[Git][ghc/ghc][wip/strings] Fixes

Sebastian Graf (@sgraf812) gitlab at gitlab.haskell.org
Wed Aug 14 12:57:42 UTC 2024



Sebastian Graf pushed to branch wip/strings at Glasgow Haskell Compiler / GHC


Commits:
b5d4cce6 by Sebastian Graf at 2024-08-14T14:57:36+02:00
Fixes

- - - - -


3 changed files:

- compiler/GHC/Data/StringBuffer.hs
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs


Changes:

=====================================
compiler/GHC/Data/StringBuffer.hs
=====================================
@@ -96,8 +96,11 @@ unsafeWithForeignPtr = withForeignPtr
 data StringBuffer
  = StringBuffer {
      buf :: {-# UNPACK #-} !(ForeignPtr Word8),
-     len :: {-# UNPACK #-} !Int,        -- length
-     cur :: {-# UNPACK #-} !Int         -- current pos
+     len :: {-# UNPACK #-} !Int,
+      -- length, i.e., *end* position of the buffer, exclusive
+     cur :: {-# UNPACK #-} !Int
+      -- current pos. Invariant: cur < len. The true length of the represented
+      -- string is (len - cur - 1).
   }
   -- The buffer is assumed to be UTF-8 encoded, and furthermore
   -- we add three @\'\\0\'@ bytes to the end as sentinels so that the
@@ -324,9 +327,13 @@ offsetBytes i s = s { cur = cur s + i }
 byteDiff :: StringBuffer -> StringBuffer -> Int
 byteDiff s1 s2 = cur s2 - cur s1
 
--- | Restricts the length of the string buffer to the given number of bytes.
+-- | Restricts the length of the string buffer to the current position plus the
+-- given number of bytes.
 takeBytes :: Int -> StringBuffer -> StringBuffer
-takeBytes new_len s = assert (len s >= new_len) $ s { len = new_len }
+takeBytes n s = assert (cur s + n < len s) s { len = cur s + n }
+
+byteLength :: StringBuffer -> Int
+byteLength s = len s - cur s - 1
 
 -- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
 atEnd :: StringBuffer -> Bool


=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -41,6 +41,7 @@
 -- Alex "Haskell code fragment top"
 
 {
+{-# OPTIONS_GHC -ddump-simpl -ddump-to-file #-}
 {-# LANGUAGE ViewPatterns #-}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE MultiWayIf #-}
@@ -2264,12 +2265,14 @@ tok_string_multi_line span buf len _buf2 = do
     lastChar = currentChar (offsetBytes (len - 1) buf)
 
 tok_string_like :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
-tok_string_like (startDelim, endDelim) span buf len = escaped_string_content loc buf_content
+tok_string_like (startDelim, endDelim) span buf bytes =
+  -- pprTrace "tok_string_like" (ppr (mkSrcSpanPs span) <+> ppr bytes <+> text startDelim <+> text endDelim $$ ppr (cur buf) <+> ppr (len buf) $$ ppr (cur buf_content) <+> ppr (len buf_content)) $
+  escaped_string_content loc buf_content
   where
     loc = foldr (flip advancePsLoc) (psSpanStart span) startDelim
     buf_content =
-      takeBytes (len - length endDelim) $ -- NB: ASCII => 1 byte per Char
-      offsetBytes (length startDelim) $
+      offsetBytes (length startDelim) $ -- NB: ASCII => 1 byte per Char
+      takeBytes (bytes - length endDelim) $
       buf
 
 newtype LexErrE = LexErrE LexErr deriving (Show)
@@ -2278,21 +2281,22 @@ instance Exception LexErrE
 escaped_string_content :: PsLoc -> StringBuffer -> P String
 escaped_string_content loc buf0 = resolve id buf0 -- id: empty DList for the result string
   where
-    !h !: !tl = h : tl
+    snoc pre !h = \(!post) -> pre (h : post)
     -- the following FSM assumes the string was lexed correctly
     resolve res buf = case nextChar buf of
       _ | atEnd buf -> pure $! res []
-      ('\\', buf')  -> resolve_esc res           buf'
-      (c,    buf')  -> resolve     ((c!:) . res) buf'
+      ('\\', buf')  -> resolve_esc res            buf'
+      (c,    buf')  -> resolve     (res `snoc` c) buf'
 
     resolve_esc res buf = case nextChar buf of
       ('&', buf') -> resolve res buf'
       (c,   buf') | is_space c -> resolve res (drop_gap buf')
-      _ -> case resolveEscapeCharacter buf nextChar of
-        Right (esc, s) -> resolve ((esc!:) . res) s
-        Left  e        -> do
-          setInput (AI loc buf0 `advance_input_to` cur buf)
-          lexError e
+      (_c,  _) -> -- pprTrace "escape" (ppr (mkSrcSpanPs (mkPsSpan loc loc)) <+> char _c) $
+        case resolveEscapeCharacter buf nextChar of
+          Right (esc, s) -> resolve (res `snoc` esc) s
+          Left  e        -> do
+            setInput (AI loc buf0 `advance_input_to` cur buf)
+            lexError e
 
     drop_gap buf = case nextChar buf of
       ('\\', buf') -> buf'


=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -75,10 +75,14 @@ resolveEscapeCharacter s next = case next' s of
 parseLongEscape :: s -> (s -> (Char, s)) -> Maybe (Char, s)
 parseLongEscape s next = match longEscapeCodes s
   where
-    match []        !_ = Nothing
-    match [("", c)] !s = Just (c, s)
-    match codes      s = case next s of
-      (c, s') -> match (mapMaybe (pop_matching_head c) codes) s'
+    match codes s | (c, s') <- next s =
+      -- pprTrace "match" (char c <+> pprWithCommas (text . fst) codes) $
+      case mapMaybe (pop_matching_head c) codes of
+        [] -> case lookup "" codes of
+          Just c  -> Just (c, s)
+          Nothing -> Nothing
+        codes' -> match codes' s'
+
     pop_matching_head c1 (esc, r)
       | c2:esc <- esc
       , c1 == c2  = Just (esc, r)



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/b5d4cce620fb238f6fbac4cca8b224e0479d5f64

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/b5d4cce620fb238f6fbac4cca8b224e0479d5f64
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240814/9f717da9/attachment-0001.html>


More information about the ghc-commits mailing list