[Git][ghc/ghc][wip/strings] StringBuffer-centric `escaped_string_content`

Wed Aug 14 09:24:46 UTC 2024


Sebastian Graf pushed to branch wip/strings at Glasgow Haskell Compiler / GHC


Commits:
27ee0e34 by Sebastian Graf at 2024-08-14T11:24:15+02:00
StringBuffer-centric `escaped_string_content`

- - - - -


3 changed files:

- compiler/GHC/Data/StringBuffer.hs
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs


Changes:

=====================================
compiler/GHC/Data/StringBuffer.hs
=====================================
@@ -38,6 +38,7 @@ module GHC.Data.StringBuffer
         stepOn,
         offsetBytes,
         byteDiff,
+        takeBytes,
         atLine,
 
         -- * Conversion
@@ -323,6 +324,10 @@ offsetBytes i s = s { cur = cur s + i }
 byteDiff :: StringBuffer -> StringBuffer -> Int
 byteDiff s1 s2 = cur s2 - cur s1
 
+-- | Restricts the length of the string buffer to the given number of bytes.
+takeBytes :: Int -> StringBuffer -> StringBuffer
+takeBytes new_len s = assert (len s >= new_len) $ s { len = new_len }
+
 -- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
 atEnd :: StringBuffer -> Bool
 atEnd (StringBuffer _ l c) = l == c


=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -84,11 +84,8 @@ import GHC.Prelude
 import qualified GHC.Data.Strict as Strict
 
 -- base
-import Control.DeepSeq (deepseq)
-import Control.Exception (catch, throw)
 import Control.Monad
 import Control.Applicative
-import Data.Bifunctor (first)
 import Data.Char
 import Data.List (stripPrefix, isInfixOf, partition)
 import Data.List.NonEmpty ( NonEmpty(..) )
@@ -96,7 +93,6 @@ import qualified Data.List.NonEmpty as NE
 import Data.Maybe
 import Data.Word
 import Debug.Trace (trace)
-import System.IO.Unsafe (unsafePerformIO)
 
 import GHC.Data.EnumSet as EnumSet
 
@@ -2234,7 +2230,7 @@ lex_string_prag_comment mkTok span _buf _len _buf2
 -- FIXME.bchinn: throw better error for escaped smart quotes
 tok_string :: Action
 tok_string span buf len _buf2 = do
-  s <- lex_string ("\"", "\"") buf (if endsInHash then len - 1 else len)
+  s <- tok_string_like ("\"", "\"") span buf (if endsInHash then len - 1 else len)
 
   if endsInHash
     then do
@@ -2252,7 +2248,8 @@ tok_string span buf len _buf2 = do
 
 tok_string_multi_line :: Action
 tok_string_multi_line span buf len _buf2 = do
-  s <- lex_string' resolveEsc ("", "\"\"\"") buf len
+  -- TODO:
+  -- s <- tok_string_like ("", "\"\"\"") buf len
 
   -- this action guaranteed to only run after lexing a newline or a triple quote
   case lastChar of
@@ -2260,72 +2257,57 @@ tok_string_multi_line span buf len _buf2 = do
     '\n' -> popLexState >> pushLexState string_multi_line
     _    -> panic $ "Got unexpected last character: " ++ show lastChar
 
-  pure $ L span (ITstring_multi_line src (mkFastString s))
+  pure $ L span (ITstring_multi_line src s)
   where
-    src = SourceText $ lexemeToFastString buf len
+    s = lexemeToFastString buf len
+    src = SourceText s
     lastChar = currentChar (offsetBytes (len - 1) buf)
 
-    -- Don't resolve escape characters here, defer until postprocessMultilineString.
-    -- However, we still want to validate them
-    resolveEsc cs =
-      let result =
-            case cs of
-              c : cs' -> (['\\', c], cs') -- for the sake of validation, pretend we always escape just one character
-              [] -> panic "Unexpectedly resolving an empty escape character"
-       in result <$ resolveEscapeCharacter cs
-
-lex_string :: (String, String) -> StringBuffer -> Int -> P String
-lex_string = lex_string' resolveEsc
+tok_string_like :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
+tok_string_like (startDelim, endDelim) span buf len = escaped_string_content loc buf_content
   where
-    resolveEsc = fmap (first (:[])) . resolveEscapeCharacter
+    loc = foldr (flip advancePsLoc) (psSpanStart span) startDelim
+    buf_content =
+      takeBytes (len - length endDelim) $ -- NB: ASCII => 1 byte per Char
+      offsetBytes (length startDelim) $
+      buf
 
 newtype LexErrE = LexErrE LexErr deriving (Show)
 instance Exception LexErrE
 
-lex_string' :: ([Char] -> Either LexErr ([Char], [Char])) -> (String, String) -> StringBuffer -> Int -> P String
-lex_string' resolveEsc (startDelim, endDelim) buf len = do
-  let s = resolve . stripDelims $ lexemeToString buf len
-
-  -- Unfortunately, `resolve` is only performant if it's pure; allocations
-  -- and performance degrade when `resolve` is implemented in P or ST. So
-  -- we'll throw an impure exception and catch it here
-  unsafePerformIO $ (s `deepseq` pure (pure ())) `catch` \(LexErrE e) -> pure $ lexError e
-
-  pure s
+escaped_string_content :: PsLoc -> StringBuffer -> P String
+escaped_string_content loc buf0 = resolve id buf0 -- id: empty DList for the result string
   where
-    stripDelims = stripPrefixOrId startDelim . stripSuffixOrId endDelim
-    stripPrefixOrId pre s = fromMaybe s (stripPrefix pre s)
-
-    -- assumes string was lexed correctly
-    resolve = \case
-      [] -> []
-      '\\' : '&' : cs -> resolve cs
-      '\\' : c : cs | is_space c -> resolve $ dropGap cs
-      '\\' : cs ->
-        case resolveEsc cs of
-          Right (esc, cs') -> esc ++ resolve cs'
-          Left e -> throw (LexErrE e)
-      c : cs -> c : resolve cs
-
-    dropGap = \case
-      '\\' : cs -> cs
-      _ : cs -> dropGap cs
-      [] -> panic "gap unexpectedly ended"
-
-    -- unlike stripPrefixOrId, we implement this manually,
-    -- because implementing stripSuffix returning a Maybe
-    -- has much worse performance
-    -- TODO: move to Data.List?
-    stripSuffixOrId :: Eq a => [a] -> [a] -> [a]
-    stripSuffixOrId suf xs =
-      let go (a:as) (_:bs) = a : go as bs
-          go as _ = if as == suf then [] else as
-       in go xs (drop (length suf) xs)
-
+    !h !: !tl = h : tl
+    -- the following FSM assumes the string was lexed correctly
+    resolve res buf = case nextChar buf of
+      _ | atEnd buf -> pure $! res []
+      ('\\', buf')  -> resolve_esc res           buf'
+      (c,    buf')  -> resolve     ((c!:) . res) buf'
+
+    resolve_esc res buf = case nextChar buf of
+      ('&', buf') -> resolve res buf'
+      (c,   buf') | is_space c -> resolve res (drop_gap buf')
+      _ -> case resolveEscapeCharacter buf nextChar of
+        Right (esc, s) -> resolve ((esc!:) . res) s
+        Left  e        -> do
+          setInput (AI loc buf0 `advance_input_to` cur buf)
+          lexError e
+
+    drop_gap buf = case nextChar buf of
+      ('\\', buf') -> buf'
+      (_,    buf') -> drop_gap buf'
+
+    advance_input_to ai@(AI _ buf) offs
+      | cur buf < offs
+      , Just (_, ai') <- alexGetChar' ai
+      = advance_input_to ai' offs
+      | otherwise
+      = assert (cur buf == offs) ai
 
 tok_quoted_label :: Action
 tok_quoted_label span buf len _buf2 = do
-  s <- lex_string ("#\"", "\"") buf len
+  s <- tok_string_like ("#\"", "\"") span buf len
   (AI end bufEnd) <- getInput
   let
     token = ITlabelvarid (SourceText src) (mkFastString s)
@@ -2337,7 +2319,7 @@ tok_quoted_label span buf len _buf2 = do
 
 tok_char :: Action
 tok_char span buf len _buf2 = do
-  c <- lex_string ("'", "'") buf (if endsInHash then len - 1 else len) >>= \case
+  c <- tok_string_like ("'", "'") span buf (if endsInHash then len - 1 else len) >>= \case
     [c] -> pure c
     s -> panic $ "tok_char expected exactly one character, got: " ++ show s
   pure . L span $


=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -18,7 +18,7 @@ import Control.Monad (when)
 import Data.Char (chr, ord)
 import qualified Data.Foldable1 as Foldable1
 import qualified Data.List.NonEmpty as NonEmpty
-import Data.Maybe (listToMaybe, mapMaybe)
+import Data.Maybe (mapMaybe)
 import GHC.Parser.CharClass (
   hexDigit,
   is_decdigit,
@@ -34,46 +34,55 @@ import GHC.Utils.Panic (panic)
 -- Escape characters
 
 -- Assumes escape character is valid
-resolveEscapeCharacter :: [Char] -> Either LexErr (Char, [Char])
-resolveEscapeCharacter = \case
-  'a'  : cs -> pure ('\a', cs)
-  'b'  : cs -> pure ('\b', cs)
-  'f'  : cs -> pure ('\f', cs)
-  'n'  : cs -> pure ('\n', cs)
-  'r'  : cs -> pure ('\r', cs)
-  't'  : cs -> pure ('\t', cs)
-  'v'  : cs -> pure ('\v', cs)
-  '\\' : cs -> pure ('\\', cs)
-  '"'  : cs -> pure ('\"', cs)
-  '\'' : cs -> pure ('\'', cs)
+resolveEscapeCharacter :: s -> (s -> (Char, s)) -> Either LexErr (Char, s)
+-- In practice, s is a StringBuffer. We never check for `atEnd` before we
+-- call `nextChar`, which sounds dangerous.
+-- However, in practice string content is terminated by a string delimiter, so
+-- we avoid UB by never progressing beyond such a character.
+resolveEscapeCharacter s next = case next' s of
+  ('a',  s') -> pure ('\a', s')
+  ('b',  s') -> pure ('\b', s')
+  ('f',  s') -> pure ('\f', s')
+  ('n',  s') -> pure ('\n', s')
+  ('r',  s') -> pure ('\r', s')
+  ('t',  s') -> pure ('\t', s')
+  ('v',  s') -> pure ('\v', s')
+  ('\\', s') -> pure ('\\', s')
+  ('"' , s') -> pure ('\"', s')
+  ('\'', s') -> pure ('\'', s')
   -- escape codes
-  'x' : cs -> parseNum is_hexdigit 16 hexDigit cs
-  'o' : cs -> parseNum is_octdigit 8 octDecDigit cs
-  c : cs | is_decdigit c -> parseNum is_decdigit 10 octDecDigit (c : cs)
+  ('x',  s') -> parseNum is_hexdigit 16 hexDigit s'
+  ('o',  s') -> parseNum is_octdigit 8 octDecDigit s'
+  (c,    _)  | is_decdigit c -> parseNum is_decdigit 10 octDecDigit s
   -- control characters (e.g. '\^M')
-  '^' : c : cs -> pure (chr $ ord c - ord '@', cs)
+  ('^',  s') | (c, s') <- next' s' -> pure (chr $ ord c - ord '@', s')
   -- long form escapes (e.g. '\NUL')
-  cs | Just (c, cs') <- parseLongEscape cs -> pure (c, cs')
+  _ | Just (!c, !s') <- parseLongEscape s next' -> pure (c, s')
   -- shouldn't happen
-  c : _ -> panic $ "found unexpected escape character: " ++ show c
-  [] -> panic $ "escape character unexpectedly ended"
+  (c,    _)  -> panic $ "found unexpected escape character: " ++ show c
   where
+    next' s = case next s of
+      (!c, !s') -> (c, s')
     parseNum isDigit base toDigit =
-      let go x = \case
-            c : cs | isDigit c -> do
+      let go !x s = case next' s of
+            (c, s') | isDigit c -> do
               let x' = x * base + toDigit c
               when (x' > 0x10ffff) $ Left LexNumEscapeRange
-              go x' cs
-            cs -> pure (chr x, cs)
+              go x' s'
+            _ -> pure (chr x, s)
        in go 0
 
-parseLongEscape :: [Char] -> Maybe (Char, [Char])
-parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
+parseLongEscape :: s -> (s -> (Char, s)) -> Maybe (Char, s)
+parseLongEscape s next = match longEscapeCodes s
   where
-    tryParse (code, c) =
-      case splitAt (length code) cs of
-        (pre, cs') | pre == code -> Just (c, cs')
-        _ -> Nothing
+    match []        !_ = Nothing
+    match [("", c)] !s = Just (c, s)
+    match codes      s = case next s of
+      (c, s') -> match (mapMaybe (pop_matching_head c) codes) s'
+    pop_matching_head c1 (esc, r)
+      | c2:esc <- esc
+      , c1 == c2  = Just (esc, r)
+      | otherwise = Nothing
 
     longEscapeCodes =
       [ ("NUL", '\NUL')
@@ -195,7 +204,7 @@ postprocessMultilineString =
     resolveEscapeChars = \case
       [] -> []
       '\\' : s ->
-        case resolveEscapeCharacter s of
+        case resolveEscapeCharacter s (\s -> (head s, tail s)) of
           Left e -> panic $ "resolving escape characters in multiline string unexpectedly found errors: " ++ show e
           Right (c, s') -> c : resolveEscapeChars s'
       c : s -> c : resolveEscapeChars s



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/27ee0e346ac5c24d41a621b843f6313fdb6defbc

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/27ee0e346ac5c24d41a621b843f6313fdb6defbc
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240814/76c6d65c/attachment-0001.html>