[Git][ghc/ghc][wip/strings] [ci skip] wip

Sat Aug 24 23:18:42 UTC 2024

Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC


Commits:
8cc82093 by Brandon Chinn at 2024-08-24T16:18:32-07:00
[ci skip] wip

- - - - -


2 changed files:

- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs


Changes:

=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -2271,7 +2271,9 @@ tok_string_multi startSpan startBuf _len _buf2 = do
 
   -- load the content of the multiline string
   let contentLen = byteDiff contentStartBuf contentEndBuf
-  s <- either lexError pure . postprocessMultilineString $ lexemeToString contentStartBuf contentLen
+  s <-
+    either (throwStringLexError (AI startLoc startBuf)) pure $
+      lexMultilineString contentLen contentStartBuf
 
   setInput i'
   pure $ L span $ ITstring_multi src (mkFastString s)
@@ -2325,22 +2327,24 @@ tok_string_multi_content :: Action
 tok_string_multi_content = panic "tok_string_multi_content unexpectedly invoked"
 
 lex_chars :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
-lex_chars (startDelim, endDelim) span buf len = resolveEscapes' . collapseGaps $ lexemeToString content_buf content_len
+lex_chars (startDelim, endDelim) span buf len =
+  either (throwStringLexError (AI (psSpanStart span) buf)) pure $
+    lexString content_len content_buf
   where
-    resolveEscapes' = either throwEscapeErr pure . resolveEscapes
-
-    -- match the normal lexical errors:
-    --   * position = beginning of string
-    --   * character = the illegal character
-    throwEscapeErr e =
-      let loc = psSpanStart span
-          buf' = buf -- FIXME.bchinn
-       in setInput (AI loc buf') >> lexError e
-
     -- assumes delimiters are ASCII, with 1 byte per Char
     content_len = len - length startDelim - length endDelim
     content_buf = offsetBytes (length startDelim) buf
 
+throwStringLexError :: AlexInput -> StringLexError -> P a
+throwStringLexError i (StringLexError e off) = setInput i' >> lexError e
+  where
+    -- Match the AlexInput that happens with normal lexical errors:
+    --   * position  = beginning of string literal
+    --   * character = the illegal character
+    i' =
+      let AI loc buf = i
+       in AI loc (offsetBytes off buf)
+
 
 tok_quoted_label :: Action
 tok_quoted_label span buf len _buf2 = do


=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -1,11 +1,10 @@
 {-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE TupleSections #-}
 
 module GHC.Parser.String (
-  collapseGaps,
-  resolveEscapes,
-
-  -- * Multiline strings
-  postprocessMultilineString,
+  StringLexError (..),
+  lexString,
+  lexMultilineString,
 
   -- * Unicode smart quote helpers
   isDoubleSmartQuote,
@@ -15,13 +14,14 @@ module GHC.Parser.String (
 import GHC.Prelude
 
 import Control.Arrow ((>>>))
-import Control.DeepSeq (deepseq)
-import Control.Exception (Exception, catch, throw)
 import Control.Monad (when)
+import Data.Bifunctor (first)
 import Data.Char (chr, ord)
 import qualified Data.Foldable1 as Foldable1
 import qualified Data.List.NonEmpty as NonEmpty
-import Data.Maybe (listToMaybe, mapMaybe)
+import Data.Maybe (mapMaybe)
+import GHC.Data.StringBuffer (StringBuffer)
+import qualified GHC.Data.StringBuffer as StringBuffer
 import GHC.Parser.CharClass (
   hexDigit,
   is_decdigit,
@@ -32,90 +32,122 @@ import GHC.Parser.CharClass (
  )
 import GHC.Parser.Errors.Types (LexErr (..))
 import GHC.Utils.Panic (panic)
-import System.IO.Unsafe (unsafePerformIO)

--- | Collapse string gaps. Assumes the string is lexically valid.
-collapseGaps :: String -> String
-collapseGaps = go
+-- | A lex error occurring when parsing a string, containing the error itself
+-- and the position in the buffer where the error occurred.
+data StringLexError = StringLexError LexErr Int
+
+lexString :: Int -> StringBuffer -> Either StringLexError String
+lexString = lexStringWith resolveEscapeCharBuf
+
+-- | Lex a StringBuffer into a String, with escapes resolved using the given
+-- function.
+--
+-- Ideally, this would be broken up into three functions composed together:
+--   1. Convert StringBuffer -> String
+--   2. Collapse gaps
+--   3. Resolve escapes
+-- However, escape resolution needs to be able to throw errors containing the
+-- location of the error, which we lose information for after converting to
+-- String. So we have to do these three steps at the same time so we have
+-- location information when resolving escapes.
+lexStringWith ::
+     (StringBuffer -> Either StringLexError ([Char], StringBuffer))
+  -> Int
+  -> StringBuffer
+  -> Either StringLexError String
+lexStringWith resolveEscape len initialBuf = go id initialBuf
   where
-    go = \case
-      '\\' : '\\' : cs -> '\\' : '\\' : go cs
-      '\\' : c : cs | is_space c -> go $ dropGap cs
-      c : cs -> c : go cs
-      [] -> []
+    go acc buf0 =
+      case nextCharBuf buf0 of
+        _ | atEnd buf0 -> pure $ acc []
+        ('\\', buf1)
+          | ('&', buf2) <- nextCharBuf buf1 -> go acc buf2
+          | (c, buf2) <- nextCharBuf buf1, is_space c -> go acc $ dropGap buf2
+          | otherwise ->
+              case resolveEscape buf1 of
+                Right (cs, buf2) -> go (acc . (cs ++)) buf2
+                Left e -> Left e
+        (c, buf1) -> go (acc . (c :)) buf1
+
+    dropGap buf =
+      case nextCharBuf buf of
+        ('\\', buf') -> buf'
+        (_, buf') -> dropGap buf'
 
-    dropGap = \case
-      '\\' : cs -> cs
-      _ : cs -> dropGap cs
-      [] -> panic "gap unexpectedly ended"
+    nextCharBuf = StringBuffer.nextChar
+
+    endBuf = StringBuffer.offsetBytes len initialBuf
+    atEnd buf = StringBuffer.byteDiff buf endBuf <= 0 -- should never be negative, but just in case
 
 -- -----------------------------------------------------------------------------
 -- Escape characters
 
-newtype LexErrE = LexErrE LexErr
-instance Show LexErrE where
-  show _ = "<LexErr>" -- we only need this for the resolveEscapes hack, so doesn't have to be meaningful
-instance Exception LexErrE
-
--- | Resolve escape characters. Assumes the string is lexically valid + gaps have been collapsed.
-resolveEscapes :: String -> Either LexErr String
-resolveEscapes s0 = do
-  let s = resolve s0
-  unsafePerformIO $ (s `deepseq` pure (Right s)) `catch` \(LexErrE e) -> pure (Left e)
+-- | 'resolveEscapeChar' specialized to StringBuffer.
+resolveEscapeCharBuf :: StringBuffer -> Either StringLexError ([Char], StringBuffer)
+resolveEscapeCharBuf = fmap (first (:[])) . resolveEscapeChar StringBuffer.nextChar mkStrLexError
   where
-    -- Unfortunately, `resolve` is only performant if it's pure; allocations
-    -- and performance degrade when `resolve` is implemented in P or ST. So
-    -- we'll throw an impure exception and catch it above with unsafePerformIO
-    resolve = \case
-      [] -> []
-      '\\' : '&' : cs -> resolve cs
-      '\\' : cs ->
-        case resolveEscapeCharacter cs of
-          Right (c, cs') -> c : resolve cs'
-          Left e -> throw (LexErrE e)
-      c : cs -> c : resolve cs
-
--- Assumes escape character is valid
-resolveEscapeCharacter :: [Char] -> Either LexErr (Char, [Char])
-resolveEscapeCharacter = \case
-  'a'  : cs -> pure ('\a', cs)
-  'b'  : cs -> pure ('\b', cs)
-  'f'  : cs -> pure ('\f', cs)
-  'n'  : cs -> pure ('\n', cs)
-  'r'  : cs -> pure ('\r', cs)
-  't'  : cs -> pure ('\t', cs)
-  'v'  : cs -> pure ('\v', cs)
-  '\\' : cs -> pure ('\\', cs)
-  '"'  : cs -> pure ('\"', cs)
-  '\'' : cs -> pure ('\'', cs)
-  -- escape codes
-  'x' : cs -> parseNum is_hexdigit 16 hexDigit cs
-  'o' : cs -> parseNum is_octdigit 8 octDecDigit cs
-  c : cs | is_decdigit c -> parseNum is_decdigit 10 octDecDigit (c : cs)
-  -- control characters (e.g. '\^M')
-  '^' : c : cs -> pure (chr $ ord c - ord '@', cs)
-  -- long form escapes (e.g. '\NUL')
-  cs | Just (c, cs') <- parseLongEscape cs -> pure (c, cs')
-  -- shouldn't happen
-  c : _ -> panic $ "found unexpected escape character: " ++ show c
-  [] -> panic $ "escape character unexpectedly ended"
+    mkStrLexError buf e = StringLexError e (StringBuffer.cur buf)
+
+-- | Resolve a escape character, after having just lexed a backslash.
+-- Assumes escape character is valid.
+resolveEscapeChar ::
+     (s -> (Char, s))
+  -> (s -> LexErr -> e)
+  -> s
+  -> Either e (Char, s)
+resolveEscapeChar nextChar mkStrLexError s0 =
+  case nextChar s0 of
+    ('a' , s1) -> pure ('\a', s1)
+    ('b' , s1) -> pure ('\b', s1)
+    ('f' , s1) -> pure ('\f', s1)
+    ('n' , s1) -> pure ('\n', s1)
+    ('r' , s1) -> pure ('\r', s1)
+    ('t' , s1) -> pure ('\t', s1)
+    ('v' , s1) -> pure ('\v', s1)
+    ('\\', s1) -> pure ('\\', s1)
+    ('"' , s1) -> pure ('\"', s1)
+    ('\'', s1) -> pure ('\'', s1)
+    -- escape codes
+    ('x', s1) -> parseNum is_hexdigit 16 hexDigit s1
+    ('o', s1) -> parseNum is_octdigit 8 octDecDigit s1
+    (c, _) | is_decdigit c -> parseNum is_decdigit 10 octDecDigit s0
+    -- control characters (e.g. '\^M')
+    ('^', s1) | (c, s2) <- nextChar s1 -> pure (chr $ ord c - ord '@', s2)
+    -- long form escapes (e.g. '\NUL')
+    _ | Just (c, s1) <- parseLongEscape nextChar s0 -> pure (c, s1)
+    -- shouldn't happen
+    (c, _) -> panic $ "found unexpected escape character: " ++ show c
   where
     parseNum isDigit base toDigit =
-      let go x = \case
-            c : cs | isDigit c -> do
-              let x' = x * base + toDigit c
-              when (x' > 0x10ffff) $ Left LexNumEscapeRange
-              go x' cs
-            cs -> pure (chr x, cs)
+      let go x s =
+            case nextChar s of
+              (c, s') | isDigit c -> do
+                let x' = x * base + toDigit c
+                when (x' > 0x10ffff) $ Left (mkStrLexError s LexNumEscapeRange)
+                go x' s'
+              _ -> pure (chr x, s)
        in go 0
 
-parseLongEscape :: [Char] -> Maybe (Char, [Char])
-parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
+parseLongEscape :: (s -> (Char, s)) -> s -> Maybe (Char, s)
+parseLongEscape nextChar = match longEscapeCodes
   where
-    tryParse (code, c) =
-      case splitAt (length code) cs of
-        (pre, cs') | pre == code -> Just (c, cs')
-        _ -> Nothing
+    -- Iteratively get the next character and filter the list of codes
+    -- by codes starting with that character, until we find a code that
+    -- completely matches.
+    -- Assumes that none of the codes are prefixed by another code
+    match []        _ = Nothing
+    match [("", c)] s = Just (c, s)
+    match codes     s =
+      let (c, s') = nextChar s
+       in match (mapMaybe (fmapFst $ stripHead c) codes) s'
+
+    stripHead :: Eq a => a -> [a] -> Maybe [a]
+    stripHead x1 (x2 : xs) | x1 == x2 = Just xs
+    stripHead _ _ = Nothing
+
+    fmapFst :: Functor f => (a -> f b) -> (a, c) -> f (b, c)
+    fmapFst f (a, c) = (, c) <$> f a
 
     longEscapeCodes =
       [ ("NUL", '\NUL')
@@ -126,14 +158,14 @@ parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
       , ("ENQ", '\ENQ')
       , ("ACK", '\ACK')
       , ("BEL", '\BEL')
-      , ("BS", '\BS')
-      , ("HT", '\HT')
-      , ("LF", '\LF')
-      , ("VT", '\VT')
-      , ("FF", '\FF')
-      , ("CR", '\CR')
-      , ("SO", '\SO')
-      , ("SI", '\SI')
+      , ("BS" , '\BS' )
+      , ("HT" , '\HT' )
+      , ("LF" , '\LF' )
+      , ("VT" , '\VT' )
+      , ("FF" , '\FF' )
+      , ("CR" , '\CR' )
+      , ("SO" , '\SO' )
+      , ("SI" , '\SI' )
       , ("DLE", '\DLE')
       , ("DC1", '\DC1')
       , ("DC2", '\DC2')
@@ -143,14 +175,14 @@ parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
       , ("SYN", '\SYN')
       , ("ETB", '\ETB')
       , ("CAN", '\CAN')
-      , ("EM", '\EM')
+      , ("EM" , '\EM' )
       , ("SUB", '\SUB')
       , ("ESC", '\ESC')
-      , ("FS", '\FS')
-      , ("GS", '\GS')
-      , ("RS", '\RS')
-      , ("US", '\US')
-      , ("SP", '\SP')
+      , ("FS" , '\FS' )
+      , ("GS" , '\GS' )
+      , ("RS" , '\RS' )
+      , ("US" , '\US' )
+      , ("SP" , '\SP' )
       , ("DEL", '\DEL')
       ]
 
@@ -177,16 +209,24 @@ isSingleSmartQuote = \case
 -- Assumes string is lexically valid. Skips the steps about splitting
 -- and rejoining lines, and instead manually find newline characters,
 -- for performance.
-postprocessMultilineString :: String -> Either LexErr String
-postprocessMultilineString =
-      collapseGaps             -- Step 1
-  >>> expandLeadingTabs        -- Step 3
-  >>> rmCommonWhitespacePrefix -- Step 4
-  >>> collapseOnlyWsLines      -- Step 5
-  >>> rmFirstNewline           -- Step 7a
-  >>> rmLastNewline            -- Step 7b
-  >>> resolveEscapes           -- Step 8
+lexMultilineString :: Int -> StringBuffer -> Either StringLexError String
+lexMultilineString len buf = postprocess <$> lexStringWith validateEscapeChar len buf
   where
+    -- When initially lexing the string, throw any errors when parsing escape
+    -- characters, but otherwise, keep the escape characters verbatim and resolve
+    -- them at the end of postprocessing.
+    validateEscapeChar buf =
+      let (c, buf') = StringBuffer.nextChar buf
+       in (['\\', c], buf') <$ resolveEscapeCharBuf buf
+
+    postprocess =
+          expandLeadingTabs        -- Step 3
+      >>> rmCommonWhitespacePrefix -- Step 4
+      >>> collapseOnlyWsLines      -- Step 5
+      >>> rmFirstNewline           -- Step 7a
+      >>> rmLastNewline            -- Step 7b
+      >>> resolveEscapes           -- Step 8
+
     expandLeadingTabs =
       let go !col = \case
             c@' ' : cs -> c : go (col + 1) cs
@@ -237,6 +277,20 @@ postprocessMultilineString =
             c : cs -> c : go cs
        in go
 
+    resolveEscapes =
+      let uncons = \case
+            c : s -> (c, s)
+            [] -> panic "escape character unexpectedly ended"
+          go = \case
+            [] -> []
+            '\\' : s ->
+              case resolveEscapeChar uncons (\_ _ -> ()) s of
+                Right (c, s') -> c : go s'
+                -- shouldn't happen, as we've already validated escapes in validateEscapeChar
+                Left _ -> panic "unexpectedly encountered invalid escape character"
+            c : s -> c : go s
+       in go
+
 -- | See step 4 in Note [Multiline string literals]
 --
 -- Assumes tabs have already been expanded.



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/8cc82093263c028854381e3fc8b201a5a2689ffd

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/8cc82093263c028854381e3fc8b201a5a2689ffd
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240824/17b72dc1/attachment-0001.html>