[Git][ghc/ghc][wip/strings] Lex multiline strings using LexState, to allow bare quotes

Mon Aug 12 05:52:46 UTC 2024

Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC


Commits:
e3442c21 by Brandon Chinn at 2024-08-11T22:52:37-07:00
Lex multiline strings using LexState, to allow bare quotes

- - - - -


5 changed files:

- compiler/GHC/Parser.y
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
- compiler/GHC/Types/SourceText.hs
- utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs


Changes:

=====================================
compiler/GHC/Parser.y
=====================================
@@ -88,6 +88,7 @@ import GHC.Parser.HaddockLex
 import GHC.Parser.Annotation
 import GHC.Parser.Errors.Types
 import GHC.Parser.Errors.Ppr ()
+import GHC.Parser.String
 
 import GHC.Builtin.Types ( unitTyCon, unitDataCon, sumTyCon,
                            tupleTyCon, tupleDataCon, nilDataCon,
@@ -728,7 +729,9 @@ are the most common patterns, rewritten as regular expressions for clarity:
 
  CHAR           { L _ (ITchar   _ _) }
  STRING         { L _ (ITstring _ _) }
- MULTILINESTRING { L _ (ITmultilinestring _ _) }
+ STRING_MULTI_BEGIN   { L _ (ITstring_multi_begin      ) }
+ STRING_MULTI_CONTENT { L _ (ITstring_multi_content _ _) }
+ STRING_MULTI_END     { L _ (ITstring_multi_end        ) }
  INTEGER        { L _ (ITinteger _) }
  RATIONAL       { L _ (ITrational _) }
 
@@ -2341,8 +2344,8 @@ atype :: { LHsType GhcPs }
                                                                         (getCHAR $1) }
         | STRING               { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
                                                                      (getSTRING  $1) }
-        | MULTILINESTRING      { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getMULTILINESTRINGs $1)
-                                                                     (getMULTILINESTRING  $1) }
+        | string_multi         { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
+                                                                     (getSTRING  $1) }
         | '_'                  { sL1a $1 $ mkAnonWildCardTy }
         -- Type variables are never exported, so `M.tyvar` will be rejected by the renamer.
         -- We let it pass the parser because the renamer can generate a better error message.
@@ -4031,8 +4034,8 @@ literal :: { Located (HsLit GhcPs) }
         : CHAR              { sL1 $1 $ HsChar       (getCHARs $1) $ getCHAR $1 }
         | STRING            { sL1 $1 $ HsString     (getSTRINGs $1)
                                                     $ getSTRING $1 }
-        | MULTILINESTRING   { sL1 $1 $ HsMultilineString (getMULTILINESTRINGs $1)
-                                                    $ getMULTILINESTRING $1 }
+        | string_multi      { sL1 $1 $ HsMultilineString (getSTRINGs $1)
+                                                    $ getSTRING $1 }
         | PRIMINTEGER       { sL1 $1 $ HsIntPrim    (getPRIMINTEGERs $1)
                                                     $ getPRIMINTEGER $1 }
         | PRIMWORD          { sL1 $1 $ HsWordPrim   (getPRIMWORDs $1)
@@ -4060,6 +4063,13 @@ literal :: { Located (HsLit GhcPs) }
         | PRIMFLOAT         { sL1 $1 $ HsFloatPrim  noExtField $ getPRIMFLOAT $1 }
         | PRIMDOUBLE        { sL1 $1 $ HsDoublePrim noExtField $ getPRIMDOUBLE $1 }
 
+string_multi :: { Located Token }
+        : STRING_MULTI_BEGIN string_multi_content STRING_MULTI_END { resolveStringMultiContent $1 (reverse $2) $3 }
+
+string_multi_content :: { [Located Token] }
+        : {- empty -}                               { [] }
+        | string_multi_content STRING_MULTI_CONTENT { $2 : $1 }
+
 -----------------------------------------------------------------------------
 -- Layout
 
@@ -4138,7 +4148,6 @@ getIPDUPVARID     (L _ (ITdupipvarid   x)) = x
 getLABELVARID     (L _ (ITlabelvarid _ x)) = x
 getCHAR           (L _ (ITchar   _ x)) = x
 getSTRING         (L _ (ITstring _ x)) = x
-getMULTILINESTRING (L _ (ITmultilinestring _ x)) = x
 getINTEGER        (L _ (ITinteger x))  = x
 getRATIONAL       (L _ (ITrational x)) = x
 getPRIMCHAR       (L _ (ITprimchar _ x)) = x
@@ -4164,7 +4173,6 @@ getVOCURLY        (L (RealSrcSpan l _) ITvocurly) = srcSpanStartCol l
 getINTEGERs       (L _ (ITinteger (IL src _ _))) = src
 getCHARs          (L _ (ITchar       src _)) = src
 getSTRINGs        (L _ (ITstring     src _)) = src
-getMULTILINESTRINGs (L _ (ITmultilinestring src _)) = src
 getPRIMCHARs      (L _ (ITprimchar   src _)) = src
 getPRIMSTRINGs    (L _ (ITprimstring src _)) = src
 getPRIMINTEGERs   (L _ (ITprimint    src _)) = src
@@ -4202,6 +4210,17 @@ getCTYPEs             (L _ (ITctype             src)) = src
 
 getStringLiteral l = StringLiteral (getSTRINGs l) (getSTRING l) Nothing
 
+resolveStringMultiContent begin contents end =
+    let loc = foldr3 combineSrcSpans (getLoc begin) (map getLoc contents) (getLoc end)
+        src = foldr3 combineSourceText delim (map getMultiContentSrc contents) delim
+        s = mkFastString . postprocessMultilineString . unpackFS $ mconcat (map getMultiContent contents)
+     in L loc (ITstring src s)
+  where
+    delim = SourceText $ mkFastString "\"\"\""
+    foldr3 f x0 x1s x2 = foldr f x0 (x1s ++ [x2])
+    getMultiContentSrc (L _ (ITstring_multi_content src _)) = src
+    getMultiContent    (L _ (ITstring_multi_content _   s)) = s
+
 isUnicode :: Located Token -> Bool
 isUnicode (L _ (ITforall         iu)) = iu == UnicodeSyntax
 isUnicode (L _ (ITdarrow         iu)) = iu == UnicodeSyntax


=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -88,6 +88,7 @@ import Control.DeepSeq (deepseq)
 import Control.Exception (catch, throw)
 import Control.Monad
 import Control.Applicative
+import Data.Bifunctor (first)
 import Data.Char
 import Data.List (stripPrefix, isInfixOf, partition)
 import Data.List.NonEmpty ( NonEmpty(..) )
@@ -225,8 +226,8 @@ $docsym    = [\| \^ \* \$]
        | "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
 @escape     = \\ ( $charesc      | @ascii | @decimal | o @octal | x @hexadecimal )
 @escapechar = \\ ( $charesc # \& | @ascii | @decimal | o @octal | x @hexadecimal )
- at stringchar = ($graphic # [\\ \"]) | \ | @escape     | @gap
- at char       = ($graphic # [\\ \']) | \ | @escapechar | @gap
+ at stringchar = ($graphic # [\\ \"]) | " " | @escape     | @gap
+ at char       = ($graphic # [\\ \']) | " " | @escapechar | @gap
 
 -- normal signed numerical literals can only be explicitly negative,
 -- not explicitly positive (contrast @exponent)
@@ -675,16 +676,30 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
 
 }

--- Strings and chars are lexed by hand-written code.  The reason is
--- that even if we recognise the string or char here in the regex
--- lexer, we would still have to parse the string afterward in order
--- to convert it to a String.
 <0> {
-  \"\"\" (@stringchar | $whitechar | $tab)* \"\"\" / { ifExtension MultilineStringsBit} { tok_string_multi }
+  \"\"\" / { ifExtension MultilineStringsBit } { push_and string_multi_content $ token ITstring_multi_begin }
   \" @stringchar* \" \#? { tok_string }
   \' @char        \' \#? { tok_char }
 }
 
+<string_multi_bol, string_multi_content> {
+  \"\"\" { pop_and $ token ITstring_multi_end }
+}
+
+-- FIXME.bchinn: whitespace possibly getting slurped by $white_no_nl+ rule at top? maybe include @stringchar* here too?
+<string_multi_bol> {
+  ([\ $tab] | @gap)* { pop_and . push_and string_multi_content $ tok_string_multi_content }
+}
+
+<string_multi_content> {
+  @stringchar* { tok_string_multi_content }
+  $nl          { pop_and . push_and string_multi_bol $ tok_string_multi_content }
+
+  -- if we see a bare quote, but we haven't seen the triple quote,
+  -- this is a safe bare quote
+  \" { tok_string_multi_content }
+}
+
 <0> {
   \' \' { token ITtyQuote }
 
@@ -978,7 +993,9 @@ data Token
 
   | ITchar     SourceText Char       -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITstring   SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
-  | ITmultilinestring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+  | ITstring_multi_begin
+  | ITstring_multi_content SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+  | ITstring_multi_end
   | ITinteger  IntegralLit           -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITrational FractionalLit
 
@@ -1302,6 +1319,11 @@ pop_and act span buf len buf2 =
   do _ <- popLexState
      act span buf len buf2
 
+push_and :: Int -> Action -> Action
+push_and ls act span buf len buf2 =
+  do pushLexState ls
+     act span buf len buf2
+
 -- See Note [Whitespace-sensitive operator parsing]
 followedByOpeningToken, precededByClosingToken :: AlexAccPred ExtsBitmap
 followedByOpeningToken _ _ _ (AI _ buf) = followedByOpeningToken' buf
@@ -2206,10 +2228,10 @@ lex_string_prag_comment mkTok span _buf _len _buf2
 -- -----------------------------------------------------------------------------
 -- Strings & Chars
 
--- FIXME: figure out smart quotes (escaped smart quotes + smart quotes failing to end string lex)
+-- FIXME.bchinn: throw better error for escaped smart quotes
 tok_string :: Action
 tok_string span buf len _buf2 = do
-  s <- lex_string span buf (if isMagicHash then len - 1 else len) "\"" "\""
+  s <- lex_string ("\"", "\"") span buf (if isMagicHash then len - 1 else len)
 
   if isMagicHash
     then do
@@ -2225,30 +2247,53 @@ tok_string span buf len _buf2 = do
     src = SourceText $ lexemeToFastString buf len
     isMagicHash = currentChar (offsetBytes (len - 1) buf) == '#'
 
-tok_string_multi :: Action
-tok_string_multi span buf len _buf2 = do
-  s <- postprocessMultilineString <$> lex_string span buf len "\"\"\"" "\"\"\""
-  pure $ L span (ITmultilinestring src (mkFastString s))
+tok_string_multi_content :: Action
+tok_string_multi_content span buf len _buf2 = do
+  s <- lex_string' resolveEsc len $ AI (psSpanStart span) buf
+  pure $ L span (ITstring_multi_content src (mkFastString s))
   where
     src = SourceText $ lexemeToFastString buf len
 
-lex_string :: PsSpan -> StringBuffer -> Int -> String -> String -> P String
-lex_string span buf len startDelim endDelim = do
-  let s = go $ lexemeToString (offsetBytes (length startDelim) buf) numChars
+    -- Don't resolve escape characters here, defer until postprocessMultilineString.
+    -- However, we still want to validate them
+    resolveEsc cs =
+      let result =
+            case cs of
+              c : cs' -> (['\\', c], cs') -- for the sake of validation, pretend we always escape just one character
+              [] -> panic "Unexpectedly resolving an empty escape character"
+       in result <$ resolveEscapeCharacter cs
+
+lex_string :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
+lex_string (startDelim, endDelim) span buf len = lex_string' resolveEsc numChars i0
+  where
+    resolveEsc = fmap (first (:[])) . resolveEscapeCharacter
+    numChars = len - (length startDelim + length endDelim)
+    i0 =
+      case applyM (length startDelim) (fmap snd . alexGetChar') (AI (psSpanStart span) buf) of
+        Just i -> i
+        Nothing -> panic "Unexpectedly reached EOF when advancing past string delimiter"
+
+    -- applyM 3 f x = f x >>= f >>= f
+    applyM n f = foldr (>=>) pure $ replicate n f
+
+lex_string' :: ([Char] -> Either ParseEscapeErr ([Char], [Char])) -> Int -> AlexInput -> P String
+lex_string' resolveEsc numChars i0@(AI _ buf) = do
+  let s = go $ lexemeToString buf numChars
 
   -- Unfortunately, `go` is only performant if it's pure; allocations
   -- and performance degrade when `go` is implemented in P or ST. So
   -- we'll throw an impure exception and catch it here
   unsafePerformIO $
     (s `deepseq` pure (pure ())) `catch` \e -> do
-      let i0 = AI (psSpanStart span) buf
-      let (e', i) = resolveParseEscapeErr alexGetChar' i0 numChars e
+      let (e', i) = resolveParseEscapeErr getNextLoc i0 numChars e
       pure $ setInput i >> lexError e'
 
   pure s
   where
-    -- the number of characters in the string
-    numChars = len - (length startDelim + length endDelim)
+    getNextLoc i =
+      case alexGetChar' i of
+        Just (_, i') -> i'
+        Nothing -> panic "Unexpectedly reached EOF when resolving ParseEscapeErr"
 
     -- assumes string was lexed correctly
     go = \case
@@ -2256,8 +2301,8 @@ lex_string span buf len startDelim endDelim = do
       '\\' : '&' : cs -> go cs
       '\\' : c : cs | is_space c -> go $ dropGap cs
       '\\' : cs ->
-        case resolveEscapeCharacter cs of
-          Right (c, cs') -> c : go cs'
+        case resolveEsc cs of
+          Right (esc, cs') -> esc ++ go cs'
           Left e -> throw e
       c : cs -> c : go cs
 
@@ -2269,7 +2314,7 @@ lex_string span buf len startDelim endDelim = do
 
 tok_quoted_label :: Action
 tok_quoted_label span buf len _buf2 = do
-  s <- lex_string span buf len "#\"" "\""
+  s <- lex_string ("#\"", "\"") span buf len
   (AI end bufEnd) <- getInput
   let
     token = ITlabelvarid (SourceText src) (mkFastString s)
@@ -2281,7 +2326,7 @@ tok_quoted_label span buf len _buf2 = do
 
 tok_char :: Action
 tok_char span buf len _buf2 = do
-  c <- lex_string span buf (if isMagicHash then len - 1 else len) "'" "'" >>= \case
+  c <- lex_string ("'", "'") span buf (if isMagicHash then len - 1 else len) >>= \case
     [c] -> pure c
     s -> panic $ "tok_char expected exactly one character, got: " ++ show s
   pure . L span $
@@ -3490,6 +3535,8 @@ lexToken = do
         setLastToken span 0
         return (L span ITeof)
     AlexError (AI loc2 buf) ->
+        -- FIXME.bchinn - check if any smart quotes between loc1 and loc2; that might
+        -- indicate that someone expected the smart quote to end a string literal
         reportLexError (psRealLoc loc1) (psRealLoc loc2) buf
           (\k srcLoc -> mkPlainErrorMsgEnvelope srcLoc $ PsErrLexer LexError k)
     AlexSkip inp2 _ -> do


=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -79,21 +79,24 @@ newtype ParseEscapeErr =
 
 instance Exception ParseEscapeErr
 
--- | Get the LexErr and location of the error, given the location of the initial
--- string delimiter, a function to get the next location, and the total length of
+-- | Get the LexErr and location of the error, given the location of the first
+-- character in the string, a function to get the next location, and the total length of
 -- the string.
-resolveParseEscapeErr :: (loc -> Maybe (Char, loc)) -> loc -> Int -> ParseEscapeErr -> (LexErr, loc)
-resolveParseEscapeErr getChar loc len (ParseEscapeErr (e, indexFromEnd)) = (e, loc')
+--
+-- @
+--             v indexFromEnd = 4
+-- "a b \xffffff c d"
+--  ^ loc0     ^ loc, index = 11
+--  ^^^^^^^^^^^^^^^^ len = 16
+-- @
+resolveParseEscapeErr :: (loc -> loc) -> loc -> Int -> ParseEscapeErr -> (LexErr, loc)
+resolveParseEscapeErr getNextLoc loc0 len (ParseEscapeErr (e, indexFromEnd)) = (e, loc)
   where
     -- the index of the error, where 0 is the first character after the initial string delimiter
     index = len - indexFromEnd - 1
 
     -- the 'loc' corresponding to 'index'
-    loc' = iterate getNextLoc loc !! (index + 1)
-    getNextLoc l =
-      case getChar l of
-        Just (_, l') -> l'
-        Nothing -> panic "Unexpectedly reached EOF when resolving ParseEscapeErr"
+    loc = iterate getNextLoc loc0 !! index
 
 parseLongEscape :: [Char] -> Maybe (Char, [Char])
 parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes


=====================================
compiler/GHC/Types/SourceText.hs
=====================================
@@ -10,6 +10,7 @@
 module GHC.Types.SourceText
    ( SourceText (..)
    , pprWithSourceText
+   , combineSourceText
 
    -- * Literals
    , IntegralLit(..)
@@ -135,6 +136,10 @@ pprWithSourceText :: SourceText -> SDoc -> SDoc
 pprWithSourceText NoSourceText     d = d
 pprWithSourceText (SourceText src) _ = ftext src
 
+combineSourceText :: SourceText -> SourceText -> SourceText
+combineSourceText (SourceText s1) (SourceText s2) = SourceText (mappend s1 s2)
+combineSourceText _ _ = NoSourceText
+
 ------------------------------------------------
 -- Literals
 ------------------------------------------------


=====================================
utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
=====================================
@@ -320,7 +320,9 @@ classify tok =
     ITlabelvarid{} -> TkUnknown
     ITchar{} -> TkChar
     ITstring{} -> TkString
-    ITmultilinestring{} -> TkString
+    ITstring_multi_begin{} -> TkSpecial
+    ITstring_multi_content{} -> TkString
+    ITstring_multi_end{} -> TkSpecial
     ITinteger{} -> TkNumber
     ITrational{} -> TkNumber
     ITprimchar{} -> TkChar



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/e3442c21a643340802c108fc3d369a7ec1a7ae04

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/e3442c21a643340802c108fc3d369a7ec1a7ae04
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240812/f1aadd83/attachment-0001.html>