[Git][ghc/ghc][wip/strings] [ci skip] wip

Sun Aug 11 23:26:19 UTC 2024

Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC


Commits:
2a2f1cf0 by Brandon Chinn at 2024-08-11T16:26:08-07:00
[ci skip] wip

- - - - -


4 changed files:

- compiler/GHC/Parser.y
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Types/SourceText.hs
- utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs


Changes:

=====================================
compiler/GHC/Parser.y
=====================================
@@ -88,6 +88,7 @@ import GHC.Parser.HaddockLex
 import GHC.Parser.Annotation
 import GHC.Parser.Errors.Types
 import GHC.Parser.Errors.Ppr ()
+import GHC.Parser.String
 
 import GHC.Builtin.Types ( unitTyCon, unitDataCon, sumTyCon,
                            tupleTyCon, tupleDataCon, nilDataCon,
@@ -728,7 +729,9 @@ are the most common patterns, rewritten as regular expressions for clarity:
 
  CHAR           { L _ (ITchar   _ _) }
  STRING         { L _ (ITstring _ _) }
- MULTILINESTRING { L _ (ITmultilinestring _ _) }
+ STRING_MULTI_BEGIN   { L _ (ITstring_multi_begin      ) }
+ STRING_MULTI_CONTENT { L _ (ITstring_multi_content _ _) }
+ STRING_MULTI_END     { L _ (ITstring_multi_end        ) }
  INTEGER        { L _ (ITinteger _) }
  RATIONAL       { L _ (ITrational _) }
 
@@ -762,7 +765,7 @@ TH_QUASIQUOTE   { L _ (ITquasiQuote _) }
 TH_QQUASIQUOTE  { L _ (ITqQuasiQuote _) }
 
 %monad { P } { >>= } { return }
-%lexer { (lexer True) } { L _ ITeof }
+%lexer { (lexerDbg True) } { L _ ITeof } -- FIXME: revert
   -- Replace 'lexer' above with 'lexerDbg'
   -- to dump the tokens fed to the parser.
 %tokentype { (Located Token) }
@@ -2341,8 +2344,8 @@ atype :: { LHsType GhcPs }
                                                                         (getCHAR $1) }
         | STRING               { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
                                                                      (getSTRING  $1) }
-        | MULTILINESTRING      { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getMULTILINESTRINGs $1)
-                                                                     (getMULTILINESTRING  $1) }
+        | string_multi         { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
+                                                                     (getSTRING  $1) }
         | '_'                  { sL1a $1 $ mkAnonWildCardTy }
         -- Type variables are never exported, so `M.tyvar` will be rejected by the renamer.
         -- We let it pass the parser because the renamer can generate a better error message.
@@ -4031,8 +4034,8 @@ literal :: { Located (HsLit GhcPs) }
         : CHAR              { sL1 $1 $ HsChar       (getCHARs $1) $ getCHAR $1 }
         | STRING            { sL1 $1 $ HsString     (getSTRINGs $1)
                                                     $ getSTRING $1 }
-        | MULTILINESTRING   { sL1 $1 $ HsMultilineString (getMULTILINESTRINGs $1)
-                                                    $ getMULTILINESTRING $1 }
+        | string_multi      { sL1 $1 $ HsMultilineString (getSTRINGs $1)
+                                                    $ getSTRING $1 }
         | PRIMINTEGER       { sL1 $1 $ HsIntPrim    (getPRIMINTEGERs $1)
                                                     $ getPRIMINTEGER $1 }
         | PRIMWORD          { sL1 $1 $ HsWordPrim   (getPRIMWORDs $1)
@@ -4060,6 +4063,13 @@ literal :: { Located (HsLit GhcPs) }
         | PRIMFLOAT         { sL1 $1 $ HsFloatPrim  noExtField $ getPRIMFLOAT $1 }
         | PRIMDOUBLE        { sL1 $1 $ HsDoublePrim noExtField $ getPRIMDOUBLE $1 }
 
+string_multi :: { Located Token }
+        : STRING_MULTI_BEGIN string_multi_content STRING_MULTI_END { resolveStringMultiContent $1 (reverse $2) $3 }
+
+string_multi_content :: { [Located Token] }
+        : {- empty -}                               { [] }
+        | string_multi_content STRING_MULTI_CONTENT { $2 : $1 }
+
 -----------------------------------------------------------------------------
 -- Layout
 
@@ -4138,7 +4148,6 @@ getIPDUPVARID     (L _ (ITdupipvarid   x)) = x
 getLABELVARID     (L _ (ITlabelvarid _ x)) = x
 getCHAR           (L _ (ITchar   _ x)) = x
 getSTRING         (L _ (ITstring _ x)) = x
-getMULTILINESTRING (L _ (ITmultilinestring _ x)) = x
 getINTEGER        (L _ (ITinteger x))  = x
 getRATIONAL       (L _ (ITrational x)) = x
 getPRIMCHAR       (L _ (ITprimchar _ x)) = x
@@ -4164,7 +4173,6 @@ getVOCURLY        (L (RealSrcSpan l _) ITvocurly) = srcSpanStartCol l
 getINTEGERs       (L _ (ITinteger (IL src _ _))) = src
 getCHARs          (L _ (ITchar       src _)) = src
 getSTRINGs        (L _ (ITstring     src _)) = src
-getMULTILINESTRINGs (L _ (ITmultilinestring src _)) = src
 getPRIMCHARs      (L _ (ITprimchar   src _)) = src
 getPRIMSTRINGs    (L _ (ITprimstring src _)) = src
 getPRIMINTEGERs   (L _ (ITprimint    src _)) = src
@@ -4202,6 +4210,17 @@ getCTYPEs             (L _ (ITctype             src)) = src
 
 getStringLiteral l = StringLiteral (getSTRINGs l) (getSTRING l) Nothing
 
+resolveStringMultiContent begin contents end =
+    let loc = foldr3 combineSrcSpans (getLoc begin) (map getLoc contents) (getLoc end)
+        src = foldr3 combineSourceText delim (map getMultiContentSrc contents) delim
+        s = mkFastString . postprocessMultilineString . unpackFS $ mconcat (map getMultiContent contents)
+     in L loc (ITstring src s)
+  where
+    delim = SourceText $ mkFastString "\"\"\""
+    foldr3 f x0 x1s x2 = foldr f x0 (x1s ++ [x2])
+    getMultiContentSrc (L _ (ITstring_multi_content src _)) = src
+    getMultiContent    (L _ (ITstring_multi_content _   s)) = s
+
 isUnicode :: Located Token -> Bool
 isUnicode (L _ (ITforall         iu)) = iu == UnicodeSyntax
 isUnicode (L _ (ITdarrow         iu)) = iu == UnicodeSyntax


=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -225,8 +225,8 @@ $docsym    = [\| \^ \* \$]
        | "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
 @escape     = \\ ( $charesc      | @ascii | @decimal | o @octal | x @hexadecimal )
 @escapechar = \\ ( $charesc # \& | @ascii | @decimal | o @octal | x @hexadecimal )
- at stringchar = ($graphic # [\\ \"]) | \ | @escape     | @gap
- at char       = ($graphic # [\\ \']) | \ | @escapechar | @gap
+ at stringchar = ($graphic # [\\ \"]) | " " | @escape     | @gap
+ at char       = ($graphic # [\\ \']) | " " | @escapechar | @gap
 
 -- normal signed numerical literals can only be explicitly negative,
 -- not explicitly positive (contrast @exponent)
@@ -675,16 +675,30 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
 
 }

--- Strings and chars are lexed by hand-written code.  The reason is
--- that even if we recognise the string or char here in the regex
--- lexer, we would still have to parse the string afterward in order
--- to convert it to a String.
 <0> {
-  \"\"\" (@stringchar | $whitechar | $tab)* \"\"\" / { ifExtension MultilineStringsBit} { tok_string_multi }
+  \"\"\" / { ifExtension MultilineStringsBit } { push_and string_multi_content $ token ITstring_multi_begin }
   \" @stringchar* \" \#? { tok_string }
   \' @char        \' \#? { tok_char }
 }
 
+<string_multi_bol, string_multi_content> {
+  \"\"\" { pop_and $ token ITstring_multi_end }
+}
+
+-- FIXME: whitespace getting slurped by $white_no_nl+ rule at top; maybe include @stringchar* here too?
+<string_multi_bol> {
+  ([\ $tab] | @gap)* { pop_and . push_and string_multi_content $ tok_string_multi_content }
+}
+
+<string_multi_content> {
+  @stringchar* { tok_string_multi_content }
+  $nl          { pop_and . push_and string_multi_bol $ tok_string_multi_content }
+
+  -- if we see a bare quote, but we haven't seen the triple quote,
+  -- this is a safe bare quote
+  \" { tok_string_multi_content }
+}
+
 <0> {
   \' \' { token ITtyQuote }
 
@@ -978,7 +992,9 @@ data Token
 
   | ITchar     SourceText Char       -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITstring   SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
-  | ITmultilinestring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+  | ITstring_multi_begin
+  | ITstring_multi_content SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+  | ITstring_multi_end
   | ITinteger  IntegralLit           -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITrational FractionalLit
 
@@ -1302,6 +1318,11 @@ pop_and act span buf len buf2 =
   do _ <- popLexState
      act span buf len buf2
 
+push_and :: Int -> Action -> Action
+push_and ls act span buf len buf2 =
+  do pushLexState ls
+     act span buf len buf2
+
 -- See Note [Whitespace-sensitive operator parsing]
 followedByOpeningToken, precededByClosingToken :: AlexAccPred ExtsBitmap
 followedByOpeningToken _ _ _ (AI _ buf) = followedByOpeningToken' buf
@@ -2225,10 +2246,11 @@ tok_string span buf len _buf2 = do
     src = SourceText $ lexemeToFastString buf len
     isMagicHash = currentChar (offsetBytes (len - 1) buf) == '#'
 
-tok_string_multi :: Action
-tok_string_multi span buf len _buf2 = do
-  s <- postprocessMultilineString <$> lex_string span buf len "\"\"\"" "\"\"\""
-  pure $ L span (ITmultilinestring src (mkFastString s))
+tok_string_multi_content :: Action
+tok_string_multi_content span buf len _buf2 = do
+  -- TODO: don't resolve escape characters here, but validate them
+  s <- lex_string span buf len "" ""
+  pure $ L span (ITstring_multi_content src (mkFastString s))
   where
     src = SourceText $ lexemeToFastString buf len
 


=====================================
compiler/GHC/Types/SourceText.hs
=====================================
@@ -10,6 +10,7 @@
 module GHC.Types.SourceText
    ( SourceText (..)
    , pprWithSourceText
+   , combineSourceText
 
    -- * Literals
    , IntegralLit(..)
@@ -135,6 +136,10 @@ pprWithSourceText :: SourceText -> SDoc -> SDoc
 pprWithSourceText NoSourceText     d = d
 pprWithSourceText (SourceText src) _ = ftext src
 
+combineSourceText :: SourceText -> SourceText -> SourceText
+combineSourceText (SourceText s1) (SourceText s2) = SourceText (mappend s1 s2)
+combineSourceText _ _ = NoSourceText
+
 ------------------------------------------------
 -- Literals
 ------------------------------------------------


=====================================
utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
=====================================
@@ -320,7 +320,9 @@ classify tok =
     ITlabelvarid{} -> TkUnknown
     ITchar{} -> TkChar
     ITstring{} -> TkString
-    ITmultilinestring{} -> TkString
+    ITstring_multi_begin{} -> TkSpecial
+    ITstring_multi_content{} -> TkString
+    ITstring_multi_end{} -> TkSpecial
     ITinteger{} -> TkNumber
     ITrational{} -> TkNumber
     ITprimchar{} -> TkChar



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/2a2f1cf0102cbccdbe3ea26bc24ddacfcb8372b5

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/2a2f1cf0102cbccdbe3ea26bc24ddacfcb8372b5
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240811/cfa09cf4/attachment-0001.html>