[Git][ghc/ghc][wip/strings] [ci skip] wip
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Sun Aug 11 23:26:19 UTC 2024
Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC
Commits:
2a2f1cf0 by Brandon Chinn at 2024-08-11T16:26:08-07:00
[ci skip] wip
- - - - -
4 changed files:
- compiler/GHC/Parser.y
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Types/SourceText.hs
- utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
Changes:
=====================================
compiler/GHC/Parser.y
=====================================
@@ -88,6 +88,7 @@ import GHC.Parser.HaddockLex
import GHC.Parser.Annotation
import GHC.Parser.Errors.Types
import GHC.Parser.Errors.Ppr ()
+import GHC.Parser.String
import GHC.Builtin.Types ( unitTyCon, unitDataCon, sumTyCon,
tupleTyCon, tupleDataCon, nilDataCon,
@@ -728,7 +729,9 @@ are the most common patterns, rewritten as regular expressions for clarity:
CHAR { L _ (ITchar _ _) }
STRING { L _ (ITstring _ _) }
- MULTILINESTRING { L _ (ITmultilinestring _ _) }
+ STRING_MULTI_BEGIN { L _ (ITstring_multi_begin ) }
+ STRING_MULTI_CONTENT { L _ (ITstring_multi_content _ _) }
+ STRING_MULTI_END { L _ (ITstring_multi_end ) }
INTEGER { L _ (ITinteger _) }
RATIONAL { L _ (ITrational _) }
@@ -762,7 +765,7 @@ TH_QUASIQUOTE { L _ (ITquasiQuote _) }
TH_QQUASIQUOTE { L _ (ITqQuasiQuote _) }
%monad { P } { >>= } { return }
-%lexer { (lexer True) } { L _ ITeof }
+%lexer { (lexerDbg True) } { L _ ITeof } -- FIXME: revert
-- Replace 'lexer' above with 'lexerDbg'
-- to dump the tokens fed to the parser.
%tokentype { (Located Token) }
@@ -2341,8 +2344,8 @@ atype :: { LHsType GhcPs }
(getCHAR $1) }
| STRING { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
(getSTRING $1) }
- | MULTILINESTRING { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getMULTILINESTRINGs $1)
- (getMULTILINESTRING $1) }
+ | string_multi { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
+ (getSTRING $1) }
| '_' { sL1a $1 $ mkAnonWildCardTy }
-- Type variables are never exported, so `M.tyvar` will be rejected by the renamer.
-- We let it pass the parser because the renamer can generate a better error message.
@@ -4031,8 +4034,8 @@ literal :: { Located (HsLit GhcPs) }
: CHAR { sL1 $1 $ HsChar (getCHARs $1) $ getCHAR $1 }
| STRING { sL1 $1 $ HsString (getSTRINGs $1)
$ getSTRING $1 }
- | MULTILINESTRING { sL1 $1 $ HsMultilineString (getMULTILINESTRINGs $1)
- $ getMULTILINESTRING $1 }
+ | string_multi { sL1 $1 $ HsMultilineString (getSTRINGs $1)
+ $ getSTRING $1 }
| PRIMINTEGER { sL1 $1 $ HsIntPrim (getPRIMINTEGERs $1)
$ getPRIMINTEGER $1 }
| PRIMWORD { sL1 $1 $ HsWordPrim (getPRIMWORDs $1)
@@ -4060,6 +4063,13 @@ literal :: { Located (HsLit GhcPs) }
| PRIMFLOAT { sL1 $1 $ HsFloatPrim noExtField $ getPRIMFLOAT $1 }
| PRIMDOUBLE { sL1 $1 $ HsDoublePrim noExtField $ getPRIMDOUBLE $1 }
+string_multi :: { Located Token }
+ : STRING_MULTI_BEGIN string_multi_content STRING_MULTI_END { resolveStringMultiContent $1 (reverse $2) $3 }
+
+string_multi_content :: { [Located Token] }
+ : {- empty -} { [] }
+ | string_multi_content STRING_MULTI_CONTENT { $2 : $1 }
+
-----------------------------------------------------------------------------
-- Layout
@@ -4138,7 +4148,6 @@ getIPDUPVARID (L _ (ITdupipvarid x)) = x
getLABELVARID (L _ (ITlabelvarid _ x)) = x
getCHAR (L _ (ITchar _ x)) = x
getSTRING (L _ (ITstring _ x)) = x
-getMULTILINESTRING (L _ (ITmultilinestring _ x)) = x
getINTEGER (L _ (ITinteger x)) = x
getRATIONAL (L _ (ITrational x)) = x
getPRIMCHAR (L _ (ITprimchar _ x)) = x
@@ -4164,7 +4173,6 @@ getVOCURLY (L (RealSrcSpan l _) ITvocurly) = srcSpanStartCol l
getINTEGERs (L _ (ITinteger (IL src _ _))) = src
getCHARs (L _ (ITchar src _)) = src
getSTRINGs (L _ (ITstring src _)) = src
-getMULTILINESTRINGs (L _ (ITmultilinestring src _)) = src
getPRIMCHARs (L _ (ITprimchar src _)) = src
getPRIMSTRINGs (L _ (ITprimstring src _)) = src
getPRIMINTEGERs (L _ (ITprimint src _)) = src
@@ -4202,6 +4210,17 @@ getCTYPEs (L _ (ITctype src)) = src
getStringLiteral l = StringLiteral (getSTRINGs l) (getSTRING l) Nothing
+resolveStringMultiContent begin contents end =
+ let loc = foldr3 combineSrcSpans (getLoc begin) (map getLoc contents) (getLoc end)
+ src = foldr3 combineSourceText delim (map getMultiContentSrc contents) delim
+ s = mkFastString . postprocessMultilineString . unpackFS $ mconcat (map getMultiContent contents)
+ in L loc (ITstring src s)
+ where
+ delim = SourceText $ mkFastString "\"\"\""
+ foldr3 f x0 x1s x2 = foldr f x0 (x1s ++ [x2])
+ getMultiContentSrc (L _ (ITstring_multi_content src _)) = src
+ getMultiContent (L _ (ITstring_multi_content _ s)) = s
+
isUnicode :: Located Token -> Bool
isUnicode (L _ (ITforall iu)) = iu == UnicodeSyntax
isUnicode (L _ (ITdarrow iu)) = iu == UnicodeSyntax
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -225,8 +225,8 @@ $docsym = [\| \^ \* \$]
| "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
@escape = \\ ( $charesc | @ascii | @decimal | o @octal | x @hexadecimal )
@escapechar = \\ ( $charesc # \& | @ascii | @decimal | o @octal | x @hexadecimal )
- at stringchar = ($graphic # [\\ \"]) | \ | @escape | @gap
- at char = ($graphic # [\\ \']) | \ | @escapechar | @gap
+ at stringchar = ($graphic # [\\ \"]) | " " | @escape | @gap
+ at char = ($graphic # [\\ \']) | " " | @escapechar | @gap
-- normal signed numerical literals can only be explicitly negative,
-- not explicitly positive (contrast @exponent)
@@ -675,16 +675,30 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
}
--- Strings and chars are lexed by hand-written code. The reason is
--- that even if we recognise the string or char here in the regex
--- lexer, we would still have to parse the string afterward in order
--- to convert it to a String.
<0> {
- \"\"\" (@stringchar | $whitechar | $tab)* \"\"\" / { ifExtension MultilineStringsBit} { tok_string_multi }
+ \"\"\" / { ifExtension MultilineStringsBit } { push_and string_multi_content $ token ITstring_multi_begin }
\" @stringchar* \" \#? { tok_string }
\' @char \' \#? { tok_char }
}
+<string_multi_bol, string_multi_content> {
+ \"\"\" { pop_and $ token ITstring_multi_end }
+}
+
+-- FIXME: whitespace getting slurped by $white_no_nl+ rule at top; maybe include @stringchar* here too?
+<string_multi_bol> {
+ ([\ $tab] | @gap)* { pop_and . push_and string_multi_content $ tok_string_multi_content }
+}
+
+<string_multi_content> {
+ @stringchar* { tok_string_multi_content }
+ $nl { pop_and . push_and string_multi_bol $ tok_string_multi_content }
+
+ -- if we see a bare quote, but we haven't seen the triple quote,
+ -- this is a safe bare quote
+ \" { tok_string_multi_content }
+}
+
<0> {
\' \' { token ITtyQuote }
@@ -978,7 +992,9 @@ data Token
| ITchar SourceText Char -- Note [Literal source text] in "GHC.Types.SourceText"
| ITstring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
- | ITmultilinestring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+ | ITstring_multi_begin
+ | ITstring_multi_content SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+ | ITstring_multi_end
| ITinteger IntegralLit -- Note [Literal source text] in "GHC.Types.SourceText"
| ITrational FractionalLit
@@ -1302,6 +1318,11 @@ pop_and act span buf len buf2 =
do _ <- popLexState
act span buf len buf2
+push_and :: Int -> Action -> Action
+push_and ls act span buf len buf2 =
+ do pushLexState ls
+ act span buf len buf2
+
-- See Note [Whitespace-sensitive operator parsing]
followedByOpeningToken, precededByClosingToken :: AlexAccPred ExtsBitmap
followedByOpeningToken _ _ _ (AI _ buf) = followedByOpeningToken' buf
@@ -2225,10 +2246,11 @@ tok_string span buf len _buf2 = do
src = SourceText $ lexemeToFastString buf len
isMagicHash = currentChar (offsetBytes (len - 1) buf) == '#'
-tok_string_multi :: Action
-tok_string_multi span buf len _buf2 = do
- s <- postprocessMultilineString <$> lex_string span buf len "\"\"\"" "\"\"\""
- pure $ L span (ITmultilinestring src (mkFastString s))
+tok_string_multi_content :: Action
+tok_string_multi_content span buf len _buf2 = do
+ -- TODO: don't resolve escape characters here, but validate them
+ s <- lex_string span buf len "" ""
+ pure $ L span (ITstring_multi_content src (mkFastString s))
where
src = SourceText $ lexemeToFastString buf len
=====================================
compiler/GHC/Types/SourceText.hs
=====================================
@@ -10,6 +10,7 @@
module GHC.Types.SourceText
( SourceText (..)
, pprWithSourceText
+ , combineSourceText
-- * Literals
, IntegralLit(..)
@@ -135,6 +136,10 @@ pprWithSourceText :: SourceText -> SDoc -> SDoc
pprWithSourceText NoSourceText d = d
pprWithSourceText (SourceText src) _ = ftext src
+combineSourceText :: SourceText -> SourceText -> SourceText
+combineSourceText (SourceText s1) (SourceText s2) = SourceText (mappend s1 s2)
+combineSourceText _ _ = NoSourceText
+
------------------------------------------------
-- Literals
------------------------------------------------
=====================================
utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
=====================================
@@ -320,7 +320,9 @@ classify tok =
ITlabelvarid{} -> TkUnknown
ITchar{} -> TkChar
ITstring{} -> TkString
- ITmultilinestring{} -> TkString
+ ITstring_multi_begin{} -> TkSpecial
+ ITstring_multi_content{} -> TkString
+ ITstring_multi_end{} -> TkSpecial
ITinteger{} -> TkNumber
ITrational{} -> TkNumber
ITprimchar{} -> TkChar
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/2a2f1cf0102cbccdbe3ea26bc24ddacfcb8372b5
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/2a2f1cf0102cbccdbe3ea26bc24ddacfcb8372b5
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240811/cfa09cf4/attachment-0001.html>
More information about the ghc-commits
mailing list