[Git][ghc/ghc][wip/strings] Lex multiline strings using LexState, to allow bare quotes
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Mon Aug 12 05:52:46 UTC 2024
Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC
Commits:
e3442c21 by Brandon Chinn at 2024-08-11T22:52:37-07:00
Lex multiline strings using LexState, to allow bare quotes
- - - - -
5 changed files:
- compiler/GHC/Parser.y
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
- compiler/GHC/Types/SourceText.hs
- utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
Changes:
=====================================
compiler/GHC/Parser.y
=====================================
@@ -88,6 +88,7 @@ import GHC.Parser.HaddockLex
import GHC.Parser.Annotation
import GHC.Parser.Errors.Types
import GHC.Parser.Errors.Ppr ()
+import GHC.Parser.String
import GHC.Builtin.Types ( unitTyCon, unitDataCon, sumTyCon,
tupleTyCon, tupleDataCon, nilDataCon,
@@ -728,7 +729,9 @@ are the most common patterns, rewritten as regular expressions for clarity:
CHAR { L _ (ITchar _ _) }
STRING { L _ (ITstring _ _) }
- MULTILINESTRING { L _ (ITmultilinestring _ _) }
+ STRING_MULTI_BEGIN { L _ (ITstring_multi_begin ) }
+ STRING_MULTI_CONTENT { L _ (ITstring_multi_content _ _) }
+ STRING_MULTI_END { L _ (ITstring_multi_end ) }
INTEGER { L _ (ITinteger _) }
RATIONAL { L _ (ITrational _) }
@@ -2341,8 +2344,8 @@ atype :: { LHsType GhcPs }
(getCHAR $1) }
| STRING { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
(getSTRING $1) }
- | MULTILINESTRING { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getMULTILINESTRINGs $1)
- (getMULTILINESTRING $1) }
+ | string_multi { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
+ (getSTRING $1) }
| '_' { sL1a $1 $ mkAnonWildCardTy }
-- Type variables are never exported, so `M.tyvar` will be rejected by the renamer.
-- We let it pass the parser because the renamer can generate a better error message.
@@ -4031,8 +4034,8 @@ literal :: { Located (HsLit GhcPs) }
: CHAR { sL1 $1 $ HsChar (getCHARs $1) $ getCHAR $1 }
| STRING { sL1 $1 $ HsString (getSTRINGs $1)
$ getSTRING $1 }
- | MULTILINESTRING { sL1 $1 $ HsMultilineString (getMULTILINESTRINGs $1)
- $ getMULTILINESTRING $1 }
+ | string_multi { sL1 $1 $ HsMultilineString (getSTRINGs $1)
+ $ getSTRING $1 }
| PRIMINTEGER { sL1 $1 $ HsIntPrim (getPRIMINTEGERs $1)
$ getPRIMINTEGER $1 }
| PRIMWORD { sL1 $1 $ HsWordPrim (getPRIMWORDs $1)
@@ -4060,6 +4063,13 @@ literal :: { Located (HsLit GhcPs) }
| PRIMFLOAT { sL1 $1 $ HsFloatPrim noExtField $ getPRIMFLOAT $1 }
| PRIMDOUBLE { sL1 $1 $ HsDoublePrim noExtField $ getPRIMDOUBLE $1 }
+string_multi :: { Located Token }
+ : STRING_MULTI_BEGIN string_multi_content STRING_MULTI_END { resolveStringMultiContent $1 (reverse $2) $3 }
+
+string_multi_content :: { [Located Token] }
+ : {- empty -} { [] }
+ | string_multi_content STRING_MULTI_CONTENT { $2 : $1 }
+
-----------------------------------------------------------------------------
-- Layout
@@ -4138,7 +4148,6 @@ getIPDUPVARID (L _ (ITdupipvarid x)) = x
getLABELVARID (L _ (ITlabelvarid _ x)) = x
getCHAR (L _ (ITchar _ x)) = x
getSTRING (L _ (ITstring _ x)) = x
-getMULTILINESTRING (L _ (ITmultilinestring _ x)) = x
getINTEGER (L _ (ITinteger x)) = x
getRATIONAL (L _ (ITrational x)) = x
getPRIMCHAR (L _ (ITprimchar _ x)) = x
@@ -4164,7 +4173,6 @@ getVOCURLY (L (RealSrcSpan l _) ITvocurly) = srcSpanStartCol l
getINTEGERs (L _ (ITinteger (IL src _ _))) = src
getCHARs (L _ (ITchar src _)) = src
getSTRINGs (L _ (ITstring src _)) = src
-getMULTILINESTRINGs (L _ (ITmultilinestring src _)) = src
getPRIMCHARs (L _ (ITprimchar src _)) = src
getPRIMSTRINGs (L _ (ITprimstring src _)) = src
getPRIMINTEGERs (L _ (ITprimint src _)) = src
@@ -4202,6 +4210,17 @@ getCTYPEs (L _ (ITctype src)) = src
getStringLiteral l = StringLiteral (getSTRINGs l) (getSTRING l) Nothing
+resolveStringMultiContent begin contents end =
+ let loc = foldr3 combineSrcSpans (getLoc begin) (map getLoc contents) (getLoc end)
+ src = foldr3 combineSourceText delim (map getMultiContentSrc contents) delim
+ s = mkFastString . postprocessMultilineString . unpackFS $ mconcat (map getMultiContent contents)
+ in L loc (ITstring src s)
+ where
+ delim = SourceText $ mkFastString "\"\"\""
+ foldr3 f x0 x1s x2 = foldr f x0 (x1s ++ [x2])
+ getMultiContentSrc (L _ (ITstring_multi_content src _)) = src
+ getMultiContent (L _ (ITstring_multi_content _ s)) = s
+
isUnicode :: Located Token -> Bool
isUnicode (L _ (ITforall iu)) = iu == UnicodeSyntax
isUnicode (L _ (ITdarrow iu)) = iu == UnicodeSyntax
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -88,6 +88,7 @@ import Control.DeepSeq (deepseq)
import Control.Exception (catch, throw)
import Control.Monad
import Control.Applicative
+import Data.Bifunctor (first)
import Data.Char
import Data.List (stripPrefix, isInfixOf, partition)
import Data.List.NonEmpty ( NonEmpty(..) )
@@ -225,8 +226,8 @@ $docsym = [\| \^ \* \$]
| "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
@escape = \\ ( $charesc | @ascii | @decimal | o @octal | x @hexadecimal )
@escapechar = \\ ( $charesc # \& | @ascii | @decimal | o @octal | x @hexadecimal )
- at stringchar = ($graphic # [\\ \"]) | \ | @escape | @gap
- at char = ($graphic # [\\ \']) | \ | @escapechar | @gap
+ at stringchar = ($graphic # [\\ \"]) | " " | @escape | @gap
+ at char = ($graphic # [\\ \']) | " " | @escapechar | @gap
-- normal signed numerical literals can only be explicitly negative,
-- not explicitly positive (contrast @exponent)
@@ -675,16 +676,30 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
}
--- Strings and chars are lexed by hand-written code. The reason is
--- that even if we recognise the string or char here in the regex
--- lexer, we would still have to parse the string afterward in order
--- to convert it to a String.
<0> {
- \"\"\" (@stringchar | $whitechar | $tab)* \"\"\" / { ifExtension MultilineStringsBit} { tok_string_multi }
+ \"\"\" / { ifExtension MultilineStringsBit } { push_and string_multi_content $ token ITstring_multi_begin }
\" @stringchar* \" \#? { tok_string }
\' @char \' \#? { tok_char }
}
+<string_multi_bol, string_multi_content> {
+ \"\"\" { pop_and $ token ITstring_multi_end }
+}
+
+-- FIXME.bchinn: whitespace possibly getting slurped by $white_no_nl+ rule at top? maybe include @stringchar* here too?
+<string_multi_bol> {
+ ([\ $tab] | @gap)* { pop_and . push_and string_multi_content $ tok_string_multi_content }
+}
+
+<string_multi_content> {
+ @stringchar* { tok_string_multi_content }
+ $nl { pop_and . push_and string_multi_bol $ tok_string_multi_content }
+
+ -- if we see a bare quote, but we haven't seen the triple quote,
+ -- this is a safe bare quote
+ \" { tok_string_multi_content }
+}
+
<0> {
\' \' { token ITtyQuote }
@@ -978,7 +993,9 @@ data Token
| ITchar SourceText Char -- Note [Literal source text] in "GHC.Types.SourceText"
| ITstring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
- | ITmultilinestring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+ | ITstring_multi_begin
+ | ITstring_multi_content SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+ | ITstring_multi_end
| ITinteger IntegralLit -- Note [Literal source text] in "GHC.Types.SourceText"
| ITrational FractionalLit
@@ -1302,6 +1319,11 @@ pop_and act span buf len buf2 =
do _ <- popLexState
act span buf len buf2
+push_and :: Int -> Action -> Action
+push_and ls act span buf len buf2 =
+ do pushLexState ls
+ act span buf len buf2
+
-- See Note [Whitespace-sensitive operator parsing]
followedByOpeningToken, precededByClosingToken :: AlexAccPred ExtsBitmap
followedByOpeningToken _ _ _ (AI _ buf) = followedByOpeningToken' buf
@@ -2206,10 +2228,10 @@ lex_string_prag_comment mkTok span _buf _len _buf2
-- -----------------------------------------------------------------------------
-- Strings & Chars
--- FIXME: figure out smart quotes (escaped smart quotes + smart quotes failing to end string lex)
+-- FIXME.bchinn: throw better error for escaped smart quotes
tok_string :: Action
tok_string span buf len _buf2 = do
- s <- lex_string span buf (if isMagicHash then len - 1 else len) "\"" "\""
+ s <- lex_string ("\"", "\"") span buf (if isMagicHash then len - 1 else len)
if isMagicHash
then do
@@ -2225,30 +2247,53 @@ tok_string span buf len _buf2 = do
src = SourceText $ lexemeToFastString buf len
isMagicHash = currentChar (offsetBytes (len - 1) buf) == '#'
-tok_string_multi :: Action
-tok_string_multi span buf len _buf2 = do
- s <- postprocessMultilineString <$> lex_string span buf len "\"\"\"" "\"\"\""
- pure $ L span (ITmultilinestring src (mkFastString s))
+tok_string_multi_content :: Action
+tok_string_multi_content span buf len _buf2 = do
+ s <- lex_string' resolveEsc len $ AI (psSpanStart span) buf
+ pure $ L span (ITstring_multi_content src (mkFastString s))
where
src = SourceText $ lexemeToFastString buf len
-lex_string :: PsSpan -> StringBuffer -> Int -> String -> String -> P String
-lex_string span buf len startDelim endDelim = do
- let s = go $ lexemeToString (offsetBytes (length startDelim) buf) numChars
+ -- Don't resolve escape characters here, defer until postprocessMultilineString.
+ -- However, we still want to validate them
+ resolveEsc cs =
+ let result =
+ case cs of
+ c : cs' -> (['\\', c], cs') -- for the sake of validation, pretend we always escape just one character
+ [] -> panic "Unexpectedly resolving an empty escape character"
+ in result <$ resolveEscapeCharacter cs
+
+lex_string :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
+lex_string (startDelim, endDelim) span buf len = lex_string' resolveEsc numChars i0
+ where
+ resolveEsc = fmap (first (:[])) . resolveEscapeCharacter
+ numChars = len - (length startDelim + length endDelim)
+ i0 =
+ case applyM (length startDelim) (fmap snd . alexGetChar') (AI (psSpanStart span) buf) of
+ Just i -> i
+ Nothing -> panic "Unexpectedly reached EOF when advancing past string delimiter"
+
+ -- applyM 3 f x = f x >>= f >>= f
+ applyM n f = foldr (>=>) pure $ replicate n f
+
+lex_string' :: ([Char] -> Either ParseEscapeErr ([Char], [Char])) -> Int -> AlexInput -> P String
+lex_string' resolveEsc numChars i0@(AI _ buf) = do
+ let s = go $ lexemeToString buf numChars
-- Unfortunately, `go` is only performant if it's pure; allocations
-- and performance degrade when `go` is implemented in P or ST. So
-- we'll throw an impure exception and catch it here
unsafePerformIO $
(s `deepseq` pure (pure ())) `catch` \e -> do
- let i0 = AI (psSpanStart span) buf
- let (e', i) = resolveParseEscapeErr alexGetChar' i0 numChars e
+ let (e', i) = resolveParseEscapeErr getNextLoc i0 numChars e
pure $ setInput i >> lexError e'
pure s
where
- -- the number of characters in the string
- numChars = len - (length startDelim + length endDelim)
+ getNextLoc i =
+ case alexGetChar' i of
+ Just (_, i') -> i'
+ Nothing -> panic "Unexpectedly reached EOF when resolving ParseEscapeErr"
-- assumes string was lexed correctly
go = \case
@@ -2256,8 +2301,8 @@ lex_string span buf len startDelim endDelim = do
'\\' : '&' : cs -> go cs
'\\' : c : cs | is_space c -> go $ dropGap cs
'\\' : cs ->
- case resolveEscapeCharacter cs of
- Right (c, cs') -> c : go cs'
+ case resolveEsc cs of
+ Right (esc, cs') -> esc ++ go cs'
Left e -> throw e
c : cs -> c : go cs
@@ -2269,7 +2314,7 @@ lex_string span buf len startDelim endDelim = do
tok_quoted_label :: Action
tok_quoted_label span buf len _buf2 = do
- s <- lex_string span buf len "#\"" "\""
+ s <- lex_string ("#\"", "\"") span buf len
(AI end bufEnd) <- getInput
let
token = ITlabelvarid (SourceText src) (mkFastString s)
@@ -2281,7 +2326,7 @@ tok_quoted_label span buf len _buf2 = do
tok_char :: Action
tok_char span buf len _buf2 = do
- c <- lex_string span buf (if isMagicHash then len - 1 else len) "'" "'" >>= \case
+ c <- lex_string ("'", "'") span buf (if isMagicHash then len - 1 else len) >>= \case
[c] -> pure c
s -> panic $ "tok_char expected exactly one character, got: " ++ show s
pure . L span $
@@ -3490,6 +3535,8 @@ lexToken = do
setLastToken span 0
return (L span ITeof)
AlexError (AI loc2 buf) ->
+ -- FIXME.bchinn - check if any smart quotes between loc1 and loc2; that might
+ -- indicate that someone expected the smart quote to end a string literal
reportLexError (psRealLoc loc1) (psRealLoc loc2) buf
(\k srcLoc -> mkPlainErrorMsgEnvelope srcLoc $ PsErrLexer LexError k)
AlexSkip inp2 _ -> do
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -79,21 +79,24 @@ newtype ParseEscapeErr =
instance Exception ParseEscapeErr
--- | Get the LexErr and location of the error, given the location of the initial
--- string delimiter, a function to get the next location, and the total length of
+-- | Get the LexErr and location of the error, given the location of the first
+-- character in the string, a function to get the next location, and the total length of
-- the string.
-resolveParseEscapeErr :: (loc -> Maybe (Char, loc)) -> loc -> Int -> ParseEscapeErr -> (LexErr, loc)
-resolveParseEscapeErr getChar loc len (ParseEscapeErr (e, indexFromEnd)) = (e, loc')
+--
+-- @
+-- v indexFromEnd = 4
+-- "a b \xffffff c d"
+-- ^ loc0 ^ loc, index = 11
+-- ^^^^^^^^^^^^^^^^ len = 16
+-- @
+resolveParseEscapeErr :: (loc -> loc) -> loc -> Int -> ParseEscapeErr -> (LexErr, loc)
+resolveParseEscapeErr getNextLoc loc0 len (ParseEscapeErr (e, indexFromEnd)) = (e, loc)
where
-- the index of the error, where 0 is the first character after the initial string delimiter
index = len - indexFromEnd - 1
-- the 'loc' corresponding to 'index'
- loc' = iterate getNextLoc loc !! (index + 1)
- getNextLoc l =
- case getChar l of
- Just (_, l') -> l'
- Nothing -> panic "Unexpectedly reached EOF when resolving ParseEscapeErr"
+ loc = iterate getNextLoc loc0 !! index
parseLongEscape :: [Char] -> Maybe (Char, [Char])
parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
=====================================
compiler/GHC/Types/SourceText.hs
=====================================
@@ -10,6 +10,7 @@
module GHC.Types.SourceText
( SourceText (..)
, pprWithSourceText
+ , combineSourceText
-- * Literals
, IntegralLit(..)
@@ -135,6 +136,10 @@ pprWithSourceText :: SourceText -> SDoc -> SDoc
pprWithSourceText NoSourceText d = d
pprWithSourceText (SourceText src) _ = ftext src
+combineSourceText :: SourceText -> SourceText -> SourceText
+combineSourceText (SourceText s1) (SourceText s2) = SourceText (mappend s1 s2)
+combineSourceText _ _ = NoSourceText
+
------------------------------------------------
-- Literals
------------------------------------------------
=====================================
utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
=====================================
@@ -320,7 +320,9 @@ classify tok =
ITlabelvarid{} -> TkUnknown
ITchar{} -> TkChar
ITstring{} -> TkString
- ITmultilinestring{} -> TkString
+ ITstring_multi_begin{} -> TkSpecial
+ ITstring_multi_content{} -> TkString
+ ITstring_multi_end{} -> TkSpecial
ITinteger{} -> TkNumber
ITrational{} -> TkNumber
ITprimchar{} -> TkChar
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/e3442c21a643340802c108fc3d369a7ec1a7ae04
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/e3442c21a643340802c108fc3d369a7ec1a7ae04
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240812/f1aadd83/attachment-0001.html>
More information about the ghc-commits
mailing list