[Git][ghc/ghc][wip/strings] Replace manual string lexing (#25158)
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Sun Sep 22 03:22:25 UTC 2024
Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC
Commits:
e7c83fe4 by Brandon Chinn at 2024-09-21T20:22:05-07:00
Replace manual string lexing (#25158)
Metric Increase:
MultilineStringsPerf
- - - - -
29 changed files:
- compiler/GHC/Parser.y
- compiler/GHC/Parser/CharClass.hs
- compiler/GHC/Parser/Errors/Ppr.hs
- compiler/GHC/Parser/Errors/Types.hs
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
- testsuite/driver/testlib.py
- testsuite/tests/ghci/prog013/prog013.stderr
- testsuite/tests/ghci/scripts/ghci022.stderr
- testsuite/tests/parser/should_fail/MultilineStringsInnerTab.stderr
- testsuite/tests/parser/should_fail/MultilineStringsSmartQuotes.stderr
- testsuite/tests/parser/should_fail/T21843c.stderr
- testsuite/tests/parser/should_fail/T21843e.stderr
- testsuite/tests/parser/should_fail/T21843f.stderr
- testsuite/tests/parser/should_fail/T3751.stderr
- testsuite/tests/parser/should_fail/T5425.stderr
- testsuite/tests/parser/should_fail/readFail002.stderr
- testsuite/tests/parser/should_fail/readFail004.stderr
- testsuite/tests/parser/should_fail/readFail005.stderr
- testsuite/tests/parser/should_fail/readFail033.stderr
- testsuite/tests/parser/unicode/all.T
- + testsuite/tests/parser/unicode/lex_unicode_ids.hs
- + testsuite/tests/parser/unicode/lex_unispace.hs
- testsuite/tests/parser/unicode/utf8_010.stderr
- testsuite/tests/parser/unicode/utf8_011.stderr
- testsuite/tests/parser/unicode/utf8_020.stderr
- testsuite/tests/parser/unicode/utf8_021.stderr
- testsuite/tests/parser/unicode/utf8_022.stderr
- utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
Changes:
=====================================
compiler/GHC/Parser.y
=====================================
@@ -728,7 +728,7 @@ are the most common patterns, rewritten as regular expressions for clarity:
CHAR { L _ (ITchar _ _) }
STRING { L _ (ITstring _ _) }
- MULTILINESTRING { L _ (ITmultilinestring _ _) }
+ STRING_MULTI { L _ (ITstringMulti _ _) }
INTEGER { L _ (ITinteger _) }
RATIONAL { L _ (ITrational _) }
@@ -2357,8 +2357,8 @@ atype :: { LHsType GhcPs }
(getCHAR $1) }
| STRING { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
(getSTRING $1) }
- | MULTILINESTRING { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getMULTILINESTRINGs $1)
- (getMULTILINESTRING $1) }
+ | STRING_MULTI { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGMULTIs $1)
+ (getSTRINGMULTI $1) }
| '_' { sL1a $1 $ mkAnonWildCardTy }
-- Type variables are never exported, so `M.tyvar` will be rejected by the renamer.
-- We let it pass the parser because the renamer can generate a better error message.
@@ -4047,8 +4047,8 @@ literal :: { Located (HsLit GhcPs) }
: CHAR { sL1 $1 $ HsChar (getCHARs $1) $ getCHAR $1 }
| STRING { sL1 $1 $ HsString (getSTRINGs $1)
$ getSTRING $1 }
- | MULTILINESTRING { sL1 $1 $ HsMultilineString (getMULTILINESTRINGs $1)
- $ getMULTILINESTRING $1 }
+ | STRING_MULTI { sL1 $1 $ HsMultilineString (getSTRINGMULTIs $1)
+ $ getSTRINGMULTI $1 }
| PRIMINTEGER { sL1 $1 $ HsIntPrim (getPRIMINTEGERs $1)
$ getPRIMINTEGER $1 }
| PRIMWORD { sL1 $1 $ HsWordPrim (getPRIMWORDs $1)
@@ -4154,7 +4154,7 @@ getIPDUPVARID (L _ (ITdupipvarid x)) = x
getLABELVARID (L _ (ITlabelvarid _ x)) = x
getCHAR (L _ (ITchar _ x)) = x
getSTRING (L _ (ITstring _ x)) = x
-getMULTILINESTRING (L _ (ITmultilinestring _ x)) = x
+getSTRINGMULTI (L _ (ITstringMulti _ x)) = x
getINTEGER (L _ (ITinteger x)) = x
getRATIONAL (L _ (ITrational x)) = x
getPRIMCHAR (L _ (ITprimchar _ x)) = x
@@ -4180,7 +4180,7 @@ getVOCURLY (L (RealSrcSpan l _) ITvocurly) = srcSpanStartCol l
getINTEGERs (L _ (ITinteger (IL src _ _))) = src
getCHARs (L _ (ITchar src _)) = src
getSTRINGs (L _ (ITstring src _)) = src
-getMULTILINESTRINGs (L _ (ITmultilinestring src _)) = src
+getSTRINGMULTIs (L _ (ITstringMulti src _)) = src
getPRIMCHARs (L _ (ITprimchar src _)) = src
getPRIMSTRINGs (L _ (ITprimstring src _)) = src
getPRIMINTEGERs (L _ (ITprimint src _)) = src
=====================================
compiler/GHC/Parser/CharClass.hs
=====================================
@@ -36,14 +36,14 @@ cDigit = 64
{-# INLINABLE is_ctype #-}
is_ctype :: Word8 -> Char -> Bool
-is_ctype mask c = (charType c .&. mask) /= 0
+is_ctype mask c = c <= '\127' && (charType c .&. mask) /= 0
is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit,
is_alphanum :: Char -> Bool
is_ident = is_ctype cIdent
is_symbol = is_ctype cSymbol
is_any = is_ctype cAny
-is_space = \c -> c <= '\x7f' && is_ctype cSpace c -- is_space only works for <= '\x7f' (#3751, #5425)
+is_space = is_ctype cSpace
is_lower = is_ctype cLower
is_upper = is_ctype cUpper
is_digit = is_ctype cDigit
=====================================
compiler/GHC/Parser/Errors/Ppr.hs
=====================================
@@ -157,8 +157,6 @@ instance Diagnostic PsMessage where
LexUnknownPragma -> text "unknown pragma"
LexErrorInPragma -> text "lexical error in pragma"
LexNumEscapeRange -> text "numeric escape sequence out of range"
- LexStringCharLit -> text "lexical error in string/character literal"
- LexStringCharLitEOF -> text "unexpected end-of-file in string/character literal"
LexUnterminatedComment -> text "unterminated `{-'"
LexUnterminatedOptions -> text "unterminated OPTIONS pragma"
LexUnterminatedQQ -> text "unterminated quasiquotation"
=====================================
compiler/GHC/Parser/Errors/Types.hs
=====================================
@@ -591,8 +591,6 @@ data LexErr
| LexUnknownPragma -- ^ Unknown pragma
| LexErrorInPragma -- ^ Lexical error in pragma
| LexNumEscapeRange -- ^ Numeric escape sequence out of range
- | LexStringCharLit -- ^ Lexical error in string/character literal
- | LexStringCharLitEOF -- ^ Unexpected end-of-file in string/character literal
| LexUnterminatedComment -- ^ Unterminated `{-'
| LexUnterminatedOptions -- ^ Unterminated OPTIONS pragma
| LexUnterminatedQQ -- ^ Unterminated quasiquotation
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -140,7 +140,8 @@ import GHC.Parser.String
-- Any changes here should likely be reflected there.
$unispace = \x05 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$nl = [\n\r\f]
-$whitechar = [$nl\v\ $unispace]
+$space = [\ $unispace]
+$whitechar = [$nl \v $space]
$white_no_nl = $whitechar # \n -- TODO #8424
$tab = \t
@@ -167,6 +168,7 @@ $idchar = [$small $large $digit $uniidchar \']
$unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$graphic = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
+$charesc = [a b f n r t v \\ \" \' \&]
$binit = 0-1
$octit = 0-7
@@ -213,6 +215,20 @@ $docsym = [\| \^ \* \$]
@floating_point = @numspc @decimal \. @decimal @exponent? | @numspc @decimal @exponent
@hex_floating_point = @numspc @hexadecimal \. @hexadecimal @bin_exponent? | @numspc @hexadecimal @bin_exponent
+ at gap = \\ $whitechar+ \\
+ at cntrl = $asclarge | \@ | \[ | \\ | \] | \^ | \_
+ at ascii = \^ @cntrl | "NUL" | "SOH" | "STX" | "ETX" | "EOT" | "ENQ" | "ACK"
+ | "BEL" | "BS" | "HT" | "LF" | "VT" | "FF" | "CR" | "SO" | "SI" | "DLE"
+ | "DC1" | "DC2" | "DC3" | "DC4" | "NAK" | "SYN" | "ETB" | "CAN"
+ | "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
+-- N.B. ideally, we would do `@escape # \\ \&` instead of duplicating in @escapechar,
+-- which is what the Haskell Report says, but this isn't valid Alex syntax, as only
+-- character sets can be subtracted, not strings
+ at escape = \\ ( $charesc | @ascii | @decimal | o @octal | x @hexadecimal )
+ at escapechar = \\ ( $charesc # \& | @ascii | @decimal | o @octal | x @hexadecimal )
+ at stringchar = ($graphic # [\\ \"]) | $space | @escape | @gap
+ at char = ($graphic # [\\ \']) | $space | @escapechar
+
-- normal signed numerical literals can only be explicitly negative,
-- not explicitly positive (contrast @exponent)
@negative = \-
@@ -460,7 +476,7 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
<0> {
"#" $idchar+ / { ifExtension OverloadedLabelsBit } { skip_one_varid_src ITlabelvarid }
- "#" \" / { ifExtension OverloadedLabelsBit } { lex_quoted_label }
+ "#" \" @stringchar* \" / { ifExtension OverloadedLabelsBit } { tok_quoted_label }
}
<0> {
@@ -660,14 +676,38 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
}
--- Strings and chars are lexed by hand-written code. The reason is
--- that even if we recognise the string or char here in the regex
--- lexer, we would still have to parse the string afterward in order
--- to convert it to a String.
<0> {
- \' { lex_char_tok }
- \"\"\" / { ifExtension MultilineStringsBit} { lex_string_tok StringTypeMulti }
- \" { lex_string_tok StringTypeSingle }
+ \"\"\" / { ifExtension MultilineStringsBit } { tok_string_multi }
+ \" @stringchar* \" { tok_string }
+ \" @stringchar* \" \# / { ifExtension MagicHashBit } { tok_string }
+ \' @char \' { tok_char }
+ \' @char \' \# / { ifExtension MagicHashBit } { tok_char }
+
+ -- Check for smart quotes and throw better errors than a plain lexical error (#21843)
+ \' \\ $unigraphic / { isSmartQuote } { smart_quote_error }
+ \" @stringchar* \\ $unigraphic / { isSmartQuote } { smart_quote_error }
+ -- See Note [Bare smart quote error]
+ -- The valid string rule will take precedence because it'll match more
+ -- characters than this rule, so this rule will only fire if the string
+ -- could not be lexed correctly
+ \" @stringchar* $unigraphic / { isSmartQuote } { smart_quote_error }
+}
+
+<string_multi_content> {
+ -- Parse as much of the multiline string as possible, except for quotes
+ @stringchar* ($nl ([\ $tab] | @gap)* @stringchar*)* { tok_string_multi_content }
+ -- Allow bare quotes if it's not a triple quote
+ (\" | \"\") / ([\n .] # \") { tok_string_multi_content }
+}
+
+<0> {
+ \'\' { token ITtyQuote }
+
+ -- The normal character match takes precedence over this because it matches
+ -- more characters. However, if that pattern didn't match, then this quote
+ -- could be a quoted identifier, like 'x. Here, just return ITsimpleQuote,
+ -- as the parser will lex the varid separately.
+ \' / ($graphic # \\ | " ") { token ITsimpleQuote }
}
-- Note [Whitespace-sensitive operator parsing]
@@ -953,7 +993,7 @@ data Token
| ITchar SourceText Char -- Note [Literal source text] in "GHC.Types.SourceText"
| ITstring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
- | ITmultilinestring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+ | ITstringMulti SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
| ITinteger IntegralLit -- Note [Literal source text] in "GHC.Types.SourceText"
| ITrational FractionalLit
@@ -2181,156 +2221,128 @@ lex_string_prag_comment mkTok span _buf _len _buf2
-- -----------------------------------------------------------------------------
-- Strings & Chars
--- This stuff is horrible. I hates it.
-
-lex_string_tok :: LexStringType -> Action
-lex_string_tok strType span buf _len _buf2 = do
- s <- lex_string strType
-
- i <- getInput
- case strType of
- StringTypeSingle ->
- lex_magic_hash i >>= \case
- Just i' -> do
- when (any (> '\xFF') s) $ do
- pState <- getPState
- let msg = PsErrPrimStringInvalidChar
- let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
- addError err
-
- setInput i'
- let (psSpan, src) = getStringLoc (buf, locStart) i'
- pure $ L psSpan (ITprimstring src (unsafeMkByteString s))
- Nothing -> do
- let (psSpan, src) = getStringLoc (buf, locStart) i
- pure $ L psSpan (ITstring src (mkFastString s))
- StringTypeMulti -> do
- let (psSpan, src) = getStringLoc (buf, locStart) i
- pure $ L psSpan (ITmultilinestring src (mkFastString s))
+tok_string :: Action
+tok_string span buf len _buf2 = do
+ s <- lex_chars ("\"", "\"") span buf (if endsInHash then len - 1 else len)
+
+ if endsInHash
+ then do
+ when (any (> '\xFF') s) $ do
+ pState <- getPState
+ let msg = PsErrPrimStringInvalidChar
+ let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
+ addError err
+ pure $ L span (ITprimstring src (unsafeMkByteString s))
+ else
+ pure $ L span (ITstring src (mkFastString s))
where
- locStart = psSpanStart span
+ src = SourceText $ lexemeToFastString buf len
+ endsInHash = currentChar (offsetBytes (len - 1) buf) == '#'
+
+-- | Ideally, we would define this completely with Alex syntax, like normal strings.
+-- Instead, this is defined as a hybrid solution by manually invoking lex states, which
+-- we're doing for two reasons:
+-- 1. The multiline string should all be one lexical token, not multiple
+-- 2. We need to allow bare quotes, which can't be done with one regex
+tok_string_multi :: Action
+tok_string_multi startSpan startBuf _len _buf2 = do
+ -- advance to the end of the multiline string
+ let startLoc = psSpanStart startSpan
+ let i@(AI _ contentStartBuf) =
+ case lexDelim $ AI startLoc startBuf of
+ Just i -> i
+ Nothing -> panic "tok_string_multi did not start with a delimiter"
+ (AI _ contentEndBuf, i'@(AI endLoc endBuf)) <- goContent i
+
+ -- build the values pertaining to the entire multiline string, including delimiters
+ let span = mkPsSpan startLoc endLoc
+ let len = byteDiff startBuf endBuf
+ let src = SourceText $ lexemeToFastString startBuf len
+
+ -- load the content of the multiline string
+ let contentLen = byteDiff contentStartBuf contentEndBuf
+ s <-
+ either (throwStringLexError (AI startLoc startBuf)) pure $
+ lexMultilineString contentLen contentStartBuf
+
+ setInput i'
+ pure $ L span $ ITstringMulti src (mkFastString s)
+ where
+ goContent i0 =
+ case alexScan i0 string_multi_content of
+ AlexToken i1 len _
+ | Just i2 <- lexDelim i1 -> pure (i1, i2)
+ | -- is the next token a tab character?
+ -- need this explicitly because there's a global rule matching $tab
+ Just ('\t', _) <- alexGetChar' i1 -> setInput i1 >> lexError LexError
+ | isEOF i1 -> checkSmartQuotes >> lexError LexError
+ | len == 0 -> panic $ "parsing multiline string got into infinite loop at: " ++ show i0
+ | otherwise -> goContent i1
+ AlexSkip i1 _ -> goContent i1
+ _ -> lexError LexError
+
+ lexDelim =
+ let go 0 i = Just i
+ go n i =
+ case alexGetChar' i of
+ Just ('"', i') -> go (n - 1) i'
+ _ -> Nothing
+ in go (3 :: Int)
+
+ -- See Note [Bare smart quote error]
+ checkSmartQuotes = do
+ let findSmartQuote i0@(AI loc _) =
+ case alexGetChar' i0 of
+ Just ('\\', i1) | Just (_, i2) <- alexGetChar' i1 -> findSmartQuote i2
+ Just (c, i1)
+ | isDoubleSmartQuote c -> Just (c, loc)
+ | otherwise -> findSmartQuote i1
+ _ -> Nothing
+ case findSmartQuote (AI (psSpanStart startSpan) startBuf) of
+ Just (c, loc) -> throwSmartQuoteError c loc
+ Nothing -> pure ()
+
+-- | Dummy action that should never be called. Should only be used in lex states
+-- that are manually lexed in tok_string_multi.
+tok_string_multi_content :: Action
+tok_string_multi_content = panic "tok_string_multi_content unexpectedly invoked"
+
+lex_chars :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
+lex_chars (startDelim, endDelim) span buf len =
+ either (throwStringLexError i0) pure $
+ lexString contentLen contentBuf
+ where
+ i0@(AI _ contentBuf) = advanceInputBytes (length startDelim) $ AI (psSpanStart span) buf
+ -- assumes delimiters are ASCII, with 1 byte per Char
+ contentLen = len - length startDelim - length endDelim
-lex_quoted_label :: Action
-lex_quoted_label span buf _len _buf2 = do
- s <- lex_string StringTypeSingle
- (AI end bufEnd) <- getInput
- let
- token = ITlabelvarid (SourceText src) (mkFastString s)
- src = lexemeToFastString (stepOn buf) (cur bufEnd - cur buf - 1)
- start = psSpanStart span
-
- return $ L (mkPsSpan start end) token
-
-
-lex_string :: LexStringType -> P String
-lex_string strType = do
- start <- getInput
- (str, next) <- either fromStringLexError pure $ lexString strType alexGetChar' start
- setInput next
- pure str
-
-
-lex_char_tok :: Action
--- Here we are basically parsing character literals, such as 'x' or '\n'
--- but we additionally spot 'x and ''T, returning ITsimpleQuote and
--- ITtyQuote respectively, but WITHOUT CONSUMING the x or T part
--- (the parser does that).
--- So we have to do two characters of lookahead: when we see 'x we need to
--- see if there's a trailing quote
-lex_char_tok span buf _len _buf2 = do -- We've seen '
- i1 <- getInput -- Look ahead to first character
- let loc = psSpanStart span
- case alexGetChar' i1 of
- Nothing -> lit_error i1
-
- Just ('\'', i2@(AI end2 _)) -> do -- We've seen ''
- setInput i2
- return (L (mkPsSpan loc end2) ITtyQuote)
-
- Just ('\\', i2@(AI end2 _)) -> do -- We've seen 'backslash
- (lit_ch, i3) <-
- either fromStringLexError pure $
- resolveEscapeCharacter alexGetChar' i2
- case alexGetChar' i3 of
- Just ('\'', i4) -> do
- setInput i4
- finish_char_tok buf loc lit_ch
- Just (mc, _) | isSingleSmartQuote mc -> add_smart_quote_error mc end2
- _ -> lit_error i3
-
- Just (c, i2@(AI end2 _))
- | not (isAnyChar c) -> lit_error i1
- | otherwise ->
-
- -- We've seen 'x, where x is a valid character
- -- (i.e. not newline etc) but not a quote or backslash
- case alexGetChar' i2 of -- Look ahead one more character
- Just ('\'', i3) -> do -- We've seen 'x'
- setInput i3
- finish_char_tok buf loc c
- Just (c, _) | isSingleSmartQuote c -> add_smart_quote_error c end2
- _other -> do -- We've seen 'x not followed by quote
- -- (including the possibility of EOF)
- -- Just parse the quote only
- let (AI end _) = i1
- return (L (mkPsSpan loc end) ITsimpleQuote)
-
--- We've already seen the closing quote
--- Just need to check for trailing #
-finish_char_tok :: StringBuffer -> PsLoc -> Char -> P (PsLocated Token)
-finish_char_tok buf loc ch = do
- i <- getInput
- lex_magic_hash i >>= \case
- Just i' -> do
- setInput i'
- -- Include the trailing # in SourceText
- let (psSpan, src) = getStringLoc (buf, loc) i'
- pure $ L psSpan (ITprimchar src ch)
- Nothing -> do
- let (psSpan, src) = getStringLoc (buf, loc) i
- pure $ L psSpan (ITchar src ch)
+throwStringLexError :: AlexInput -> StringLexError -> P a
+throwStringLexError i (StringLexError e pos) = setInput (advanceInputTo pos i) >> lexError e
--- | Get the span and source text for a string from the given start to the given end.
-getStringLoc :: (StringBuffer, PsLoc) -> AlexInput -> (PsSpan, SourceText)
-getStringLoc (bufStart, locStart) (AI locEnd bufEnd) = (psSpan, SourceText src)
+tok_quoted_label :: Action
+tok_quoted_label span buf len _buf2 = do
+ s <- lex_chars ("#\"", "\"") span buf len
+ pure $ L span (ITlabelvarid src (mkFastString s))
where
- psSpan = mkPsSpan locStart locEnd
- src = lexemeToFastString bufStart (cur bufEnd - cur bufStart)
-
-
--- Return Just if we found the magic hash, with the next input.
-lex_magic_hash :: AlexInput -> P (Maybe AlexInput)
-lex_magic_hash i = do
- magicHash <- getBit MagicHashBit
- if magicHash
- then
- case alexGetChar' i of
- Just ('#', i') -> pure (Just i')
- _other -> pure Nothing
- else pure Nothing
-
-fromStringLexError :: StringLexError AlexInput -> P a
-fromStringLexError = \case
- UnexpectedEOF i squote -> checkSQuote squote >> throw i LexStringCharLitEOF
- BadCharInitialLex i squote -> checkSQuote squote >> throw i LexStringCharLit
- EscapeBadChar i -> throw i LexStringCharLit
- EscapeUnexpectedEOF i -> throw i LexStringCharLitEOF
- EscapeNumRangeError i -> throw i LexNumEscapeRange
- EscapeSmartQuoteError c (AI loc _) -> add_smart_quote_error c loc
+ -- skip leading '#'
+ src = SourceText . mkFastString . drop 1 $ lexemeToString buf len
+
+
+tok_char :: Action
+tok_char span buf len _buf2 = do
+ c <- lex_chars ("'", "'") span buf (if endsInHash then len - 1 else len) >>= \case
+ [c] -> pure c
+ s -> panic $ "tok_char expected exactly one character, got: " ++ show s
+ pure . L span $
+ if endsInHash
+ then ITprimchar src c
+ else ITchar src c
where
- throw i e = setInput i >> lexError e
- checkSQuote = \case
- NoSmartQuote -> pure ()
- SmartQuote c (AI loc _) -> add_nonfatal_smart_quote_error c loc
-
--- before calling lit_error, ensure that the current input is pointing to
--- the position of the error in the buffer. This is so that we can report
--- a correct location to the user, but also so we can detect UTF-8 decoding
--- errors if they occur.
-lit_error :: AlexInput -> P a
-lit_error i = do setInput i; lexError LexStringCharLit
+ src = SourceText $ lexemeToFastString buf len
+ endsInHash = currentChar (offsetBytes (len - 1) buf) == '#'
+
-- -----------------------------------------------------------------------------
-- QuasiQuote
@@ -2389,32 +2401,28 @@ quasiquote_error start = do
isSmartQuote :: AlexAccPred ExtsBitmap
isSmartQuote _ _ _ (AI _ buf) = let c = prevChar buf ' ' in isSingleSmartQuote c || isDoubleSmartQuote c
-smart_quote_error_message :: Char -> PsLoc -> MsgEnvelope PsMessage
-smart_quote_error_message c loc =
- let (correct_char, correct_char_name) =
- if isSingleSmartQuote c then ('\'', "Single Quote") else ('"', "Quotation Mark")
- err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (mkPsSpan loc loc)) $
- PsErrUnicodeCharLooksLike c correct_char correct_char_name in
- err
-
+throwSmartQuoteError :: Char -> PsLoc -> P a
+throwSmartQuoteError c loc = addFatalError err
+ where
+ err =
+ mkPlainErrorMsgEnvelope (mkSrcSpanPs (mkPsSpan loc loc)) $
+ PsErrUnicodeCharLooksLike c correct_char correct_char_name
+ (correct_char, correct_char_name) =
+ if isSingleSmartQuote c
+ then ('\'', "Single Quote")
+ else ('"', "Quotation Mark")
+
+-- | Throw a smart quote error, where the smart quote was the last character lexed
smart_quote_error :: Action
-smart_quote_error span buf _len _buf2 = do
- let c = currentChar buf
- addFatalError (smart_quote_error_message c (psSpanStart span))
-
-add_smart_quote_error :: Char -> PsLoc -> P a
-add_smart_quote_error c loc = addFatalError (smart_quote_error_message c loc)
-
-add_nonfatal_smart_quote_error :: Char -> PsLoc -> P ()
-add_nonfatal_smart_quote_error c loc = addError (smart_quote_error_message c loc)
+smart_quote_error span _ _ buf2 = do
+ let c = prevChar buf2 (panic "smart_quote_error unexpectedly called on beginning of input")
+ throwSmartQuoteError c (psSpanStart span)
-advance_to_smart_quote_character :: P ()
-advance_to_smart_quote_character = do
- i <- getInput
- case alexGetChar' i of
- Just (c, _) | isDoubleSmartQuote c -> return ()
- Just (_, i2) -> do setInput i2; advance_to_smart_quote_character
- Nothing -> return () -- should never get here
+-- Note [Bare smart quote error]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- A smart quote inside of a string is allowed, but if a complete valid string
+-- couldn't be lexed, we want to see if there's a smart quote that the user
+-- thought ended the string, but in fact didn't.
-- -----------------------------------------------------------------------------
-- Warnings
@@ -2652,7 +2660,7 @@ getLastLocIncludingComments = P $ \s@(PState { prev_loc = prev_loc }) -> POk s p
getLastLoc :: P PsSpan
getLastLoc = P $ \s@(PState { last_loc = last_loc }) -> POk s last_loc
-data AlexInput = AI !PsLoc !StringBuffer
+data AlexInput = AI !PsLoc !StringBuffer deriving (Show)
{-
Note [Unicode in Alex]
@@ -2763,6 +2771,19 @@ alexGetChar' (AI loc s)
where (c,s') = nextChar s
loc' = advancePsLoc loc c
+-- | Advance the given input N bytes.
+advanceInputBytes :: Int -> AlexInput -> AlexInput
+advanceInputBytes n i0@(AI _ buf0) = advanceInputTo (cur buf0 + n) i0
+
+-- | Advance the given input to the given position.
+advanceInputTo :: Int -> AlexInput -> AlexInput
+advanceInputTo pos = go
+ where
+ go i@(AI _ buf)
+ | cur buf >= pos = i
+ | Just (_, i') <- alexGetChar' i = go i'
+ | otherwise = i -- reached the end, just return the last input
+
getInput :: P AlexInput
getInput = P $ \s at PState{ loc=l, buffer=b } -> POk s (AI l b)
@@ -2770,9 +2791,10 @@ setInput :: AlexInput -> P ()
setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
nextIsEOF :: P Bool
-nextIsEOF = do
- AI _ s <- getInput
- return $ atEnd s
+nextIsEOF = isEOF <$> getInput
+
+isEOF :: AlexInput -> Bool
+isEOF (AI _ buf) = atEnd buf
pushLexState :: Int -> P ()
pushLexState ls = P $ \s at PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
@@ -3516,6 +3538,11 @@ topNoLayoutContainsCommas [] = False
topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
+-- If the generated alexScan/alexScanUser functions are called multiple times
+-- in this file, alexScanUser gets broken out into a separate function and
+-- increases memory usage. Make sure GHC inlines this function and optimizes it.
+{-# INLINE alexScanUser #-}
+
lexToken :: P (PsLocated Token)
lexToken = do
inp@(AI loc1 buf) <- getInput
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -1,284 +1,199 @@
{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE PatternSynonyms #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE ViewPatterns #-}
module GHC.Parser.String (
StringLexError (..),
- ContainsSmartQuote (..),
- LexStringType (..),
lexString,
+ lexMultilineString,
-- * Unicode smart quote helpers
isDoubleSmartQuote,
isSingleSmartQuote,
-
- -- * Other helpers
- isAnyChar,
- resolveEscapeCharacter,
) where
-import GHC.Prelude
+import GHC.Prelude hiding (getChar)
import Control.Arrow ((>>>))
-import Control.Monad (guard, unless, when)
-import Data.Char (chr, isPrint, ord)
-import Data.List (unfoldr)
+import Control.Monad (when)
+import Data.Char (chr, ord)
+import qualified Data.Foldable1 as Foldable1
+import qualified Data.List.NonEmpty as NonEmpty
import Data.Maybe (listToMaybe, mapMaybe)
+import GHC.Data.StringBuffer (StringBuffer)
+import qualified GHC.Data.StringBuffer as StringBuffer
import GHC.Parser.CharClass (
hexDigit,
- is_any,
is_decdigit,
is_hexdigit,
is_octdigit,
is_space,
octDecDigit,
)
+import GHC.Parser.Errors.Types (LexErr (..))
import GHC.Utils.Panic (panic)
-data LexStringType = StringTypeSingle | StringTypeMulti
+type BufPos = Int
+data StringLexError = StringLexError LexErr BufPos
--- | State to accumulate while iterating through string literal.
---
--- Fields are strict here to avoid space leak when iterating through large string literal
--- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
-data LexStringState loc = LexStringState
- { stringAcc :: !String
- -- ^ The string seen so far, reversed
- , multilineCommonWsPrefix :: !Int
- -- ^ The common prefix for multiline strings. See Note [Multiline string literals]
- , initialLoc :: !loc
- -- ^ The location of the beginning of the string literal
- }
-
--- | Get the character at the given location, with the location
--- of the next character. Returns Nothing if at the end of the
--- input.
-type GetChar loc = loc -> Maybe (Char, loc)
-
-lexString :: LexStringType -> GetChar loc -> loc -> Either (StringLexError loc) (String, loc)
-lexString strType getChar initialLoc = go initialState initialLoc
+lexString :: Int -> StringBuffer -> Either StringLexError String
+lexString = lexStringWith processChars processChars
where
- initialState =
- LexStringState
- { stringAcc = ""
- , multilineCommonWsPrefix =
- case strType of
- StringTypeMulti -> maxBound
- _ -> 0
- , initialLoc = initialLoc
- }
-
- -- 's' is strict here to avoid space leak when iterating through large string literal
- -- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
- go !s loc0 =
- case getChar loc0 of
- -- found closing delimiter
- Just ('"', _) | Just loc1 <- checkDelimiter strType getChar loc0 -> do
- let postprocess =
- case strType of
- StringTypeSingle -> id
- StringTypeMulti -> postprocessMultiline (multilineCommonWsPrefix s)
- Right (postprocess . reverse $ stringAcc s, loc1)
-
- -- found backslash
- Just (c0@'\\', loc1) -> do
- case getChar loc1 of
- -- found '\&' character, which should be elided
- Just ('&', loc2) -> go s loc2
- -- found start of a string gap
- Just (c1, loc2) | is_space c1 -> collapseStringGap getChar s loc2 >>= go s
- -- some other escape character
- Just (c1, loc2) ->
- case strType of
- StringTypeSingle -> do
- (c', loc') <- resolveEscapeCharacter getChar loc1
- go (addChar c' s) loc'
- StringTypeMulti -> do
- -- keep escape characters unresolved until after post-processing,
- -- to distinguish between a user-newline and the user writing "\n".
- -- but still process the characters here, to find any errors
- _ <- resolveEscapeCharacter getChar loc1
- go (addChar c1 . addChar c0 $ s) loc2
- -- backslash at end of input
- Nothing -> Left $ BadCharInitialLex loc1 (hasSQuote getChar s)
-
- -- found newline character in multiline string
- Just (c0@'\n', loc1) | StringTypeMulti <- strType ->
- uncurry go $ parseLeadingWS getChar (addChar c0 s) loc1
-
- -- found some other character
- Just (c0, loc1) | isAnyChar c0 -> go (addChar c0 s) loc1
-
- -- found some unknown character
- Just (_, _) -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
-
- -- reached EOF before finding end of string
- Nothing -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
-{-# INLINE lexString #-}
-
-checkDelimiter :: LexStringType -> GetChar loc -> loc -> Maybe loc
-checkDelimiter strType getChar loc0 =
- case strType of
- StringTypeSingle -> do
- ('"', loc1) <- getChar loc0
- Just loc1
- StringTypeMulti -> do
- ('"', loc1) <- getChar loc0
- ('"', loc2) <- getChar loc1
- ('"', loc3) <- getChar loc2
- Just loc3
-{-# INLINE checkDelimiter #-}
-
--- | A helper for adding the given character to the lexed string.
-addChar :: Char -> LexStringState loc -> LexStringState loc
-addChar c s = s{stringAcc = c : stringAcc s}
-{-# INLINE addChar #-}
-
--- | Return whether the string we've parsed so far contains any smart quotes.
-hasSQuote :: GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
-hasSQuote getChar s
- | any isDoubleSmartQuote (stringAcc s)
- , (c, loc) : _ <- filter (isDoubleSmartQuote . fst) allChars =
- SmartQuote c loc
- | otherwise =
- NoSmartQuote
+ processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
+ processChars =
+ collapseGaps
+ >>> resolveEscapes
+
+-- -----------------------------------------------------------------------------
+-- Lexing interface
+
+{-
+Note [Lexing strings]
+~~~~~~~~~~~~~~~~~~~~~
+
+After verifying if a string is lexically valid with Alex, we still need to do
+some post processing of the string, namely:
+1. Collapse string gaps
+2. Resolve escape characters
+
+The problem: 'lexemeToString' is more performant than manually reading
+characters from the StringBuffer. However, that completely erases the position
+of each character, which we need in order to report the correct position for
+error messages (e.g. when resolving escape characters).
+
+So what we'll do is do two passes. The first pass is optimistic; just convert
+to a plain String and process it. If this results in an error, we do a second
+pass, this time where each character is annotated with its position. Now, the
+error has all the information it needs.
+
+Ideally, lexStringWith would take a single (forall c. HasChar c => ...) function,
+but to help the specializer, we pass it in twice to concretize it for the two
+types we actually use.
+-}
+
+-- | See Note [Lexing strings]
+lexStringWith ::
+ ([Char] -> Either (Char, LexErr) [Char])
+ -> ([CharPos] -> Either (CharPos, LexErr) [CharPos])
+ -> Int
+ -> StringBuffer
+ -> Either StringLexError String
+lexStringWith processChars processCharsPos len buf =
+ case processChars $ bufferChars buf len of
+ Right s -> Right s
+ Left _ ->
+ case processCharsPos $ bufferLocatedChars buf len of
+ Right _ -> panic "expected lex error on second pass"
+ Left ((_, pos), e) -> Left $ StringLexError e pos
+
+class HasChar c where
+ getChar :: c -> Char
+ setChar :: Char -> c -> c
+
+instance HasChar Char where
+ getChar = id
+ setChar = const
+
+instance HasChar (Char, x) where
+ getChar = fst
+ setChar c (_, x) = (c, x)
+
+pattern Char :: HasChar c => Char -> c
+pattern Char c <- (getChar -> c)
+{-# COMPLETE Char #-}
+
+bufferChars :: StringBuffer -> Int -> [Char]
+bufferChars = StringBuffer.lexemeToString
+
+type CharPos = (Char, BufPos)
+
+bufferLocatedChars :: StringBuffer -> Int -> [CharPos]
+bufferLocatedChars initialBuf len = go initialBuf
where
- allChars = unfoldr getCharWithLoc (initialLoc s)
- getCharWithLoc loc =
- case getChar loc of
- Just (c, loc') -> Just ((c, loc), loc')
- Nothing -> Nothing
-{-# INLINE hasSQuote #-}
-
--- | After parsing a backslash and a space character, consume the rest of
--- the string gap and return the next location.
-collapseStringGap :: GetChar loc -> LexStringState loc -> loc -> Either (StringLexError loc) loc
-collapseStringGap getChar s = go
+ go buf
+ | atEnd buf = []
+ | otherwise =
+ let (c, buf') = StringBuffer.nextChar buf
+ in (c, StringBuffer.cur buf) : go buf'
+
+ atEnd buf = StringBuffer.byteDiff initialBuf buf >= len
+
+-- -----------------------------------------------------------------------------
+-- Lexing phases
+
+collapseGaps :: HasChar c => [c] -> [c]
+collapseGaps = go
where
- go loc0 =
- case getChar loc0 of
- Just ('\\', loc1) -> pure loc1
- Just (c0, loc1) | is_space c0 -> go loc1
- Just _ -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
- Nothing -> Left $ UnexpectedEOF loc0 (hasSQuote getChar s)
-{-# INLINE collapseStringGap #-}
+ go = \case
+ c1@(Char '\\') : c2@(Char c) : cs
+ | is_space c -> go $ dropGap cs
+ | otherwise -> c1 : c2 : go cs
+ c : cs -> c : go cs
+ [] -> []
--- | See Note [Multiline string literals]
-parseLeadingWS :: GetChar loc -> LexStringState loc -> loc -> (LexStringState loc, loc)
-parseLeadingWS getChar = go 0
+ dropGap = \case
+ Char '\\' : cs -> cs
+ _ : cs -> dropGap cs
+ [] -> panic "gap unexpectedly ended"
+
+resolveEscapes :: HasChar c => [c] -> Either (c, LexErr) [c]
+resolveEscapes = go dlistEmpty
where
- go !col s loc =
- case getChar loc of
- Just (c@' ', loc') -> go (col + 1) (addChar c s) loc'
- -- expand tabs
- Just ('\t', loc') ->
- let fill = 8 - (col `mod` 8)
- s' = applyN fill (addChar ' ') s
- in go (col + fill) s' loc'
- -- if we see a newline or string delimiter, then this line only contained whitespace, so
- -- don't include it in the common whitespace prefix
- Just ('\n', _) -> (s, loc)
- Just ('"', _) | Just _ <- checkDelimiter StringTypeMulti getChar loc -> (s, loc)
- -- found some other character, so we're done parsing leading whitespace
- _ ->
- let s' = s{multilineCommonWsPrefix = min col (multilineCommonWsPrefix s)}
- in (s', loc)
-
- applyN :: Int -> (a -> a) -> a -> a
- applyN n f x0 = iterate f x0 !! n
-{-# INLINE parseLeadingWS #-}
-
-data StringLexError loc
- = UnexpectedEOF !loc !(ContainsSmartQuote loc)
- -- ^ Unexpectedly hit EOF when lexing string
- | BadCharInitialLex !loc !(ContainsSmartQuote loc)
- -- ^ Found invalid character when initially lexing string
- | EscapeBadChar !loc
- -- ^ Found invalid character when parsing an escaped character
- | EscapeUnexpectedEOF !loc
- -- ^ Unexpectedly hit EOF when parsing an escaped character
- | EscapeNumRangeError !loc
- -- ^ Escaped number exceeds range
- | EscapeSmartQuoteError !Char !loc
- -- ^ Found escaped smart unicode chars as `\’` or `\”`
- deriving (Show)
-
--- | When initially lexing the string, we want to track if we've
--- seen a smart quote, to show a helpful "you might be accidentally
--- using a smart quote" error.
-data ContainsSmartQuote loc
- = NoSmartQuote
- | SmartQuote !Char !loc
- deriving (Show)
+ go !acc = \case
+ [] -> pure $ dlistToList acc
+ Char '\\' : Char '&' : cs -> go acc cs
+ backslash@(Char '\\') : cs ->
+ case resolveEscapeChar cs of
+ Right (esc, cs') -> go (acc `dlistSnoc` setChar esc backslash) cs'
+ Left (c, e) -> Left (c, e)
+ c : cs -> go (acc `dlistSnoc` c) cs
-- -----------------------------------------------------------------------------
-- Escape characters
--- | After finding a backslash, parse the rest of the escape character, starting
--- at the given location.
-resolveEscapeCharacter :: GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
-resolveEscapeCharacter getChar loc0 = do
- (c0, loc1) <- expectChar loc0
- case c0 of
- 'a' -> pure ('\a', loc1)
- 'b' -> pure ('\b', loc1)
- 'f' -> pure ('\f', loc1)
- 'n' -> pure ('\n', loc1)
- 'r' -> pure ('\r', loc1)
- 't' -> pure ('\t', loc1)
- 'v' -> pure ('\v', loc1)
- '\\' -> pure ('\\', loc1)
- '"' -> pure ('\"', loc1)
- '\'' -> pure ('\'', loc1)
- -- escape codes
- 'x' -> expectNum is_hexdigit 16 hexDigit loc1
- 'o' -> expectNum is_octdigit 8 octDecDigit loc1
- _ | is_decdigit c0 -> expectNum is_decdigit 10 octDecDigit loc0
- -- control characters (e.g. '\^M')
- '^' -> do
- (c1, loc2) <- expectChar loc1
- unless ('@' <= c1 && c1 <= '_') $ Left $ EscapeBadChar loc1
- pure (chr $ ord c1 - ord '@', loc2)
- -- long form escapes (e.g. '\NUL')
- _ | Just (c1, loc2) <- parseLongEscape getChar c0 loc1 -> pure (c1, loc2)
- -- check unicode smart quotes (#21843)
- _ | isDoubleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
- _ | isSingleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
- -- unknown escape
- _ -> Left $ EscapeBadChar loc0
+-- | Resolve a escape character, after having just lexed a backslash.
+-- Assumes escape character is valid.
+resolveEscapeChar :: HasChar c => [c] -> Either (c, LexErr) (Char, [c])
+resolveEscapeChar = \case
+ Char 'a' : cs -> pure ('\a', cs)
+ Char 'b' : cs -> pure ('\b', cs)
+ Char 'f' : cs -> pure ('\f', cs)
+ Char 'n' : cs -> pure ('\n', cs)
+ Char 'r' : cs -> pure ('\r', cs)
+ Char 't' : cs -> pure ('\t', cs)
+ Char 'v' : cs -> pure ('\v', cs)
+ Char '\\' : cs -> pure ('\\', cs)
+ Char '"' : cs -> pure ('\"', cs)
+ Char '\'' : cs -> pure ('\'', cs)
+ -- escape codes
+ Char 'x' : cs -> parseNum is_hexdigit 16 hexDigit cs
+ Char 'o' : cs -> parseNum is_octdigit 8 octDecDigit cs
+ cs@(Char c : _) | is_decdigit c -> parseNum is_decdigit 10 octDecDigit cs
+ -- control characters (e.g. '\^M')
+ Char '^' : Char c : cs -> pure (chr $ ord c - ord '@', cs)
+ -- long form escapes (e.g. '\NUL')
+ cs | Just (esc, cs') <- parseLongEscape cs -> pure (esc, cs')
+ -- shouldn't happen
+ Char c : _ -> panic $ "found unexpected escape character: " ++ show c
+ [] -> panic "escape character unexpectedly ended"
where
- expectChar loc =
- case getChar loc of
- Just x -> pure x
- Nothing -> Left $ EscapeUnexpectedEOF loc
-
- expectNum isDigit base toDigit loc1 = do
- (c1, loc2) <- expectChar loc1
- unless (isDigit c1) $ Left $ EscapeBadChar loc1
- let parseNum x loc =
- case getChar loc of
- Just (c, loc') | isDigit c -> do
- let x' = x * base + toDigit c
- when (x' > 0x10ffff) $ Left $ EscapeNumRangeError loc
- parseNum x' loc'
- _ ->
- pure (chr x, loc)
- parseNum (toDigit c1) loc2
-{-# INLINE resolveEscapeCharacter #-}
-
-parseLongEscape :: GetChar loc -> Char -> loc -> Maybe (Char, loc)
-parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCodes
+ parseNum isDigit base toDigit =
+ let go x = \case
+ ch@(Char c) : cs | isDigit c -> do
+ let x' = x * base + toDigit c
+ when (x' > 0x10ffff) $ Left (ch, LexNumEscapeRange)
+ go x' cs
+ cs -> pure (chr x, cs)
+ in go 0
+
+parseLongEscape :: HasChar c => [c] -> Maybe (Char, [c])
+parseLongEscape cs = listToMaybe (mapMaybe tryParse longEscapeCodes)
where
- tryParse (prefix, c) = do
- p0 : p <- pure prefix
- guard (p0 == c0) -- see if the first character matches
- loc <- parsePrefix loc1 p -- see if the rest of the prefix matches
- pure (c, loc)
-
- parsePrefix loc = \case
- [] -> pure loc
- p : ps -> do
- (c, loc') <- getChar loc
- guard (p == c)
- parsePrefix loc' ps
+ tryParse (code, esc) =
+ case splitAt (length code) cs of
+ (pre, cs') | map getChar pre == code -> Just (esc, cs')
+ _ -> Nothing
longEscapeCodes =
[ ("NUL", '\NUL')
@@ -289,14 +204,14 @@ parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCode
, ("ENQ", '\ENQ')
, ("ACK", '\ACK')
, ("BEL", '\BEL')
- , ("BS", '\BS')
- , ("HT", '\HT')
- , ("LF", '\LF')
- , ("VT", '\VT')
- , ("FF", '\FF')
- , ("CR", '\CR')
- , ("SO", '\SO')
- , ("SI", '\SI')
+ , ("BS" , '\BS' )
+ , ("HT" , '\HT' )
+ , ("LF" , '\LF' )
+ , ("VT" , '\VT' )
+ , ("FF" , '\FF' )
+ , ("CR" , '\CR' )
+ , ("SO" , '\SO' )
+ , ("SI" , '\SI' )
, ("DLE", '\DLE')
, ("DC1", '\DC1')
, ("DC2", '\DC2')
@@ -306,17 +221,16 @@ parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCode
, ("SYN", '\SYN')
, ("ETB", '\ETB')
, ("CAN", '\CAN')
- , ("EM", '\EM')
+ , ("EM" , '\EM' )
, ("SUB", '\SUB')
, ("ESC", '\ESC')
- , ("FS", '\FS')
- , ("GS", '\GS')
- , ("RS", '\RS')
- , ("US", '\US')
- , ("SP", '\SP')
+ , ("FS" , '\FS' )
+ , ("GS" , '\GS' )
+ , ("RS" , '\RS' )
+ , ("US" , '\US' )
+ , ("SP" , '\SP' )
, ("DEL", '\DEL')
]
-{-# INLINE parseLongEscape #-}
-- -----------------------------------------------------------------------------
-- Unicode Smart Quote detection (#21843)
@@ -333,6 +247,98 @@ isSingleSmartQuote = \case
'’' -> True
_ -> False
+-- -----------------------------------------------------------------------------
+-- Multiline strings
+
+-- | See Note [Multiline string literals]
+--
+-- Assumes string is lexically valid. Skips the steps about splitting
+-- and rejoining lines, and instead manually find newline characters,
+-- for performance.
+lexMultilineString :: Int -> StringBuffer -> Either StringLexError String
+lexMultilineString = lexStringWith processChars processChars
+ where
+ processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
+ processChars =
+ collapseGaps -- Step 1
+ >>> expandLeadingTabs -- Step 3
+ >>> rmCommonWhitespacePrefix -- Step 4
+ >>> collapseOnlyWsLines -- Step 5
+ >>> rmFirstNewline -- Step 7a
+ >>> rmLastNewline -- Step 7b
+ >>> resolveEscapes -- Step 8
+
+ -- expands all tabs, since the lexer will verify that tabs can only appear
+ -- as leading indentation
+ expandLeadingTabs :: HasChar c => [c] -> [c]
+ expandLeadingTabs =
+ let go !col = \case
+ c@(Char '\t') : cs ->
+ let fill = 8 - (col `mod` 8)
+ in replicate fill (setChar ' ' c) ++ go (col + fill) cs
+ c : cs -> c : go (if getChar c == '\n' then 0 else col + 1) cs
+ [] -> []
+ in go 0
+
+ rmCommonWhitespacePrefix :: HasChar c => [c] -> [c]
+ rmCommonWhitespacePrefix cs0 =
+ let commonWSPrefix = getCommonWsPrefix (map getChar cs0)
+ go = \case
+ c@(Char '\n') : cs -> c : go (dropLine commonWSPrefix cs)
+ c : cs -> c : go cs
+ [] -> []
+ -- drop x characters from the string, or up to a newline, whichever
+ -- comes first
+ dropLine !x = \case
+ cs | x <= 0 -> cs
+ cs@(Char '\n' : _) -> cs
+ _ : cs -> dropLine (x - 1) cs
+ [] -> []
+ in go cs0
+
+ collapseOnlyWsLines :: HasChar c => [c] -> [c]
+ collapseOnlyWsLines =
+ let go = \case
+ c@(Char '\n') : cs | Just cs' <- checkAllWs cs -> c : go cs'
+ c : cs -> c : go cs
+ [] -> []
+ checkAllWs = \case
+ -- got all the way to a newline or the end of the string, return
+ cs@(Char '\n' : _) -> Just cs
+ cs@[] -> Just cs
+ -- found whitespace, continue
+ Char c : cs | is_space c -> checkAllWs cs
+ -- anything else, stop
+ _ -> Nothing
+ in go
+
+ rmFirstNewline :: HasChar c => [c] -> [c]
+ rmFirstNewline = \case
+ Char '\n' : cs -> cs
+ cs -> cs
+
+ rmLastNewline :: HasChar c => [c] -> [c]
+ rmLastNewline =
+ let go = \case
+ [] -> []
+ [Char '\n'] -> []
+ c : cs -> c : go cs
+ in go
+
+-- | See step 4 in Note [Multiline string literals]
+--
+-- Assumes tabs have already been expanded.
+getCommonWsPrefix :: String -> Int
+getCommonWsPrefix s =
+ case NonEmpty.nonEmpty includedLines of
+ Nothing -> 0
+ Just ls -> Foldable1.minimum $ NonEmpty.map (length . takeWhile is_space) ls
+ where
+ includedLines =
+ filter (not . all is_space) -- ignore whitespace-only lines
+ . drop 1 -- ignore first line in calculation
+ $ lines s
+
{-
Note [Multiline string literals]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -351,23 +357,13 @@ The canonical steps for post processing a multiline string are:
2. Split the string by newlines
3. Convert leading tabs into spaces
* In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
-4. Remove common whitespace prefix in every line (see below)
+4. Remove common whitespace prefix in every line except the first (see below)
5. If a line contains only whitespace, remove all of the whitespace
6. Join the string back with `\n` delimiters
-7. If the first character of the string is a newline, remove it
+7a. If the first character of the string is a newline, remove it
+7b. If the last character of the string is a newline, remove it
8. Interpret escaped characters
-However, for performance reasons, we do as much of this in one pass as possible:
-1. As we lex the string, do the following steps as they appear:
- a. Collapse string gaps
- b. Keep track of the common whitespace prefix so far
- c. Validate escaped characters
-2. At the very end, post process the lexed string:
- a. Remove the common whitespace prefix from every line
- b. Remove all whitespace from all-whitespace lines
- c. Remove initial newline character
- d. Resolve escaped characters
-
The common whitespace prefix can be informally defined as "The longest
prefix of whitespace shared by all lines in the string, excluding the
first line and any whitespace-only lines".
@@ -382,75 +378,16 @@ It's more precisely defined with the following algorithm:
3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
-}
--- | See Note [Multiline string literals]
-postprocessMultiline :: Int -> String -> String
-postprocessMultiline commonWSPrefix =
- rmCommonWhitespacePrefix
- >>> collapseOnlyWsLines
- >>> rmFirstNewline
- >>> rmLastNewline
- >>> resolveEscapeChars
- where
- rmCommonWhitespacePrefix =
- let go = \case
- '\n' : s -> '\n' : go (dropLine commonWSPrefix s)
- c : s -> c : go s
- [] -> []
- -- drop x characters from the string, or up to a newline, whichever
- -- comes first
- dropLine !x = \case
- s | x <= 0 -> s
- s@('\n' : _) -> s
- _ : s -> dropLine (x - 1) s
- [] -> []
- in go
-
- collapseOnlyWsLines =
- let go = \case
- '\n' : s | Just s' <- checkAllWs s -> '\n' : go s'
- c : s -> c : go s
- [] -> []
- checkAllWs = \case
- -- got all the way to a newline or the end of the string, return
- s@('\n' : _) -> Just s
- s@[] -> Just s
- -- found whitespace, continue
- c : s | is_space c -> checkAllWs s
- -- anything else, stop
- _ -> Nothing
- in go
-
- rmFirstNewline = \case
- '\n' : s -> s
- s -> s
+-- -----------------------------------------------------------------------------
+-- DList
- rmLastNewline =
- let go = \case
- [] -> []
- ['\n'] -> []
- c : cs -> c : go cs
- in go
+newtype DList a = DList ([a] -> [a])
- -- resolve escape characters, deferred from lexString. guaranteed
- -- to not throw any errors, since we already checked them in lexString
- resolveEscapeChars = \case
- [] -> []
- '\\' : s ->
- -- concretizing 'loc' to String:
- -- resolveEscapeCharacter :: (String -> Maybe (Char, String)) -> String -> Either _ (Char, String)
- case resolveEscapeCharacter uncons s of
- Left e -> panic $ "resolving escape characters in multiline string unexpectedly found errors: " ++ show e
- Right (c, s') -> c : resolveEscapeChars s'
- c : s -> c : resolveEscapeChars s
-
- uncons = \case
- c : cs -> Just (c, cs)
- [] -> Nothing
+dlistEmpty :: DList a
+dlistEmpty = DList id
--- -----------------------------------------------------------------------------
--- Helpers
+dlistToList :: DList a -> [a]
+dlistToList (DList f) = f []
-isAnyChar :: Char -> Bool
-isAnyChar c
- | c > '\x7f' = isPrint c
- | otherwise = is_any c
+dlistSnoc :: DList a -> a -> DList a
+dlistSnoc (DList f) x = DList (f . (x :))
=====================================
testsuite/driver/testlib.py
=====================================
@@ -2868,7 +2868,7 @@ def normalise_errmsg(s: str) -> str:
# normalise slashes to minimise Windows/Unix filename differences,
# but don't normalize backslashes in chars
- s = re.sub(r"(?!')\\", '/', s)
+ s = re.sub(r"(?<!('|‘))\\", '/', s)
# Normalize the name of the GHC executable. Specifically,
# this catches the cases that:
=====================================
testsuite/tests/ghci/prog013/prog013.stderr
=====================================
@@ -1,17 +1,12 @@
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
-
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
<interactive>:9:1: error: [GHC-58481] parse error on input ‘+’
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
+
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
=====================================
testsuite/tests/ghci/scripts/ghci022.stderr
=====================================
@@ -1,3 +1,2 @@
+ghci022.hs:2:5: error: [GHC-21231] lexical error at character '\n'
-ghci022.hs:2:6: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
=====================================
testsuite/tests/parser/should_fail/MultilineStringsInnerTab.stderr
=====================================
@@ -1,3 +1,3 @@
-
MultilineStringsInnerTab.hs:8:5: error: [GHC-21231]
- lexical error in string/character literal at character '\t'
+ lexical error at character '\t'
+
=====================================
testsuite/tests/parser/should_fail/MultilineStringsSmartQuotes.stderr
=====================================
@@ -1,6 +1,2 @@
-
MultilineStringsSmartQuotes.hs:9:3: [GHC-31623]
Unicode character '”' ('\8221') looks like '"' (Quotation Mark), but it is not
-
-MultilineStringsSmartQuotes.hs:10:1: [GHC-21231]
- lexical error in string/character literal at end of input
=====================================
testsuite/tests/parser/should_fail/T21843c.stderr
=====================================
@@ -1,6 +1,2 @@
-
-T21843c.hs:3:19: [GHC-31623]
+T21843c.hs:3:13: [GHC-31623]
Unicode character '”' ('\8221') looks like '"' (Quotation Mark), but it is not
-
-T21843c.hs:3:20: [GHC-21231]
- lexical error in string/character literal at character '\n'
=====================================
testsuite/tests/parser/should_fail/T21843e.stderr
=====================================
@@ -1,3 +1,3 @@
-
-T21843e.hs:3:15: [GHC-31623]
+T21843e.hs:3:13: error: [GHC-31623]
Unicode character '”' ('\8221') looks like '"' (Quotation Mark), but it is not
+
=====================================
testsuite/tests/parser/should_fail/T21843f.stderr
=====================================
@@ -1,3 +1,3 @@
-
-T21843f.hs:3:13: [GHC-31623]
+T21843f.hs:3:11: error: [GHC-31623]
Unicode character '‘' ('\8216') looks like ''' (Single Quote), but it is not
+
=====================================
testsuite/tests/parser/should_fail/T3751.stderr
=====================================
@@ -1,3 +1,2 @@
+T3751.hs:3:5: error: [GHC-21231] lexical error at character '\167'
-T3751.hs:3:7: error: [GHC-21231]
- lexical error in string/character literal at character '\167'
=====================================
testsuite/tests/parser/should_fail/T5425.stderr
=====================================
@@ -1,3 +1,2 @@
+T5425.hs:4:1: error: [GHC-21231] lexical error at character '\955'
-T5425.hs:5:2: error: [GHC-21231]
- lexical error in string/character literal at character '\955'
=====================================
testsuite/tests/parser/should_fail/readFail002.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail002.hs:5:5: error: [GHC-21231]
+ lexical error at character '\n'
-readFail002.hs:5:6: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
=====================================
testsuite/tests/parser/should_fail/readFail004.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail004.hs:17:16: error: [GHC-21231]
+ lexical error at character '.'
-readFail004.hs:19:1: error: [GHC-21231]
- lexical error in string/character literal at character '.'
=====================================
testsuite/tests/parser/should_fail/readFail005.stderr
=====================================
@@ -1,3 +1,2 @@
-
-readFail005.hs:4:7: error: [GHC-21231]
- lexical error in string/character literal at character '&'
+readFail005.hs:4:5: [GHC-21231]
+ lexical error at character '&'
=====================================
testsuite/tests/parser/should_fail/readFail033.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail033.hs:2:5: error: [GHC-21231]
+ lexical error at character '\t'
-readFail033.hs:2:6: error: [GHC-21231]
- lexical error in string/character literal at character '\t'
=====================================
testsuite/tests/parser/unicode/all.T
=====================================
@@ -33,3 +33,6 @@ test('T18225B', normal, compile_fail, [''])
test('T18158', normal, compile, [''])
test('T18158b', normal, compile_fail, [''])
+
+test('lex_unispace', normal, compile, [''])
+test('lex_unicode_ids', normal, compile, [''])
=====================================
testsuite/tests/parser/unicode/lex_unicode_ids.hs
=====================================
@@ -0,0 +1,11 @@
+-- Regression tests for unicode identifiers
+
+{-# LANGUAGE TemplateHaskellQuotes #-}
+
+module ShouldCompile where
+
+ƞ = 1
+eta = 'ƞ
+
+data Ʊ
+upsilon = ''Ʊ
=====================================
testsuite/tests/parser/unicode/lex_unispace.hs
=====================================
@@ -0,0 +1,9 @@
+-- Regression tests for unicode whitespace
+
+module ShouldCompile where
+
+-- https://github.com/fosskers/aura/blob/cdebca1f48254ebb8286d8e38591bf644282866f/haskell/aura/lib/Aura/Languages.hs#L107
+x1 = ' ' -- \12288
+
+-- https://github.com/jgm/pandoc/blob/98e77e02f6436e4b74a164762d0f3149ae7ecefa/src/Text/Pandoc/Writers/FB2.hs#L295C3-L295C32
+x2 = " " -- \xa0
=====================================
testsuite/tests/parser/unicode/utf8_010.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_010.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
testsuite/tests/parser/unicode/utf8_011.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_011.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
testsuite/tests/parser/unicode/utf8_020.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_020.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
testsuite/tests/parser/unicode/utf8_021.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_021.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
testsuite/tests/parser/unicode/utf8_022.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_022.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
=====================================
@@ -320,7 +320,7 @@ classify tok =
ITlabelvarid{} -> TkUnknown
ITchar{} -> TkChar
ITstring{} -> TkString
- ITmultilinestring{} -> TkString
+ ITstringMulti{} -> TkString
ITinteger{} -> TkNumber
ITrational{} -> TkNumber
ITprimchar{} -> TkChar
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/e7c83fe4834cbeb0c45c33a08d589133bfce7e10
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/e7c83fe4834cbeb0c45c33a08d589133bfce7e10
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240921/d581aa78/attachment-0001.html>
More information about the ghc-commits
mailing list