[Git][ghc/ghc][wip/strings] Replace manual string lexing (#25158)
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Wed Sep 25 15:53:53 UTC 2024
Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC
Commits:
92f9a4df by Brandon Chinn at 2024-09-25T08:48:02-07:00
Replace manual string lexing (#25158)
Metric Increase:
MultilineStringsPerf
This commit replaces the manual string lexing logic with native Alex
lexing syntax. This aligns the lexer much closer to the Haskell Report,
making it easier to see how the implementation and spec relate. This
slightly increases memory usage when compiling multiline strings because
we now have two distinct phases: lexing the multiline string with Alex
and post-processing the string afterwards. Before, these were done at
the same time, but separating them allows us to push as much logic into
normal Alex lexing as possible.
Since multiline strings are a new feature, this regression shouldn't be
too noticeable. We can optimize this over time.
- - - - -
29 changed files:
- compiler/GHC/Parser.y
- compiler/GHC/Parser/CharClass.hs
- compiler/GHC/Parser/Errors/Ppr.hs
- compiler/GHC/Parser/Errors/Types.hs
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
- testsuite/driver/testlib.py
- testsuite/tests/ghci/prog013/prog013.stderr
- testsuite/tests/ghci/scripts/ghci022.stderr
- testsuite/tests/parser/should_fail/MultilineStringsInnerTab.stderr
- testsuite/tests/parser/should_fail/MultilineStringsSmartQuotes.stderr
- testsuite/tests/parser/should_fail/T21843c.stderr
- testsuite/tests/parser/should_fail/T21843e.stderr
- testsuite/tests/parser/should_fail/T21843f.stderr
- testsuite/tests/parser/should_fail/T3751.stderr
- testsuite/tests/parser/should_fail/T5425.stderr
- testsuite/tests/parser/should_fail/readFail002.stderr
- testsuite/tests/parser/should_fail/readFail004.stderr
- testsuite/tests/parser/should_fail/readFail005.stderr
- testsuite/tests/parser/should_fail/readFail033.stderr
- testsuite/tests/parser/unicode/all.T
- + testsuite/tests/parser/unicode/lex_unicode_ids.hs
- + testsuite/tests/parser/unicode/lex_unispace.hs
- testsuite/tests/parser/unicode/utf8_010.stderr
- testsuite/tests/parser/unicode/utf8_011.stderr
- testsuite/tests/parser/unicode/utf8_020.stderr
- testsuite/tests/parser/unicode/utf8_021.stderr
- testsuite/tests/parser/unicode/utf8_022.stderr
- utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
Changes:
=====================================
compiler/GHC/Parser.y
=====================================
@@ -728,7 +728,7 @@ are the most common patterns, rewritten as regular expressions for clarity:
CHAR { L _ (ITchar _ _) }
STRING { L _ (ITstring _ _) }
- MULTILINESTRING { L _ (ITmultilinestring _ _) }
+ STRING_MULTI { L _ (ITstringMulti _ _) }
INTEGER { L _ (ITinteger _) }
RATIONAL { L _ (ITrational _) }
@@ -2357,8 +2357,8 @@ atype :: { LHsType GhcPs }
(getCHAR $1) }
| STRING { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
(getSTRING $1) }
- | MULTILINESTRING { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getMULTILINESTRINGs $1)
- (getMULTILINESTRING $1) }
+ | STRING_MULTI { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGMULTIs $1)
+ (getSTRINGMULTI $1) }
| '_' { sL1a $1 $ mkAnonWildCardTy }
-- Type variables are never exported, so `M.tyvar` will be rejected by the renamer.
-- We let it pass the parser because the renamer can generate a better error message.
@@ -4047,8 +4047,8 @@ literal :: { Located (HsLit GhcPs) }
: CHAR { sL1 $1 $ HsChar (getCHARs $1) $ getCHAR $1 }
| STRING { sL1 $1 $ HsString (getSTRINGs $1)
$ getSTRING $1 }
- | MULTILINESTRING { sL1 $1 $ HsMultilineString (getMULTILINESTRINGs $1)
- $ getMULTILINESTRING $1 }
+ | STRING_MULTI { sL1 $1 $ HsMultilineString (getSTRINGMULTIs $1)
+ $ getSTRINGMULTI $1 }
| PRIMINTEGER { sL1 $1 $ HsIntPrim (getPRIMINTEGERs $1)
$ getPRIMINTEGER $1 }
| PRIMWORD { sL1 $1 $ HsWordPrim (getPRIMWORDs $1)
@@ -4154,7 +4154,7 @@ getIPDUPVARID (L _ (ITdupipvarid x)) = x
getLABELVARID (L _ (ITlabelvarid _ x)) = x
getCHAR (L _ (ITchar _ x)) = x
getSTRING (L _ (ITstring _ x)) = x
-getMULTILINESTRING (L _ (ITmultilinestring _ x)) = x
+getSTRINGMULTI (L _ (ITstringMulti _ x)) = x
getINTEGER (L _ (ITinteger x)) = x
getRATIONAL (L _ (ITrational x)) = x
getPRIMCHAR (L _ (ITprimchar _ x)) = x
@@ -4180,7 +4180,7 @@ getVOCURLY (L (RealSrcSpan l _) ITvocurly) = srcSpanStartCol l
getINTEGERs (L _ (ITinteger (IL src _ _))) = src
getCHARs (L _ (ITchar src _)) = src
getSTRINGs (L _ (ITstring src _)) = src
-getMULTILINESTRINGs (L _ (ITmultilinestring src _)) = src
+getSTRINGMULTIs (L _ (ITstringMulti src _)) = src
getPRIMCHARs (L _ (ITprimchar src _)) = src
getPRIMSTRINGs (L _ (ITprimstring src _)) = src
getPRIMINTEGERs (L _ (ITprimint src _)) = src
=====================================
compiler/GHC/Parser/CharClass.hs
=====================================
@@ -36,14 +36,14 @@ cDigit = 64
{-# INLINABLE is_ctype #-}
is_ctype :: Word8 -> Char -> Bool
-is_ctype mask c = (charType c .&. mask) /= 0
+is_ctype mask c = c <= '\127' && (charType c .&. mask) /= 0
is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit,
is_alphanum :: Char -> Bool
is_ident = is_ctype cIdent
is_symbol = is_ctype cSymbol
is_any = is_ctype cAny
-is_space = \c -> c <= '\x7f' && is_ctype cSpace c -- is_space only works for <= '\x7f' (#3751, #5425)
+is_space = is_ctype cSpace
is_lower = is_ctype cLower
is_upper = is_ctype cUpper
is_digit = is_ctype cDigit
=====================================
compiler/GHC/Parser/Errors/Ppr.hs
=====================================
@@ -157,8 +157,6 @@ instance Diagnostic PsMessage where
LexUnknownPragma -> text "unknown pragma"
LexErrorInPragma -> text "lexical error in pragma"
LexNumEscapeRange -> text "numeric escape sequence out of range"
- LexStringCharLit -> text "lexical error in string/character literal"
- LexStringCharLitEOF -> text "unexpected end-of-file in string/character literal"
LexUnterminatedComment -> text "unterminated `{-'"
LexUnterminatedOptions -> text "unterminated OPTIONS pragma"
LexUnterminatedQQ -> text "unterminated quasiquotation"
=====================================
compiler/GHC/Parser/Errors/Types.hs
=====================================
@@ -591,8 +591,6 @@ data LexErr
| LexUnknownPragma -- ^ Unknown pragma
| LexErrorInPragma -- ^ Lexical error in pragma
| LexNumEscapeRange -- ^ Numeric escape sequence out of range
- | LexStringCharLit -- ^ Lexical error in string/character literal
- | LexStringCharLitEOF -- ^ Unexpected end-of-file in string/character literal
| LexUnterminatedComment -- ^ Unterminated `{-'
| LexUnterminatedOptions -- ^ Unterminated OPTIONS pragma
| LexUnterminatedQQ -- ^ Unterminated quasiquotation
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -140,7 +140,8 @@ import GHC.Parser.String
-- Any changes here should likely be reflected there.
$unispace = \x05 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$nl = [\n\r\f]
-$whitechar = [$nl\v\ $unispace]
+$space = [\ $unispace]
+$whitechar = [$nl \v $space]
$white_no_nl = $whitechar # \n -- TODO #8424
$tab = \t
@@ -167,6 +168,7 @@ $idchar = [$small $large $digit $uniidchar \']
$unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$graphic = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
+$charesc = [a b f n r t v \\ \" \' \&]
$binit = 0-1
$octit = 0-7
@@ -213,6 +215,20 @@ $docsym = [\| \^ \* \$]
@floating_point = @numspc @decimal \. @decimal @exponent? | @numspc @decimal @exponent
@hex_floating_point = @numspc @hexadecimal \. @hexadecimal @bin_exponent? | @numspc @hexadecimal @bin_exponent
+ at gap = \\ $whitechar+ \\
+ at cntrl = $asclarge | \@ | \[ | \\ | \] | \^ | \_
+ at ascii = \^ @cntrl | "NUL" | "SOH" | "STX" | "ETX" | "EOT" | "ENQ" | "ACK"
+ | "BEL" | "BS" | "HT" | "LF" | "VT" | "FF" | "CR" | "SO" | "SI" | "DLE"
+ | "DC1" | "DC2" | "DC3" | "DC4" | "NAK" | "SYN" | "ETB" | "CAN"
+ | "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
+-- N.B. ideally, we would do `@escape # \\ \&` instead of duplicating in @escapechar,
+-- which is what the Haskell Report says, but this isn't valid Alex syntax, as only
+-- character sets can be subtracted, not strings
+ at escape = \\ ( $charesc | @ascii | @decimal | o @octal | x @hexadecimal )
+ at escapechar = \\ ( $charesc # \& | @ascii | @decimal | o @octal | x @hexadecimal )
+ at stringchar = ($graphic # [\\ \"]) | $space | @escape | @gap
+ at char = ($graphic # [\\ \']) | $space | @escapechar
+
-- normal signed numerical literals can only be explicitly negative,
-- not explicitly positive (contrast @exponent)
@negative = \-
@@ -460,7 +476,7 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
<0> {
"#" $idchar+ / { ifExtension OverloadedLabelsBit } { skip_one_varid_src ITlabelvarid }
- "#" \" / { ifExtension OverloadedLabelsBit } { lex_quoted_label }
+ "#" \" @stringchar* \" / { ifExtension OverloadedLabelsBit } { tok_quoted_label }
}
<0> {
@@ -660,14 +676,38 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
}
--- Strings and chars are lexed by hand-written code. The reason is
--- that even if we recognise the string or char here in the regex
--- lexer, we would still have to parse the string afterward in order
--- to convert it to a String.
<0> {
- \' { lex_char_tok }
- \"\"\" / { ifExtension MultilineStringsBit} { lex_string_tok StringTypeMulti }
- \" { lex_string_tok StringTypeSingle }
+ \"\"\" / { ifExtension MultilineStringsBit } { tok_string_multi }
+ \" @stringchar* \" { tok_string }
+ \" @stringchar* \" \# / { ifExtension MagicHashBit } { tok_string }
+ \' @char \' { tok_char }
+ \' @char \' \# / { ifExtension MagicHashBit } { tok_char }
+
+ -- Check for smart quotes and throw better errors than a plain lexical error (#21843)
+ \' \\ $unigraphic / { isSmartQuote } { smart_quote_error }
+ \" @stringchar* \\ $unigraphic / { isSmartQuote } { smart_quote_error }
+ -- See Note [Bare smart quote error]
+ -- The valid string rule will take precedence because it'll match more
+ -- characters than this rule, so this rule will only fire if the string
+ -- could not be lexed correctly
+ \" @stringchar* $unigraphic / { isSmartQuote } { smart_quote_error }
+}
+
+<string_multi_content> {
+ -- Parse as much of the multiline string as possible, except for quotes
+ @stringchar* ($nl ([\ $tab] | @gap)* @stringchar*)* { tok_string_multi_content }
+ -- Allow bare quotes if it's not a triple quote
+ (\" | \"\") / ([\n .] # \") { tok_string_multi_content }
+}
+
+<0> {
+ \'\' { token ITtyQuote }
+
+ -- The normal character match takes precedence over this because it matches
+ -- more characters. However, if that pattern didn't match, then this quote
+ -- could be a quoted identifier, like 'x. Here, just return ITsimpleQuote,
+ -- as the parser will lex the varid separately.
+ \' / ($graphic # \\ | " ") { token ITsimpleQuote }
}
-- Note [Whitespace-sensitive operator parsing]
@@ -953,7 +993,7 @@ data Token
| ITchar SourceText Char -- Note [Literal source text] in "GHC.Types.SourceText"
| ITstring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
- | ITmultilinestring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+ | ITstringMulti SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
| ITinteger IntegralLit -- Note [Literal source text] in "GHC.Types.SourceText"
| ITrational FractionalLit
@@ -2181,156 +2221,128 @@ lex_string_prag_comment mkTok span _buf _len _buf2
-- -----------------------------------------------------------------------------
-- Strings & Chars
--- This stuff is horrible. I hates it.
-
-lex_string_tok :: LexStringType -> Action
-lex_string_tok strType span buf _len _buf2 = do
- s <- lex_string strType
-
- i <- getInput
- case strType of
- StringTypeSingle ->
- lex_magic_hash i >>= \case
- Just i' -> do
- when (any (> '\xFF') s) $ do
- pState <- getPState
- let msg = PsErrPrimStringInvalidChar
- let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
- addError err
-
- setInput i'
- let (psSpan, src) = getStringLoc (buf, locStart) i'
- pure $ L psSpan (ITprimstring src (unsafeMkByteString s))
- Nothing -> do
- let (psSpan, src) = getStringLoc (buf, locStart) i
- pure $ L psSpan (ITstring src (mkFastString s))
- StringTypeMulti -> do
- let (psSpan, src) = getStringLoc (buf, locStart) i
- pure $ L psSpan (ITmultilinestring src (mkFastString s))
+tok_string :: Action
+tok_string span buf len _buf2 = do
+ s <- lex_chars ("\"", "\"") span buf (if endsInHash then len - 1 else len)
+
+ if endsInHash
+ then do
+ when (any (> '\xFF') s) $ do
+ pState <- getPState
+ let msg = PsErrPrimStringInvalidChar
+ let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
+ addError err
+ pure $ L span (ITprimstring src (unsafeMkByteString s))
+ else
+ pure $ L span (ITstring src (mkFastString s))
where
- locStart = psSpanStart span
+ src = SourceText $ lexemeToFastString buf len
+ endsInHash = currentChar (offsetBytes (len - 1) buf) == '#'
+
+-- | Ideally, we would define this completely with Alex syntax, like normal strings.
+-- Instead, this is defined as a hybrid solution by manually invoking lex states, which
+-- we're doing for two reasons:
+-- 1. The multiline string should all be one lexical token, not multiple
+-- 2. We need to allow bare quotes, which can't be done with one regex
+tok_string_multi :: Action
+tok_string_multi startSpan startBuf _len _buf2 = do
+ -- advance to the end of the multiline string
+ let startLoc = psSpanStart startSpan
+ let i@(AI _ contentStartBuf) =
+ case lexDelim $ AI startLoc startBuf of
+ Just i -> i
+ Nothing -> panic "tok_string_multi did not start with a delimiter"
+ (AI _ contentEndBuf, i'@(AI endLoc endBuf)) <- goContent i
+
+ -- build the values pertaining to the entire multiline string, including delimiters
+ let span = mkPsSpan startLoc endLoc
+ let len = byteDiff startBuf endBuf
+ let src = SourceText $ lexemeToFastString startBuf len
+
+ -- load the content of the multiline string
+ let contentLen = byteDiff contentStartBuf contentEndBuf
+ s <-
+ either (throwStringLexError (AI startLoc startBuf)) pure $
+ lexMultilineString contentLen contentStartBuf
+
+ setInput i'
+ pure $ L span $ ITstringMulti src (mkFastString s)
+ where
+ goContent i0 =
+ case alexScan i0 string_multi_content of
+ AlexToken i1 len _
+ | Just i2 <- lexDelim i1 -> pure (i1, i2)
+ | -- is the next token a tab character?
+ -- need this explicitly because there's a global rule matching $tab
+ Just ('\t', _) <- alexGetChar' i1 -> setInput i1 >> lexError LexError
+ | isEOF i1 -> checkSmartQuotes >> lexError LexError
+ | len == 0 -> panic $ "parsing multiline string got into infinite loop at: " ++ show i0
+ | otherwise -> goContent i1
+ AlexSkip i1 _ -> goContent i1
+ _ -> lexError LexError
+
+ lexDelim =
+ let go 0 i = Just i
+ go n i =
+ case alexGetChar' i of
+ Just ('"', i') -> go (n - 1) i'
+ _ -> Nothing
+ in go (3 :: Int)
+
+ -- See Note [Bare smart quote error]
+ checkSmartQuotes = do
+ let findSmartQuote i0@(AI loc _) =
+ case alexGetChar' i0 of
+ Just ('\\', i1) | Just (_, i2) <- alexGetChar' i1 -> findSmartQuote i2
+ Just (c, i1)
+ | isDoubleSmartQuote c -> Just (c, loc)
+ | otherwise -> findSmartQuote i1
+ _ -> Nothing
+ case findSmartQuote (AI (psSpanStart startSpan) startBuf) of
+ Just (c, loc) -> throwSmartQuoteError c loc
+ Nothing -> pure ()
+
+-- | Dummy action that should never be called. Should only be used in lex states
+-- that are manually lexed in tok_string_multi.
+tok_string_multi_content :: Action
+tok_string_multi_content = panic "tok_string_multi_content unexpectedly invoked"
+
+lex_chars :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
+lex_chars (startDelim, endDelim) span buf len =
+ either (throwStringLexError i0) pure $
+ lexString contentLen contentBuf
+ where
+ i0@(AI _ contentBuf) = advanceInputBytes (length startDelim) $ AI (psSpanStart span) buf
+ -- assumes delimiters are ASCII, with 1 byte per Char
+ contentLen = len - length startDelim - length endDelim
-lex_quoted_label :: Action
-lex_quoted_label span buf _len _buf2 = do
- s <- lex_string StringTypeSingle
- (AI end bufEnd) <- getInput
- let
- token = ITlabelvarid (SourceText src) (mkFastString s)
- src = lexemeToFastString (stepOn buf) (cur bufEnd - cur buf - 1)
- start = psSpanStart span
-
- return $ L (mkPsSpan start end) token
-
-
-lex_string :: LexStringType -> P String
-lex_string strType = do
- start <- getInput
- (str, next) <- either fromStringLexError pure $ lexString strType alexGetChar' start
- setInput next
- pure str
-
-
-lex_char_tok :: Action
--- Here we are basically parsing character literals, such as 'x' or '\n'
--- but we additionally spot 'x and ''T, returning ITsimpleQuote and
--- ITtyQuote respectively, but WITHOUT CONSUMING the x or T part
--- (the parser does that).
--- So we have to do two characters of lookahead: when we see 'x we need to
--- see if there's a trailing quote
-lex_char_tok span buf _len _buf2 = do -- We've seen '
- i1 <- getInput -- Look ahead to first character
- let loc = psSpanStart span
- case alexGetChar' i1 of
- Nothing -> lit_error i1
-
- Just ('\'', i2@(AI end2 _)) -> do -- We've seen ''
- setInput i2
- return (L (mkPsSpan loc end2) ITtyQuote)
-
- Just ('\\', i2@(AI end2 _)) -> do -- We've seen 'backslash
- (lit_ch, i3) <-
- either fromStringLexError pure $
- resolveEscapeCharacter alexGetChar' i2
- case alexGetChar' i3 of
- Just ('\'', i4) -> do
- setInput i4
- finish_char_tok buf loc lit_ch
- Just (mc, _) | isSingleSmartQuote mc -> add_smart_quote_error mc end2
- _ -> lit_error i3
-
- Just (c, i2@(AI end2 _))
- | not (isAnyChar c) -> lit_error i1
- | otherwise ->
-
- -- We've seen 'x, where x is a valid character
- -- (i.e. not newline etc) but not a quote or backslash
- case alexGetChar' i2 of -- Look ahead one more character
- Just ('\'', i3) -> do -- We've seen 'x'
- setInput i3
- finish_char_tok buf loc c
- Just (c, _) | isSingleSmartQuote c -> add_smart_quote_error c end2
- _other -> do -- We've seen 'x not followed by quote
- -- (including the possibility of EOF)
- -- Just parse the quote only
- let (AI end _) = i1
- return (L (mkPsSpan loc end) ITsimpleQuote)
-
--- We've already seen the closing quote
--- Just need to check for trailing #
-finish_char_tok :: StringBuffer -> PsLoc -> Char -> P (PsLocated Token)
-finish_char_tok buf loc ch = do
- i <- getInput
- lex_magic_hash i >>= \case
- Just i' -> do
- setInput i'
- -- Include the trailing # in SourceText
- let (psSpan, src) = getStringLoc (buf, loc) i'
- pure $ L psSpan (ITprimchar src ch)
- Nothing -> do
- let (psSpan, src) = getStringLoc (buf, loc) i
- pure $ L psSpan (ITchar src ch)
+throwStringLexError :: AlexInput -> StringLexError -> P a
+throwStringLexError i (StringLexError e pos) = setInput (advanceInputTo pos i) >> lexError e
--- | Get the span and source text for a string from the given start to the given end.
-getStringLoc :: (StringBuffer, PsLoc) -> AlexInput -> (PsSpan, SourceText)
-getStringLoc (bufStart, locStart) (AI locEnd bufEnd) = (psSpan, SourceText src)
+tok_quoted_label :: Action
+tok_quoted_label span buf len _buf2 = do
+ s <- lex_chars ("#\"", "\"") span buf len
+ pure $ L span (ITlabelvarid src (mkFastString s))
where
- psSpan = mkPsSpan locStart locEnd
- src = lexemeToFastString bufStart (cur bufEnd - cur bufStart)
-
-
--- Return Just if we found the magic hash, with the next input.
-lex_magic_hash :: AlexInput -> P (Maybe AlexInput)
-lex_magic_hash i = do
- magicHash <- getBit MagicHashBit
- if magicHash
- then
- case alexGetChar' i of
- Just ('#', i') -> pure (Just i')
- _other -> pure Nothing
- else pure Nothing
-
-fromStringLexError :: StringLexError AlexInput -> P a
-fromStringLexError = \case
- UnexpectedEOF i squote -> checkSQuote squote >> throw i LexStringCharLitEOF
- BadCharInitialLex i squote -> checkSQuote squote >> throw i LexStringCharLit
- EscapeBadChar i -> throw i LexStringCharLit
- EscapeUnexpectedEOF i -> throw i LexStringCharLitEOF
- EscapeNumRangeError i -> throw i LexNumEscapeRange
- EscapeSmartQuoteError c (AI loc _) -> add_smart_quote_error c loc
+ -- skip leading '#'
+ src = SourceText . mkFastString . drop 1 $ lexemeToString buf len
+
+
+tok_char :: Action
+tok_char span buf len _buf2 = do
+ c <- lex_chars ("'", "'") span buf (if endsInHash then len - 1 else len) >>= \case
+ [c] -> pure c
+ s -> panic $ "tok_char expected exactly one character, got: " ++ show s
+ pure . L span $
+ if endsInHash
+ then ITprimchar src c
+ else ITchar src c
where
- throw i e = setInput i >> lexError e
- checkSQuote = \case
- NoSmartQuote -> pure ()
- SmartQuote c (AI loc _) -> add_nonfatal_smart_quote_error c loc
-
--- before calling lit_error, ensure that the current input is pointing to
--- the position of the error in the buffer. This is so that we can report
--- a correct location to the user, but also so we can detect UTF-8 decoding
--- errors if they occur.
-lit_error :: AlexInput -> P a
-lit_error i = do setInput i; lexError LexStringCharLit
+ src = SourceText $ lexemeToFastString buf len
+ endsInHash = currentChar (offsetBytes (len - 1) buf) == '#'
+
-- -----------------------------------------------------------------------------
-- QuasiQuote
@@ -2389,32 +2401,28 @@ quasiquote_error start = do
isSmartQuote :: AlexAccPred ExtsBitmap
isSmartQuote _ _ _ (AI _ buf) = let c = prevChar buf ' ' in isSingleSmartQuote c || isDoubleSmartQuote c
-smart_quote_error_message :: Char -> PsLoc -> MsgEnvelope PsMessage
-smart_quote_error_message c loc =
- let (correct_char, correct_char_name) =
- if isSingleSmartQuote c then ('\'', "Single Quote") else ('"', "Quotation Mark")
- err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (mkPsSpan loc loc)) $
- PsErrUnicodeCharLooksLike c correct_char correct_char_name in
- err
-
+throwSmartQuoteError :: Char -> PsLoc -> P a
+throwSmartQuoteError c loc = addFatalError err
+ where
+ err =
+ mkPlainErrorMsgEnvelope (mkSrcSpanPs (mkPsSpan loc loc)) $
+ PsErrUnicodeCharLooksLike c correct_char correct_char_name
+ (correct_char, correct_char_name) =
+ if isSingleSmartQuote c
+ then ('\'', "Single Quote")
+ else ('"', "Quotation Mark")
+
+-- | Throw a smart quote error, where the smart quote was the last character lexed
smart_quote_error :: Action
-smart_quote_error span buf _len _buf2 = do
- let c = currentChar buf
- addFatalError (smart_quote_error_message c (psSpanStart span))
-
-add_smart_quote_error :: Char -> PsLoc -> P a
-add_smart_quote_error c loc = addFatalError (smart_quote_error_message c loc)
-
-add_nonfatal_smart_quote_error :: Char -> PsLoc -> P ()
-add_nonfatal_smart_quote_error c loc = addError (smart_quote_error_message c loc)
+smart_quote_error span _ _ buf2 = do
+ let c = prevChar buf2 (panic "smart_quote_error unexpectedly called on beginning of input")
+ throwSmartQuoteError c (psSpanStart span)
-advance_to_smart_quote_character :: P ()
-advance_to_smart_quote_character = do
- i <- getInput
- case alexGetChar' i of
- Just (c, _) | isDoubleSmartQuote c -> return ()
- Just (_, i2) -> do setInput i2; advance_to_smart_quote_character
- Nothing -> return () -- should never get here
+-- Note [Bare smart quote error]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- A smart quote inside of a string is allowed, but if a complete valid string
+-- couldn't be lexed, we want to see if there's a smart quote that the user
+-- thought ended the string, but in fact didn't.
-- -----------------------------------------------------------------------------
-- Warnings
@@ -2652,7 +2660,7 @@ getLastLocIncludingComments = P $ \s@(PState { prev_loc = prev_loc }) -> POk s p
getLastLoc :: P PsSpan
getLastLoc = P $ \s@(PState { last_loc = last_loc }) -> POk s last_loc
-data AlexInput = AI !PsLoc !StringBuffer
+data AlexInput = AI !PsLoc !StringBuffer deriving (Show)
{-
Note [Unicode in Alex]
@@ -2763,6 +2771,19 @@ alexGetChar' (AI loc s)
where (c,s') = nextChar s
loc' = advancePsLoc loc c
+-- | Advance the given input N bytes.
+advanceInputBytes :: Int -> AlexInput -> AlexInput
+advanceInputBytes n i0@(AI _ buf0) = advanceInputTo (cur buf0 + n) i0
+
+-- | Advance the given input to the given position.
+advanceInputTo :: Int -> AlexInput -> AlexInput
+advanceInputTo pos = go
+ where
+ go i@(AI _ buf)
+ | cur buf >= pos = i
+ | Just (_, i') <- alexGetChar' i = go i'
+ | otherwise = i -- reached the end, just return the last input
+
getInput :: P AlexInput
getInput = P $ \s at PState{ loc=l, buffer=b } -> POk s (AI l b)
@@ -2770,9 +2791,10 @@ setInput :: AlexInput -> P ()
setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
nextIsEOF :: P Bool
-nextIsEOF = do
- AI _ s <- getInput
- return $ atEnd s
+nextIsEOF = isEOF <$> getInput
+
+isEOF :: AlexInput -> Bool
+isEOF (AI _ buf) = atEnd buf
pushLexState :: Int -> P ()
pushLexState ls = P $ \s at PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
@@ -3516,6 +3538,11 @@ topNoLayoutContainsCommas [] = False
topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
+-- If the generated alexScan/alexScanUser functions are called multiple times
+-- in this file, alexScanUser gets broken out into a separate function and
+-- increases memory usage. Make sure GHC inlines this function and optimizes it.
+{-# INLINE alexScanUser #-}
+
lexToken :: P (PsLocated Token)
lexToken = do
inp@(AI loc1 buf) <- getInput
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -1,284 +1,199 @@
{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE PatternSynonyms #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE ViewPatterns #-}
module GHC.Parser.String (
StringLexError (..),
- ContainsSmartQuote (..),
- LexStringType (..),
lexString,
+ lexMultilineString,
-- * Unicode smart quote helpers
isDoubleSmartQuote,
isSingleSmartQuote,
-
- -- * Other helpers
- isAnyChar,
- resolveEscapeCharacter,
) where
-import GHC.Prelude
+import GHC.Prelude hiding (getChar)
import Control.Arrow ((>>>))
-import Control.Monad (guard, unless, when)
-import Data.Char (chr, isPrint, ord)
-import Data.List (unfoldr)
+import Control.Monad (when)
+import Data.Char (chr, ord)
+import qualified Data.Foldable1 as Foldable1
+import qualified Data.List.NonEmpty as NonEmpty
import Data.Maybe (listToMaybe, mapMaybe)
+import GHC.Data.StringBuffer (StringBuffer)
+import qualified GHC.Data.StringBuffer as StringBuffer
import GHC.Parser.CharClass (
hexDigit,
- is_any,
is_decdigit,
is_hexdigit,
is_octdigit,
is_space,
octDecDigit,
)
+import GHC.Parser.Errors.Types (LexErr (..))
import GHC.Utils.Panic (panic)
-data LexStringType = StringTypeSingle | StringTypeMulti
+type BufPos = Int
+data StringLexError = StringLexError LexErr BufPos
--- | State to accumulate while iterating through string literal.
---
--- Fields are strict here to avoid space leak when iterating through large string literal
--- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
-data LexStringState loc = LexStringState
- { stringAcc :: !String
- -- ^ The string seen so far, reversed
- , multilineCommonWsPrefix :: !Int
- -- ^ The common prefix for multiline strings. See Note [Multiline string literals]
- , initialLoc :: !loc
- -- ^ The location of the beginning of the string literal
- }
-
--- | Get the character at the given location, with the location
--- of the next character. Returns Nothing if at the end of the
--- input.
-type GetChar loc = loc -> Maybe (Char, loc)
-
-lexString :: LexStringType -> GetChar loc -> loc -> Either (StringLexError loc) (String, loc)
-lexString strType getChar initialLoc = go initialState initialLoc
+lexString :: Int -> StringBuffer -> Either StringLexError String
+lexString = lexStringWith processChars processChars
where
- initialState =
- LexStringState
- { stringAcc = ""
- , multilineCommonWsPrefix =
- case strType of
- StringTypeMulti -> maxBound
- _ -> 0
- , initialLoc = initialLoc
- }
-
- -- 's' is strict here to avoid space leak when iterating through large string literal
- -- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
- go !s loc0 =
- case getChar loc0 of
- -- found closing delimiter
- Just ('"', _) | Just loc1 <- checkDelimiter strType getChar loc0 -> do
- let postprocess =
- case strType of
- StringTypeSingle -> id
- StringTypeMulti -> postprocessMultiline (multilineCommonWsPrefix s)
- Right (postprocess . reverse $ stringAcc s, loc1)
-
- -- found backslash
- Just (c0@'\\', loc1) -> do
- case getChar loc1 of
- -- found '\&' character, which should be elided
- Just ('&', loc2) -> go s loc2
- -- found start of a string gap
- Just (c1, loc2) | is_space c1 -> collapseStringGap getChar s loc2 >>= go s
- -- some other escape character
- Just (c1, loc2) ->
- case strType of
- StringTypeSingle -> do
- (c', loc') <- resolveEscapeCharacter getChar loc1
- go (addChar c' s) loc'
- StringTypeMulti -> do
- -- keep escape characters unresolved until after post-processing,
- -- to distinguish between a user-newline and the user writing "\n".
- -- but still process the characters here, to find any errors
- _ <- resolveEscapeCharacter getChar loc1
- go (addChar c1 . addChar c0 $ s) loc2
- -- backslash at end of input
- Nothing -> Left $ BadCharInitialLex loc1 (hasSQuote getChar s)
-
- -- found newline character in multiline string
- Just (c0@'\n', loc1) | StringTypeMulti <- strType ->
- uncurry go $ parseLeadingWS getChar (addChar c0 s) loc1
-
- -- found some other character
- Just (c0, loc1) | isAnyChar c0 -> go (addChar c0 s) loc1
-
- -- found some unknown character
- Just (_, _) -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
-
- -- reached EOF before finding end of string
- Nothing -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
-{-# INLINE lexString #-}
-
-checkDelimiter :: LexStringType -> GetChar loc -> loc -> Maybe loc
-checkDelimiter strType getChar loc0 =
- case strType of
- StringTypeSingle -> do
- ('"', loc1) <- getChar loc0
- Just loc1
- StringTypeMulti -> do
- ('"', loc1) <- getChar loc0
- ('"', loc2) <- getChar loc1
- ('"', loc3) <- getChar loc2
- Just loc3
-{-# INLINE checkDelimiter #-}
-
--- | A helper for adding the given character to the lexed string.
-addChar :: Char -> LexStringState loc -> LexStringState loc
-addChar c s = s{stringAcc = c : stringAcc s}
-{-# INLINE addChar #-}
-
--- | Return whether the string we've parsed so far contains any smart quotes.
-hasSQuote :: GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
-hasSQuote getChar s
- | any isDoubleSmartQuote (stringAcc s)
- , (c, loc) : _ <- filter (isDoubleSmartQuote . fst) allChars =
- SmartQuote c loc
- | otherwise =
- NoSmartQuote
+ processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
+ processChars =
+ collapseGaps
+ >>> resolveEscapes
+
+-- -----------------------------------------------------------------------------
+-- Lexing interface
+
+{-
+Note [Lexing strings]
+~~~~~~~~~~~~~~~~~~~~~
+
+After verifying if a string is lexically valid with Alex, we still need to do
+some post processing of the string, namely:
+1. Collapse string gaps
+2. Resolve escape characters
+
+The problem: 'lexemeToString' is more performant than manually reading
+characters from the StringBuffer. However, that completely erases the position
+of each character, which we need in order to report the correct position for
+error messages (e.g. when resolving escape characters).
+
+So what we'll do is do two passes. The first pass is optimistic; just convert
+to a plain String and process it. If this results in an error, we do a second
+pass, this time where each character is annotated with its position. Now, the
+error has all the information it needs.
+
+Ideally, lexStringWith would take a single (forall c. HasChar c => ...) function,
+but to help the specializer, we pass it in twice to concretize it for the two
+types we actually use.
+-}
+
+-- | See Note [Lexing strings]
+lexStringWith ::
+ ([Char] -> Either (Char, LexErr) [Char])
+ -> ([CharPos] -> Either (CharPos, LexErr) [CharPos])
+ -> Int
+ -> StringBuffer
+ -> Either StringLexError String
+lexStringWith processChars processCharsPos len buf =
+ case processChars $ bufferChars buf len of
+ Right s -> Right s
+ Left _ ->
+ case processCharsPos $ bufferLocatedChars buf len of
+ Right _ -> panic "expected lex error on second pass"
+ Left ((_, pos), e) -> Left $ StringLexError e pos
+
+class HasChar c where
+ getChar :: c -> Char
+ setChar :: Char -> c -> c
+
+instance HasChar Char where
+ getChar = id
+ setChar = const
+
+instance HasChar (Char, x) where
+ getChar = fst
+ setChar c (_, x) = (c, x)
+
+pattern Char :: HasChar c => Char -> c
+pattern Char c <- (getChar -> c)
+{-# COMPLETE Char #-}
+
+bufferChars :: StringBuffer -> Int -> [Char]
+bufferChars = StringBuffer.lexemeToString
+
+type CharPos = (Char, BufPos)
+
+bufferLocatedChars :: StringBuffer -> Int -> [CharPos]
+bufferLocatedChars initialBuf len = go initialBuf
where
- allChars = unfoldr getCharWithLoc (initialLoc s)
- getCharWithLoc loc =
- case getChar loc of
- Just (c, loc') -> Just ((c, loc), loc')
- Nothing -> Nothing
-{-# INLINE hasSQuote #-}
-
--- | After parsing a backslash and a space character, consume the rest of
--- the string gap and return the next location.
-collapseStringGap :: GetChar loc -> LexStringState loc -> loc -> Either (StringLexError loc) loc
-collapseStringGap getChar s = go
+ go buf
+ | atEnd buf = []
+ | otherwise =
+ let (c, buf') = StringBuffer.nextChar buf
+ in (c, StringBuffer.cur buf) : go buf'
+
+ atEnd buf = StringBuffer.byteDiff initialBuf buf >= len
+
+-- -----------------------------------------------------------------------------
+-- Lexing phases
+
+collapseGaps :: HasChar c => [c] -> [c]
+collapseGaps = go
where
- go loc0 =
- case getChar loc0 of
- Just ('\\', loc1) -> pure loc1
- Just (c0, loc1) | is_space c0 -> go loc1
- Just _ -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
- Nothing -> Left $ UnexpectedEOF loc0 (hasSQuote getChar s)
-{-# INLINE collapseStringGap #-}
+ go = \case
+ c1@(Char '\\') : c2@(Char c) : cs
+ | is_space c -> go $ dropGap cs
+ | otherwise -> c1 : c2 : go cs
+ c : cs -> c : go cs
+ [] -> []
--- | See Note [Multiline string literals]
-parseLeadingWS :: GetChar loc -> LexStringState loc -> loc -> (LexStringState loc, loc)
-parseLeadingWS getChar = go 0
+ dropGap = \case
+ Char '\\' : cs -> cs
+ _ : cs -> dropGap cs
+ [] -> panic "gap unexpectedly ended"
+
+resolveEscapes :: HasChar c => [c] -> Either (c, LexErr) [c]
+resolveEscapes = go dlistEmpty
where
- go !col s loc =
- case getChar loc of
- Just (c@' ', loc') -> go (col + 1) (addChar c s) loc'
- -- expand tabs
- Just ('\t', loc') ->
- let fill = 8 - (col `mod` 8)
- s' = applyN fill (addChar ' ') s
- in go (col + fill) s' loc'
- -- if we see a newline or string delimiter, then this line only contained whitespace, so
- -- don't include it in the common whitespace prefix
- Just ('\n', _) -> (s, loc)
- Just ('"', _) | Just _ <- checkDelimiter StringTypeMulti getChar loc -> (s, loc)
- -- found some other character, so we're done parsing leading whitespace
- _ ->
- let s' = s{multilineCommonWsPrefix = min col (multilineCommonWsPrefix s)}
- in (s', loc)
-
- applyN :: Int -> (a -> a) -> a -> a
- applyN n f x0 = iterate f x0 !! n
-{-# INLINE parseLeadingWS #-}
-
-data StringLexError loc
- = UnexpectedEOF !loc !(ContainsSmartQuote loc)
- -- ^ Unexpectedly hit EOF when lexing string
- | BadCharInitialLex !loc !(ContainsSmartQuote loc)
- -- ^ Found invalid character when initially lexing string
- | EscapeBadChar !loc
- -- ^ Found invalid character when parsing an escaped character
- | EscapeUnexpectedEOF !loc
- -- ^ Unexpectedly hit EOF when parsing an escaped character
- | EscapeNumRangeError !loc
- -- ^ Escaped number exceeds range
- | EscapeSmartQuoteError !Char !loc
- -- ^ Found escaped smart unicode chars as `\’` or `\”`
- deriving (Show)
-
--- | When initially lexing the string, we want to track if we've
--- seen a smart quote, to show a helpful "you might be accidentally
--- using a smart quote" error.
-data ContainsSmartQuote loc
- = NoSmartQuote
- | SmartQuote !Char !loc
- deriving (Show)
+ go !acc = \case
+ [] -> pure $ dlistToList acc
+ Char '\\' : Char '&' : cs -> go acc cs
+ backslash@(Char '\\') : cs ->
+ case resolveEscapeChar cs of
+ Right (esc, cs') -> go (acc `dlistSnoc` setChar esc backslash) cs'
+ Left (c, e) -> Left (c, e)
+ c : cs -> go (acc `dlistSnoc` c) cs
-- -----------------------------------------------------------------------------
-- Escape characters
--- | After finding a backslash, parse the rest of the escape character, starting
--- at the given location.
-resolveEscapeCharacter :: GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
-resolveEscapeCharacter getChar loc0 = do
- (c0, loc1) <- expectChar loc0
- case c0 of
- 'a' -> pure ('\a', loc1)
- 'b' -> pure ('\b', loc1)
- 'f' -> pure ('\f', loc1)
- 'n' -> pure ('\n', loc1)
- 'r' -> pure ('\r', loc1)
- 't' -> pure ('\t', loc1)
- 'v' -> pure ('\v', loc1)
- '\\' -> pure ('\\', loc1)
- '"' -> pure ('\"', loc1)
- '\'' -> pure ('\'', loc1)
- -- escape codes
- 'x' -> expectNum is_hexdigit 16 hexDigit loc1
- 'o' -> expectNum is_octdigit 8 octDecDigit loc1
- _ | is_decdigit c0 -> expectNum is_decdigit 10 octDecDigit loc0
- -- control characters (e.g. '\^M')
- '^' -> do
- (c1, loc2) <- expectChar loc1
- unless ('@' <= c1 && c1 <= '_') $ Left $ EscapeBadChar loc1
- pure (chr $ ord c1 - ord '@', loc2)
- -- long form escapes (e.g. '\NUL')
- _ | Just (c1, loc2) <- parseLongEscape getChar c0 loc1 -> pure (c1, loc2)
- -- check unicode smart quotes (#21843)
- _ | isDoubleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
- _ | isSingleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
- -- unknown escape
- _ -> Left $ EscapeBadChar loc0
+-- | Resolve a escape character, after having just lexed a backslash.
+-- Assumes escape character is valid.
+resolveEscapeChar :: HasChar c => [c] -> Either (c, LexErr) (Char, [c])
+resolveEscapeChar = \case
+ Char 'a' : cs -> pure ('\a', cs)
+ Char 'b' : cs -> pure ('\b', cs)
+ Char 'f' : cs -> pure ('\f', cs)
+ Char 'n' : cs -> pure ('\n', cs)
+ Char 'r' : cs -> pure ('\r', cs)
+ Char 't' : cs -> pure ('\t', cs)
+ Char 'v' : cs -> pure ('\v', cs)
+ Char '\\' : cs -> pure ('\\', cs)
+ Char '"' : cs -> pure ('\"', cs)
+ Char '\'' : cs -> pure ('\'', cs)
+ -- escape codes
+ Char 'x' : cs -> parseNum is_hexdigit 16 hexDigit cs
+ Char 'o' : cs -> parseNum is_octdigit 8 octDecDigit cs
+ cs@(Char c : _) | is_decdigit c -> parseNum is_decdigit 10 octDecDigit cs
+ -- control characters (e.g. '\^M')
+ Char '^' : Char c : cs -> pure (chr $ ord c - ord '@', cs)
+ -- long form escapes (e.g. '\NUL')
+ cs | Just (esc, cs') <- parseLongEscape cs -> pure (esc, cs')
+ -- shouldn't happen
+ Char c : _ -> panic $ "found unexpected escape character: " ++ show c
+ [] -> panic "escape character unexpectedly ended"
where
- expectChar loc =
- case getChar loc of
- Just x -> pure x
- Nothing -> Left $ EscapeUnexpectedEOF loc
-
- expectNum isDigit base toDigit loc1 = do
- (c1, loc2) <- expectChar loc1
- unless (isDigit c1) $ Left $ EscapeBadChar loc1
- let parseNum x loc =
- case getChar loc of
- Just (c, loc') | isDigit c -> do
- let x' = x * base + toDigit c
- when (x' > 0x10ffff) $ Left $ EscapeNumRangeError loc
- parseNum x' loc'
- _ ->
- pure (chr x, loc)
- parseNum (toDigit c1) loc2
-{-# INLINE resolveEscapeCharacter #-}
-
-parseLongEscape :: GetChar loc -> Char -> loc -> Maybe (Char, loc)
-parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCodes
+ parseNum isDigit base toDigit =
+ let go x = \case
+ ch@(Char c) : cs | isDigit c -> do
+ let x' = x * base + toDigit c
+ when (x' > 0x10ffff) $ Left (ch, LexNumEscapeRange)
+ go x' cs
+ cs -> pure (chr x, cs)
+ in go 0
+
+parseLongEscape :: HasChar c => [c] -> Maybe (Char, [c])
+parseLongEscape cs = listToMaybe (mapMaybe tryParse longEscapeCodes)
where
- tryParse (prefix, c) = do
- p0 : p <- pure prefix
- guard (p0 == c0) -- see if the first character matches
- loc <- parsePrefix loc1 p -- see if the rest of the prefix matches
- pure (c, loc)
-
- parsePrefix loc = \case
- [] -> pure loc
- p : ps -> do
- (c, loc') <- getChar loc
- guard (p == c)
- parsePrefix loc' ps
+ tryParse (code, esc) =
+ case splitAt (length code) cs of
+ (pre, cs') | map getChar pre == code -> Just (esc, cs')
+ _ -> Nothing
longEscapeCodes =
[ ("NUL", '\NUL')
@@ -289,14 +204,14 @@ parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCode
, ("ENQ", '\ENQ')
, ("ACK", '\ACK')
, ("BEL", '\BEL')
- , ("BS", '\BS')
- , ("HT", '\HT')
- , ("LF", '\LF')
- , ("VT", '\VT')
- , ("FF", '\FF')
- , ("CR", '\CR')
- , ("SO", '\SO')
- , ("SI", '\SI')
+ , ("BS" , '\BS' )
+ , ("HT" , '\HT' )
+ , ("LF" , '\LF' )
+ , ("VT" , '\VT' )
+ , ("FF" , '\FF' )
+ , ("CR" , '\CR' )
+ , ("SO" , '\SO' )
+ , ("SI" , '\SI' )
, ("DLE", '\DLE')
, ("DC1", '\DC1')
, ("DC2", '\DC2')
@@ -306,17 +221,16 @@ parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCode
, ("SYN", '\SYN')
, ("ETB", '\ETB')
, ("CAN", '\CAN')
- , ("EM", '\EM')
+ , ("EM" , '\EM' )
, ("SUB", '\SUB')
, ("ESC", '\ESC')
- , ("FS", '\FS')
- , ("GS", '\GS')
- , ("RS", '\RS')
- , ("US", '\US')
- , ("SP", '\SP')
+ , ("FS" , '\FS' )
+ , ("GS" , '\GS' )
+ , ("RS" , '\RS' )
+ , ("US" , '\US' )
+ , ("SP" , '\SP' )
, ("DEL", '\DEL')
]
-{-# INLINE parseLongEscape #-}
-- -----------------------------------------------------------------------------
-- Unicode Smart Quote detection (#21843)
@@ -333,6 +247,98 @@ isSingleSmartQuote = \case
'’' -> True
_ -> False
+-- -----------------------------------------------------------------------------
+-- Multiline strings
+
+-- | See Note [Multiline string literals]
+--
+-- Assumes string is lexically valid. Skips the steps about splitting
+-- and rejoining lines, and instead manually find newline characters,
+-- for performance.
+lexMultilineString :: Int -> StringBuffer -> Either StringLexError String
+lexMultilineString = lexStringWith processChars processChars
+ where
+ processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
+ processChars =
+ collapseGaps -- Step 1
+ >>> expandLeadingTabs -- Step 3
+ >>> rmCommonWhitespacePrefix -- Step 4
+ >>> collapseOnlyWsLines -- Step 5
+ >>> rmFirstNewline -- Step 7a
+ >>> rmLastNewline -- Step 7b
+ >>> resolveEscapes -- Step 8
+
+ -- expands all tabs, since the lexer will verify that tabs can only appear
+ -- as leading indentation
+ expandLeadingTabs :: HasChar c => [c] -> [c]
+ expandLeadingTabs =
+ let go !col = \case
+ c@(Char '\t') : cs ->
+ let fill = 8 - (col `mod` 8)
+ in replicate fill (setChar ' ' c) ++ go (col + fill) cs
+ c : cs -> c : go (if getChar c == '\n' then 0 else col + 1) cs
+ [] -> []
+ in go 0
+
+ rmCommonWhitespacePrefix :: HasChar c => [c] -> [c]
+ rmCommonWhitespacePrefix cs0 =
+ let commonWSPrefix = getCommonWsPrefix (map getChar cs0)
+ go = \case
+ c@(Char '\n') : cs -> c : go (dropLine commonWSPrefix cs)
+ c : cs -> c : go cs
+ [] -> []
+ -- drop x characters from the string, or up to a newline, whichever
+ -- comes first
+ dropLine !x = \case
+ cs | x <= 0 -> cs
+ cs@(Char '\n' : _) -> cs
+ _ : cs -> dropLine (x - 1) cs
+ [] -> []
+ in go cs0
+
+ collapseOnlyWsLines :: HasChar c => [c] -> [c]
+ collapseOnlyWsLines =
+ let go = \case
+ c@(Char '\n') : cs | Just cs' <- checkAllWs cs -> c : go cs'
+ c : cs -> c : go cs
+ [] -> []
+ checkAllWs = \case
+ -- got all the way to a newline or the end of the string, return
+ cs@(Char '\n' : _) -> Just cs
+ cs@[] -> Just cs
+ -- found whitespace, continue
+ Char c : cs | is_space c -> checkAllWs cs
+ -- anything else, stop
+ _ -> Nothing
+ in go
+
+ rmFirstNewline :: HasChar c => [c] -> [c]
+ rmFirstNewline = \case
+ Char '\n' : cs -> cs
+ cs -> cs
+
+ rmLastNewline :: HasChar c => [c] -> [c]
+ rmLastNewline =
+ let go = \case
+ [] -> []
+ [Char '\n'] -> []
+ c : cs -> c : go cs
+ in go
+
+-- | See step 4 in Note [Multiline string literals]
+--
+-- Assumes tabs have already been expanded.
+getCommonWsPrefix :: String -> Int
+getCommonWsPrefix s =
+ case NonEmpty.nonEmpty includedLines of
+ Nothing -> 0
+ Just ls -> Foldable1.minimum $ NonEmpty.map (length . takeWhile is_space) ls
+ where
+ includedLines =
+ filter (not . all is_space) -- ignore whitespace-only lines
+ . drop 1 -- ignore first line in calculation
+ $ lines s
+
{-
Note [Multiline string literals]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -351,23 +357,13 @@ The canonical steps for post processing a multiline string are:
2. Split the string by newlines
3. Convert leading tabs into spaces
* In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
-4. Remove common whitespace prefix in every line (see below)
+4. Remove common whitespace prefix in every line except the first (see below)
5. If a line contains only whitespace, remove all of the whitespace
6. Join the string back with `\n` delimiters
-7. If the first character of the string is a newline, remove it
+7a. If the first character of the string is a newline, remove it
+7b. If the last character of the string is a newline, remove it
8. Interpret escaped characters
-However, for performance reasons, we do as much of this in one pass as possible:
-1. As we lex the string, do the following steps as they appear:
- a. Collapse string gaps
- b. Keep track of the common whitespace prefix so far
- c. Validate escaped characters
-2. At the very end, post process the lexed string:
- a. Remove the common whitespace prefix from every line
- b. Remove all whitespace from all-whitespace lines
- c. Remove initial newline character
- d. Resolve escaped characters
-
The common whitespace prefix can be informally defined as "The longest
prefix of whitespace shared by all lines in the string, excluding the
first line and any whitespace-only lines".
@@ -382,75 +378,16 @@ It's more precisely defined with the following algorithm:
3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
-}
--- | See Note [Multiline string literals]
-postprocessMultiline :: Int -> String -> String
-postprocessMultiline commonWSPrefix =
- rmCommonWhitespacePrefix
- >>> collapseOnlyWsLines
- >>> rmFirstNewline
- >>> rmLastNewline
- >>> resolveEscapeChars
- where
- rmCommonWhitespacePrefix =
- let go = \case
- '\n' : s -> '\n' : go (dropLine commonWSPrefix s)
- c : s -> c : go s
- [] -> []
- -- drop x characters from the string, or up to a newline, whichever
- -- comes first
- dropLine !x = \case
- s | x <= 0 -> s
- s@('\n' : _) -> s
- _ : s -> dropLine (x - 1) s
- [] -> []
- in go
-
- collapseOnlyWsLines =
- let go = \case
- '\n' : s | Just s' <- checkAllWs s -> '\n' : go s'
- c : s -> c : go s
- [] -> []
- checkAllWs = \case
- -- got all the way to a newline or the end of the string, return
- s@('\n' : _) -> Just s
- s@[] -> Just s
- -- found whitespace, continue
- c : s | is_space c -> checkAllWs s
- -- anything else, stop
- _ -> Nothing
- in go
-
- rmFirstNewline = \case
- '\n' : s -> s
- s -> s
+-- -----------------------------------------------------------------------------
+-- DList
- rmLastNewline =
- let go = \case
- [] -> []
- ['\n'] -> []
- c : cs -> c : go cs
- in go
+newtype DList a = DList ([a] -> [a])
- -- resolve escape characters, deferred from lexString. guaranteed
- -- to not throw any errors, since we already checked them in lexString
- resolveEscapeChars = \case
- [] -> []
- '\\' : s ->
- -- concretizing 'loc' to String:
- -- resolveEscapeCharacter :: (String -> Maybe (Char, String)) -> String -> Either _ (Char, String)
- case resolveEscapeCharacter uncons s of
- Left e -> panic $ "resolving escape characters in multiline string unexpectedly found errors: " ++ show e
- Right (c, s') -> c : resolveEscapeChars s'
- c : s -> c : resolveEscapeChars s
-
- uncons = \case
- c : cs -> Just (c, cs)
- [] -> Nothing
+dlistEmpty :: DList a
+dlistEmpty = DList id
--- -----------------------------------------------------------------------------
--- Helpers
+dlistToList :: DList a -> [a]
+dlistToList (DList f) = f []
-isAnyChar :: Char -> Bool
-isAnyChar c
- | c > '\x7f' = isPrint c
- | otherwise = is_any c
+dlistSnoc :: DList a -> a -> DList a
+dlistSnoc (DList f) x = DList (f . (x :))
=====================================
testsuite/driver/testlib.py
=====================================
@@ -2868,7 +2868,7 @@ def normalise_errmsg(s: str) -> str:
# normalise slashes to minimise Windows/Unix filename differences,
# but don't normalize backslashes in chars
- s = re.sub(r"(?!')\\", '/', s)
+ s = re.sub(r"(?<!('|‘))\\", '/', s)
# Normalize the name of the GHC executable. Specifically,
# this catches the cases that:
=====================================
testsuite/tests/ghci/prog013/prog013.stderr
=====================================
@@ -1,17 +1,12 @@
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
-
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
<interactive>:9:1: error: [GHC-58481] parse error on input ‘+’
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
+
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
-Bad.hs:3:8: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
=====================================
testsuite/tests/ghci/scripts/ghci022.stderr
=====================================
@@ -1,3 +1,2 @@
+ghci022.hs:2:5: error: [GHC-21231] lexical error at character '\n'
-ghci022.hs:2:6: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
=====================================
testsuite/tests/parser/should_fail/MultilineStringsInnerTab.stderr
=====================================
@@ -1,3 +1,3 @@
-
MultilineStringsInnerTab.hs:8:5: error: [GHC-21231]
- lexical error in string/character literal at character '\t'
+ lexical error at character '\t'
+
=====================================
testsuite/tests/parser/should_fail/MultilineStringsSmartQuotes.stderr
=====================================
@@ -1,6 +1,2 @@
-
MultilineStringsSmartQuotes.hs:9:3: [GHC-31623]
Unicode character '”' ('\8221') looks like '"' (Quotation Mark), but it is not
-
-MultilineStringsSmartQuotes.hs:10:1: [GHC-21231]
- lexical error in string/character literal at end of input
=====================================
testsuite/tests/parser/should_fail/T21843c.stderr
=====================================
@@ -1,6 +1,2 @@
-
-T21843c.hs:3:19: [GHC-31623]
+T21843c.hs:3:13: [GHC-31623]
Unicode character '”' ('\8221') looks like '"' (Quotation Mark), but it is not
-
-T21843c.hs:3:20: [GHC-21231]
- lexical error in string/character literal at character '\n'
=====================================
testsuite/tests/parser/should_fail/T21843e.stderr
=====================================
@@ -1,3 +1,3 @@
-
-T21843e.hs:3:15: [GHC-31623]
+T21843e.hs:3:13: error: [GHC-31623]
Unicode character '”' ('\8221') looks like '"' (Quotation Mark), but it is not
+
=====================================
testsuite/tests/parser/should_fail/T21843f.stderr
=====================================
@@ -1,3 +1,3 @@
-
-T21843f.hs:3:13: [GHC-31623]
+T21843f.hs:3:11: error: [GHC-31623]
Unicode character '‘' ('\8216') looks like ''' (Single Quote), but it is not
+
=====================================
testsuite/tests/parser/should_fail/T3751.stderr
=====================================
@@ -1,3 +1,2 @@
+T3751.hs:3:5: error: [GHC-21231] lexical error at character '\167'
-T3751.hs:3:7: error: [GHC-21231]
- lexical error in string/character literal at character '\167'
=====================================
testsuite/tests/parser/should_fail/T5425.stderr
=====================================
@@ -1,3 +1,2 @@
+T5425.hs:4:1: error: [GHC-21231] lexical error at character '\955'
-T5425.hs:5:2: error: [GHC-21231]
- lexical error in string/character literal at character '\955'
=====================================
testsuite/tests/parser/should_fail/readFail002.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail002.hs:5:5: error: [GHC-21231]
+ lexical error at character '\n'
-readFail002.hs:5:6: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
=====================================
testsuite/tests/parser/should_fail/readFail004.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail004.hs:17:16: error: [GHC-21231]
+ lexical error at character '.'
-readFail004.hs:19:1: error: [GHC-21231]
- lexical error in string/character literal at character '.'
=====================================
testsuite/tests/parser/should_fail/readFail005.stderr
=====================================
@@ -1,3 +1,2 @@
-
-readFail005.hs:4:7: error: [GHC-21231]
- lexical error in string/character literal at character '&'
+readFail005.hs:4:5: [GHC-21231]
+ lexical error at character '&'
=====================================
testsuite/tests/parser/should_fail/readFail033.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail033.hs:2:5: error: [GHC-21231]
+ lexical error at character '\t'
-readFail033.hs:2:6: error: [GHC-21231]
- lexical error in string/character literal at character '\t'
=====================================
testsuite/tests/parser/unicode/all.T
=====================================
@@ -33,3 +33,6 @@ test('T18225B', normal, compile_fail, [''])
test('T18158', normal, compile, [''])
test('T18158b', normal, compile_fail, [''])
+
+test('lex_unispace', normal, compile, [''])
+test('lex_unicode_ids', normal, compile, [''])
=====================================
testsuite/tests/parser/unicode/lex_unicode_ids.hs
=====================================
@@ -0,0 +1,11 @@
+-- Regression tests for unicode identifiers
+
+{-# LANGUAGE TemplateHaskellQuotes #-}
+
+module ShouldCompile where
+
+ƞ = 1
+eta = 'ƞ
+
+data Ʊ
+upsilon = ''Ʊ
=====================================
testsuite/tests/parser/unicode/lex_unispace.hs
=====================================
@@ -0,0 +1,9 @@
+-- Regression tests for unicode whitespace
+
+module ShouldCompile where
+
+-- https://github.com/fosskers/aura/blob/cdebca1f48254ebb8286d8e38591bf644282866f/haskell/aura/lib/Aura/Languages.hs#L107
+x1 = ' ' -- \12288
+
+-- https://github.com/jgm/pandoc/blob/98e77e02f6436e4b74a164762d0f3149ae7ecefa/src/Text/Pandoc/Writers/FB2.hs#L295C3-L295C32
+x2 = " " -- \xa0
=====================================
testsuite/tests/parser/unicode/utf8_010.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_010.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
testsuite/tests/parser/unicode/utf8_011.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_011.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
testsuite/tests/parser/unicode/utf8_020.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_020.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
testsuite/tests/parser/unicode/utf8_021.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_021.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
testsuite/tests/parser/unicode/utf8_022.stderr
=====================================
@@ -1,3 +1,3 @@
-
utf8_022.hs:2:8: error: [GHC-21231]
- lexical error in string/character literal (UTF-8 decoding error)
+ lexical error (UTF-8 decoding error)
+
=====================================
utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
=====================================
@@ -320,7 +320,7 @@ classify tok =
ITlabelvarid{} -> TkUnknown
ITchar{} -> TkChar
ITstring{} -> TkString
- ITmultilinestring{} -> TkString
+ ITstringMulti{} -> TkString
ITinteger{} -> TkNumber
ITrational{} -> TkNumber
ITprimchar{} -> TkChar
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/92f9a4dfbd862eb0b80e55d4ac388af356c32069
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/92f9a4dfbd862eb0b80e55d4ac388af356c32069
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240925/92a4e63e/attachment-0001.html>
More information about the ghc-commits
mailing list