[Git][ghc/ghc][wip/strings] 2 commits: Distinguish multiline string section more clearly
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Sun Aug 11 05:29:25 UTC 2024
Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC
Commits:
5a55357d by Brandon Chinn at 2024-08-10T22:25:05-07:00
Distinguish multiline string section more clearly
- - - - -
de6a3a26 by Brandon Chinn at 2024-08-10T22:29:11-07:00
Replace manual string lexing
- - - - -
9 changed files:
- compiler/GHC/Parser/Errors/Ppr.hs
- compiler/GHC/Parser/Errors/Types.hs
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
- testsuite/tests/parser/should_fail/T3751.stderr
- testsuite/tests/parser/should_fail/T5425.stderr
- testsuite/tests/parser/should_fail/readFail002.stderr
- testsuite/tests/parser/should_fail/readFail004.stderr
- testsuite/tests/parser/should_fail/readFail033.stderr
Changes:
=====================================
compiler/GHC/Parser/Errors/Ppr.hs
=====================================
@@ -157,8 +157,6 @@ instance Diagnostic PsMessage where
LexUnknownPragma -> text "unknown pragma"
LexErrorInPragma -> text "lexical error in pragma"
LexNumEscapeRange -> text "numeric escape sequence out of range"
- LexStringCharLit -> text "lexical error in string/character literal"
- LexStringCharLitEOF -> text "unexpected end-of-file in string/character literal"
LexUnterminatedComment -> text "unterminated `{-'"
LexUnterminatedOptions -> text "unterminated OPTIONS pragma"
LexUnterminatedQQ -> text "unterminated quasiquotation"
=====================================
compiler/GHC/Parser/Errors/Types.hs
=====================================
@@ -591,11 +591,10 @@ data LexErr
| LexUnknownPragma -- ^ Unknown pragma
| LexErrorInPragma -- ^ Lexical error in pragma
| LexNumEscapeRange -- ^ Numeric escape sequence out of range
- | LexStringCharLit -- ^ Lexical error in string/character literal
- | LexStringCharLitEOF -- ^ Unexpected end-of-file in string/character literal
| LexUnterminatedComment -- ^ Unterminated `{-'
| LexUnterminatedOptions -- ^ Unterminated OPTIONS pragma
| LexUnterminatedQQ -- ^ Unterminated quasiquotation
+ deriving (Show)
-- | Errors from the Cmm parser
data CmmParserError
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -84,6 +84,8 @@ import GHC.Prelude
import qualified GHC.Data.Strict as Strict
-- base
+import Control.DeepSeq (deepseq)
+import Control.Exception (catch, throw)
import Control.Monad
import Control.Applicative
import Data.Char
@@ -93,6 +95,7 @@ import qualified Data.List.NonEmpty as NE
import Data.Maybe
import Data.Word
import Debug.Trace (trace)
+import System.IO.Unsafe (unsafePerformIO)
import GHC.Data.EnumSet as EnumSet
@@ -167,6 +170,7 @@ $idchar = [$small $large $digit $uniidchar \']
$unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$graphic = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
+$charesc = [a b f n r t v \\ \" \' \&]
$binit = 0-1
$octit = 0-7
@@ -213,6 +217,15 @@ $docsym = [\| \^ \* \$]
@floating_point = @numspc @decimal \. @decimal @exponent? | @numspc @decimal @exponent
@hex_floating_point = @numspc @hexadecimal \. @hexadecimal @bin_exponent? | @numspc @hexadecimal @bin_exponent
+ at gap = \\ $whitechar+ \\
+ at cntrl = $asclarge | \@ | \[ | \\ | \] | \^ | \_
+ at ascii = \^ @cntrl | "NUL" | "SOH" | "STX" | "ETX" | "EOT" | "ENQ" | "ACK"
+ | "BEL" | "BS" | "HT" | "LF" | "VT" | "FF" | "CR" | "SO" | "SI" | "DLE"
+ | "DC1" | "DC2" | "DC3" | "DC4" | "NAK" | "SYN" | "ETB" | "CAN"
+ | "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
+ at escape = \\ ( $charesc | @ascii | @decimal | o @octal | x @hexadecimal )
+ at stringchar = ($graphic # [\\ \"]) | \ | @escape | @gap
+
-- normal signed numerical literals can only be explicitly negative,
-- not explicitly positive (contrast @exponent)
@negative = \-
@@ -460,7 +473,7 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
<0> {
"#" $idchar+ / { ifExtension OverloadedLabelsBit } { skip_one_varid_src ITlabelvarid }
- "#" \" / { ifExtension OverloadedLabelsBit } { lex_quoted_label }
+ "#" \" @stringchar* \" / { ifExtension OverloadedLabelsBit } { tok_quoted_label }
}
<0> {
@@ -665,9 +678,18 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
-- lexer, we would still have to parse the string afterward in order
-- to convert it to a String.
<0> {
- \' { lex_char_tok }
- \"\"\" / { ifExtension MultilineStringsBit} { lex_string_tok StringTypeMulti }
- \" { lex_string_tok StringTypeSingle }
+ \' @stringchar \' \#? / { ifCurrentChar '&' {- disallow '\&' -} } { tok_char }
+ \"\"\" (@stringchar | $whitechar)* \"\"\" / { ifExtension MultilineStringsBit} { tok_string_multi }
+ \" @stringchar* \" \#? { tok_string }
+}
+
+<0> {
+ \' \' { token ITtyQuote }
+
+ -- if a Char was not lexed and we still see a single quote,
+ -- it's a quoted identifier, like 'x. Just return ITsimpleQuote,
+ -- as the parser will lex the varid separately.
+ \' { token ITsimpleQuote }
}
-- Note [Whitespace-sensitive operator parsing]
@@ -2181,39 +2203,71 @@ lex_string_prag_comment mkTok span _buf _len _buf2
-- -----------------------------------------------------------------------------
-- Strings & Chars
--- This stuff is horrible. I hates it.
-
-lex_string_tok :: LexStringType -> Action
-lex_string_tok strType span buf _len _buf2 = do
- s <- lex_string strType
-
- i <- getInput
- case strType of
- StringTypeSingle ->
- lex_magic_hash i >>= \case
- Just i' -> do
- when (any (> '\xFF') s) $ do
- pState <- getPState
- let msg = PsErrPrimStringInvalidChar
- let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
- addError err
-
- setInput i'
- let (psSpan, src) = getStringLoc (buf, locStart) i'
- pure $ L psSpan (ITprimstring src (unsafeMkByteString s))
- Nothing -> do
- let (psSpan, src) = getStringLoc (buf, locStart) i
- pure $ L psSpan (ITstring src (mkFastString s))
- StringTypeMulti -> do
- let (psSpan, src) = getStringLoc (buf, locStart) i
- pure $ L psSpan (ITmultilinestring src (mkFastString s))
+-- TODO: figure out smart quotes (escaped smart quotes + smart quotes failing to end string lex)
+tok_string :: Action
+tok_string span buf len _buf2 = do
+ s <- lex_string span buf (if isMagicHash then len - 1 else len) "\"" "\""
+
+ if isMagicHash
+ then do
+ when (any (> '\xFF') s) $ do
+ pState <- getPState
+ let msg = PsErrPrimStringInvalidChar
+ let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
+ addError err
+ pure $ L span (ITprimstring src (unsafeMkByteString s))
+ else
+ pure $ L span (ITstring src (mkFastString s))
where
- locStart = psSpanStart span
-
-
-lex_quoted_label :: Action
-lex_quoted_label span buf _len _buf2 = do
- s <- lex_string StringTypeSingle
+ src = SourceText $ lexemeToFastString buf len
+ isMagicHash = currentChar (offsetBytes (len - 1) buf) == '#'
+
+tok_string_multi :: Action
+tok_string_multi span buf len _buf2 = do
+ s <- lex_string span buf len "\"\"\"" "\"\"\""
+ -- TODO: post-process string
+ pure $ L span (ITmultilinestring src (mkFastString s))
+ where
+ src = SourceText $ lexemeToFastString buf len
+
+lex_string :: PsSpan -> StringBuffer -> Int -> String -> String -> P String
+lex_string span buf len startDelim endDelim = do
+ let s = go $ lexemeToString (offsetBytes (length startDelim) buf) numChars
+
+ -- Unfortunately, `go` is only performant if it's pure; allocations
+ -- and performance degrade when `go` is implemented in P or ST. So
+ -- we'll throw an impure exception and catch it here
+ unsafePerformIO $
+ (s `deepseq` pure (pure ())) `catch` \e -> do
+ let i0 = AI (psSpanStart span) buf
+ let (e', i) = resolveParseEscapeErr alexGetChar' i0 numChars e
+ pure $ setInput i >> lexError e'
+
+ pure s
+ where
+ -- the number of characters in the string
+ numChars = len - (length startDelim + length endDelim)
+
+ -- assumes string was lexed correctly
+ go = \case
+ [] -> ""
+ '\\' : '&' : cs -> go cs
+ '\\' : c : cs | is_space c -> go $ dropGap cs
+ '\\' : cs ->
+ case resolveEscapeCharacter cs of
+ Right (c, cs') -> c : go cs'
+ Left e -> throw e
+ c : cs -> c : go cs
+
+ dropGap = \case
+ '\\' : cs -> cs
+ _ : cs -> dropGap cs
+ [] -> panic "gap unexpectedly ended"
+
+
+tok_quoted_label :: Action
+tok_quoted_label span buf len _buf2 = do
+ s <- lex_string span buf len "#\"" "\""
(AI end bufEnd) <- getInput
let
token = ITlabelvarid (SourceText src) (mkFastString s)
@@ -2223,114 +2277,19 @@ lex_quoted_label span buf _len _buf2 = do
return $ L (mkPsSpan start end) token
-lex_string :: LexStringType -> P String
-lex_string strType = do
- start <- getInput
- (str, next) <- either fromStringLexError pure $ lexString strType alexGetChar' start
- setInput next
- pure str
-
-
-lex_char_tok :: Action
--- Here we are basically parsing character literals, such as 'x' or '\n'
--- but we additionally spot 'x and ''T, returning ITsimpleQuote and
--- ITtyQuote respectively, but WITHOUT CONSUMING the x or T part
--- (the parser does that).
--- So we have to do two characters of lookahead: when we see 'x we need to
--- see if there's a trailing quote
-lex_char_tok span buf _len _buf2 = do -- We've seen '
- i1 <- getInput -- Look ahead to first character
- let loc = psSpanStart span
- case alexGetChar' i1 of
- Nothing -> lit_error i1
-
- Just ('\'', i2@(AI end2 _)) -> do -- We've seen ''
- setInput i2
- return (L (mkPsSpan loc end2) ITtyQuote)
-
- Just ('\\', i2@(AI end2 _)) -> do -- We've seen 'backslash
- (lit_ch, i3) <-
- either fromStringLexError pure $
- resolveEscapeCharacter alexGetChar' i2
- case alexGetChar' i3 of
- Just ('\'', i4) -> do
- setInput i4
- finish_char_tok buf loc lit_ch
- Just (mc, _) | isSingleSmartQuote mc -> add_smart_quote_error mc end2
- _ -> lit_error i3
-
- Just (c, i2@(AI end2 _))
- | not (isAnyChar c) -> lit_error i1
- | otherwise ->
-
- -- We've seen 'x, where x is a valid character
- -- (i.e. not newline etc) but not a quote or backslash
- case alexGetChar' i2 of -- Look ahead one more character
- Just ('\'', i3) -> do -- We've seen 'x'
- setInput i3
- finish_char_tok buf loc c
- Just (c, _) | isSingleSmartQuote c -> add_smart_quote_error c end2
- _other -> do -- We've seen 'x not followed by quote
- -- (including the possibility of EOF)
- -- Just parse the quote only
- let (AI end _) = i1
- return (L (mkPsSpan loc end) ITsimpleQuote)
-
--- We've already seen the closing quote
--- Just need to check for trailing #
-finish_char_tok :: StringBuffer -> PsLoc -> Char -> P (PsLocated Token)
-finish_char_tok buf loc ch = do
- i <- getInput
- lex_magic_hash i >>= \case
- Just i' -> do
- setInput i'
- -- Include the trailing # in SourceText
- let (psSpan, src) = getStringLoc (buf, loc) i'
- pure $ L psSpan (ITprimchar src ch)
- Nothing -> do
- let (psSpan, src) = getStringLoc (buf, loc) i
- pure $ L psSpan (ITchar src ch)
-
-
--- | Get the span and source text for a string from the given start to the given end.
-getStringLoc :: (StringBuffer, PsLoc) -> AlexInput -> (PsSpan, SourceText)
-getStringLoc (bufStart, locStart) (AI locEnd bufEnd) = (psSpan, SourceText src)
+tok_char :: Action
+tok_char span buf len _buf2 = do
+ c <- lex_string span buf (if isMagicHash then len - 1 else len) "'" "'" >>= \case
+ [c] -> pure c
+ s -> panic $ "tok_char expected exactly one character, got: " ++ show s
+ pure . L span $
+ if isMagicHash
+ then ITprimchar src c
+ else ITchar src c
where
- psSpan = mkPsSpan locStart locEnd
- src = lexemeToFastString bufStart (cur bufEnd - cur bufStart)
-
-
--- Return Just if we found the magic hash, with the next input.
-lex_magic_hash :: AlexInput -> P (Maybe AlexInput)
-lex_magic_hash i = do
- magicHash <- getBit MagicHashBit
- if magicHash
- then
- case alexGetChar' i of
- Just ('#', i') -> pure (Just i')
- _other -> pure Nothing
- else pure Nothing
-
-fromStringLexError :: StringLexError AlexInput -> P a
-fromStringLexError = \case
- UnexpectedEOF i squote -> checkSQuote squote >> throw i LexStringCharLitEOF
- BadCharInitialLex i squote -> checkSQuote squote >> throw i LexStringCharLit
- EscapeBadChar i -> throw i LexStringCharLit
- EscapeUnexpectedEOF i -> throw i LexStringCharLitEOF
- EscapeNumRangeError i -> throw i LexNumEscapeRange
- EscapeSmartQuoteError c (AI loc _) -> add_smart_quote_error c loc
- where
- throw i e = setInput i >> lexError e
- checkSQuote = \case
- NoSmartQuote -> pure ()
- SmartQuote c (AI loc _) -> add_nonfatal_smart_quote_error c loc
-
--- before calling lit_error, ensure that the current input is pointing to
--- the position of the error in the buffer. This is so that we can report
--- a correct location to the user, but also so we can detect UTF-8 decoding
--- errors if they occur.
-lit_error :: AlexInput -> P a
-lit_error i = do setInput i; lexError LexStringCharLit
+ src = SourceText $ lexemeToFastString buf len
+ isMagicHash = currentChar (offsetBytes (len - 1) buf) == '#'
+
-- -----------------------------------------------------------------------------
-- QuasiQuote
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -1,10 +1,12 @@
{-# LANGUAGE LambdaCase #-}
module GHC.Parser.String (
- StringLexError (..),
- ContainsSmartQuote (..),
- LexStringType (..),
- lexString,
+ resolveEscapeCharacter,
+ ParseEscapeErr,
+ resolveParseEscapeErr,
+
+ -- * Multiline strings
+ postprocessMultiline,
-- * Unicode smart quote helpers
isDoubleSmartQuote,
@@ -12,15 +14,14 @@ module GHC.Parser.String (
-- * Other helpers
isAnyChar,
- resolveEscapeCharacter,
) where
import GHC.Prelude
import Control.Arrow ((>>>))
-import Control.Monad (guard, unless, when)
+import Control.Exception (Exception)
+import Control.Monad (when)
import Data.Char (chr, isPrint, ord)
-import Data.List (unfoldr)
import Data.Maybe (listToMaybe, mapMaybe)
import GHC.Parser.CharClass (
hexDigit,
@@ -31,254 +32,78 @@ import GHC.Parser.CharClass (
is_space,
octDecDigit,
)
+import GHC.Parser.Errors.Types (LexErr (..))
import GHC.Utils.Panic (panic)
-data LexStringType = StringTypeSingle | StringTypeMulti
-
--- | State to accumulate while iterating through string literal.
---
--- Fields are strict here to avoid space leak when iterating through large string literal
--- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
-data LexStringState loc = LexStringState
- { stringAcc :: !String
- -- ^ The string seen so far, reversed
- , multilineCommonWsPrefix :: !Int
- -- ^ The common prefix for multiline strings. See Note [Multiline string literals]
- , initialLoc :: !loc
- -- ^ The location of the beginning of the string literal
- }
-
--- | Get the character at the given location, with the location
--- of the next character. Returns Nothing if at the end of the
--- input.
-type GetChar loc = loc -> Maybe (Char, loc)
-
-lexString :: LexStringType -> GetChar loc -> loc -> Either (StringLexError loc) (String, loc)
-lexString strType getChar initialLoc = go initialState initialLoc
- where
- initialState =
- LexStringState
- { stringAcc = ""
- , multilineCommonWsPrefix =
- case strType of
- StringTypeMulti -> maxBound
- _ -> 0
- , initialLoc = initialLoc
- }
-
- -- 's' is strict here to avoid space leak when iterating through large string literal
- -- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
- go !s loc0 =
- case getChar loc0 of
- -- found closing delimiter
- Just ('"', _) | Just loc1 <- checkDelimiter strType getChar loc0 -> do
- let postprocess =
- case strType of
- StringTypeSingle -> id
- StringTypeMulti -> postprocessMultiline (multilineCommonWsPrefix s)
- Right (postprocess . reverse $ stringAcc s, loc1)
-
- -- found backslash
- Just (c0@'\\', loc1) -> do
- case getChar loc1 of
- -- found '\&' character, which should be elided
- Just ('&', loc2) -> go s loc2
- -- found start of a string gap
- Just (c1, loc2) | is_space c1 -> collapseStringGap getChar s loc2 >>= go s
- -- some other escape character
- Just (c1, loc2) ->
- case strType of
- StringTypeSingle -> do
- (c', loc') <- resolveEscapeCharacter getChar loc1
- go (addChar c' s) loc'
- StringTypeMulti -> do
- -- keep escape characters unresolved until after post-processing,
- -- to distinguish between a user-newline and the user writing "\n".
- -- but still process the characters here, to find any errors
- _ <- resolveEscapeCharacter getChar loc1
- go (addChar c1 . addChar c0 $ s) loc2
- -- backslash at end of input
- Nothing -> Left $ BadCharInitialLex loc1 (hasSQuote getChar s)
-
- -- found newline character in multiline string
- Just (c0@'\n', loc1) | StringTypeMulti <- strType ->
- uncurry go $ parseLeadingWS getChar (addChar c0 s) loc1
-
- -- found some other character
- Just (c0, loc1) | isAnyChar c0 -> go (addChar c0 s) loc1
-
- -- found some unknown character
- Just (_, _) -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
-
- -- reached EOF before finding end of string
- Nothing -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
-{-# INLINE lexString #-}
-
-checkDelimiter :: LexStringType -> GetChar loc -> loc -> Maybe loc
-checkDelimiter strType getChar loc0 =
- case strType of
- StringTypeSingle -> do
- ('"', loc1) <- getChar loc0
- Just loc1
- StringTypeMulti -> do
- ('"', loc1) <- getChar loc0
- ('"', loc2) <- getChar loc1
- ('"', loc3) <- getChar loc2
- Just loc3
-{-# INLINE checkDelimiter #-}
-
--- | A helper for adding the given character to the lexed string.
-addChar :: Char -> LexStringState loc -> LexStringState loc
-addChar c s = s{stringAcc = c : stringAcc s}
-{-# INLINE addChar #-}
-
--- | Return whether the string we've parsed so far contains any smart quotes.
-hasSQuote :: GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
-hasSQuote getChar s
- | any isDoubleSmartQuote (stringAcc s)
- , (c, loc) : _ <- filter (isDoubleSmartQuote . fst) allChars =
- SmartQuote c loc
- | otherwise =
- NoSmartQuote
- where
- allChars = unfoldr getCharWithLoc (initialLoc s)
- getCharWithLoc loc =
- case getChar loc of
- Just (c, loc') -> Just ((c, loc), loc')
- Nothing -> Nothing
-{-# INLINE hasSQuote #-}
-
--- | After parsing a backslash and a space character, consume the rest of
--- the string gap and return the next location.
-collapseStringGap :: GetChar loc -> LexStringState loc -> loc -> Either (StringLexError loc) loc
-collapseStringGap getChar s = go
- where
- go loc0 =
- case getChar loc0 of
- Just ('\\', loc1) -> pure loc1
- Just (c0, loc1) | is_space c0 -> go loc1
- Just _ -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
- Nothing -> Left $ UnexpectedEOF loc0 (hasSQuote getChar s)
-{-# INLINE collapseStringGap #-}
+-- -----------------------------------------------------------------------------
+-- Escape characters
--- | See Note [Multiline string literals]
-parseLeadingWS :: GetChar loc -> LexStringState loc -> loc -> (LexStringState loc, loc)
-parseLeadingWS getChar = go 0
+-- Assumes escape character is valid
+resolveEscapeCharacter :: [Char] -> Either ParseEscapeErr (Char, [Char])
+resolveEscapeCharacter = \case
+ 'a' : cs -> pure ('\a', cs)
+ 'b' : cs -> pure ('\b', cs)
+ 'f' : cs -> pure ('\f', cs)
+ 'n' : cs -> pure ('\n', cs)
+ 'r' : cs -> pure ('\r', cs)
+ 't' : cs -> pure ('\t', cs)
+ 'v' : cs -> pure ('\v', cs)
+ '\\' : cs -> pure ('\\', cs)
+ '"' : cs -> pure ('\"', cs)
+ '\'' : cs -> pure ('\'', cs)
+ -- escape codes
+ 'x' : cs -> parseNum is_hexdigit 16 hexDigit cs
+ 'o' : cs -> parseNum is_octdigit 8 octDecDigit cs
+ c : cs | is_decdigit c -> parseNum is_decdigit 10 octDecDigit (c : cs)
+ -- control characters (e.g. '\^M')
+ '^' : c : cs -> pure (chr $ ord c - ord '@', cs)
+ -- long form escapes (e.g. '\NUL')
+ cs | Just (c, cs') <- parseLongEscape cs -> pure (c, cs')
+ -- shouldn't happen
+ c : _ -> panic $ "found unexpected escape character: " ++ show c
+ [] -> panic $ "escape character unexpectedly ended"
where
- go !col s loc =
- case getChar loc of
- Just (c@' ', loc') -> go (col + 1) (addChar c s) loc'
- -- expand tabs
- Just ('\t', loc') ->
- let fill = 8 - (col `mod` 8)
- s' = applyN fill (addChar ' ') s
- in go (col + fill) s' loc'
- -- if we see a newline or string delimiter, then this line only contained whitespace, so
- -- don't include it in the common whitespace prefix
- Just ('\n', _) -> (s, loc)
- Just ('"', _) | Just _ <- checkDelimiter StringTypeMulti getChar loc -> (s, loc)
- -- found some other character, so we're done parsing leading whitespace
- _ ->
- let s' = s{multilineCommonWsPrefix = min col (multilineCommonWsPrefix s)}
- in (s', loc)
-
- applyN :: Int -> (a -> a) -> a -> a
- applyN n f x0 = iterate f x0 !! n
-{-# INLINE parseLeadingWS #-}
-
-data StringLexError loc
- = UnexpectedEOF !loc !(ContainsSmartQuote loc)
- -- ^ Unexpectedly hit EOF when lexing string
- | BadCharInitialLex !loc !(ContainsSmartQuote loc)
- -- ^ Found invalid character when initially lexing string
- | EscapeBadChar !loc
- -- ^ Found invalid character when parsing an escaped character
- | EscapeUnexpectedEOF !loc
- -- ^ Unexpectedly hit EOF when parsing an escaped character
- | EscapeNumRangeError !loc
- -- ^ Escaped number exceeds range
- | EscapeSmartQuoteError !Char !loc
- -- ^ Found escaped smart unicode chars as `\’` or `\”`
+ parseNum isDigit base toDigit =
+ let go x = \case
+ c : cs | isDigit c -> do
+ let x' = x * base + toDigit c
+ when (x' > 0x10ffff) $ Left $ ParseEscapeErr (LexNumEscapeRange, length cs)
+ go x' cs
+ cs -> pure (chr x, cs)
+ in go 0
+
+newtype ParseEscapeErr =
+ ParseEscapeErr
+ ( LexErr
+ , Int -- where the error occurred, as the number of characters from the end. e.g. 0 = last character in string
+ )
deriving (Show)
--- | When initially lexing the string, we want to track if we've
--- seen a smart quote, to show a helpful "you might be accidentally
--- using a smart quote" error.
-data ContainsSmartQuote loc
- = NoSmartQuote
- | SmartQuote !Char !loc
- deriving (Show)
+instance Exception ParseEscapeErr
--- -----------------------------------------------------------------------------
--- Escape characters
-
--- | After finding a backslash, parse the rest of the escape character, starting
--- at the given location.
-resolveEscapeCharacter :: GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
-resolveEscapeCharacter getChar loc0 = do
- (c0, loc1) <- expectChar loc0
- case c0 of
- 'a' -> pure ('\a', loc1)
- 'b' -> pure ('\b', loc1)
- 'f' -> pure ('\f', loc1)
- 'n' -> pure ('\n', loc1)
- 'r' -> pure ('\r', loc1)
- 't' -> pure ('\t', loc1)
- 'v' -> pure ('\v', loc1)
- '\\' -> pure ('\\', loc1)
- '"' -> pure ('\"', loc1)
- '\'' -> pure ('\'', loc1)
- -- escape codes
- 'x' -> expectNum is_hexdigit 16 hexDigit loc1
- 'o' -> expectNum is_octdigit 8 octDecDigit loc1
- _ | is_decdigit c0 -> expectNum is_decdigit 10 octDecDigit loc0
- -- control characters (e.g. '\^M')
- '^' -> do
- (c1, loc2) <- expectChar loc1
- unless ('@' <= c1 && c1 <= '_') $ Left $ EscapeBadChar loc1
- pure (chr $ ord c1 - ord '@', loc2)
- -- long form escapes (e.g. '\NUL')
- _ | Just (c1, loc2) <- parseLongEscape getChar c0 loc1 -> pure (c1, loc2)
- -- check unicode smart quotes (#21843)
- _ | isDoubleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
- _ | isSingleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
- -- unknown escape
- _ -> Left $ EscapeBadChar loc0
+-- | Get the LexErr and location of the error, given the location of the initial
+-- string delimiter, a function to get the next location, and the total length of
+-- the string.
+resolveParseEscapeErr :: (loc -> Maybe (Char, loc)) -> loc -> Int -> ParseEscapeErr -> (LexErr, loc)
+resolveParseEscapeErr getChar loc len (ParseEscapeErr (e, indexFromEnd)) = (e, loc')
where
- expectChar loc =
- case getChar loc of
- Just x -> pure x
- Nothing -> Left $ EscapeUnexpectedEOF loc
-
- expectNum isDigit base toDigit loc1 = do
- (c1, loc2) <- expectChar loc1
- unless (isDigit c1) $ Left $ EscapeBadChar loc1
- let parseNum x loc =
- case getChar loc of
- Just (c, loc') | isDigit c -> do
- let x' = x * base + toDigit c
- when (x' > 0x10ffff) $ Left $ EscapeNumRangeError loc
- parseNum x' loc'
- _ ->
- pure (chr x, loc)
- parseNum (toDigit c1) loc2
-{-# INLINE resolveEscapeCharacter #-}
-
-parseLongEscape :: GetChar loc -> Char -> loc -> Maybe (Char, loc)
-parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCodes
+ -- the index of the error, where 0 is the first character after the initial string delimiter
+ index = len - indexFromEnd - 1
+
+ -- the 'loc' corresponding to 'index'
+ loc' = iterate getNextLoc loc !! (index + 1)
+ getNextLoc l =
+ case getChar l of
+ Just (_, l') -> l'
+ Nothing -> panic "Unexpectedly reached EOF when resolving ParseEscapeErr"
+
+parseLongEscape :: [Char] -> Maybe (Char, [Char])
+parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
where
- tryParse (prefix, c) = do
- p0 : p <- pure prefix
- guard (p0 == c0) -- see if the first character matches
- loc <- parsePrefix loc1 p -- see if the rest of the prefix matches
- pure (c, loc)
-
- parsePrefix loc = \case
- [] -> pure loc
- p : ps -> do
- (c, loc') <- getChar loc
- guard (p == c)
- parsePrefix loc' ps
+ tryParse (code, c) =
+ case splitAt (length code) cs of
+ (pre, cs') | pre == code -> Just (c, cs')
+ _ -> Nothing
longEscapeCodes =
[ ("NUL", '\NUL')
@@ -316,7 +141,6 @@ parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCode
, ("SP", '\SP')
, ("DEL", '\DEL')
]
-{-# INLINE parseLongEscape #-}
-- -----------------------------------------------------------------------------
-- Unicode Smart Quote detection (#21843)
@@ -333,54 +157,8 @@ isSingleSmartQuote = \case
'’' -> True
_ -> False
-{-
-Note [Multiline string literals]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Multiline string literals were added following the acceptance of the
-proposal: https://github.com/ghc-proposals/ghc-proposals/pull/569
-
-Multiline string literals are syntax sugar for normal string literals,
-with an extra post processing step. This all happens in the Lexer; that
-is, HsMultilineString will contain the post-processed string. This matches
-the same behavior as HsString, which contains the normalized string
-(see Note [Literal source text]).
-
-The canonical steps for post processing a multiline string are:
-1. Collapse string gaps
-2. Split the string by newlines
-3. Convert leading tabs into spaces
- * In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
-4. Remove common whitespace prefix in every line (see below)
-5. If a line contains only whitespace, remove all of the whitespace
-6. Join the string back with `\n` delimiters
-7. If the first character of the string is a newline, remove it
-8. Interpret escaped characters
-
-However, for performance reasons, we do as much of this in one pass as possible:
-1. As we lex the string, do the following steps as they appear:
- a. Collapse string gaps
- b. Keep track of the common whitespace prefix so far
- c. Validate escaped characters
-2. At the very end, post process the lexed string:
- a. Remove the common whitespace prefix from every line
- b. Remove all whitespace from all-whitespace lines
- c. Remove initial newline character
- d. Resolve escaped characters
-
-The common whitespace prefix can be informally defined as "The longest
-prefix of whitespace shared by all lines in the string, excluding the
-first line and any whitespace-only lines".
-
-It's more precisely defined with the following algorithm:
-
-1. Take a list representing the lines in the string
-2. Ignore the following elements in the list:
- * The first line (we want to ignore everything before the first newline)
- * Empty lines
- * Lines with only whitespace characters
-3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
--}
+-- -----------------------------------------------------------------------------
+-- Multiline strings
-- | See Note [Multiline string literals]
postprocessMultiline :: Int -> String -> String
@@ -436,16 +214,59 @@ postprocessMultiline commonWSPrefix =
resolveEscapeChars = \case
[] -> []
'\\' : s ->
- -- concretizing 'loc' to String:
- -- resolveEscapeCharacter :: (String -> Maybe (Char, String)) -> String -> Either _ (Char, String)
- case resolveEscapeCharacter uncons s of
+ case resolveEscapeCharacter s of
Left e -> panic $ "resolving escape characters in multiline string unexpectedly found errors: " ++ show e
Right (c, s') -> c : resolveEscapeChars s'
c : s -> c : resolveEscapeChars s
- uncons = \case
- c : cs -> Just (c, cs)
- [] -> Nothing
+{-
+Note [Multiline string literals]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Multiline string literals were added following the acceptance of the
+proposal: https://github.com/ghc-proposals/ghc-proposals/pull/569
+
+Multiline string literals are syntax sugar for normal string literals,
+with an extra post processing step. This all happens in the Lexer; that
+is, HsMultilineString will contain the post-processed string. This matches
+the same behavior as HsString, which contains the normalized string
+(see Note [Literal source text]).
+
+The canonical steps for post processing a multiline string are:
+1. Collapse string gaps
+2. Split the string by newlines
+3. Convert leading tabs into spaces
+ * In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
+4. Remove common whitespace prefix in every line (see below)
+5. If a line contains only whitespace, remove all of the whitespace
+6. Join the string back with `\n` delimiters
+7. If the first character of the string is a newline, remove it
+8. Interpret escaped characters
+
+However, for performance reasons, we do as much of this in one pass as possible:
+1. As we lex the string, do the following steps as they appear:
+ a. Collapse string gaps
+ b. Keep track of the common whitespace prefix so far
+ c. Validate escaped characters
+2. At the very end, post process the lexed string:
+ a. Remove the common whitespace prefix from every line
+ b. Remove all whitespace from all-whitespace lines
+ c. Remove initial newline character
+ d. Resolve escaped characters
+
+The common whitespace prefix can be informally defined as "The longest
+prefix of whitespace shared by all lines in the string, excluding the
+first line and any whitespace-only lines".
+
+It's more precisely defined with the following algorithm:
+
+1. Take a list representing the lines in the string
+2. Ignore the following elements in the list:
+ * The first line (we want to ignore everything before the first newline)
+ * Empty lines
+ * Lines with only whitespace characters
+3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
+-}
-- -----------------------------------------------------------------------------
-- Helpers
=====================================
testsuite/tests/parser/should_fail/T3751.stderr
=====================================
@@ -1,3 +1,2 @@
+T3751.hs:3:5: error: [GHC-21231] lexical error at character '\167'
-T3751.hs:3:7: error: [GHC-21231]
- lexical error in string/character literal at character '\167'
=====================================
testsuite/tests/parser/should_fail/T5425.stderr
=====================================
@@ -1,3 +1,2 @@
+T5425.hs:4:1: error: [GHC-21231] lexical error at character '\955'
-T5425.hs:5:2: error: [GHC-21231]
- lexical error in string/character literal at character '\955'
=====================================
testsuite/tests/parser/should_fail/readFail002.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail002.hs:5:5: error: [GHC-21231]
+ lexical error at character '\n'
-readFail002.hs:5:6: error: [GHC-21231]
- lexical error in string/character literal at character '\n'
=====================================
testsuite/tests/parser/should_fail/readFail004.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail004.hs:17:16: error: [GHC-21231]
+ lexical error at character '.'
-readFail004.hs:19:1: error: [GHC-21231]
- lexical error in string/character literal at character '.'
=====================================
testsuite/tests/parser/should_fail/readFail033.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail033.hs:2:5: error: [GHC-21231]
+ lexical error at character '\t'
-readFail033.hs:2:6: error: [GHC-21231]
- lexical error in string/character literal at character '\t'
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/91293f1e8865b6f3ed154bd2afb61964b1be141a...de6a3a26e9ba52104e8454e22dc59852d8ab32d4
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/91293f1e8865b6f3ed154bd2afb61964b1be141a...de6a3a26e9ba52104e8454e22dc59852d8ab32d4
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240811/f700d45e/attachment-0001.html>
More information about the ghc-commits
mailing list