[Git][ghc/ghc][wip/strings] 2 commits: Replace manual string lexing

Brandon Chinn (@brandonchinn178) gitlab at gitlab.haskell.org
Sun Aug 25 23:42:54 UTC 2024



Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC


Commits:
84f3dbbd by Brandon Chinn at 2024-08-25T16:42:36-07:00
Replace manual string lexing

- - - - -
c477484f by Brandon Chinn at 2024-08-25T16:42:36-07:00
Update tests for new lexing error messages

- - - - -


23 changed files:

- compiler/GHC/Parser.y
- compiler/GHC/Parser/Errors/Ppr.hs
- compiler/GHC/Parser/Errors/Types.hs
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
- testsuite/tests/ghci/prog013/prog013.stderr
- testsuite/tests/ghci/scripts/ghci022.stderr
- testsuite/tests/parser/should_fail/MultilineStringsSmartQuotes.stderr
- testsuite/tests/parser/should_fail/T21843c.stderr
- testsuite/tests/parser/should_fail/T21843e.stderr
- testsuite/tests/parser/should_fail/T21843f.stderr
- testsuite/tests/parser/should_fail/T3751.stderr
- testsuite/tests/parser/should_fail/T5425.stderr
- testsuite/tests/parser/should_fail/readFail002.stderr
- testsuite/tests/parser/should_fail/readFail004.stderr
- testsuite/tests/parser/should_fail/readFail005.stderr
- testsuite/tests/parser/should_fail/readFail033.stderr
- testsuite/tests/parser/unicode/utf8_010.stderr
- testsuite/tests/parser/unicode/utf8_011.stderr
- testsuite/tests/parser/unicode/utf8_020.stderr
- testsuite/tests/parser/unicode/utf8_021.stderr
- testsuite/tests/parser/unicode/utf8_022.stderr
- utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs


Changes:

=====================================
compiler/GHC/Parser.y
=====================================
@@ -728,7 +728,7 @@ are the most common patterns, rewritten as regular expressions for clarity:
 
  CHAR           { L _ (ITchar   _ _) }
  STRING         { L _ (ITstring _ _) }
- MULTILINESTRING { L _ (ITmultilinestring _ _) }
+ STRING_MULTI   { L _ (ITstring_multi _ _) }
  INTEGER        { L _ (ITinteger _) }
  RATIONAL       { L _ (ITrational _) }
 
@@ -2357,8 +2357,8 @@ atype :: { LHsType GhcPs }
                                                                         (getCHAR $1) }
         | STRING               { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGs $1)
                                                                      (getSTRING  $1) }
-        | MULTILINESTRING      { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getMULTILINESTRINGs $1)
-                                                                     (getMULTILINESTRING  $1) }
+        | STRING_MULTI         { sLLa $1 $> $ HsTyLit noExtField $ HsStrTy (getSTRINGMULTIs $1)
+                                                                     (getSTRINGMULTI  $1) }
         | '_'                  { sL1a $1 $ mkAnonWildCardTy }
         -- Type variables are never exported, so `M.tyvar` will be rejected by the renamer.
         -- We let it pass the parser because the renamer can generate a better error message.
@@ -4047,8 +4047,8 @@ literal :: { Located (HsLit GhcPs) }
         : CHAR              { sL1 $1 $ HsChar       (getCHARs $1) $ getCHAR $1 }
         | STRING            { sL1 $1 $ HsString     (getSTRINGs $1)
                                                     $ getSTRING $1 }
-        | MULTILINESTRING   { sL1 $1 $ HsMultilineString (getMULTILINESTRINGs $1)
-                                                    $ getMULTILINESTRING $1 }
+        | STRING_MULTI      { sL1 $1 $ HsMultilineString (getSTRINGMULTIs $1)
+                                                    $ getSTRINGMULTI $1 }
         | PRIMINTEGER       { sL1 $1 $ HsIntPrim    (getPRIMINTEGERs $1)
                                                     $ getPRIMINTEGER $1 }
         | PRIMWORD          { sL1 $1 $ HsWordPrim   (getPRIMWORDs $1)
@@ -4154,7 +4154,7 @@ getIPDUPVARID     (L _ (ITdupipvarid   x)) = x
 getLABELVARID     (L _ (ITlabelvarid _ x)) = x
 getCHAR           (L _ (ITchar   _ x)) = x
 getSTRING         (L _ (ITstring _ x)) = x
-getMULTILINESTRING (L _ (ITmultilinestring _ x)) = x
+getSTRINGMULTI    (L _ (ITstring_multi _ x)) = x
 getINTEGER        (L _ (ITinteger x))  = x
 getRATIONAL       (L _ (ITrational x)) = x
 getPRIMCHAR       (L _ (ITprimchar _ x)) = x
@@ -4180,7 +4180,7 @@ getVOCURLY        (L (RealSrcSpan l _) ITvocurly) = srcSpanStartCol l
 getINTEGERs       (L _ (ITinteger (IL src _ _))) = src
 getCHARs          (L _ (ITchar       src _)) = src
 getSTRINGs        (L _ (ITstring     src _)) = src
-getMULTILINESTRINGs (L _ (ITmultilinestring src _)) = src
+getSTRINGMULTIs   (L _ (ITstring_multi src _)) = src
 getPRIMCHARs      (L _ (ITprimchar   src _)) = src
 getPRIMSTRINGs    (L _ (ITprimstring src _)) = src
 getPRIMINTEGERs   (L _ (ITprimint    src _)) = src


=====================================
compiler/GHC/Parser/Errors/Ppr.hs
=====================================
@@ -157,8 +157,6 @@ instance Diagnostic PsMessage where
               LexUnknownPragma       -> text "unknown pragma"
               LexErrorInPragma       -> text "lexical error in pragma"
               LexNumEscapeRange      -> text "numeric escape sequence out of range"
-              LexStringCharLit       -> text "lexical error in string/character literal"
-              LexStringCharLitEOF    -> text "unexpected end-of-file in string/character literal"
               LexUnterminatedComment -> text "unterminated `{-'"
               LexUnterminatedOptions -> text "unterminated OPTIONS pragma"
               LexUnterminatedQQ      -> text "unterminated quasiquotation"


=====================================
compiler/GHC/Parser/Errors/Types.hs
=====================================
@@ -591,8 +591,6 @@ data LexErr
    | LexUnknownPragma       -- ^ Unknown pragma
    | LexErrorInPragma       -- ^ Lexical error in pragma
    | LexNumEscapeRange      -- ^ Numeric escape sequence out of range
-   | LexStringCharLit       -- ^ Lexical error in string/character literal
-   | LexStringCharLitEOF    -- ^ Unexpected end-of-file in string/character literal
    | LexUnterminatedComment -- ^ Unterminated `{-'
    | LexUnterminatedOptions -- ^ Unterminated OPTIONS pragma
    | LexUnterminatedQQ      -- ^ Unterminated quasiquotation


=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -167,6 +167,7 @@ $idchar    = [$small $large $digit $uniidchar \']
 
 $unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
 $graphic   = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
+$charesc   = [a b f n r t v \\ \" \' \&]
 
 $binit     = 0-1
 $octit     = 0-7
@@ -213,6 +214,20 @@ $docsym    = [\| \^ \* \$]
 @floating_point = @numspc @decimal \. @decimal @exponent? | @numspc @decimal @exponent
 @hex_floating_point = @numspc @hexadecimal \. @hexadecimal @bin_exponent? | @numspc @hexadecimal @bin_exponent
 
+ at gap = \\ $whitechar+ \\
+ at cntrl = $asclarge | \@ | \[ | \\ | \] | \^ | \_
+ at ascii = \^ @cntrl | "NUL" | "SOH" | "STX" | "ETX" | "EOT" | "ENQ" | "ACK"
+       | "BEL" | "BS" | "HT" | "LF" | "VT" | "FF" | "CR" | "SO" | "SI" | "DLE"
+       | "DC1" | "DC2" | "DC3" | "DC4" | "NAK" | "SYN" | "ETB" | "CAN"
+       | "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
+-- N.B. ideally, we would do `@escape # \\ \&` instead of duplicating in @escapechar,
+-- which is what the Haskell Report says, but this isn't valid Alex syntax, as only
+-- character sets can be subtracted, not strings
+ at escape     = \\ ( $charesc      | @ascii | @decimal | o @octal | x @hexadecimal )
+ at escapechar = \\ ( $charesc # \& | @ascii | @decimal | o @octal | x @hexadecimal )
+ at stringchar = ($graphic # [\\ \"]) | " " | @escape     | @gap
+ at char       = ($graphic # [\\ \']) | " " | @escapechar
+
 -- normal signed numerical literals can only be explicitly negative,
 -- not explicitly positive (contrast @exponent)
 @negative = \-
@@ -460,7 +475,7 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
 
 <0> {
   "#" $idchar+ / { ifExtension OverloadedLabelsBit } { skip_one_varid_src ITlabelvarid }
-  "#" \" / { ifExtension OverloadedLabelsBit } { lex_quoted_label }
+  "#" \" @stringchar* \" / { ifExtension OverloadedLabelsBit } { tok_quoted_label }
 }
 
 <0> {
@@ -660,14 +675,44 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
 
 }
 
--- Strings and chars are lexed by hand-written code.  The reason is
--- that even if we recognise the string or char here in the regex
--- lexer, we would still have to parse the string afterward in order
--- to convert it to a String.
 <0> {
-  \'                            { lex_char_tok }
-  \"\"\" / { ifExtension MultilineStringsBit} { lex_string_tok StringTypeMulti }
-  \"                            { lex_string_tok StringTypeSingle }
+  \"\"\" / { ifExtension MultilineStringsBit }         { tok_string_multi }
+  \" @stringchar* \"                                   { tok_string }
+  \" @stringchar* \" \# / { ifExtension MagicHashBit } { tok_string }
+  \' @char        \'                                   { tok_char }
+  \' @char        \' \# / { ifExtension MagicHashBit } { tok_char }
+
+  -- Check for smart quotes and throw better errors than a plain lexical error (#21843)
+  \'              \\ $unigraphic / { isSmartQuote } { smart_quote_error }
+  \" @stringchar* \\ $unigraphic / { isSmartQuote } { smart_quote_error }
+  -- See Note [Bare smart quote error]
+  -- The valid string rule will take precedence because it'll match more
+  -- characters than this rule, so this rule will only fire if the string
+  -- could not be lexed correctly
+  \" @stringchar*    $unigraphic / { isSmartQuote } { smart_quote_error }
+}
+
+<string_multi_bol> {
+  ([\  $tab] | @gap)* { tok_string_multi_content }
+}
+
+<string_multi_content> {
+  @stringchar*               { tok_string_multi_content }
+  $nl                        { tok_string_multi_content }
+  -- allow bare quotes if it's not a triple quote
+  -- N.B. we need to explicitly check for \n in the right context because
+  -- the character set [^...] doesn't include newlines
+  (\" | \"\") / (\n | [^\"]) { tok_string_multi_content }
+}
+
+<0> {
+  \'\' / [^ $whitechar \\] { token ITtyQuote }
+
+  -- the normal character match takes precedence over this because
+  -- it matches more characters. if that pattern didn't match, then
+  -- this quote is a quoted identifier, like 'x. Here, just return
+  -- ITsimpleQuote, as the parser will lex the varid separately.
+  \'   / [^ $whitechar \\] { token ITsimpleQuote }
 }
 
 -- Note [Whitespace-sensitive operator parsing]
@@ -953,7 +998,7 @@ data Token
 
   | ITchar     SourceText Char       -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITstring   SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
-  | ITmultilinestring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+  | ITstring_multi SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITinteger  IntegralLit           -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITrational FractionalLit
 
@@ -2181,156 +2226,136 @@ lex_string_prag_comment mkTok span _buf _len _buf2
 -- -----------------------------------------------------------------------------
 -- Strings & Chars
 
--- This stuff is horrible.  I hates it.
-
-lex_string_tok :: LexStringType -> Action
-lex_string_tok strType span buf _len _buf2 = do
-  s <- lex_string strType
-
-  i <- getInput
-  case strType of
-    StringTypeSingle ->
-      lex_magic_hash i >>= \case
-        Just i' -> do
-          when (any (> '\xFF') s) $ do
-            pState <- getPState
-            let msg = PsErrPrimStringInvalidChar
-            let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
-            addError err
-
-          setInput i'
-          let (psSpan, src) = getStringLoc (buf, locStart) i'
-          pure $ L psSpan (ITprimstring src (unsafeMkByteString s))
-        Nothing -> do
-          let (psSpan, src) = getStringLoc (buf, locStart) i
-          pure $ L psSpan (ITstring src (mkFastString s))
-    StringTypeMulti -> do
-      let (psSpan, src) = getStringLoc (buf, locStart) i
-      pure $ L psSpan (ITmultilinestring src (mkFastString s))
+tok_string :: Action
+tok_string span buf len _buf2 = do
+  s <- lex_chars ("\"", "\"") span buf (if endsInHash then len - 1 else len)
+
+  if endsInHash
+    then do
+      when (any (> '\xFF') s) $ do
+        pState <- getPState
+        let msg = PsErrPrimStringInvalidChar
+        let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
+        addError err
+      pure $ L span (ITprimstring src (unsafeMkByteString s))
+    else
+      pure $ L span (ITstring src (mkFastString s))
+  where
+    src = SourceText $ lexemeToFastString buf len
+    endsInHash = currentChar (offsetBytes (len - 1) buf) == '#'
+
+-- | Ideally, we would define this completely with Alex syntax, like normal strings.
+-- Instead, this is defined as a hybrid solution by manually invoking lex states, which
+-- we're doing for two reasons:
+--   1. The multiline string should all be one lexical token, not multiple
+--   2. We need to allow bare quotes, which can't be done with one regex
+tok_string_multi :: Action
+tok_string_multi startSpan startBuf _len _buf2 = do
+  -- advance to the end of the multiline string
+  let startLoc = psSpanStart startSpan
+  let i@(AI _ contentStartBuf) =
+        case lexDelim $ AI startLoc startBuf of
+          Just i -> i
+          Nothing -> panic "tok_string_multi did not start with a delimiter"
+  (AI _ contentEndBuf, i'@(AI endLoc endBuf)) <- goContent i
+
+  -- build the values pertaining to the entire multiline string, including delimiters
+  let span = mkPsSpan startLoc endLoc
+  let len = byteDiff startBuf endBuf
+  let src = SourceText $ lexemeToFastString startBuf len
+
+  -- load the content of the multiline string
+  let contentLen = byteDiff contentStartBuf contentEndBuf
+  s <-
+    either (throwStringLexError (AI startLoc startBuf)) pure $
+      lexMultilineString contentLen contentStartBuf
+
+  setInput i'
+  pure $ L span $ ITstring_multi src (mkFastString s)
+  where
+    goContent i0 =
+      case alexScan i0 string_multi_content of
+        AlexToken i1 len _
+          | Just i2 <- lexDelim i1   -> pure (i1, i2)
+          | Just i2 <- lexNewline i1 -> goBOL i2
+          | isEOF i1                 -> checkSmartQuotes >> lexError LexError
+          | len == 0                 -> panic $ "parsing multiline string got into infinite loop at: " ++ show i0
+          | otherwise                -> goContent i1
+        AlexSkip i1 _ -> goContent i1
+        _ -> lexError LexError
+
+    goBOL i0 =
+      case alexScan i0 string_multi_bol of
+        AlexToken i1 _ _ -> goContent i1
+        AlexSkip i1 _ -> goBOL i1
+        _ -> lexError LexError
+
+    lexNewline (AI loc buf) =
+      case nextChar buf of
+        (c@'\n', buf') -> Just (AI (advancePsLoc loc c) buf')
+        _ -> Nothing
+
+    lexDelim =
+      let go 0 i = Just i
+          go n (AI loc buf) =
+            case nextChar buf of
+              (c@'"', buf') -> go (n - 1) (AI (advancePsLoc loc c) buf')
+              _ -> Nothing
+       in go (3 :: Int)
+
+    -- See Note [Bare smart quote error]
+    checkSmartQuotes = do
+      let findSmartQuote i0@(AI loc _) =
+            case alexGetChar' i0 of
+              Just ('\\', i1) | Just (_, i2) <- alexGetChar' i1 -> findSmartQuote i2
+              Just (c, i1)
+                | isDoubleSmartQuote c -> Just (c, loc)
+                | otherwise -> findSmartQuote i1
+              _ -> Nothing
+      case findSmartQuote (AI (psSpanStart startSpan) startBuf) of
+        Just (c, loc) -> throwSmartQuoteError c loc
+        Nothing -> pure ()
+
+-- | Dummy action that should never be called. Should only be used in lex states
+-- that are manually lexed in tok_string_multi.
+tok_string_multi_content :: Action
+tok_string_multi_content = panic "tok_string_multi_content unexpectedly invoked"
+
+lex_chars :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
+lex_chars (startDelim, endDelim) span buf len =
+  either (throwStringLexError i0) pure $
+    lexString contentLen contentBuf
   where
-    locStart = psSpanStart span
+    i0@(AI _ contentBuf) = advanceInputBytes (length startDelim) $ AI (psSpanStart span) buf
 
+    -- assumes delimiters are ASCII, with 1 byte per Char
+    contentLen = len - length startDelim - length endDelim
 
-lex_quoted_label :: Action
-lex_quoted_label span buf _len _buf2 = do
-  s <- lex_string StringTypeSingle
-  (AI end bufEnd) <- getInput
-  let
-    token = ITlabelvarid (SourceText src) (mkFastString s)
-    src = lexemeToFastString (stepOn buf) (cur bufEnd - cur buf - 1)
-    start = psSpanStart span
-
-  return $ L (mkPsSpan start end) token
-
-
-lex_string :: LexStringType -> P String
-lex_string strType = do
-  start <- getInput
-  (str, next) <- either fromStringLexError pure $ lexString strType alexGetChar' start
-  setInput next
-  pure str
-
-
-lex_char_tok :: Action
--- Here we are basically parsing character literals, such as 'x' or '\n'
--- but we additionally spot 'x and ''T, returning ITsimpleQuote and
--- ITtyQuote respectively, but WITHOUT CONSUMING the x or T part
--- (the parser does that).
--- So we have to do two characters of lookahead: when we see 'x we need to
--- see if there's a trailing quote
-lex_char_tok span buf _len _buf2 = do        -- We've seen '
-   i1 <- getInput       -- Look ahead to first character
-   let loc = psSpanStart span
-   case alexGetChar' i1 of
-        Nothing -> lit_error  i1
-
-        Just ('\'', i2@(AI end2 _)) -> do       -- We've seen ''
-                   setInput i2
-                   return (L (mkPsSpan loc end2)  ITtyQuote)
-
-        Just ('\\', i2@(AI end2 _)) -> do      -- We've seen 'backslash
-                  (lit_ch, i3) <-
-                    either fromStringLexError pure $
-                      resolveEscapeCharacter alexGetChar' i2
-                  case alexGetChar' i3 of
-                    Just ('\'', i4) -> do
-                      setInput i4
-                      finish_char_tok buf loc lit_ch
-                    Just (mc, _) | isSingleSmartQuote mc -> add_smart_quote_error mc end2
-                    _ -> lit_error i3
-
-        Just (c, i2@(AI end2 _))
-                | not (isAnyChar c) -> lit_error i1
-                | otherwise ->
-
-                -- We've seen 'x, where x is a valid character
-                --  (i.e. not newline etc) but not a quote or backslash
-           case alexGetChar' i2 of      -- Look ahead one more character
-                Just ('\'', i3) -> do   -- We've seen 'x'
-                        setInput i3
-                        finish_char_tok buf loc c
-                Just (c, _) | isSingleSmartQuote c -> add_smart_quote_error c end2
-                _other -> do            -- We've seen 'x not followed by quote
-                                        -- (including the possibility of EOF)
-                                        -- Just parse the quote only
-                        let (AI end _) = i1
-                        return (L (mkPsSpan loc end) ITsimpleQuote)
-
--- We've already seen the closing quote
--- Just need to check for trailing #
-finish_char_tok :: StringBuffer -> PsLoc -> Char -> P (PsLocated Token)
-finish_char_tok buf loc ch = do
-  i <- getInput
-  lex_magic_hash i >>= \case
-    Just i' -> do
-      setInput i'
-      -- Include the trailing # in SourceText
-      let (psSpan, src) = getStringLoc (buf, loc) i'
-      pure $ L psSpan (ITprimchar src ch)
-    Nothing -> do
-      let (psSpan, src) = getStringLoc (buf, loc) i
-      pure $ L psSpan (ITchar src ch)
+throwStringLexError :: AlexInput -> StringLexError -> P a
+throwStringLexError i (StringLexError e pos) = setInput (advanceInputTo pos i) >> lexError e
 
 
--- | Get the span and source text for a string from the given start to the given end.
-getStringLoc :: (StringBuffer, PsLoc) -> AlexInput -> (PsSpan, SourceText)
-getStringLoc (bufStart, locStart) (AI locEnd bufEnd) = (psSpan, SourceText src)
+tok_quoted_label :: Action
+tok_quoted_label span buf len _buf2 = do
+  s <- lex_chars ("#\"", "\"") span buf len
+  pure $ L span (ITlabelvarid src (mkFastString s))
   where
-    psSpan = mkPsSpan locStart locEnd
-    src = lexemeToFastString bufStart (cur bufEnd - cur bufStart)
-
-
--- Return Just if we found the magic hash, with the next input.
-lex_magic_hash :: AlexInput -> P (Maybe AlexInput)
-lex_magic_hash i = do
-  magicHash <- getBit MagicHashBit
-  if magicHash
-    then
-      case alexGetChar' i of
-        Just ('#', i') -> pure (Just i')
-        _other -> pure Nothing
-    else pure Nothing
-
-fromStringLexError :: StringLexError AlexInput -> P a
-fromStringLexError = \case
-  UnexpectedEOF i squote -> checkSQuote squote >> throw i LexStringCharLitEOF
-  BadCharInitialLex i squote -> checkSQuote squote >> throw i LexStringCharLit
-  EscapeBadChar i -> throw i LexStringCharLit
-  EscapeUnexpectedEOF i -> throw i LexStringCharLitEOF
-  EscapeNumRangeError i -> throw i LexNumEscapeRange
-  EscapeSmartQuoteError c (AI loc _) -> add_smart_quote_error c loc
+    src = SourceText $ lexemeToFastString buf len
+
+
+tok_char :: Action
+tok_char span buf len _buf2 = do
+  c <- lex_chars ("'", "'") span buf (if endsInHash then len - 1 else len) >>= \case
+    [c] -> pure c
+    s -> panic $ "tok_char expected exactly one character, got: " ++ show s
+  pure . L span $
+    if endsInHash
+      then ITprimchar src c
+      else ITchar src c
   where
-    throw i e = setInput i >> lexError e
-    checkSQuote = \case
-      NoSmartQuote -> pure ()
-      SmartQuote c (AI loc _) -> add_nonfatal_smart_quote_error c loc
-
--- before calling lit_error, ensure that the current input is pointing to
--- the position of the error in the buffer.  This is so that we can report
--- a correct location to the user, but also so we can detect UTF-8 decoding
--- errors if they occur.
-lit_error :: AlexInput -> P a
-lit_error i = do setInput i; lexError LexStringCharLit
+    src = SourceText $ lexemeToFastString buf len
+    endsInHash = currentChar (offsetBytes (len - 1) buf) == '#'
+
 
 -- -----------------------------------------------------------------------------
 -- QuasiQuote
@@ -2389,32 +2414,28 @@ quasiquote_error start = do
 isSmartQuote :: AlexAccPred ExtsBitmap
 isSmartQuote _ _ _ (AI _ buf) = let c = prevChar buf ' ' in isSingleSmartQuote c || isDoubleSmartQuote c
 
-smart_quote_error_message :: Char -> PsLoc -> MsgEnvelope PsMessage
-smart_quote_error_message c loc =
-  let (correct_char, correct_char_name) =
-         if isSingleSmartQuote c then ('\'', "Single Quote") else ('"', "Quotation Mark")
-      err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (mkPsSpan loc loc)) $
-              PsErrUnicodeCharLooksLike c correct_char correct_char_name in
-    err
-
+throwSmartQuoteError :: Char -> PsLoc -> P a
+throwSmartQuoteError c loc = addFatalError err
+  where
+    err =
+      mkPlainErrorMsgEnvelope (mkSrcSpanPs (mkPsSpan loc loc)) $
+        PsErrUnicodeCharLooksLike c correct_char correct_char_name
+    (correct_char, correct_char_name) =
+      if isSingleSmartQuote c
+        then ('\'', "Single Quote")
+        else ('"', "Quotation Mark")
+
+-- | Throw a smart quote error, where the smart quote was the last character lexed
 smart_quote_error :: Action
-smart_quote_error span buf _len _buf2 = do
-  let c = currentChar buf
-  addFatalError (smart_quote_error_message c (psSpanStart span))
-
-add_smart_quote_error :: Char -> PsLoc -> P a
-add_smart_quote_error c loc = addFatalError (smart_quote_error_message c loc)
+smart_quote_error span _ _ buf2 = do
+  let c = prevChar buf2 (panic "smart_quote_error unexpectedly called on beginning of input")
+  throwSmartQuoteError c (psSpanStart span)
 
-add_nonfatal_smart_quote_error :: Char -> PsLoc -> P ()
-add_nonfatal_smart_quote_error c loc = addError (smart_quote_error_message c loc)
-
-advance_to_smart_quote_character :: P ()
-advance_to_smart_quote_character  = do
-  i <- getInput
-  case alexGetChar' i of
-    Just (c, _) | isDoubleSmartQuote c -> return ()
-    Just (_, i2) -> do setInput i2; advance_to_smart_quote_character
-    Nothing -> return () -- should never get here
+-- Note [Bare smart quote error]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- A smart quote inside of a string is allowed, but if a complete valid string
+-- couldn't be lexed, we want to see if there's a smart quote that the user
+-- thought ended the string, but in fact didn't.
 
 -- -----------------------------------------------------------------------------
 -- Warnings
@@ -2652,7 +2673,7 @@ getLastLocIncludingComments = P $ \s@(PState { prev_loc = prev_loc }) -> POk s p
 getLastLoc :: P PsSpan
 getLastLoc = P $ \s@(PState { last_loc = last_loc }) -> POk s last_loc
 
-data AlexInput = AI !PsLoc !StringBuffer
+data AlexInput = AI !PsLoc !StringBuffer deriving (Show)
 
 {-
 Note [Unicode in Alex]
@@ -2763,6 +2784,19 @@ alexGetChar' (AI loc s)
   where (c,s') = nextChar s
         loc'   = advancePsLoc loc c
 
+-- | Advance the given input N bytes.
+advanceInputBytes :: Int -> AlexInput -> AlexInput
+advanceInputBytes n i0@(AI _ buf0) = advanceInputTo (cur buf0 + n) i0
+
+-- | Advance the given input to the given position.
+advanceInputTo :: Int -> AlexInput -> AlexInput
+advanceInputTo pos = go
+  where
+    go i@(AI _ buf)
+      | cur buf >= pos = i
+      | Just (_, i') <- alexGetChar' i = go i'
+      | otherwise = i -- reached the end, just return the last input
+
 getInput :: P AlexInput
 getInput = P $ \s at PState{ loc=l, buffer=b } -> POk s (AI l b)
 
@@ -2770,9 +2804,10 @@ setInput :: AlexInput -> P ()
 setInput (AI l b) = P $ \s -> POk s{ loc=l, buffer=b } ()
 
 nextIsEOF :: P Bool
-nextIsEOF = do
-  AI _ s <- getInput
-  return $ atEnd s
+nextIsEOF = isEOF <$> getInput
+
+isEOF :: AlexInput -> Bool
+isEOF (AI _ buf) = atEnd buf
 
 pushLexState :: Int -> P ()
 pushLexState ls = P $ \s at PState{ lex_state=l } -> POk s{lex_state=ls:l} ()


=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -1,284 +1,195 @@
 {-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE PatternSynonyms #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE ViewPatterns #-}
 
 module GHC.Parser.String (
   StringLexError (..),
-  ContainsSmartQuote (..),
-  LexStringType (..),
   lexString,
+  lexMultilineString,
 
   -- * Unicode smart quote helpers
   isDoubleSmartQuote,
   isSingleSmartQuote,
-
-  -- * Other helpers
-  isAnyChar,
-  resolveEscapeCharacter,
 ) where
 
-import GHC.Prelude
+import GHC.Prelude hiding (getChar)
 
 import Control.Arrow ((>>>))
-import Control.Monad (guard, unless, when)
-import Data.Char (chr, isPrint, ord)
-import Data.List (unfoldr)
+import Control.Monad (when)
+import Data.Char (chr, ord)
+import qualified Data.Foldable as Seq (toList)
+import qualified Data.Foldable1 as Foldable1
+import qualified Data.List.NonEmpty as NonEmpty
 import Data.Maybe (listToMaybe, mapMaybe)
+import qualified Data.Sequence as Seq
+import GHC.Data.StringBuffer (StringBuffer)
+import qualified GHC.Data.StringBuffer as StringBuffer
 import GHC.Parser.CharClass (
   hexDigit,
-  is_any,
   is_decdigit,
   is_hexdigit,
   is_octdigit,
   is_space,
   octDecDigit,
  )
+import GHC.Parser.Errors.Types (LexErr (..))
 import GHC.Utils.Panic (panic)
 
-data LexStringType = StringTypeSingle | StringTypeMulti
+type BufPos = Int
+data StringLexError = StringLexError LexErr BufPos
 
--- | State to accumulate while iterating through string literal.
---
--- Fields are strict here to avoid space leak when iterating through large string literal
--- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
-data LexStringState loc = LexStringState
-  { stringAcc :: !String
-    -- ^ The string seen so far, reversed
-  , multilineCommonWsPrefix :: !Int
-    -- ^ The common prefix for multiline strings. See Note [Multiline string literals]
-  , initialLoc :: !loc
-    -- ^ The location of the beginning of the string literal
-  }
-
--- | Get the character at the given location, with the location
--- of the next character. Returns Nothing if at the end of the
--- input.
-type GetChar loc = loc -> Maybe (Char, loc)
-
-lexString :: LexStringType -> GetChar loc -> loc -> Either (StringLexError loc) (String, loc)
-lexString strType getChar initialLoc = go initialState initialLoc
+lexString :: Int -> StringBuffer -> Either StringLexError String
+lexString = lexStringWith processChars
   where
-    initialState =
-      LexStringState
-        { stringAcc = ""
-        , multilineCommonWsPrefix =
-            case strType of
-              StringTypeMulti -> maxBound
-              _ -> 0
-        , initialLoc = initialLoc
-        }
-
-    -- 's' is strict here to avoid space leak when iterating through large string literal
-    -- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
-    go !s loc0 =
-      case getChar loc0 of
-        -- found closing delimiter
-        Just ('"', _) | Just loc1 <- checkDelimiter strType getChar loc0 -> do
-          let postprocess =
-                case strType of
-                  StringTypeSingle -> id
-                  StringTypeMulti -> postprocessMultiline (multilineCommonWsPrefix s)
-          Right (postprocess . reverse $ stringAcc s, loc1)
-
-        -- found backslash
-        Just (c0@'\\', loc1) -> do
-          case getChar loc1 of
-            -- found '\&' character, which should be elided
-            Just ('&', loc2) -> go s loc2
-            -- found start of a string gap
-            Just (c1, loc2) | is_space c1 -> collapseStringGap getChar s loc2 >>= go s
-            -- some other escape character
-            Just (c1, loc2) ->
-              case strType of
-                StringTypeSingle -> do
-                  (c', loc') <- resolveEscapeCharacter getChar loc1
-                  go (addChar c' s) loc'
-                StringTypeMulti -> do
-                  -- keep escape characters unresolved until after post-processing,
-                  -- to distinguish between a user-newline and the user writing "\n".
-                  -- but still process the characters here, to find any errors
-                  _ <- resolveEscapeCharacter getChar loc1
-                  go (addChar c1 . addChar c0 $ s) loc2
-            -- backslash at end of input
-            Nothing -> Left $ BadCharInitialLex loc1 (hasSQuote getChar s)
-
-        -- found newline character in multiline string
-        Just (c0@'\n', loc1) | StringTypeMulti <- strType ->
-          uncurry go $ parseLeadingWS getChar (addChar c0 s) loc1
-
-        -- found some other character
-        Just (c0, loc1) | isAnyChar c0 -> go (addChar c0 s) loc1
-
-        -- found some unknown character
-        Just (_, _) -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
-
-        -- reached EOF before finding end of string
-        Nothing -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
-{-# INLINE lexString #-}
-
-checkDelimiter :: LexStringType -> GetChar loc -> loc -> Maybe loc
-checkDelimiter strType getChar loc0 =
-  case strType of
-    StringTypeSingle -> do
-      ('"', loc1) <- getChar loc0
-      Just loc1
-    StringTypeMulti -> do
-      ('"', loc1) <- getChar loc0
-      ('"', loc2) <- getChar loc1
-      ('"', loc3) <- getChar loc2
-      Just loc3
-{-# INLINE checkDelimiter #-}
-
--- | A helper for adding the given character to the lexed string.
-addChar :: Char -> LexStringState loc -> LexStringState loc
-addChar c s = s{stringAcc = c : stringAcc s}
-{-# INLINE addChar #-}
-
--- | Return whether the string we've parsed so far contains any smart quotes.
-hasSQuote :: GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
-hasSQuote getChar s
-  | any isDoubleSmartQuote (stringAcc s)
-  , (c, loc) : _ <- filter (isDoubleSmartQuote . fst) allChars =
-      SmartQuote c loc
-  | otherwise =
-      NoSmartQuote
+    processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
+    processChars =
+          collapseGaps
+      >>> resolveEscapes
+
+-- -----------------------------------------------------------------------------
+-- Lexing interface
+
+{-
+Note [Lexing strings]
+~~~~~~~~~~~~~~~~~~~~~
+
+After verifying if a string is lexically valid with Alex, we still need to do
+some post processing of the string, namely:
+1. Collapse string gaps
+2. Resolve escape characters
+
+The problem: 'lexemeToString' is more performant than manually reading
+characters from the StringBuffer. However, that completely erases the position
+of each character, which we need in order to report the correct position for
+error messages (e.g. when resolving escape characters).
+
+So what we'll do is do two passes. The first pass is optimistic; just convert
+to a plain String and process it. If this results in an error, we do a second
+pass, this time where each character is annotated with its position. Now, the
+error has all the information it needs.
+-}
+
+-- | See Note [Lexing strings]
+lexStringWith ::
+  (forall c. HasChar c => [c] -> Either (c, LexErr) [c])
+  -> Int
+  -> StringBuffer
+  -> Either StringLexError String
+lexStringWith processChars len buf =
+  case processChars $ bufferChars buf len of
+    Right s -> Right s
+    Left _ ->
+      case processChars $ bufferLocatedChars buf len of
+        Right _ -> panic "expected lex error on second pass"
+        Left ((_, pos), e) -> Left $ StringLexError e pos
+
+class HasChar c where
+  getChar :: c -> Char
+  setChar :: Char -> c -> c
+
+instance HasChar Char where
+  getChar = id
+  setChar = const
+
+instance HasChar (Char, x) where
+  getChar = fst
+  setChar c (_, x) = (c, x)
+
+pattern Char :: HasChar c => Char -> c
+pattern Char c <- (getChar -> c)
+{-# COMPLETE Char #-}
+
+bufferChars :: StringBuffer -> Int -> [Char]
+bufferChars = StringBuffer.lexemeToString
+
+bufferLocatedChars :: StringBuffer -> Int -> [(Char, BufPos)]
+bufferLocatedChars initialBuf len = go initialBuf
   where
-    allChars = unfoldr getCharWithLoc (initialLoc s)
-    getCharWithLoc loc =
-      case getChar loc of
-        Just (c, loc') -> Just ((c, loc), loc')
-        Nothing -> Nothing
-{-# INLINE hasSQuote #-}
-
--- | After parsing a backslash and a space character, consume the rest of
--- the string gap and return the next location.
-collapseStringGap :: GetChar loc -> LexStringState loc -> loc -> Either (StringLexError loc) loc
-collapseStringGap getChar s = go
+    go buf
+      | atEnd buf = []
+      | otherwise  =
+          let (c, buf') = StringBuffer.nextChar buf
+           in (c, StringBuffer.cur buf) : go buf'
+
+    atEnd buf = StringBuffer.byteDiff initialBuf buf >= len
+
+-- -----------------------------------------------------------------------------
+-- Lexing phases
+
+collapseGaps :: HasChar c => [c] -> [c]
+collapseGaps = go
   where
-    go loc0 =
-      case getChar loc0 of
-        Just ('\\', loc1) -> pure loc1
-        Just (c0, loc1) | is_space c0 -> go loc1
-        Just _ -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
-        Nothing -> Left $ UnexpectedEOF loc0 (hasSQuote getChar s)
-{-# INLINE collapseStringGap #-}
+    go = \case
+      c1@(Char '\\') : c2@(Char c) : cs
+        | is_space c -> go $ dropGap cs
+        | otherwise  -> c1 : c2 : go cs
+      c : cs -> c : go cs
+      [] -> []
 
--- | See Note [Multiline string literals]
-parseLeadingWS :: GetChar loc -> LexStringState loc -> loc -> (LexStringState loc, loc)
-parseLeadingWS getChar = go 0
+    dropGap = \case
+      Char '\\' : cs -> cs
+      _ : cs -> dropGap cs
+      [] -> panic "gap unexpectedly ended"
+
+resolveEscapes :: HasChar c => [c] -> Either (c, LexErr) [c]
+resolveEscapes = go Seq.empty
   where
-    go !col s loc =
-      case getChar loc of
-        Just (c@' ', loc') -> go (col + 1) (addChar c s) loc'
-        -- expand tabs
-        Just ('\t', loc') ->
-          let fill = 8 - (col `mod` 8)
-              s' = applyN fill (addChar ' ') s
-           in go (col + fill) s' loc'
-        -- if we see a newline or string delimiter, then this line only contained whitespace, so
-        -- don't include it in the common whitespace prefix
-        Just ('\n', _) -> (s, loc)
-        Just ('"', _) | Just _ <- checkDelimiter StringTypeMulti getChar loc -> (s, loc)
-        -- found some other character, so we're done parsing leading whitespace
-        _ ->
-          let s' = s{multilineCommonWsPrefix = min col (multilineCommonWsPrefix s)}
-           in (s', loc)
-
-    applyN :: Int -> (a -> a) -> a -> a
-    applyN n f x0 = iterate f x0 !! n
-{-# INLINE parseLeadingWS #-}
-
-data StringLexError loc
-  = UnexpectedEOF !loc !(ContainsSmartQuote loc)
-    -- ^ Unexpectedly hit EOF when lexing string
-  | BadCharInitialLex !loc !(ContainsSmartQuote loc)
-    -- ^ Found invalid character when initially lexing string
-  | EscapeBadChar !loc
-    -- ^ Found invalid character when parsing an escaped character
-  | EscapeUnexpectedEOF !loc
-    -- ^ Unexpectedly hit EOF when parsing an escaped character
-  | EscapeNumRangeError !loc
-    -- ^ Escaped number exceeds range
-  | EscapeSmartQuoteError !Char !loc
-    -- ^ Found escaped smart unicode chars as `\’` or `\”`
-  deriving (Show)
-
--- | When initially lexing the string, we want to track if we've
--- seen a smart quote, to show a helpful "you might be accidentally
--- using a smart quote" error.
-data ContainsSmartQuote loc
-  = NoSmartQuote
-  | SmartQuote !Char !loc
-  deriving (Show)
+    -- FIXME.bchinn: see if dlist/reverselist improves performance
+    go !acc = \case
+      [] -> pure $ Seq.toList acc
+      Char '\\' : Char '&' : cs -> go acc cs
+      backslash@(Char '\\') : cs ->
+        case resolveEscapeChar cs of
+          Right (esc, cs') -> go (acc Seq.|> setChar esc backslash) cs'
+          Left (c, e) -> Left (c, e)
+      c : cs -> go (acc Seq.|> c) cs
 
 -- -----------------------------------------------------------------------------
 -- Escape characters
 
--- | After finding a backslash, parse the rest of the escape character, starting
--- at the given location.
-resolveEscapeCharacter :: GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
-resolveEscapeCharacter getChar loc0 = do
-  (c0, loc1) <- expectChar loc0
-  case c0 of
-    'a'  -> pure ('\a', loc1)
-    'b'  -> pure ('\b', loc1)
-    'f'  -> pure ('\f', loc1)
-    'n'  -> pure ('\n', loc1)
-    'r'  -> pure ('\r', loc1)
-    't'  -> pure ('\t', loc1)
-    'v'  -> pure ('\v', loc1)
-    '\\' -> pure ('\\', loc1)
-    '"'  -> pure ('\"', loc1)
-    '\'' -> pure ('\'', loc1)
-    -- escape codes
-    'x' -> expectNum is_hexdigit 16 hexDigit loc1
-    'o' -> expectNum is_octdigit 8 octDecDigit loc1
-    _ | is_decdigit c0 -> expectNum is_decdigit 10 octDecDigit loc0
-    -- control characters (e.g. '\^M')
-    '^' -> do
-      (c1, loc2) <- expectChar loc1
-      unless ('@' <= c1 && c1 <= '_') $ Left $ EscapeBadChar loc1
-      pure (chr $ ord c1 - ord '@', loc2)
-    -- long form escapes (e.g. '\NUL')
-    _ | Just (c1, loc2) <- parseLongEscape getChar c0 loc1 -> pure (c1, loc2)
-    -- check unicode smart quotes (#21843)
-    _ | isDoubleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
-    _ | isSingleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
-    -- unknown escape
-    _ -> Left $ EscapeBadChar loc0
+-- | Resolve a escape character, after having just lexed a backslash.
+-- Assumes escape character is valid.
+resolveEscapeChar :: HasChar c => [c] -> Either (c, LexErr) (Char, [c])
+resolveEscapeChar = \case
+  Char 'a'  : cs -> pure ('\a', cs)
+  Char 'b'  : cs -> pure ('\b', cs)
+  Char 'f'  : cs -> pure ('\f', cs)
+  Char 'n'  : cs -> pure ('\n', cs)
+  Char 'r'  : cs -> pure ('\r', cs)
+  Char 't'  : cs -> pure ('\t', cs)
+  Char 'v'  : cs -> pure ('\v', cs)
+  Char '\\' : cs -> pure ('\\', cs)
+  Char '"'  : cs -> pure ('\"', cs)
+  Char '\'' : cs -> pure ('\'', cs)
+  -- escape codes
+  Char 'x' : cs -> parseNum is_hexdigit 16 hexDigit cs
+  Char 'o' : cs -> parseNum is_octdigit 8 octDecDigit cs
+  cs@(Char c : _) | is_decdigit c -> parseNum is_decdigit 10 octDecDigit cs
+  -- control characters (e.g. '\^M')
+  Char '^' : Char c : cs -> pure (chr $ ord c - ord '@', cs)
+  -- long form escapes (e.g. '\NUL')
+  cs | Just (esc, cs') <- parseLongEscape cs -> pure (esc, cs')
+  -- shouldn't happen
+  Char c : _ -> panic $ "found unexpected escape character: " ++ show c
+  [] -> panic "escape character unexpectedly ended"
   where
-    expectChar loc =
-      case getChar loc of
-        Just x -> pure x
-        Nothing -> Left $ EscapeUnexpectedEOF loc
-
-    expectNum isDigit base toDigit loc1 = do
-      (c1, loc2) <- expectChar loc1
-      unless (isDigit c1) $ Left $ EscapeBadChar loc1
-      let parseNum x loc =
-            case getChar loc of
-              Just (c, loc') | isDigit c -> do
-                let x' = x * base + toDigit c
-                when (x' > 0x10ffff) $ Left $ EscapeNumRangeError loc
-                parseNum x' loc'
-              _ ->
-                pure (chr x, loc)
-      parseNum (toDigit c1) loc2
-{-# INLINE resolveEscapeCharacter #-}
-
-parseLongEscape :: GetChar loc -> Char -> loc -> Maybe (Char, loc)
-parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCodes
+    parseNum isDigit base toDigit =
+      let go x = \case
+            ch@(Char c) : cs | isDigit c -> do
+              let x' = x * base + toDigit c
+              when (x' > 0x10ffff) $ Left (ch, LexNumEscapeRange)
+              go x' cs
+            cs -> pure (chr x, cs)
+       in go 0
+
+parseLongEscape :: HasChar c => [c] -> Maybe (Char, [c])
+parseLongEscape cs = listToMaybe (mapMaybe tryParse longEscapeCodes)
   where
-    tryParse (prefix, c) = do
-      p0 : p <- pure prefix
-      guard (p0 == c0)          -- see if the first character matches
-      loc <- parsePrefix loc1 p -- see if the rest of the prefix matches
-      pure (c, loc)
-
-    parsePrefix loc = \case
-      [] -> pure loc
-      p : ps -> do
-        (c, loc') <- getChar loc
-        guard (p == c)
-        parsePrefix loc' ps
+    tryParse (code, esc) =
+      case splitAt (length code) cs of
+        (pre, cs') | map getChar pre == code -> Just (esc, cs')
+        _ -> Nothing
 
     longEscapeCodes =
       [ ("NUL", '\NUL')
@@ -289,14 +200,14 @@ parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCode
       , ("ENQ", '\ENQ')
       , ("ACK", '\ACK')
       , ("BEL", '\BEL')
-      , ("BS", '\BS')
-      , ("HT", '\HT')
-      , ("LF", '\LF')
-      , ("VT", '\VT')
-      , ("FF", '\FF')
-      , ("CR", '\CR')
-      , ("SO", '\SO')
-      , ("SI", '\SI')
+      , ("BS" , '\BS' )
+      , ("HT" , '\HT' )
+      , ("LF" , '\LF' )
+      , ("VT" , '\VT' )
+      , ("FF" , '\FF' )
+      , ("CR" , '\CR' )
+      , ("SO" , '\SO' )
+      , ("SI" , '\SI' )
       , ("DLE", '\DLE')
       , ("DC1", '\DC1')
       , ("DC2", '\DC2')
@@ -306,17 +217,16 @@ parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCode
       , ("SYN", '\SYN')
       , ("ETB", '\ETB')
       , ("CAN", '\CAN')
-      , ("EM", '\EM')
+      , ("EM" , '\EM' )
       , ("SUB", '\SUB')
       , ("ESC", '\ESC')
-      , ("FS", '\FS')
-      , ("GS", '\GS')
-      , ("RS", '\RS')
-      , ("US", '\US')
-      , ("SP", '\SP')
+      , ("FS" , '\FS' )
+      , ("GS" , '\GS' )
+      , ("RS" , '\RS' )
+      , ("US" , '\US' )
+      , ("SP" , '\SP' )
       , ("DEL", '\DEL')
       ]
-{-# INLINE parseLongEscape #-}
 
 -- -----------------------------------------------------------------------------
 -- Unicode Smart Quote detection (#21843)
@@ -337,69 +247,91 @@ isSingleSmartQuote = \case
 -- Multiline strings
 
 -- | See Note [Multiline string literals]
-postprocessMultiline :: Int -> String -> String
-postprocessMultiline commonWSPrefix =
-      rmCommonWhitespacePrefix
-  >>> collapseOnlyWsLines
-  >>> rmFirstNewline
-  >>> rmLastNewline
-  >>> resolveEscapeChars
+--
+-- Assumes string is lexically valid. Skips the steps about splitting
+-- and rejoining lines, and instead manually find newline characters,
+-- for performance.
+lexMultilineString :: Int -> StringBuffer -> Either StringLexError String
+lexMultilineString = lexStringWith processChars
   where
-    rmCommonWhitespacePrefix =
-      let go = \case
-            '\n' : s -> '\n' : go (dropLine commonWSPrefix s)
-            c : s -> c : go s
+    processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
+    processChars =
+          collapseGaps             -- Step 1
+      >>> expandLeadingTabs        -- Step 3
+      >>> rmCommonWhitespacePrefix -- Step 4
+      >>> collapseOnlyWsLines      -- Step 5
+      >>> rmFirstNewline           -- Step 7a
+      >>> rmLastNewline            -- Step 7b
+      >>> resolveEscapes           -- Step 8
+
+    expandLeadingTabs :: HasChar c => [c] -> [c]
+    expandLeadingTabs =
+      let go !col = \case
+            c@(Char ' ')  : cs -> c : go (col + 1) cs
+            c@(Char '\t') : cs ->
+              let fill = 8 - (col `mod` 8)
+               in replicate fill (setChar ' ' c) ++ go (col + fill) cs
+            cs -> cs
+       in go 0
+
+    rmCommonWhitespacePrefix :: HasChar c => [c] -> [c]
+    rmCommonWhitespacePrefix cs0 =
+      let commonWSPrefix = getCommonWsPrefix (map getChar cs0)
+          go = \case
+            c@(Char '\n') : cs -> c : go (dropLine commonWSPrefix cs)
+            c : cs -> c : go cs
             [] -> []
           -- drop x characters from the string, or up to a newline, whichever
           -- comes first
           dropLine !x = \case
-            s | x <= 0 -> s
-            s@('\n' : _) -> s
-            _ : s -> dropLine (x - 1) s
+            cs | x <= 0 -> cs
+            cs@(Char '\n' : _) -> cs
+            _ : cs -> dropLine (x - 1) cs
             [] -> []
-       in go
+       in go cs0
 
+    collapseOnlyWsLines :: HasChar c => [c] -> [c]
     collapseOnlyWsLines =
       let go = \case
-            '\n' : s | Just s' <- checkAllWs s -> '\n' : go s'
-            c : s -> c : go s
+            c@(Char '\n') : cs | Just cs' <- checkAllWs cs -> c : go cs'
+            c : cs -> c : go cs
             [] -> []
           checkAllWs = \case
             -- got all the way to a newline or the end of the string, return
-            s@('\n' : _) -> Just s
-            s@[] -> Just s
+            cs@(Char '\n' : _) -> Just cs
+            cs@[] -> Just cs
             -- found whitespace, continue
-            c : s | is_space c -> checkAllWs s
+            Char c : cs | is_space c -> checkAllWs cs
             -- anything else, stop
             _ -> Nothing
        in go
 
+    rmFirstNewline :: HasChar c => [c] -> [c]
     rmFirstNewline = \case
-      '\n' : s -> s
-      s -> s
+      Char '\n' : cs -> cs
+      cs -> cs
 
+    rmLastNewline :: HasChar c => [c] -> [c]
     rmLastNewline =
       let go = \case
             [] -> []
-            ['\n'] -> []
+            [Char '\n'] -> []
             c : cs -> c : go cs
        in go
 
-    -- resolve escape characters, deferred from lexString. guaranteed
-    -- to not throw any errors, since we already checked them in lexString
-    resolveEscapeChars = \case
-      [] -> []
-      '\\' : s ->
-        -- concretizing 'loc' to String:
-        --   resolveEscapeCharacter :: (String -> Maybe (Char, String)) -> String -> Either _ (Char, String)
-        case resolveEscapeCharacter uncons s of
-          Left e -> panic $ "resolving escape characters in multiline string unexpectedly found errors: " ++ show e
-          Right (c, s') -> c : resolveEscapeChars s'
-      c : s -> c : resolveEscapeChars s
-
-    uncons = \case
-      c : cs -> Just (c, cs)
-      [] -> Nothing
+-- | See step 4 in Note [Multiline string literals]
+--
+-- Assumes tabs have already been expanded.
+getCommonWsPrefix :: String -> Int
+getCommonWsPrefix s =
+  case NonEmpty.nonEmpty includedLines of
+    Nothing -> 0
+    Just ls -> Foldable1.minimum $ NonEmpty.map (length . takeWhile is_space) ls
+  where
+    includedLines =
+        filter (not . all is_space) -- ignore whitespace-only lines
+      . drop 1                      -- ignore first line in calculation
+      $ lines s
 
 {-
 Note [Multiline string literals]
@@ -419,23 +351,13 @@ The canonical steps for post processing a multiline string are:
 2. Split the string by newlines
 3. Convert leading tabs into spaces
     * In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
-4. Remove common whitespace prefix in every line (see below)
+4. Remove common whitespace prefix in every line except the first (see below)
 5. If a line contains only whitespace, remove all of the whitespace
 6. Join the string back with `\n` delimiters
-7. If the first character of the string is a newline, remove it
+7a. If the first character of the string is a newline, remove it
+7b. If the last character of the string is a newline, remove it
 8. Interpret escaped characters
 
-However, for performance reasons, we do as much of this in one pass as possible:
-1. As we lex the string, do the following steps as they appear:
-    a. Collapse string gaps
-    b. Keep track of the common whitespace prefix so far
-    c. Validate escaped characters
-2. At the very end, post process the lexed string:
-    a. Remove the common whitespace prefix from every line
-    b. Remove all whitespace from all-whitespace lines
-    c. Remove initial newline character
-    d. Resolve escaped characters
-
 The common whitespace prefix can be informally defined as "The longest
 prefix of whitespace shared by all lines in the string, excluding the
 first line and any whitespace-only lines".
@@ -449,11 +371,3 @@ It's more precisely defined with the following algorithm:
     * Lines with only whitespace characters
 3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
 -}
-
--- -----------------------------------------------------------------------------
--- Helpers
-
-isAnyChar :: Char -> Bool
-isAnyChar c
-  | c > '\x7f' = isPrint c
-  | otherwise  = is_any c


=====================================
testsuite/tests/ghci/prog013/prog013.stderr
=====================================
@@ -1,17 +1,12 @@
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
 
-Bad.hs:3:8: error: [GHC-21231]
-    lexical error in string/character literal at character '\n'
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
 
-Bad.hs:3:8: error: [GHC-21231]
-    lexical error in string/character literal at character '\n'
-
-Bad.hs:3:8: error: [GHC-21231]
-    lexical error in string/character literal at character '\n'
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
 
 <interactive>:9:1: error: [GHC-58481] parse error on input ‘+’
 
-Bad.hs:3:8: error: [GHC-21231]
-    lexical error in string/character literal at character '\n'
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
+
+Bad.hs:3:7: error: [GHC-21231] lexical error at character '\n'
 
-Bad.hs:3:8: error: [GHC-21231]
-    lexical error in string/character literal at character '\n'


=====================================
testsuite/tests/ghci/scripts/ghci022.stderr
=====================================
@@ -1,3 +1,2 @@
+ghci022.hs:2:5: error: [GHC-21231] lexical error at character '\n'
 
-ghci022.hs:2:6: error: [GHC-21231]
-    lexical error in string/character literal at character '\n'


=====================================
testsuite/tests/parser/should_fail/MultilineStringsSmartQuotes.stderr
=====================================
@@ -1,6 +1,2 @@
-
 MultilineStringsSmartQuotes.hs:9:3: [GHC-31623]
     Unicode character '”' ('\8221') looks like '"' (Quotation Mark), but it is not
-
-MultilineStringsSmartQuotes.hs:10:1: [GHC-21231]
-    lexical error in string/character literal at end of input


=====================================
testsuite/tests/parser/should_fail/T21843c.stderr
=====================================
@@ -1,6 +1,2 @@
-
-T21843c.hs:3:19: [GHC-31623]
+T21843c.hs:3:13: [GHC-31623]
     Unicode character '”' ('\8221') looks like '"' (Quotation Mark), but it is not
-
-T21843c.hs:3:20: [GHC-21231]
-    lexical error in string/character literal at character '\n'


=====================================
testsuite/tests/parser/should_fail/T21843e.stderr
=====================================
@@ -1,3 +1,3 @@
-
-T21843e.hs:3:15: [GHC-31623]
+T21843e.hs:3:13: error: [GHC-31623]
     Unicode character '”' ('\8221') looks like '"' (Quotation Mark), but it is not
+


=====================================
testsuite/tests/parser/should_fail/T21843f.stderr
=====================================
@@ -1,3 +1,3 @@
-
-T21843f.hs:3:13: [GHC-31623]
+T21843f.hs:3:11: error: [GHC-31623]
     Unicode character '‘' ('\8216') looks like ''' (Single Quote), but it is not
+


=====================================
testsuite/tests/parser/should_fail/T3751.stderr
=====================================
@@ -1,3 +1,2 @@
+T3751.hs:3:5: error: [GHC-21231] lexical error at character '\167'
 
-T3751.hs:3:7: error: [GHC-21231]
-    lexical error in string/character literal at character '\167'


=====================================
testsuite/tests/parser/should_fail/T5425.stderr
=====================================
@@ -1,3 +1,2 @@
+T5425.hs:4:1: error: [GHC-21231] lexical error at character '\955'
 
-T5425.hs:5:2: error: [GHC-21231]
-    lexical error in string/character literal at character '\955'


=====================================
testsuite/tests/parser/should_fail/readFail002.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail002.hs:5:5: error: [GHC-21231]
+    lexical error at character '\n'
 
-readFail002.hs:5:6: error: [GHC-21231]
-    lexical error in string/character literal at character '\n'


=====================================
testsuite/tests/parser/should_fail/readFail004.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail004.hs:17:16: error: [GHC-21231]
+    lexical error at character '.'
 
-readFail004.hs:19:1: error: [GHC-21231]
-    lexical error in string/character literal at character '.'


=====================================
testsuite/tests/parser/should_fail/readFail005.stderr
=====================================
@@ -1,3 +1,2 @@
-
-readFail005.hs:4:7: error: [GHC-21231]
-    lexical error in string/character literal at character '&'
+readFail005.hs:4:5: [GHC-21231]
+    lexical error at character '&'


=====================================
testsuite/tests/parser/should_fail/readFail033.stderr
=====================================
@@ -1,3 +1,3 @@
+readFail033.hs:2:5: error: [GHC-21231]
+    lexical error at character '\t'
 
-readFail033.hs:2:6: error: [GHC-21231]
-    lexical error in string/character literal at character '\t'


=====================================
testsuite/tests/parser/unicode/utf8_010.stderr
=====================================
@@ -1,3 +1,3 @@
-
 utf8_010.hs:2:8: error: [GHC-21231]
-    lexical error in string/character literal (UTF-8 decoding error)
+    lexical error (UTF-8 decoding error)
+


=====================================
testsuite/tests/parser/unicode/utf8_011.stderr
=====================================
@@ -1,3 +1,3 @@
-
 utf8_011.hs:2:8: error: [GHC-21231]
-    lexical error in string/character literal (UTF-8 decoding error)
+    lexical error (UTF-8 decoding error)
+


=====================================
testsuite/tests/parser/unicode/utf8_020.stderr
=====================================
@@ -1,3 +1,3 @@
-
 utf8_020.hs:2:8: error: [GHC-21231]
-    lexical error in string/character literal (UTF-8 decoding error)
+    lexical error (UTF-8 decoding error)
+


=====================================
testsuite/tests/parser/unicode/utf8_021.stderr
=====================================
@@ -1,3 +1,3 @@
-
 utf8_021.hs:2:8: error: [GHC-21231]
-    lexical error in string/character literal (UTF-8 decoding error)
+    lexical error (UTF-8 decoding error)
+


=====================================
testsuite/tests/parser/unicode/utf8_022.stderr
=====================================
@@ -1,3 +1,3 @@
-
 utf8_022.hs:2:8: error: [GHC-21231]
-    lexical error in string/character literal (UTF-8 decoding error)
+    lexical error (UTF-8 decoding error)
+


=====================================
utils/haddock/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
=====================================
@@ -320,7 +320,7 @@ classify tok =
     ITlabelvarid{} -> TkUnknown
     ITchar{} -> TkChar
     ITstring{} -> TkString
-    ITmultilinestring{} -> TkString
+    ITstring_multi{} -> TkString
     ITinteger{} -> TkNumber
     ITrational{} -> TkNumber
     ITprimchar{} -> TkChar



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/bf5c535c5188b3f8bf9f1fcad027f113e4645787...c477484f91609cf8f4dd786924d50d4969caf1e1

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/bf5c535c5188b3f8bf9f1fcad027f113e4645787...c477484f91609cf8f4dd786924d50d4969caf1e1
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240825/d8607d2c/attachment-0001.html>


More information about the ghc-commits mailing list