[Git][ghc/ghc][wip/multiline-strings] Reimplement with manual iteration
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Wed Jul 17 05:24:10 UTC 2024
Brandon Chinn pushed to branch wip/multiline-strings at Glasgow Haskell Compiler / GHC
Commits:
008b9c52 by Brandon Chinn at 2024-07-16T22:23:57-07:00
Reimplement with manual iteration
- - - - -
2 changed files:
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
Changes:
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -87,7 +87,7 @@ import qualified GHC.Data.Strict as Strict
import Control.Monad
import Control.Applicative
import Data.Char
-import Data.List (stripPrefix, isInfixOf, partition, unfoldr)
+import Data.List (stripPrefix, isInfixOf, partition)
import Data.List.NonEmpty ( NonEmpty(..) )
import qualified Data.List.NonEmpty as NE
import Data.Maybe
@@ -2226,65 +2226,9 @@ lex_quoted_label span buf _len _buf2 = do
lex_string :: LexStringType -> P String
lex_string strType = do
start <- getInput
- case lexString [] start of
- Right (lexedStr, next) -> do
- setInput next
- either fromStringLexError pure $ resolveLexedString strType lexedStr
- Left (e, s, i) -> do
- -- see if we can find a smart quote in the string we've found so far.
- -- if the built-up string s contains a smart double quote character, it was
- -- likely the reason why the string literal was not lexed correctly
- case filter (\(LexedChar c _) -> isDoubleSmartQuote c) s of
- LexedChar c (AI loc _) : _ -> add_nonfatal_smart_quote_error c loc
- _ -> pure ()
-
- -- regardless whether we found a smart quote, throw a lexical error
- setInput i >> lexError e
- where
- -- Given the (reversed) string we've seen so far and the current location,
- -- return Right with the fully lexed string and the subsequent location,
- -- or Left with the string we've seen so far and the location where lexing
- -- failed.
- lexString acc0 i0 = do
- let acc = reverse acc0
- case alexGetChar' i0 of
- _ | Just i1 <- lexDelimiter i0 -> Right (acc, i1)
-
- Just (c0, i1) -> do
- let acc1 = LexedChar c0 i0 : acc0
- case c0 of
- '\\' -> do
- case alexGetChar' i1 of
- Just (c1, i2)
- | is_space c1 -> lexStringGap acc0 i2
- | otherwise -> lexString (LexedChar c1 i1 : acc1) i2
- Nothing -> Left (LexStringCharLit, acc, i1)
- _ | isAny c0 -> lexString acc1 i1
- _ | strType == StringTypeMulti && c0 `elem` ['\n', '\t'] -> lexString acc1 i1
- _ -> Left (LexStringCharLit, acc, i0)
-
- Nothing -> Left (LexStringCharLit, acc, i0)
-
- lexDelimiter i0 =
- case strType of
- StringTypeSingle -> do
- ('"', i1) <- alexGetChar' i0
- Just i1
- StringTypeMulti -> do
- ('"', i1) <- alexGetChar' i0
- ('"', i2) <- alexGetChar' i1
- ('"', i3) <- alexGetChar' i2
- Just i3
-
- lexStringGap acc0 i0 = do
- let acc = reverse acc0
- case alexGetChar' i0 of
- Just (c0, i1) ->
- case c0 of
- '\\' -> lexString acc0 i1
- _ | is_space c0 -> lexStringGap acc0 i1
- _ -> Left (LexStringCharLit, acc, i0)
- Nothing -> Left (LexStringCharLitEOF, acc, i0)
+ (str, next) <- either fromStringLexError pure $ lexString strType alexGetChar' start
+ setInput next
+ pure str
lex_char_tok :: Action
@@ -2305,13 +2249,9 @@ lex_char_tok span buf _len _buf2 = do -- We've seen '
return (L (mkPsSpan loc end2) ITtyQuote)
Just ('\\', i2@(AI end2 _)) -> do -- We've seen 'backslash
- (LexedChar lit_ch _, rest) <-
+ (lit_ch, i3) <-
either fromStringLexError pure $
- resolveEscapeCharacter (LexedChar '\\' i1) (asLexedString i2)
- i3 <-
- case rest of
- LexedChar _ i3 : _ -> pure i3
- [] -> lexError LexStringCharLitEOF
+ resolveEscapeCharacter alexGetChar' i2
case alexGetChar' i3 of
Just ('\'', i4) -> do
setInput i4
@@ -2320,7 +2260,7 @@ lex_char_tok span buf _len _buf2 = do -- We've seen '
_ -> lit_error i3
Just (c, i2@(AI end2 _))
- | not (isAny c) -> lit_error i1
+ | not (isAnyChar c) -> lit_error i1
| otherwise ->
-- We've seen 'x, where x is a valid character
@@ -2371,24 +2311,19 @@ lex_magic_hash i = do
_other -> pure Nothing
else pure Nothing
-isAny :: Char -> Bool
-isAny c | c > '\x7f' = isPrint c
- | otherwise = is_any c
-
--- | Returns a LexedString that, when iterated, lazily streams
--- successive characters from the AlexInput.
-asLexedString :: AlexInput -> LexedString AlexInput
-asLexedString = unfoldr toLexedChar
- where
- toLexedChar i =
- case alexGetChar' i of
- Just (c, i') -> Just (LexedChar c i, i')
- Nothing -> Nothing
-
fromStringLexError :: StringLexError AlexInput -> P a
fromStringLexError = \case
- SmartQuoteError c (AI loc _) -> add_smart_quote_error c loc
- StringLexError _ i e -> setInput i >> lexError e
+ UnexpectedEOF i squote -> checkSQuote squote >> throw i LexStringCharLitEOF
+ BadCharInitialLex i squote -> checkSQuote squote >> throw i LexStringCharLit
+ EscapeBadChar i -> throw i LexStringCharLit
+ EscapeUnexpectedEOF i -> throw i LexStringCharLitEOF
+ EscapeNumRangeError i -> throw i LexNumEscapeRange
+ EscapeSmartQuoteError c (AI loc _) -> add_smart_quote_error c loc
+ where
+ throw i e = setInput i >> lexError e
+ checkSQuote = \case
+ NoSmartQuote -> pure ()
+ SmartQuote c (AI loc _) -> add_nonfatal_smart_quote_error c loc
-- before calling lit_error, ensure that the current input is pointing to
-- the position of the error in the buffer. This is so that we can report
@@ -2397,12 +2332,6 @@ fromStringLexError = \case
lit_error :: AlexInput -> P a
lit_error i = do setInput i; lexError LexStringCharLit
-getCharOrFail :: AlexInput -> P Char
-getCharOrFail i = do
- case alexGetChar' i of
- Nothing -> lexError LexStringCharLitEOF
- Just (c,i) -> do setInput i; return c
-
-- -----------------------------------------------------------------------------
-- QuasiQuote
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -1,170 +1,277 @@
{-# LANGUAGE LambdaCase #-}
module GHC.Parser.String (
- LexedString,
- LexedChar (..),
StringLexError (..),
+ ContainsSmartQuote (..),
LexStringType (..),
- resolveLexedString,
- resolveEscapeCharacter,
+ lexString,
-- * Unicode smart quote helpers
isDoubleSmartQuote,
isSingleSmartQuote,
+
+ -- * Other helpers
+ isAnyChar,
+ resolveEscapeCharacter,
) where
import GHC.Prelude
import Control.Arrow ((>>>))
-import Control.Monad (forM_, guard, unless, when, (>=>))
-import Data.Char (chr, isSpace, ord)
-import qualified Data.Foldable1 as Foldable1
-import qualified Data.List.NonEmpty as NonEmpty
-import Data.Maybe (listToMaybe, mapMaybe, maybeToList)
+import Control.Monad (guard, unless, when)
+import Data.Char (chr, isPrint, ord)
+import Data.List (unfoldr)
+import Data.Maybe (listToMaybe, mapMaybe)
import GHC.Parser.CharClass (
hexDigit,
+ is_any,
is_decdigit,
is_hexdigit,
is_octdigit,
+ is_space,
octDecDigit,
)
-import GHC.Parser.Errors.Types (LexErr (..))
import GHC.Utils.Panic (panic)
-data LexStringType = StringTypeSingle | StringTypeMulti deriving (Eq)
-
-data LexedChar loc = LexedChar !Char !loc
-type LexedString loc = [LexedChar loc]
-
-unLexedChar :: LexedChar loc -> Char
-unLexedChar (LexedChar c _) = c
+data LexStringType = StringTypeSingle | StringTypeMulti
-unLexedString :: LexedString loc -> String
-unLexedString = map unLexedChar
-
-resolveLexedString ::
- LexStringType ->
- LexedString loc ->
- Either (StringLexError loc) String
-resolveLexedString strType = fmap unLexedString . processString
+-- | State to accumulate while iterating through string literal.
+--
+-- Fields are strict here to avoid space leak when iterating through large string literal
+-- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
+data LexStringState loc = LexStringState
+ { stringAcc :: !String
+ -- ^ The string seen so far, reversed
+ , multilineCommonWsPrefix :: !Int
+ -- ^ The common prefix for multiline strings. See Note [Multiline string literals]
+ , initialLoc :: !loc
+ -- ^ The location of the beginning of the string literal
+ }
+
+-- | Get the character at the given location, with the location
+-- of the next character. Returns Nothing if at the end of the
+-- input.
+type GetChar loc = loc -> Maybe (Char, loc)
+
+lexString :: LexStringType -> GetChar loc -> loc -> Either (StringLexError loc) (String, loc)
+lexString strType getChar initialLoc = go initialState initialLoc
where
- processString =
- case strType of
- StringTypeSingle ->
- resolveEscapeCharacters
- StringTypeMulti ->
- resolveMultilineString
- >>> (\s -> checkInnerTabs s >> pure s)
- >=> resolveEscapeCharacters
-
-data StringLexError loc
- = SmartQuoteError !Char !loc
- | StringLexError !Char !loc !LexErr
-
-collapseStringGaps :: LexedString loc -> LexedString loc
-collapseStringGaps s0 = go s0
+ initialState =
+ LexStringState
+ { stringAcc = ""
+ , multilineCommonWsPrefix =
+ case strType of
+ StringTypeMulti -> maxBound
+ _ -> 0
+ , initialLoc = initialLoc
+ }
+
+ -- 's' is strict here to avoid space leak when iterating through large string literal
+ -- https://gitlab.haskell.org/ghc/ghc/-/merge_requests/12089#note_576175
+ go !s loc0 =
+ case getChar loc0 of
+ -- found closing delimiter
+ Just ('"', _) | Just loc1 <- checkDelimiter strType getChar loc0 -> do
+ let postprocess =
+ case strType of
+ StringTypeSingle -> id
+ StringTypeMulti -> postprocessMultiline (multilineCommonWsPrefix s)
+ Right (postprocess . reverse $ stringAcc s, loc1)
+
+ -- found backslash
+ Just (c0@'\\', loc1) -> do
+ case getChar loc1 of
+ -- found '\&' character, which should be elided
+ Just ('&', loc2) -> go s loc2
+ -- found start of a string gap
+ Just (c1, loc2) | is_space c1 -> collapseStringGap getChar s loc2 >>= go s
+ -- some other escape character
+ Just (c1, loc2) ->
+ case strType of
+ StringTypeSingle -> do
+ (c', loc') <- resolveEscapeCharacter getChar loc1
+ go (addChar c' s) loc'
+ StringTypeMulti -> do
+ -- keep escape characters unresolved until after post-processing,
+ -- to distinguish between a user-newline and the user writing "\n".
+ -- but still process the characters here, to find any errors
+ _ <- resolveEscapeCharacter getChar loc1
+ go (addChar c1 . addChar c0 $ s) loc2
+ -- backslash at end of input
+ Nothing -> Left $ BadCharInitialLex loc1 (hasSQuote getChar s)
+
+ -- found newline character in multiline string
+ Just (c0@'\n', loc1) | StringTypeMulti <- strType ->
+ uncurry go $ parseLeadingWS getChar (addChar c0 s) loc1
+
+ -- found some other character
+ Just (c0, loc1) | isAnyChar c0 -> go (addChar c0 s) loc1
+
+ -- found some unknown character
+ Just (_, _) -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
+
+ -- reached EOF before finding end of string
+ Nothing -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
+
+checkDelimiter :: LexStringType -> GetChar loc -> loc -> Maybe loc
+checkDelimiter strType getChar loc0 =
+ case strType of
+ StringTypeSingle -> do
+ ('"', loc1) <- getChar loc0
+ Just loc1
+ StringTypeMulti -> do
+ ('"', loc1) <- getChar loc0
+ ('"', loc2) <- getChar loc1
+ ('"', loc3) <- getChar loc2
+ Just loc3
+
+-- | A helper for adding the given character to the lexed string.
+addChar :: Char -> LexStringState loc -> LexStringState loc
+addChar c s = s{stringAcc = c : stringAcc s}
+
+-- | Return whether the string we've parsed so far contains any smart quotes.
+hasSQuote :: GetChar loc -> LexStringState loc -> ContainsSmartQuote loc
+hasSQuote getChar s
+ | any isDoubleSmartQuote (stringAcc s)
+ , (c, loc) : _ <- filter (isDoubleSmartQuote . fst) allChars =
+ SmartQuote c loc
+ | otherwise =
+ NoSmartQuote
where
- go = \case
- [] -> []
+ allChars = unfoldr getCharWithLoc (initialLoc s)
+ getCharWithLoc loc =
+ case getChar loc of
+ Just (c, loc') -> Just ((c, loc), loc')
+ Nothing -> Nothing
+
+-- | After parsing a backslash and a space character, consume the rest of
+-- the string gap and return the next location.
+collapseStringGap :: GetChar loc -> LexStringState loc -> loc -> Either (StringLexError loc) loc
+collapseStringGap getChar s = go
+ where
+ go loc0 =
+ case getChar loc0 of
+ Just ('\\', loc1) -> pure loc1
+ Just (c0, loc1) | is_space c0 -> go loc1
+ Just _ -> Left $ BadCharInitialLex loc0 (hasSQuote getChar s)
+ Nothing -> Left $ UnexpectedEOF loc0 (hasSQuote getChar s)
- backslash@(LexedChar '\\' _) : c : s
- | isLexedSpace c ->
- -- lexer should have validated that this is a valid gap,
- -- so we'll panic if we find any invalid characters
- case dropWhile isLexedSpace s of
- LexedChar '\\' _ : s -> go s
- _ -> panic $ "Invalid string gap in " ++ show (unLexedString s0)
- | otherwise ->
- backslash : c : go s
+-- | See Note [Multiline string literals]
+parseLeadingWS :: GetChar loc -> LexStringState loc -> loc -> (LexStringState loc, loc)
+parseLeadingWS getChar = go 0
+ where
+ go !col s loc =
+ case getChar loc of
+ Just (c@' ', loc') -> go (col + 1) (addChar c s) loc'
+ -- expand tabs
+ Just ('\t', loc') ->
+ let fill = 8 - (col `mod` 8)
+ s' = applyN fill (addChar ' ') s
+ in go (col + fill) s' loc'
+ -- if we see a newline or string delimiter, then this line only contained whitespace, so
+ -- don't include it in the common whitespace prefix
+ Just ('\n', _) -> (s, loc)
+ Just ('"', _) | Just _ <- checkDelimiter StringTypeMulti getChar loc -> (s, loc)
+ -- found some other character, so we're done parsing leading whitespace
+ _ ->
+ let s' = s{multilineCommonWsPrefix = min col (multilineCommonWsPrefix s)}
+ in (s', loc)
+
+ applyN :: Int -> (a -> a) -> a -> a
+ applyN n f x0 = iterate f x0 !! n
- c : s -> c : go s
+data StringLexError loc
+ = UnexpectedEOF !loc !(ContainsSmartQuote loc)
+ -- ^ Unexpectedly hit EOF when lexing string
+ | BadCharInitialLex !loc !(ContainsSmartQuote loc)
+ -- ^ Found invalid character when initially lexing string
+ | EscapeBadChar !loc
+ -- ^ Found invalid character when parsing an escaped character
+ | EscapeUnexpectedEOF !loc
+ -- ^ Unexpectedly hit EOF when parsing an escaped character
+ | EscapeNumRangeError !loc
+ -- ^ Escaped number exceeds range
+ | EscapeSmartQuoteError !Char !loc
+ -- ^ Found escaped smart unicode chars as `\’` or `\”`
+ deriving (Show)
+
+-- | When initially lexing the string, we want to track if we've
+-- seen a smart quote, to show a helpful "you might be accidentally
+-- using a smart quote" error.
+data ContainsSmartQuote loc
+ = NoSmartQuote
+ | SmartQuote !Char !loc
+ deriving (Show)
-resolveEscapeCharacters :: LexedString loc -> Either (StringLexError loc) (LexedString loc)
-resolveEscapeCharacters = go
- where
- go = \case
- [] -> pure []
- LexedChar '\\' _ : LexedChar '&' _ : s -> go s
- backslashChar@(LexedChar '\\' _) : s -> do
- (c, s') <- resolveEscapeCharacter backslashChar s
- (c :) <$> go s'
- c : s ->
- (c :) <$> go s
-
--- | After finding a backslash, parse the rest of the escape character.
-resolveEscapeCharacter ::
- LexedChar loc -> -- the backslash character
- LexedString loc -> -- the rest of the string to parse
- Either
- (StringLexError loc)
- (LexedChar loc, LexedString loc) -- the resolved escape character and the rest of the string
-resolveEscapeCharacter backslashChar s0 = do
- (firstChar@(LexedChar c loc), s1) <- expectNext backslashChar s0
- let rewrap c' = pure (LexedChar c' loc, s1)
- case c of
- 'a' -> rewrap '\a'
- 'b' -> rewrap '\b'
- 'f' -> rewrap '\f'
- 'n' -> rewrap '\n'
- 'r' -> rewrap '\r'
- 't' -> rewrap '\t'
- 'v' -> rewrap '\v'
- '\\' -> rewrap '\\'
- '"' -> rewrap '\"'
- '\'' -> rewrap '\''
+-- -----------------------------------------------------------------------------
+-- Escape characters
+
+-- | After finding a backslash, parse the rest of the escape character, starting
+-- at the given location.
+resolveEscapeCharacter :: GetChar loc -> loc -> Either (StringLexError loc) (Char, loc)
+resolveEscapeCharacter getChar loc0 = do
+ (c0, loc1) <- expectChar loc0
+ case c0 of
+ 'a' -> pure ('\a', loc1)
+ 'b' -> pure ('\b', loc1)
+ 'f' -> pure ('\f', loc1)
+ 'n' -> pure ('\n', loc1)
+ 'r' -> pure ('\r', loc1)
+ 't' -> pure ('\t', loc1)
+ 'v' -> pure ('\v', loc1)
+ '\\' -> pure ('\\', loc1)
+ '"' -> pure ('\"', loc1)
+ '\'' -> pure ('\'', loc1)
-- escape codes
- 'x' -> expectNum is_hexdigit 16 hexDigit (firstChar, s1)
- 'o' -> expectNum is_octdigit 8 octDecDigit (firstChar, s1)
- _ | is_decdigit c -> expectNum is_decdigit 10 octDecDigit (backslashChar, s0)
+ 'x' -> expectNum is_hexdigit 16 hexDigit loc1
+ 'o' -> expectNum is_octdigit 8 octDecDigit loc1
+ _ | is_decdigit c0 -> expectNum is_decdigit 10 octDecDigit loc0
-- control characters (e.g. '\^M')
'^' -> do
- (LexedChar c1 loc1, s2) <- expectNext firstChar s1
- unless (c1 >= '@' && c1 <= '_') $
- Left $ StringLexError c1 loc1 LexStringCharLit
- let c' = chr $ ord c1 - ord '@'
- pure (LexedChar c' loc, s2)
+ (c1, loc2) <- expectChar loc1
+ unless ('@' <= c1 && c1 <= '_') $ Left $ EscapeBadChar loc1
+ pure (chr $ ord c1 - ord '@', loc2)
-- long form escapes (e.g. '\NUL')
- _ | Just (c', s2) <- parseLongEscape firstChar s1 -> pure (LexedChar c' loc, s2)
+ _ | Just (c1, loc2) <- parseLongEscape getChar c0 loc1 -> pure (c1, loc2)
-- check unicode smart quotes (#21843)
- _ | isDoubleSmartQuote c -> Left $ SmartQuoteError c loc
- _ | isSingleSmartQuote c -> Left $ SmartQuoteError c loc
+ _ | isDoubleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
+ _ | isSingleSmartQuote c0 -> Left $ EscapeSmartQuoteError c0 loc0
-- unknown escape
- _ -> Left $ StringLexError c loc LexStringCharLit
+ _ -> Left $ EscapeBadChar loc0
where
- expectNext lastChar = \case
- [] -> do
- let LexedChar c loc = lastChar
- Left $ StringLexError c loc LexStringCharLitEOF
- c : cs -> pure (c, cs)
-
- expectNum isDigit base toDigit (lastChar, s0) = do
- (LexedChar c loc, s1) <- expectNext lastChar s0
- unless (isDigit c) $ Left $ StringLexError c loc LexStringCharLit
- let parseNum x = \case
- LexedChar c' loc' : s' | isDigit c' -> do
- let x' = x * base + toDigit c'
- when (x' > 0x10ffff) $ Left $ StringLexError c' loc' LexNumEscapeRange
- parseNum x' s'
- s ->
- pure (LexedChar (chr x) loc, s)
- parseNum (toDigit c) s1
-
--- | Check if the escape characters match a long escape code.
---
--- >>> parseLongEscape 'C' [LexedChar 'R', LexedChar 'X', ...s] = Just ('\CR', [LexedChar 'X', ...s])
--- >>> parseLongEscape 'X' [LexedChar 'X', LexedChar 'X', ...s] = Nothing
-parseLongEscape :: LexedChar loc -> LexedString loc -> Maybe (Char, LexedString loc)
-parseLongEscape (LexedChar c _) s = listToMaybe $ mapMaybe tryParse longEscapeCodes
+ expectChar loc =
+ case getChar loc of
+ Just x -> pure x
+ Nothing -> Left $ EscapeUnexpectedEOF loc
+
+ expectNum isDigit base toDigit loc1 = do
+ (c1, loc2) <- expectChar loc1
+ unless (isDigit c1) $ Left $ EscapeBadChar loc1
+ let parseNum x loc =
+ case getChar loc of
+ Just (c, loc') | isDigit c -> do
+ let x' = x * base + toDigit c
+ when (x' > 0x10ffff) $ Left $ EscapeNumRangeError loc
+ parseNum x' loc'
+ _ ->
+ pure (chr x, loc)
+ parseNum (toDigit c1) loc2
+
+parseLongEscape :: GetChar loc -> Char -> loc -> Maybe (Char, loc)
+parseLongEscape getChar c0 loc1 = listToMaybe $ mapMaybe tryParse longEscapeCodes
where
- tryParse (prefix, c') = do
+ tryParse (prefix, c) = do
p0 : p <- pure prefix
- guard (p0 == c) -- see if the first character matches
- s' <- parsePrefix p s -- see if the rest of the prefix matches
- pure (c', s')
+ guard (p0 == c0) -- see if the first character matches
+ loc <- parsePrefix loc1 p -- see if the rest of the prefix matches
+ pure (c, loc)
- parsePrefix (p : ps) (LexedChar t _ : ts) | p == t = parsePrefix ps ts
- parsePrefix [] s' = Just s' -- we've matched the whole prefix, return the rest
- parsePrefix _ _ = Nothing
+ parsePrefix loc = \case
+ [] -> pure loc
+ p : ps -> do
+ (c, loc') <- getChar loc
+ guard (p == c)
+ parsePrefix loc' ps
longEscapeCodes =
[ ("NUL", '\NUL')
@@ -203,17 +310,6 @@ parseLongEscape (LexedChar c _) s = listToMaybe $ mapMaybe tryParse longEscapeCo
, ("DEL", '\DEL')
]
--- | Error if string contains any tab characters.
---
--- Normal strings don't lex tab characters in the first place, but we
--- have to allow them in multiline strings for leading indentation. So
--- we allow them in the initial lexing pass, then check for any remaining
--- tabs after replacing leading tabs in resolveMultilineString.
-checkInnerTabs :: LexedString loc -> Either (StringLexError loc) ()
-checkInnerTabs s =
- forM_ s $ \(LexedChar c loc) ->
- when (c == '\t') $ Left $ StringLexError c loc LexStringCharLit
-
-- -----------------------------------------------------------------------------
-- Unicode Smart Quote detection (#21843)
@@ -242,7 +338,7 @@ is, HsMultilineString will contain the post-processed string. This matches
the same behavior as HsString, which contains the normalized string
(see Note [Literal source text]).
-The string is post-processed with the following steps:
+The canonical steps for post processing a multiline string are:
1. Collapse string gaps
2. Split the string by newlines
3. Convert leading tabs into spaces
@@ -253,6 +349,17 @@ The string is post-processed with the following steps:
7. If the first character of the string is a newline, remove it
8. Interpret escaped characters
+However, for performance reasons, we do as much of this in one pass as possible:
+1. As we lex the string, do the following steps as they appear:
+ a. Collapse string gaps
+ b. Keep track of the common whitespace prefix so far
+ c. Validate escaped characters
+2. At the very end, post process the lexed string:
+ a. Remove the common whitespace prefix from every line
+ b. Remove all whitespace from all-whitespace lines
+ c. Remove initial newline character
+ d. Resolve escaped characters
+
The common whitespace prefix can be informally defined as "The longest
prefix of whitespace shared by all lines in the string, excluding the
first line and any whitespace-only lines".
@@ -267,83 +374,67 @@ It's more precisely defined with the following algorithm:
3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
-}
--- | A lexed line, with the string and the location info of the ending newline
--- character, if one exists
-data LexedLine loc = LexedLine !(LexedString loc) (Maybe loc)
-
-mapLine :: (LexedString loc -> LexedString loc) -> LexedLine loc -> LexedLine loc
-mapLine f (LexedLine line nl) = LexedLine (f line) nl
-
-mapLines :: (LexedString loc -> LexedString loc) -> [LexedLine loc] -> [LexedLine loc]
-mapLines f = map (mapLine f)
-
-filterLines :: (LexedString loc -> Bool) -> [LexedLine loc] -> [LexedLine loc]
-filterLines f = filter (\(LexedLine line _) -> f line)
-
-splitLines :: LexedString loc -> [LexedLine loc]
-splitLines =
- foldr
- ( curry $ \case
- (LexedChar '\n' loc, ls) -> LexedLine [] (Just loc) : ls
- (c, l : ls) -> mapLine (c :) l : ls
- (c, []) -> LexedLine [c] Nothing : [] -- should not happen
- )
- [emptyLine]
- where
- emptyLine = LexedLine [] Nothing
-
-joinLines :: [LexedLine loc] -> LexedString loc
-joinLines = concatMap $ \(LexedLine line nl) ->
- case nl of
- Nothing -> line
- Just nl' -> line ++ [LexedChar '\n' nl']
-
-- | See Note [Multiline string literals]
-resolveMultilineString :: LexedString loc -> LexedString loc
-resolveMultilineString = process
+postprocessMultiline :: Int -> String -> String
+postprocessMultiline commonWSPrefix =
+ rmCommonWhitespacePrefix
+ >>> collapseOnlyWsLines
+ >>> rmFirstNewline
+ >>> resolveEscapeChars
where
- process =
- splitLines
- >>> convertLeadingTabs
- >>> rmCommonWhitespacePrefix
- >>> stripOnlyWhitespace
- >>> joinLines
- >>> rmFirstNewline
-
- convertLeadingTabs =
- let convertLine col = \case
+ rmCommonWhitespacePrefix =
+ let go = \case
+ '\n' : s -> '\n' : go (dropLine commonWSPrefix s)
+ c : s -> c : go s
[] -> []
- c@(LexedChar ' ' _) : cs -> c : convertLine (col + 1) cs
- LexedChar '\t' loc : cs ->
- let fill = 8 - (col `mod` 8)
- in replicate fill (LexedChar ' ' loc) ++ convertLine (col + fill) cs
- c : cs -> c : cs
- in mapLines (convertLine 0)
-
- rmCommonWhitespacePrefix = \case
- [] -> []
- -- exclude the first line from this calculation
- firstLine : strLines ->
- let excludeWsOnlyLines = filterLines (not . all isLexedSpace)
- commonWSPrefix =
- case NonEmpty.nonEmpty (excludeWsOnlyLines strLines) of
- Nothing -> 0
- Just strLines' ->
- Foldable1.minimum $
- flip NonEmpty.map strLines' $ \(LexedLine line _) ->
- length $ takeWhile isLexedSpace line
- in firstLine : mapLines (drop commonWSPrefix) strLines
-
- stripOnlyWhitespace =
- let stripWsOnlyLine line = if all isLexedSpace line then [] else line
- in mapLines stripWsOnlyLine
+ -- drop x characters from the string, or up to a newline, whichever
+ -- comes first
+ dropLine !x = \case
+ s | x <= 0 -> s
+ s@('\n' : _) -> s
+ _ : s -> dropLine (x - 1) s
+ [] -> []
+ in go
+
+ collapseOnlyWsLines =
+ let go = \case
+ '\n' : s | Just s' <- checkAllWs s -> '\n' : go s'
+ c : s -> c : go s
+ [] -> []
+ checkAllWs = \case
+ -- got all the way to a newline or the end of the string, return
+ s@('\n' : _) -> Just s
+ s@[] -> Just s
+ -- found whitespace, continue
+ c : s | is_space c -> checkAllWs s
+ -- anything else, stop
+ _ -> Nothing
+ in go
rmFirstNewline = \case
- LexedChar '\n' _ : s -> s
+ '\n' : s -> s
s -> s
+ -- resolve escape characters, deferred from lexString. guaranteed
+ -- to not throw any errors, since we already checked them in lexString
+ resolveEscapeChars = \case
+ [] -> []
+ '\\' : s ->
+ -- concretizing 'loc' to String:
+ -- resolveEscapeCharacter :: (String -> Maybe (Char, String)) -> String -> Either _ (Char, String)
+ case resolveEscapeCharacter uncons s of
+ Left e -> panic $ "resolving escape characters in multiline string unexpectedly found errors: " ++ show e
+ Right (c, s') -> c : resolveEscapeChars s'
+ c : s -> c : resolveEscapeChars s
+
+ uncons = \case
+ c : cs -> Just (c, cs)
+ [] -> Nothing
+
-- -----------------------------------------------------------------------------
-- Helpers
-isLexedSpace :: LexedChar loc -> Bool
-isLexedSpace = isSpace . unLexedChar
+isAnyChar :: Char -> Bool
+isAnyChar c
+ | c > '\x7f' = isPrint c
+ | otherwise = is_any c
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/008b9c529887dcfd95c6c3706d98fe380b7aadbe
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/008b9c529887dcfd95c6c3706d98fe380b7aadbe
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240717/c2ad57d4/attachment-0001.html>
More information about the ghc-commits
mailing list