[Git][ghc/ghc][wip/strings] [ci skip] wip
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Sat Aug 24 23:18:42 UTC 2024
Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC
Commits:
8cc82093 by Brandon Chinn at 2024-08-24T16:18:32-07:00
[ci skip] wip
- - - - -
2 changed files:
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
Changes:
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -2271,7 +2271,9 @@ tok_string_multi startSpan startBuf _len _buf2 = do
-- load the content of the multiline string
let contentLen = byteDiff contentStartBuf contentEndBuf
- s <- either lexError pure . postprocessMultilineString $ lexemeToString contentStartBuf contentLen
+ s <-
+ either (throwStringLexError (AI startLoc startBuf)) pure $
+ lexMultilineString contentLen contentStartBuf
setInput i'
pure $ L span $ ITstring_multi src (mkFastString s)
@@ -2325,22 +2327,24 @@ tok_string_multi_content :: Action
tok_string_multi_content = panic "tok_string_multi_content unexpectedly invoked"
lex_chars :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
-lex_chars (startDelim, endDelim) span buf len = resolveEscapes' . collapseGaps $ lexemeToString content_buf content_len
+lex_chars (startDelim, endDelim) span buf len =
+ either (throwStringLexError (AI (psSpanStart span) buf)) pure $
+ lexString content_len content_buf
where
- resolveEscapes' = either throwEscapeErr pure . resolveEscapes
-
- -- match the normal lexical errors:
- -- * position = beginning of string
- -- * character = the illegal character
- throwEscapeErr e =
- let loc = psSpanStart span
- buf' = buf -- FIXME.bchinn
- in setInput (AI loc buf') >> lexError e
-
-- assumes delimiters are ASCII, with 1 byte per Char
content_len = len - length startDelim - length endDelim
content_buf = offsetBytes (length startDelim) buf
+throwStringLexError :: AlexInput -> StringLexError -> P a
+throwStringLexError i (StringLexError e off) = setInput i' >> lexError e
+ where
+ -- Match the AlexInput that happens with normal lexical errors:
+ -- * position = beginning of string literal
+ -- * character = the illegal character
+ i' =
+ let AI loc buf = i
+ in AI loc (offsetBytes off buf)
+
tok_quoted_label :: Action
tok_quoted_label span buf len _buf2 = do
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -1,11 +1,10 @@
{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE TupleSections #-}
module GHC.Parser.String (
- collapseGaps,
- resolveEscapes,
-
- -- * Multiline strings
- postprocessMultilineString,
+ StringLexError (..),
+ lexString,
+ lexMultilineString,
-- * Unicode smart quote helpers
isDoubleSmartQuote,
@@ -15,13 +14,14 @@ module GHC.Parser.String (
import GHC.Prelude
import Control.Arrow ((>>>))
-import Control.DeepSeq (deepseq)
-import Control.Exception (Exception, catch, throw)
import Control.Monad (when)
+import Data.Bifunctor (first)
import Data.Char (chr, ord)
import qualified Data.Foldable1 as Foldable1
import qualified Data.List.NonEmpty as NonEmpty
-import Data.Maybe (listToMaybe, mapMaybe)
+import Data.Maybe (mapMaybe)
+import GHC.Data.StringBuffer (StringBuffer)
+import qualified GHC.Data.StringBuffer as StringBuffer
import GHC.Parser.CharClass (
hexDigit,
is_decdigit,
@@ -32,90 +32,122 @@ import GHC.Parser.CharClass (
)
import GHC.Parser.Errors.Types (LexErr (..))
import GHC.Utils.Panic (panic)
-import System.IO.Unsafe (unsafePerformIO)
--- | Collapse string gaps. Assumes the string is lexically valid.
-collapseGaps :: String -> String
-collapseGaps = go
+-- | A lex error occurring when parsing a string, containing the error itself
+-- and the position in the buffer where the error occurred.
+data StringLexError = StringLexError LexErr Int
+
+lexString :: Int -> StringBuffer -> Either StringLexError String
+lexString = lexStringWith resolveEscapeCharBuf
+
+-- | Lex a StringBuffer into a String, with escapes resolved using the given
+-- function.
+--
+-- Ideally, this would be broken up into three functions composed together:
+-- 1. Convert StringBuffer -> String
+-- 2. Collapse gaps
+-- 3. Resolve escapes
+-- However, escape resolution needs to be able to throw errors containing the
+-- location of the error, which we lose information for after converting to
+-- String. So we have to do these three steps at the same time so we have
+-- location information when resolving escapes.
+lexStringWith ::
+ (StringBuffer -> Either StringLexError ([Char], StringBuffer))
+ -> Int
+ -> StringBuffer
+ -> Either StringLexError String
+lexStringWith resolveEscape len initialBuf = go id initialBuf
where
- go = \case
- '\\' : '\\' : cs -> '\\' : '\\' : go cs
- '\\' : c : cs | is_space c -> go $ dropGap cs
- c : cs -> c : go cs
- [] -> []
+ go acc buf0 =
+ case nextCharBuf buf0 of
+ _ | atEnd buf0 -> pure $ acc []
+ ('\\', buf1)
+ | ('&', buf2) <- nextCharBuf buf1 -> go acc buf2
+ | (c, buf2) <- nextCharBuf buf1, is_space c -> go acc $ dropGap buf2
+ | otherwise ->
+ case resolveEscape buf1 of
+ Right (cs, buf2) -> go (acc . (cs ++)) buf2
+ Left e -> Left e
+ (c, buf1) -> go (acc . (c :)) buf1
+
+ dropGap buf =
+ case nextCharBuf buf of
+ ('\\', buf') -> buf'
+ (_, buf') -> dropGap buf'
- dropGap = \case
- '\\' : cs -> cs
- _ : cs -> dropGap cs
- [] -> panic "gap unexpectedly ended"
+ nextCharBuf = StringBuffer.nextChar
+
+ endBuf = StringBuffer.offsetBytes len initialBuf
+ atEnd buf = StringBuffer.byteDiff buf endBuf <= 0 -- should never be negative, but just in case
-- -----------------------------------------------------------------------------
-- Escape characters
-newtype LexErrE = LexErrE LexErr
-instance Show LexErrE where
- show _ = "<LexErr>" -- we only need this for the resolveEscapes hack, so doesn't have to be meaningful
-instance Exception LexErrE
-
--- | Resolve escape characters. Assumes the string is lexically valid + gaps have been collapsed.
-resolveEscapes :: String -> Either LexErr String
-resolveEscapes s0 = do
- let s = resolve s0
- unsafePerformIO $ (s `deepseq` pure (Right s)) `catch` \(LexErrE e) -> pure (Left e)
+-- | 'resolveEscapeChar' specialized to StringBuffer.
+resolveEscapeCharBuf :: StringBuffer -> Either StringLexError ([Char], StringBuffer)
+resolveEscapeCharBuf = fmap (first (:[])) . resolveEscapeChar StringBuffer.nextChar mkStrLexError
where
- -- Unfortunately, `resolve` is only performant if it's pure; allocations
- -- and performance degrade when `resolve` is implemented in P or ST. So
- -- we'll throw an impure exception and catch it above with unsafePerformIO
- resolve = \case
- [] -> []
- '\\' : '&' : cs -> resolve cs
- '\\' : cs ->
- case resolveEscapeCharacter cs of
- Right (c, cs') -> c : resolve cs'
- Left e -> throw (LexErrE e)
- c : cs -> c : resolve cs
-
--- Assumes escape character is valid
-resolveEscapeCharacter :: [Char] -> Either LexErr (Char, [Char])
-resolveEscapeCharacter = \case
- 'a' : cs -> pure ('\a', cs)
- 'b' : cs -> pure ('\b', cs)
- 'f' : cs -> pure ('\f', cs)
- 'n' : cs -> pure ('\n', cs)
- 'r' : cs -> pure ('\r', cs)
- 't' : cs -> pure ('\t', cs)
- 'v' : cs -> pure ('\v', cs)
- '\\' : cs -> pure ('\\', cs)
- '"' : cs -> pure ('\"', cs)
- '\'' : cs -> pure ('\'', cs)
- -- escape codes
- 'x' : cs -> parseNum is_hexdigit 16 hexDigit cs
- 'o' : cs -> parseNum is_octdigit 8 octDecDigit cs
- c : cs | is_decdigit c -> parseNum is_decdigit 10 octDecDigit (c : cs)
- -- control characters (e.g. '\^M')
- '^' : c : cs -> pure (chr $ ord c - ord '@', cs)
- -- long form escapes (e.g. '\NUL')
- cs | Just (c, cs') <- parseLongEscape cs -> pure (c, cs')
- -- shouldn't happen
- c : _ -> panic $ "found unexpected escape character: " ++ show c
- [] -> panic $ "escape character unexpectedly ended"
+ mkStrLexError buf e = StringLexError e (StringBuffer.cur buf)
+
+-- | Resolve a escape character, after having just lexed a backslash.
+-- Assumes escape character is valid.
+resolveEscapeChar ::
+ (s -> (Char, s))
+ -> (s -> LexErr -> e)
+ -> s
+ -> Either e (Char, s)
+resolveEscapeChar nextChar mkStrLexError s0 =
+ case nextChar s0 of
+ ('a' , s1) -> pure ('\a', s1)
+ ('b' , s1) -> pure ('\b', s1)
+ ('f' , s1) -> pure ('\f', s1)
+ ('n' , s1) -> pure ('\n', s1)
+ ('r' , s1) -> pure ('\r', s1)
+ ('t' , s1) -> pure ('\t', s1)
+ ('v' , s1) -> pure ('\v', s1)
+ ('\\', s1) -> pure ('\\', s1)
+ ('"' , s1) -> pure ('\"', s1)
+ ('\'', s1) -> pure ('\'', s1)
+ -- escape codes
+ ('x', s1) -> parseNum is_hexdigit 16 hexDigit s1
+ ('o', s1) -> parseNum is_octdigit 8 octDecDigit s1
+ (c, _) | is_decdigit c -> parseNum is_decdigit 10 octDecDigit s0
+ -- control characters (e.g. '\^M')
+ ('^', s1) | (c, s2) <- nextChar s1 -> pure (chr $ ord c - ord '@', s2)
+ -- long form escapes (e.g. '\NUL')
+ _ | Just (c, s1) <- parseLongEscape nextChar s0 -> pure (c, s1)
+ -- shouldn't happen
+ (c, _) -> panic $ "found unexpected escape character: " ++ show c
where
parseNum isDigit base toDigit =
- let go x = \case
- c : cs | isDigit c -> do
- let x' = x * base + toDigit c
- when (x' > 0x10ffff) $ Left LexNumEscapeRange
- go x' cs
- cs -> pure (chr x, cs)
+ let go x s =
+ case nextChar s of
+ (c, s') | isDigit c -> do
+ let x' = x * base + toDigit c
+ when (x' > 0x10ffff) $ Left (mkStrLexError s LexNumEscapeRange)
+ go x' s'
+ _ -> pure (chr x, s)
in go 0
-parseLongEscape :: [Char] -> Maybe (Char, [Char])
-parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
+parseLongEscape :: (s -> (Char, s)) -> s -> Maybe (Char, s)
+parseLongEscape nextChar = match longEscapeCodes
where
- tryParse (code, c) =
- case splitAt (length code) cs of
- (pre, cs') | pre == code -> Just (c, cs')
- _ -> Nothing
+ -- Iteratively get the next character and filter the list of codes
+ -- by codes starting with that character, until we find a code that
+ -- completely matches.
+ -- Assumes that none of the codes are prefixed by another code
+ match [] _ = Nothing
+ match [("", c)] s = Just (c, s)
+ match codes s =
+ let (c, s') = nextChar s
+ in match (mapMaybe (fmapFst $ stripHead c) codes) s'
+
+ stripHead :: Eq a => a -> [a] -> Maybe [a]
+ stripHead x1 (x2 : xs) | x1 == x2 = Just xs
+ stripHead _ _ = Nothing
+
+ fmapFst :: Functor f => (a -> f b) -> (a, c) -> f (b, c)
+ fmapFst f (a, c) = (, c) <$> f a
longEscapeCodes =
[ ("NUL", '\NUL')
@@ -126,14 +158,14 @@ parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
, ("ENQ", '\ENQ')
, ("ACK", '\ACK')
, ("BEL", '\BEL')
- , ("BS", '\BS')
- , ("HT", '\HT')
- , ("LF", '\LF')
- , ("VT", '\VT')
- , ("FF", '\FF')
- , ("CR", '\CR')
- , ("SO", '\SO')
- , ("SI", '\SI')
+ , ("BS" , '\BS' )
+ , ("HT" , '\HT' )
+ , ("LF" , '\LF' )
+ , ("VT" , '\VT' )
+ , ("FF" , '\FF' )
+ , ("CR" , '\CR' )
+ , ("SO" , '\SO' )
+ , ("SI" , '\SI' )
, ("DLE", '\DLE')
, ("DC1", '\DC1')
, ("DC2", '\DC2')
@@ -143,14 +175,14 @@ parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
, ("SYN", '\SYN')
, ("ETB", '\ETB')
, ("CAN", '\CAN')
- , ("EM", '\EM')
+ , ("EM" , '\EM' )
, ("SUB", '\SUB')
, ("ESC", '\ESC')
- , ("FS", '\FS')
- , ("GS", '\GS')
- , ("RS", '\RS')
- , ("US", '\US')
- , ("SP", '\SP')
+ , ("FS" , '\FS' )
+ , ("GS" , '\GS' )
+ , ("RS" , '\RS' )
+ , ("US" , '\US' )
+ , ("SP" , '\SP' )
, ("DEL", '\DEL')
]
@@ -177,16 +209,24 @@ isSingleSmartQuote = \case
-- Assumes string is lexically valid. Skips the steps about splitting
-- and rejoining lines, and instead manually find newline characters,
-- for performance.
-postprocessMultilineString :: String -> Either LexErr String
-postprocessMultilineString =
- collapseGaps -- Step 1
- >>> expandLeadingTabs -- Step 3
- >>> rmCommonWhitespacePrefix -- Step 4
- >>> collapseOnlyWsLines -- Step 5
- >>> rmFirstNewline -- Step 7a
- >>> rmLastNewline -- Step 7b
- >>> resolveEscapes -- Step 8
+lexMultilineString :: Int -> StringBuffer -> Either StringLexError String
+lexMultilineString len buf = postprocess <$> lexStringWith validateEscapeChar len buf
where
+ -- When initially lexing the string, throw any errors when parsing escape
+ -- characters, but otherwise, keep the escape characters verbatim and resolve
+ -- them at the end of postprocessing.
+ validateEscapeChar buf =
+ let (c, buf') = StringBuffer.nextChar buf
+ in (['\\', c], buf') <$ resolveEscapeCharBuf buf
+
+ postprocess =
+ expandLeadingTabs -- Step 3
+ >>> rmCommonWhitespacePrefix -- Step 4
+ >>> collapseOnlyWsLines -- Step 5
+ >>> rmFirstNewline -- Step 7a
+ >>> rmLastNewline -- Step 7b
+ >>> resolveEscapes -- Step 8
+
expandLeadingTabs =
let go !col = \case
c@' ' : cs -> c : go (col + 1) cs
@@ -237,6 +277,20 @@ postprocessMultilineString =
c : cs -> c : go cs
in go
+ resolveEscapes =
+ let uncons = \case
+ c : s -> (c, s)
+ [] -> panic "escape character unexpectedly ended"
+ go = \case
+ [] -> []
+ '\\' : s ->
+ case resolveEscapeChar uncons (\_ _ -> ()) s of
+ Right (c, s') -> c : go s'
+ -- shouldn't happen, as we've already validated escapes in validateEscapeChar
+ Left _ -> panic "unexpectedly encountered invalid escape character"
+ c : s -> c : go s
+ in go
+
-- | See step 4 in Note [Multiline string literals]
--
-- Assumes tabs have already been expanded.
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/8cc82093263c028854381e3fc8b201a5a2689ffd
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/8cc82093263c028854381e3fc8b201a5a2689ffd
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240824/17b72dc1/attachment-0001.html>
More information about the ghc-commits
mailing list