[Git][ghc/ghc][wip/strings] StringBuffer-centric `escaped_string_content`
Sebastian Graf (@sgraf812)
gitlab at gitlab.haskell.org
Wed Aug 14 09:24:46 UTC 2024
Sebastian Graf pushed to branch wip/strings at Glasgow Haskell Compiler / GHC
Commits:
27ee0e34 by Sebastian Graf at 2024-08-14T11:24:15+02:00
StringBuffer-centric `escaped_string_content`
- - - - -
3 changed files:
- compiler/GHC/Data/StringBuffer.hs
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
Changes:
=====================================
compiler/GHC/Data/StringBuffer.hs
=====================================
@@ -38,6 +38,7 @@ module GHC.Data.StringBuffer
stepOn,
offsetBytes,
byteDiff,
+ takeBytes,
atLine,
-- * Conversion
@@ -323,6 +324,10 @@ offsetBytes i s = s { cur = cur s + i }
byteDiff :: StringBuffer -> StringBuffer -> Int
byteDiff s1 s2 = cur s2 - cur s1
+-- | Restricts the length of the string buffer to the given number of bytes.
+takeBytes :: Int -> StringBuffer -> StringBuffer
+takeBytes new_len s = assert (len s >= new_len) $ s { len = new_len }
+
-- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
atEnd :: StringBuffer -> Bool
atEnd (StringBuffer _ l c) = l == c
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -84,11 +84,8 @@ import GHC.Prelude
import qualified GHC.Data.Strict as Strict
-- base
-import Control.DeepSeq (deepseq)
-import Control.Exception (catch, throw)
import Control.Monad
import Control.Applicative
-import Data.Bifunctor (first)
import Data.Char
import Data.List (stripPrefix, isInfixOf, partition)
import Data.List.NonEmpty ( NonEmpty(..) )
@@ -96,7 +93,6 @@ import qualified Data.List.NonEmpty as NE
import Data.Maybe
import Data.Word
import Debug.Trace (trace)
-import System.IO.Unsafe (unsafePerformIO)
import GHC.Data.EnumSet as EnumSet
@@ -2234,7 +2230,7 @@ lex_string_prag_comment mkTok span _buf _len _buf2
-- FIXME.bchinn: throw better error for escaped smart quotes
tok_string :: Action
tok_string span buf len _buf2 = do
- s <- lex_string ("\"", "\"") buf (if endsInHash then len - 1 else len)
+ s <- tok_string_like ("\"", "\"") span buf (if endsInHash then len - 1 else len)
if endsInHash
then do
@@ -2252,7 +2248,8 @@ tok_string span buf len _buf2 = do
tok_string_multi_line :: Action
tok_string_multi_line span buf len _buf2 = do
- s <- lex_string' resolveEsc ("", "\"\"\"") buf len
+ -- TODO:
+ -- s <- tok_string_like ("", "\"\"\"") buf len
-- this action guaranteed to only run after lexing a newline or a triple quote
case lastChar of
@@ -2260,72 +2257,57 @@ tok_string_multi_line span buf len _buf2 = do
'\n' -> popLexState >> pushLexState string_multi_line
_ -> panic $ "Got unexpected last character: " ++ show lastChar
- pure $ L span (ITstring_multi_line src (mkFastString s))
+ pure $ L span (ITstring_multi_line src s)
where
- src = SourceText $ lexemeToFastString buf len
+ s = lexemeToFastString buf len
+ src = SourceText s
lastChar = currentChar (offsetBytes (len - 1) buf)
- -- Don't resolve escape characters here, defer until postprocessMultilineString.
- -- However, we still want to validate them
- resolveEsc cs =
- let result =
- case cs of
- c : cs' -> (['\\', c], cs') -- for the sake of validation, pretend we always escape just one character
- [] -> panic "Unexpectedly resolving an empty escape character"
- in result <$ resolveEscapeCharacter cs
-
-lex_string :: (String, String) -> StringBuffer -> Int -> P String
-lex_string = lex_string' resolveEsc
+tok_string_like :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
+tok_string_like (startDelim, endDelim) span buf len = escaped_string_content loc buf_content
where
- resolveEsc = fmap (first (:[])) . resolveEscapeCharacter
+ loc = foldr (flip advancePsLoc) (psSpanStart span) startDelim
+ buf_content =
+ takeBytes (len - length endDelim) $ -- NB: ASCII => 1 byte per Char
+ offsetBytes (length startDelim) $
+ buf
newtype LexErrE = LexErrE LexErr deriving (Show)
instance Exception LexErrE
-lex_string' :: ([Char] -> Either LexErr ([Char], [Char])) -> (String, String) -> StringBuffer -> Int -> P String
-lex_string' resolveEsc (startDelim, endDelim) buf len = do
- let s = resolve . stripDelims $ lexemeToString buf len
-
- -- Unfortunately, `resolve` is only performant if it's pure; allocations
- -- and performance degrade when `resolve` is implemented in P or ST. So
- -- we'll throw an impure exception and catch it here
- unsafePerformIO $ (s `deepseq` pure (pure ())) `catch` \(LexErrE e) -> pure $ lexError e
-
- pure s
+escaped_string_content :: PsLoc -> StringBuffer -> P String
+escaped_string_content loc buf0 = resolve id buf0 -- id: empty DList for the result string
where
- stripDelims = stripPrefixOrId startDelim . stripSuffixOrId endDelim
- stripPrefixOrId pre s = fromMaybe s (stripPrefix pre s)
-
- -- assumes string was lexed correctly
- resolve = \case
- [] -> []
- '\\' : '&' : cs -> resolve cs
- '\\' : c : cs | is_space c -> resolve $ dropGap cs
- '\\' : cs ->
- case resolveEsc cs of
- Right (esc, cs') -> esc ++ resolve cs'
- Left e -> throw (LexErrE e)
- c : cs -> c : resolve cs
-
- dropGap = \case
- '\\' : cs -> cs
- _ : cs -> dropGap cs
- [] -> panic "gap unexpectedly ended"
-
- -- unlike stripPrefixOrId, we implement this manually,
- -- because implementing stripSuffix returning a Maybe
- -- has much worse performance
- -- TODO: move to Data.List?
- stripSuffixOrId :: Eq a => [a] -> [a] -> [a]
- stripSuffixOrId suf xs =
- let go (a:as) (_:bs) = a : go as bs
- go as _ = if as == suf then [] else as
- in go xs (drop (length suf) xs)
-
+ !h !: !tl = h : tl
+ -- the following FSM assumes the string was lexed correctly
+ resolve res buf = case nextChar buf of
+ _ | atEnd buf -> pure $! res []
+ ('\\', buf') -> resolve_esc res buf'
+ (c, buf') -> resolve ((c!:) . res) buf'
+
+ resolve_esc res buf = case nextChar buf of
+ ('&', buf') -> resolve res buf'
+ (c, buf') | is_space c -> resolve res (drop_gap buf')
+ _ -> case resolveEscapeCharacter buf nextChar of
+ Right (esc, s) -> resolve ((esc!:) . res) s
+ Left e -> do
+ setInput (AI loc buf0 `advance_input_to` cur buf)
+ lexError e
+
+ drop_gap buf = case nextChar buf of
+ ('\\', buf') -> buf'
+ (_, buf') -> drop_gap buf'
+
+ advance_input_to ai@(AI _ buf) offs
+ | cur buf < offs
+ , Just (_, ai') <- alexGetChar' ai
+ = advance_input_to ai' offs
+ | otherwise
+ = assert (cur buf == offs) ai
tok_quoted_label :: Action
tok_quoted_label span buf len _buf2 = do
- s <- lex_string ("#\"", "\"") buf len
+ s <- tok_string_like ("#\"", "\"") span buf len
(AI end bufEnd) <- getInput
let
token = ITlabelvarid (SourceText src) (mkFastString s)
@@ -2337,7 +2319,7 @@ tok_quoted_label span buf len _buf2 = do
tok_char :: Action
tok_char span buf len _buf2 = do
- c <- lex_string ("'", "'") buf (if endsInHash then len - 1 else len) >>= \case
+ c <- tok_string_like ("'", "'") span buf (if endsInHash then len - 1 else len) >>= \case
[c] -> pure c
s -> panic $ "tok_char expected exactly one character, got: " ++ show s
pure . L span $
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -18,7 +18,7 @@ import Control.Monad (when)
import Data.Char (chr, ord)
import qualified Data.Foldable1 as Foldable1
import qualified Data.List.NonEmpty as NonEmpty
-import Data.Maybe (listToMaybe, mapMaybe)
+import Data.Maybe (mapMaybe)
import GHC.Parser.CharClass (
hexDigit,
is_decdigit,
@@ -34,46 +34,55 @@ import GHC.Utils.Panic (panic)
-- Escape characters
-- Assumes escape character is valid
-resolveEscapeCharacter :: [Char] -> Either LexErr (Char, [Char])
-resolveEscapeCharacter = \case
- 'a' : cs -> pure ('\a', cs)
- 'b' : cs -> pure ('\b', cs)
- 'f' : cs -> pure ('\f', cs)
- 'n' : cs -> pure ('\n', cs)
- 'r' : cs -> pure ('\r', cs)
- 't' : cs -> pure ('\t', cs)
- 'v' : cs -> pure ('\v', cs)
- '\\' : cs -> pure ('\\', cs)
- '"' : cs -> pure ('\"', cs)
- '\'' : cs -> pure ('\'', cs)
+resolveEscapeCharacter :: s -> (s -> (Char, s)) -> Either LexErr (Char, s)
+-- In practice, s is a StringBuffer. We never check for `atEnd` before we
+-- call `nextChar`, which sounds dangerous.
+-- However, in practice string content is terminated by a string delimiter, so
+-- we avoid UB by never progressing beyond such a character.
+resolveEscapeCharacter s next = case next' s of
+ ('a', s') -> pure ('\a', s')
+ ('b', s') -> pure ('\b', s')
+ ('f', s') -> pure ('\f', s')
+ ('n', s') -> pure ('\n', s')
+ ('r', s') -> pure ('\r', s')
+ ('t', s') -> pure ('\t', s')
+ ('v', s') -> pure ('\v', s')
+ ('\\', s') -> pure ('\\', s')
+ ('"' , s') -> pure ('\"', s')
+ ('\'', s') -> pure ('\'', s')
-- escape codes
- 'x' : cs -> parseNum is_hexdigit 16 hexDigit cs
- 'o' : cs -> parseNum is_octdigit 8 octDecDigit cs
- c : cs | is_decdigit c -> parseNum is_decdigit 10 octDecDigit (c : cs)
+ ('x', s') -> parseNum is_hexdigit 16 hexDigit s'
+ ('o', s') -> parseNum is_octdigit 8 octDecDigit s'
+ (c, _) | is_decdigit c -> parseNum is_decdigit 10 octDecDigit s
-- control characters (e.g. '\^M')
- '^' : c : cs -> pure (chr $ ord c - ord '@', cs)
+ ('^', s') | (c, s') <- next' s' -> pure (chr $ ord c - ord '@', s')
-- long form escapes (e.g. '\NUL')
- cs | Just (c, cs') <- parseLongEscape cs -> pure (c, cs')
+ _ | Just (!c, !s') <- parseLongEscape s next' -> pure (c, s')
-- shouldn't happen
- c : _ -> panic $ "found unexpected escape character: " ++ show c
- [] -> panic $ "escape character unexpectedly ended"
+ (c, _) -> panic $ "found unexpected escape character: " ++ show c
where
+ next' s = case next s of
+ (!c, !s') -> (c, s')
parseNum isDigit base toDigit =
- let go x = \case
- c : cs | isDigit c -> do
+ let go !x s = case next' s of
+ (c, s') | isDigit c -> do
let x' = x * base + toDigit c
when (x' > 0x10ffff) $ Left LexNumEscapeRange
- go x' cs
- cs -> pure (chr x, cs)
+ go x' s'
+ _ -> pure (chr x, s)
in go 0
-parseLongEscape :: [Char] -> Maybe (Char, [Char])
-parseLongEscape cs = listToMaybe $ mapMaybe tryParse longEscapeCodes
+parseLongEscape :: s -> (s -> (Char, s)) -> Maybe (Char, s)
+parseLongEscape s next = match longEscapeCodes s
where
- tryParse (code, c) =
- case splitAt (length code) cs of
- (pre, cs') | pre == code -> Just (c, cs')
- _ -> Nothing
+ match [] !_ = Nothing
+ match [("", c)] !s = Just (c, s)
+ match codes s = case next s of
+ (c, s') -> match (mapMaybe (pop_matching_head c) codes) s'
+ pop_matching_head c1 (esc, r)
+ | c2:esc <- esc
+ , c1 == c2 = Just (esc, r)
+ | otherwise = Nothing
longEscapeCodes =
[ ("NUL", '\NUL')
@@ -195,7 +204,7 @@ postprocessMultilineString =
resolveEscapeChars = \case
[] -> []
'\\' : s ->
- case resolveEscapeCharacter s of
+ case resolveEscapeCharacter s (\s -> (head s, tail s)) of
Left e -> panic $ "resolving escape characters in multiline string unexpectedly found errors: " ++ show e
Right (c, s') -> c : resolveEscapeChars s'
c : s -> c : resolveEscapeChars s
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/27ee0e346ac5c24d41a621b843f6313fdb6defbc
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/27ee0e346ac5c24d41a621b843f6313fdb6defbc
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240814/76c6d65c/attachment-0001.html>
More information about the ghc-commits
mailing list