[Git][ghc/ghc][wip/strings] Fixes
Sebastian Graf (@sgraf812)
gitlab at gitlab.haskell.org
Wed Aug 14 12:57:42 UTC 2024
Sebastian Graf pushed to branch wip/strings at Glasgow Haskell Compiler / GHC
Commits:
b5d4cce6 by Sebastian Graf at 2024-08-14T14:57:36+02:00
Fixes
- - - - -
3 changed files:
- compiler/GHC/Data/StringBuffer.hs
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
Changes:
=====================================
compiler/GHC/Data/StringBuffer.hs
=====================================
@@ -96,8 +96,11 @@ unsafeWithForeignPtr = withForeignPtr
data StringBuffer
= StringBuffer {
buf :: {-# UNPACK #-} !(ForeignPtr Word8),
- len :: {-# UNPACK #-} !Int, -- length
- cur :: {-# UNPACK #-} !Int -- current pos
+ len :: {-# UNPACK #-} !Int,
+ -- length, i.e., *end* position of the buffer, exclusive
+ cur :: {-# UNPACK #-} !Int
+ -- current pos. Invariant: cur < len. The true length of the represented
+ -- string is (len - cur - 1).
}
-- The buffer is assumed to be UTF-8 encoded, and furthermore
-- we add three @\'\\0\'@ bytes to the end as sentinels so that the
@@ -324,9 +327,13 @@ offsetBytes i s = s { cur = cur s + i }
byteDiff :: StringBuffer -> StringBuffer -> Int
byteDiff s1 s2 = cur s2 - cur s1
--- | Restricts the length of the string buffer to the given number of bytes.
+-- | Restricts the length of the string buffer to the current position plus the
+-- given number of bytes.
takeBytes :: Int -> StringBuffer -> StringBuffer
-takeBytes new_len s = assert (len s >= new_len) $ s { len = new_len }
+takeBytes n s = assert (cur s + n < len s) s { len = cur s + n }
+
+byteLength :: StringBuffer -> Int
+byteLength s = len s - cur s - 1
-- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
atEnd :: StringBuffer -> Bool
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -41,6 +41,7 @@
-- Alex "Haskell code fragment top"
{
+{-# OPTIONS_GHC -ddump-simpl -ddump-to-file #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
@@ -2264,12 +2265,14 @@ tok_string_multi_line span buf len _buf2 = do
lastChar = currentChar (offsetBytes (len - 1) buf)
tok_string_like :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
-tok_string_like (startDelim, endDelim) span buf len = escaped_string_content loc buf_content
+tok_string_like (startDelim, endDelim) span buf bytes =
+ -- pprTrace "tok_string_like" (ppr (mkSrcSpanPs span) <+> ppr bytes <+> text startDelim <+> text endDelim $$ ppr (cur buf) <+> ppr (len buf) $$ ppr (cur buf_content) <+> ppr (len buf_content)) $
+ escaped_string_content loc buf_content
where
loc = foldr (flip advancePsLoc) (psSpanStart span) startDelim
buf_content =
- takeBytes (len - length endDelim) $ -- NB: ASCII => 1 byte per Char
- offsetBytes (length startDelim) $
+ offsetBytes (length startDelim) $ -- NB: ASCII => 1 byte per Char
+ takeBytes (bytes - length endDelim) $
buf
newtype LexErrE = LexErrE LexErr deriving (Show)
@@ -2278,21 +2281,22 @@ instance Exception LexErrE
escaped_string_content :: PsLoc -> StringBuffer -> P String
escaped_string_content loc buf0 = resolve id buf0 -- id: empty DList for the result string
where
- !h !: !tl = h : tl
+ snoc pre !h = \(!post) -> pre (h : post)
-- the following FSM assumes the string was lexed correctly
resolve res buf = case nextChar buf of
_ | atEnd buf -> pure $! res []
- ('\\', buf') -> resolve_esc res buf'
- (c, buf') -> resolve ((c!:) . res) buf'
+ ('\\', buf') -> resolve_esc res buf'
+ (c, buf') -> resolve (res `snoc` c) buf'
resolve_esc res buf = case nextChar buf of
('&', buf') -> resolve res buf'
(c, buf') | is_space c -> resolve res (drop_gap buf')
- _ -> case resolveEscapeCharacter buf nextChar of
- Right (esc, s) -> resolve ((esc!:) . res) s
- Left e -> do
- setInput (AI loc buf0 `advance_input_to` cur buf)
- lexError e
+ (_c, _) -> -- pprTrace "escape" (ppr (mkSrcSpanPs (mkPsSpan loc loc)) <+> char _c) $
+ case resolveEscapeCharacter buf nextChar of
+ Right (esc, s) -> resolve (res `snoc` esc) s
+ Left e -> do
+ setInput (AI loc buf0 `advance_input_to` cur buf)
+ lexError e
drop_gap buf = case nextChar buf of
('\\', buf') -> buf'
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -75,10 +75,14 @@ resolveEscapeCharacter s next = case next' s of
parseLongEscape :: s -> (s -> (Char, s)) -> Maybe (Char, s)
parseLongEscape s next = match longEscapeCodes s
where
- match [] !_ = Nothing
- match [("", c)] !s = Just (c, s)
- match codes s = case next s of
- (c, s') -> match (mapMaybe (pop_matching_head c) codes) s'
+ match codes s | (c, s') <- next s =
+ -- pprTrace "match" (char c <+> pprWithCommas (text . fst) codes) $
+ case mapMaybe (pop_matching_head c) codes of
+ [] -> case lookup "" codes of
+ Just c -> Just (c, s)
+ Nothing -> Nothing
+ codes' -> match codes' s'
+
pop_matching_head c1 (esc, r)
| c2:esc <- esc
, c1 == c2 = Just (esc, r)
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/b5d4cce620fb238f6fbac4cca8b224e0479d5f64
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/b5d4cce620fb238f6fbac4cca8b224e0479d5f64
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240814/9f717da9/attachment-0001.html>
More information about the ghc-commits
mailing list