[Git][ghc/ghc][master] 2 commits: Lexer: pass updated buffer to actions (#22201)
Marge Bot (@marge-bot)
gitlab at gitlab.haskell.org
Sun Sep 18 12:01:36 UTC 2022
Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC
Commits:
78037167 by Vladislav Zavialov at 2022-09-18T08:01:20-04:00
Lexer: pass updated buffer to actions (#22201)
In the lexer, predicates have the following type:
{ ... } :: user -- predicate state
-> AlexInput -- input stream before the token
-> Int -- length of the token
-> AlexInput -- input stream after the token
-> Bool -- True <=> accept the token
This is documented in the Alex manual.
There is access to the input stream both before and after the token.
But when the time comes to construct the token, GHC passes only the
initial string buffer to the lexer action. This patch fixes it:
- type Action = PsSpan -> StringBuffer -> Int -> P (PsLocated Token)
+ type Action = PsSpan -> StringBuffer -> Int -> StringBuffer -> P (PsLocated Token)
Now lexer actions have access to the string buffer both before and after
the token, just like the predicates. It's just a matter of passing an
additional function parameter throughout the lexer.
- - - - -
75746594 by Vladislav Zavialov at 2022-09-18T08:01:20-04:00
Lexer: define varsym without predicates (#22201)
Before this patch, the varsym lexing rules were defined as follows:
<0> {
@varsym / { precededByClosingToken `alexAndPred` followedByOpeningToken } { varsym_tight_infix }
@varsym / { followedByOpeningToken } { varsym_prefix }
@varsym / { precededByClosingToken } { varsym_suffix }
@varsym { varsym_loose_infix }
}
Unfortunately, this meant that the predicates 'precededByClosingToken' and
'followedByOpeningToken' were recomputed several times before we could figure
out the whitespace context.
With this patch, we check for whitespace context directly in the lexer
action:
<0> {
@varsym { with_op_ws varsym }
}
The checking for opening/closing tokens happens in 'with_op_ws' now,
which is part of the lexer action rather than the lexer predicate.
- - - - -
1 changed file:
- compiler/GHC/Parser/Lexer.x
Changes:
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -490,20 +490,12 @@ $tab { warnTab }
@conid "#"+ / { ifExtension MagicHashBit } { idtoken conid }
}
--- Operators classified into prefix, suffix, tight infix, and loose infix.
--- See Note [Whitespace-sensitive operator parsing]
-<0> {
- @varsym / { precededByClosingToken `alexAndPred` followedByOpeningToken } { varsym_tight_infix }
- @varsym / { followedByOpeningToken } { varsym_prefix }
- @varsym / { precededByClosingToken } { varsym_suffix }
- @varsym { varsym_loose_infix }
-}
-
-- ToDo: - move `var` and (sym) into lexical syntax?
-- - remove backquote from $special?
<0> {
@qvarsym { idtoken qvarsym }
@qconsym { idtoken qconsym }
+ @varsym { with_op_ws varsym }
@consym { consym }
}
@@ -707,6 +699,14 @@ $tab { warnTab }
{
+-- Operator whitespace occurrence. See Note [Whitespace-sensitive operator parsing].
+data OpWs
+ = OpWsPrefix -- a !b
+ | OpWsSuffix -- a! b
+ | OpWsTightInfix -- a!b
+ | OpWsLooseInfix -- a ! b
+ deriving Show
+
-- -----------------------------------------------------------------------------
-- The token type
@@ -1093,60 +1093,61 @@ reservedSymsFM = listToUFM $
-- -----------------------------------------------------------------------------
-- Lexer actions
-type Action = PsSpan -> StringBuffer -> Int -> P (PsLocated Token)
+type Action = PsSpan -> StringBuffer -> Int -> StringBuffer -> P (PsLocated Token)
special :: Token -> Action
-special tok span _buf _len = return (L span tok)
+special tok span _buf _len _buf2 = return (L span tok)
token, layout_token :: Token -> Action
-token t span _buf _len = return (L span t)
-layout_token t span _buf _len = pushLexState layout >> return (L span t)
+token t span _buf _len _buf2 = return (L span t)
+layout_token t span _buf _len _buf2 = pushLexState layout >> return (L span t)
idtoken :: (StringBuffer -> Int -> Token) -> Action
-idtoken f span buf len = return (L span $! (f buf len))
+idtoken f span buf len _buf2 = return (L span $! (f buf len))
qdo_token :: (Maybe FastString -> Token) -> Action
-qdo_token con span buf len = do
+qdo_token con span buf len _buf2 = do
maybe_layout token
return (L span $! token)
where
!token = con $! Just $! fst $! splitQualName buf len False
skip_one_varid :: (FastString -> Token) -> Action
-skip_one_varid f span buf len
+skip_one_varid f span buf len _buf2
= return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
skip_two_varid :: (FastString -> Token) -> Action
-skip_two_varid f span buf len
+skip_two_varid f span buf len _buf2
= return (L span $! f (lexemeToFastString (stepOn (stepOn buf)) (len-2)))
strtoken :: (String -> Token) -> Action
-strtoken f span buf len =
+strtoken f span buf len _buf2 =
return (L span $! (f $! lexemeToString buf len))
begin :: Int -> Action
-begin code _span _str _len = do pushLexState code; lexToken
+begin code _span _str _len _buf2 = do pushLexState code; lexToken
pop :: Action
-pop _span _buf _len = do _ <- popLexState
- lexToken
+pop _span _buf _len _buf2 =
+ do _ <- popLexState
+ lexToken
-- See Note [Nested comment line pragmas]
failLinePrag1 :: Action
-failLinePrag1 span _buf _len = do
+failLinePrag1 span _buf _len _buf2 = do
b <- getBit InNestedCommentBit
if b then return (L span ITcomment_line_prag)
else lexError LexErrorInPragma
-- See Note [Nested comment line pragmas]
popLinePrag1 :: Action
-popLinePrag1 span _buf _len = do
+popLinePrag1 span _buf _len _buf2 = do
b <- getBit InNestedCommentBit
if b then return (L span ITcomment_line_prag) else do
_ <- popLexState
lexToken
hopefully_open_brace :: Action
-hopefully_open_brace span buf len
+hopefully_open_brace span buf len buf2
= do relaxed <- getBit RelaxedLayoutBit
ctx <- getContext
(AI l _) <- getInput
@@ -1155,17 +1156,23 @@ hopefully_open_brace span buf len
case ctx of
Layout prev_off _ : _ -> prev_off < offset
_ -> True
- if isOK then pop_and open_brace span buf len
+ if isOK then pop_and open_brace span buf len buf2
else addFatalError $
mkPlainErrorMsgEnvelope (mkSrcSpanPs span) PsErrMissingBlock
pop_and :: Action -> Action
-pop_and act span buf len = do _ <- popLexState
- act span buf len
+pop_and act span buf len buf2 =
+ do _ <- popLexState
+ act span buf len buf2
-- See Note [Whitespace-sensitive operator parsing]
-followedByOpeningToken :: AlexAccPred ExtsBitmap
-followedByOpeningToken _ _ _ (AI _ buf)
+followedByOpeningToken, precededByClosingToken :: AlexAccPred ExtsBitmap
+followedByOpeningToken _ _ _ (AI _ buf) = followedByOpeningToken' buf
+precededByClosingToken _ (AI _ buf) _ _ = precededByClosingToken' buf
+
+-- The input is the buffer *after* the token.
+followedByOpeningToken' :: StringBuffer -> Bool
+followedByOpeningToken' buf
| atEnd buf = False
| otherwise =
case nextChar buf of
@@ -1179,9 +1186,9 @@ followedByOpeningToken _ _ _ (AI _ buf)
('⦇', _) -> True
(c, _) -> isAlphaNum c
--- See Note [Whitespace-sensitive operator parsing]
-precededByClosingToken :: AlexAccPred ExtsBitmap
-precededByClosingToken _ (AI _ buf) _ _ =
+-- The input is the buffer *before* the token.
+precededByClosingToken' :: StringBuffer -> Bool
+precededByClosingToken' buf =
case prevChar buf '\n' of
'}' -> decodePrevNChars 1 buf /= "-"
')' -> True
@@ -1193,6 +1200,19 @@ precededByClosingToken _ (AI _ buf) _ _ =
'⦈' -> True
c -> isAlphaNum c
+get_op_ws :: StringBuffer -> StringBuffer -> OpWs
+get_op_ws buf1 buf2 =
+ mk_op_ws (precededByClosingToken' buf1) (followedByOpeningToken' buf2)
+ where
+ mk_op_ws False True = OpWsPrefix
+ mk_op_ws True False = OpWsSuffix
+ mk_op_ws True True = OpWsTightInfix
+ mk_op_ws False False = OpWsLooseInfix
+
+{-# INLINE with_op_ws #-}
+with_op_ws :: (OpWs -> Action) -> Action
+with_op_ws act span buf len buf2 = act (get_op_ws buf buf2) span buf len buf2
+
{-# INLINE nextCharIs #-}
nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
@@ -1289,7 +1309,7 @@ alexOrPred p1 p2 userState in1 len in2
= p1 userState in1 len in2 || p2 userState in1 len in2
multiline_doc_comment :: Action
-multiline_doc_comment span buf _len = {-# SCC "multiline_doc_comment" #-} withLexedDocType worker
+multiline_doc_comment span buf _len _buf2 = {-# SCC "multiline_doc_comment" #-} withLexedDocType worker
where
worker input@(AI start_loc _) docType checkNextLine = go start_loc "" [] input
where
@@ -1335,11 +1355,11 @@ multiline_doc_comment span buf _len = {-# SCC "multiline_doc_comment" #-} withLe
Nothing -> input
lineCommentToken :: Action
-lineCommentToken span buf len = do
+lineCommentToken span buf len buf2 = do
b <- getBit RawTokenStreamBit
if b then do
lt <- getLastLocComment
- strtoken (\s -> ITlineComment s lt) span buf len
+ strtoken (\s -> ITlineComment s lt) span buf len buf2
else lexToken
@@ -1348,7 +1368,7 @@ lineCommentToken span buf len = do
using regular expressions.
-}
nested_comment :: Action
-nested_comment span buf len = {-# SCC "nested_comment" #-} do
+nested_comment span buf len _buf2 = {-# SCC "nested_comment" #-} do
l <- getLastLocComment
let endComment input (L _ comment) = commentEnd lexToken input (Nothing, ITblockComment comment l) buf span
input <- getInput
@@ -1357,7 +1377,7 @@ nested_comment span buf len = {-# SCC "nested_comment" #-} do
nested_comment_logic endComment start_decorator input span
nested_doc_comment :: Action
-nested_doc_comment span buf _len = {-# SCC "nested_doc_comment" #-} withLexedDocType worker
+nested_doc_comment span buf _len _buf2 = {-# SCC "nested_doc_comment" #-} withLexedDocType worker
where
worker input docType _checkNextLine = nested_comment_logic endComment "" input span
where
@@ -1496,7 +1516,7 @@ mkHdkCommentSection loc n mkDS = (HdkCommentSection n ds, ITdocComment ds loc)
-- RULES pragmas turn on the forall and '.' keywords, and we turn them
-- off again at the end of the pragma.
rulePrag :: Action
-rulePrag span buf len = do
+rulePrag span buf len _buf2 = do
setExts (.|. xbit InRulePragBit)
let !src = lexemeToString buf len
return (L span (ITrules_prag (SourceText src)))
@@ -1504,26 +1524,26 @@ rulePrag span buf len = do
-- When 'UsePosPragsBit' is not set, it is expected that we emit a token instead
-- of updating the position in 'PState'
linePrag :: Action
-linePrag span buf len = do
+linePrag span buf len buf2 = do
usePosPrags <- getBit UsePosPragsBit
if usePosPrags
- then begin line_prag2 span buf len
+ then begin line_prag2 span buf len buf2
else let !src = lexemeToString buf len
in return (L span (ITline_prag (SourceText src)))
-- When 'UsePosPragsBit' is not set, it is expected that we emit a token instead
-- of updating the position in 'PState'
columnPrag :: Action
-columnPrag span buf len = do
+columnPrag span buf len buf2 = do
usePosPrags <- getBit UsePosPragsBit
let !src = lexemeToString buf len
if usePosPrags
- then begin column_prag span buf len
+ then begin column_prag span buf len buf2
else let !src = lexemeToString buf len
in return (L span (ITcolumn_prag (SourceText src)))
endPrag :: Action
-endPrag span _buf _len = do
+endPrag span _buf _len _buf2 = do
setExts (.&. complement (xbit InRulePragBit))
return (L span ITclose_prag)
@@ -1567,11 +1587,11 @@ errBrace (AI end _) span =
(\srcLoc -> mkPlainErrorMsgEnvelope srcLoc (PsErrLexer LexUnterminatedComment LexErrKind_EOF))
open_brace, close_brace :: Action
-open_brace span _str _len = do
+open_brace span _str _len _buf2 = do
ctx <- getContext
setContext (NoLayout:ctx)
return (L span ITocurly)
-close_brace span _str _len = do
+close_brace span _str _len _buf2 = do
popContext
return (L span ITccurly)
@@ -1614,7 +1634,7 @@ splitQualName orig_buf len parens = split orig_buf orig_buf
qual_size = orig_buf `byteDiff` dot_buf
varid :: Action
-varid span buf len =
+varid span buf len _buf2 =
case lookupUFM reservedWordsFM fs of
Just (ITcase, _) -> do
lastTk <- getLastTk
@@ -1660,8 +1680,8 @@ qvarsym buf len = ITqvarsym $! splitQualName buf len False
qconsym buf len = ITqconsym $! splitQualName buf len False
-- See Note [Whitespace-sensitive operator parsing]
-varsym_prefix :: Action
-varsym_prefix = sym $ \span exts s ->
+varsym :: OpWs -> Action
+varsym OpWsPrefix = sym $ \span exts s ->
let warnExtConflict errtok =
do { addPsMessage (mkSrcSpanPs span) (PsWarnOperatorWhitespaceExtConflict errtok)
; return (ITvarsym s) }
@@ -1693,10 +1713,7 @@ varsym_prefix = sym $ \span exts s ->
(mkSrcSpanPs span)
(PsWarnOperatorWhitespace s OperatorWhitespaceOccurrence_Prefix)
; return (ITvarsym s) }
-
--- See Note [Whitespace-sensitive operator parsing]
-varsym_suffix :: Action
-varsym_suffix = sym $ \span _ s ->
+varsym OpWsSuffix = sym $ \span _ s ->
if | s == fsLit "@" -> failMsgP (\srcLoc -> mkPlainErrorMsgEnvelope srcLoc $ PsErrSuffixAT)
| s == fsLit "." -> return ITdot
| otherwise ->
@@ -1704,10 +1721,7 @@ varsym_suffix = sym $ \span _ s ->
(mkSrcSpanPs span)
(PsWarnOperatorWhitespace s OperatorWhitespaceOccurrence_Suffix)
; return (ITvarsym s) }
-
--- See Note [Whitespace-sensitive operator parsing]
-varsym_tight_infix :: Action
-varsym_tight_infix = sym $ \span exts s ->
+varsym OpWsTightInfix = sym $ \span exts s ->
if | s == fsLit "@" -> return ITat
| s == fsLit ".", OverloadedRecordDotBit `xtest` exts -> return (ITproj False)
| s == fsLit "." -> return ITdot
@@ -1716,10 +1730,7 @@ varsym_tight_infix = sym $ \span exts s ->
(mkSrcSpanPs span)
(PsWarnOperatorWhitespace s (OperatorWhitespaceOccurrence_TightInfix))
; return (ITvarsym s) }
-
--- See Note [Whitespace-sensitive operator parsing]
-varsym_loose_infix :: Action
-varsym_loose_infix = sym $ \_ _ s ->
+varsym OpWsLooseInfix = sym $ \_ _ s ->
if | s == fsLit "."
-> return ITdot
| otherwise
@@ -1729,7 +1740,7 @@ consym :: Action
consym = sym (\_span _exts s -> return $ ITconsym s)
sym :: (PsSpan -> ExtsBitmap -> FastString -> P Token) -> Action
-sym con span buf len =
+sym con span buf len _buf2 =
case lookupUFM reservedSymsFM fs of
Just (keyword, NormalSyntax, 0) ->
return $ L span keyword
@@ -1760,7 +1771,7 @@ tok_integral :: (SourceText -> Integer -> Token)
-> Int -> Int
-> (Integer, (Char -> Int))
-> Action
-tok_integral itint transint transbuf translen (radix,char_to_int) span buf len = do
+tok_integral itint transint transbuf translen (radix,char_to_int) span buf len _buf2 = do
numericUnderscores <- getBit NumericUnderscoresBit -- #14473
let src = lexemeToString buf len
when ((not numericUnderscores) && ('_' `elem` src)) $ do
@@ -1802,7 +1813,7 @@ hexadecimal = (16,hexDigit)
-- readSignificandExponentPair can understand negative rationals, exponents, everything.
tok_frac :: Int -> (String -> Token) -> Action
-tok_frac drop f span buf len = do
+tok_frac drop f span buf len _buf2 = do
numericUnderscores <- getBit NumericUnderscoresBit -- #14473
let src = lexemeToString buf (len-drop)
when ((not numericUnderscores) && ('_' `elem` src)) $ do
@@ -1837,7 +1848,7 @@ readFractionalLitX readStr b str =
-- we're at the first token on a line, insert layout tokens if necessary
do_bol :: Action
-do_bol span _str _len = do
+do_bol span _str _len _buf2 = do
-- See Note [Nested comment line pragmas]
b <- getBit InNestedCommentBit
if b then return (L span ITcomment_line_prag) else do
@@ -1888,7 +1899,7 @@ maybe_layout t = do -- If the alternative layout rule is enabled then
-- by a 'do', then we allow the new context to be at the same indentation as
-- the previous context. This is what the 'strict' argument is for.
new_layout_context :: Bool -> Bool -> Token -> Action
-new_layout_context strict gen_semic tok span _buf len = do
+new_layout_context strict gen_semic tok span _buf len _buf2 = do
_ <- popLexState
(AI l _) <- getInput
let offset = srcLocCol (psRealLoc l) - len
@@ -1907,7 +1918,7 @@ new_layout_context strict gen_semic tok span _buf len = do
return (L span tok)
do_layout_left :: Action
-do_layout_left span _buf _len = do
+do_layout_left span _buf _len _buf2 = do
_ <- popLexState
pushLexState bol -- we must be at the start of a line
return (L span ITvccurly)
@@ -1916,7 +1927,7 @@ do_layout_left span _buf _len = do
-- LINE pragmas
setLineAndFile :: Int -> Action
-setLineAndFile code (PsSpan span _) buf len = do
+setLineAndFile code (PsSpan span _) buf len _buf2 = do
let src = lexemeToString buf (len - 1) -- drop trailing quotation mark
linenumLen = length $ head $ words src
linenum = parseUnsignedInteger buf linenumLen 10 octDecDigit
@@ -1943,7 +1954,7 @@ setLineAndFile code (PsSpan span _) buf len = do
lexToken
setColumn :: Action
-setColumn (PsSpan span _) buf len = do
+setColumn (PsSpan span _) buf len _buf2 = do
let column =
case reads (lexemeToString buf len) of
[(column, _)] -> column
@@ -1969,7 +1980,7 @@ lex_string_prag mkTok = lex_string_prag_comment mkTok'
mkTok' s _ = mkTok s
lex_string_prag_comment :: (String -> PsSpan -> Token) -> Action
-lex_string_prag_comment mkTok span _buf _len
+lex_string_prag_comment mkTok span _buf _len _buf2
= do input <- getInput
start <- getParsedLoc
l <- getLastLocComment
@@ -1998,7 +2009,7 @@ lex_string_prag_comment mkTok span _buf _len
-- This stuff is horrible. I hates it.
lex_string_tok :: Action
-lex_string_tok span buf _len = do
+lex_string_tok span buf _len _buf2 = do
tok <- lex_string ""
(AI end bufEnd) <- getInput
let
@@ -2068,7 +2079,7 @@ lex_char_tok :: Action
-- (the parser does that).
-- So we have to do two characters of lookahead: when we see 'x we need to
-- see if there's a trailing quote
-lex_char_tok span buf _len = do -- We've seen '
+lex_char_tok span buf _len _buf2 = do -- We've seen '
i1 <- getInput -- Look ahead to first character
let loc = psSpanStart span
case alexGetChar' i1 of
@@ -2246,7 +2257,7 @@ getCharOrFail i = do
-- QuasiQuote
lex_qquasiquote_tok :: Action
-lex_qquasiquote_tok span buf len = do
+lex_qquasiquote_tok span buf len _buf2 = do
let (qual, quoter) = splitQualName (stepOn buf) (len - 2) False
quoteStart <- getParsedLoc
quote <- lex_quasiquote (psRealLoc quoteStart) ""
@@ -2258,7 +2269,7 @@ lex_qquasiquote_tok span buf len = do
mkPsSpan quoteStart end)))
lex_quasiquote_tok :: Action
-lex_quasiquote_tok span buf len = do
+lex_quasiquote_tok span buf len _buf2 = do
let quoter = tail (lexemeToString buf (len - 1))
-- 'tail' drops the initial '[',
-- while the -1 drops the trailing '|'
@@ -2297,14 +2308,14 @@ quasiquote_error start = do
-- Warnings
warnTab :: Action
-warnTab srcspan _buf _len = do
+warnTab srcspan _buf _len _buf2 = do
addTabWarning (psRealSpan srcspan)
lexToken
warnThen :: PsMessage -> Action -> Action
-warnThen warning action srcspan buf len = do
+warnThen warning action srcspan buf len buf2 = do
addPsMessage (RealSrcSpan (psRealSpan srcspan) Strict.Nothing) warning
- action srcspan buf len
+ action srcspan buf len buf2
-- -----------------------------------------------------------------------------
-- The Parse Monad
@@ -3401,7 +3412,7 @@ lexToken = do
let span = mkPsSpan loc1 end
let bytes = byteDiff buf buf2
span `seq` setLastToken span bytes
- lt <- t span buf bytes
+ lt <- t span buf bytes buf2
let lt' = unLoc lt
if (isComment lt') then setLastComment lt else setLastTk lt
return lt
@@ -3490,9 +3501,10 @@ twoWordPrags = Map.fromList [
]
dispatch_pragmas :: Map String Action -> Action
-dispatch_pragmas prags span buf len = case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
- Just found -> found span buf len
- Nothing -> lexError LexUnknownPragma
+dispatch_pragmas prags span buf len buf2 =
+ case Map.lookup (clean_pragma (lexemeToString buf len)) prags of
+ Just found -> found span buf len buf2
+ Nothing -> lexError LexUnknownPragma
known_pragma :: Map String Action -> AlexAccPred ExtsBitmap
known_pragma prags _ (AI _ startbuf) _ (AI _ curbuf)
@@ -3514,13 +3526,13 @@ clean_pragma prag = canon_ws (map toLower (unprefix prag))
canon_ws s = unwords (map canonical (words s))
warn_unknown_prag :: Map String Action -> Action
-warn_unknown_prag prags span buf len = do
+warn_unknown_prag prags span buf len buf2 = do
let uppercase = map toUpper
unknown_prag = uppercase (clean_pragma (lexemeToString buf len))
suggestions = map uppercase (Map.keys prags)
addPsMessage (RealSrcSpan (psRealSpan span) Strict.Nothing) $
PsWarnUnrecognisedPragma unknown_prag suggestions
- nested_comment span buf len
+ nested_comment span buf len buf2
{-
%************************************************************************
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/8a666ad2a89a8ad2aa24a6406b88f516afaec671...7574659452a864e762fa812cb38cf15f70d85617
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/8a666ad2a89a8ad2aa24a6406b88f516afaec671...7574659452a864e762fa812cb38cf15f70d85617
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20220918/5879daaf/attachment-0001.html>
More information about the ghc-commits
mailing list