[Git][ghc/ghc][wip/T25609] 3 commits: Break out GHC.Parser.Lexer.Interface
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Sun Dec 29 23:03:46 UTC 2024
Brandon Chinn pushed to branch wip/T25609 at Glasgow Haskell Compiler / GHC
Commits:
ecd519f1 by Brandon Chinn at 2024-12-29T14:58:16-08:00
Break out GHC.Parser.Lexer.Interface
- - - - -
e5fd3795 by Brandon Chinn at 2024-12-29T14:58:25-08:00
Fix #25609
- - - - -
023b6395 by Brandon Chinn at 2024-12-29T15:02:10-08:00
Allow decrease in metrics
Metric Decrease:
MultiLayerModulesRecomp
parsing001
- - - - -
9 changed files:
- compiler/GHC/Parser/HaddockLex.x
- compiler/GHC/Parser/Lexer.x
- + compiler/GHC/Parser/Lexer/Interface.hs
- + compiler/GHC/Parser/Lexer/String.x
- compiler/ghc.cabal.in
- testsuite/tests/count-deps/CountDepsParser.stdout
- + testsuite/tests/parser/should_run/T25609.hs
- + testsuite/tests/parser/should_run/T25609.stdout
- testsuite/tests/parser/should_run/all.T
Changes:
=====================================
compiler/GHC/Parser/HaddockLex.x
=====================================
@@ -8,6 +8,7 @@ import GHC.Prelude
import GHC.Data.FastString
import GHC.Hs.Doc
import GHC.Parser.Lexer
+import GHC.Parser.Lexer.Interface (adjustChar)
import GHC.Parser.Annotation
import GHC.Types.SrcLoc
import GHC.Types.SourceText
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -78,7 +78,6 @@ module GHC.Parser.Lexer (
commentToAnnotation,
HdkComment(..),
warnopt,
- adjustChar,
addPsMessage
) where
@@ -132,6 +131,8 @@ import GHC.Driver.Flags
import GHC.Parser.Errors.Basic
import GHC.Parser.Errors.Types
import GHC.Parser.Errors.Ppr ()
+import GHC.Parser.Lexer.Interface
+import qualified GHC.Parser.Lexer.String as Lexer.String
import GHC.Parser.String
}
@@ -622,13 +623,6 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
\" @stringchar* $unigraphic / { isSmartQuote } { smart_quote_error }
}
-<string_multi_content> {
- -- Parse as much of the multiline string as possible, except for quotes
- @stringchar* ($nl ([\ $tab] | @gap)* @stringchar*)* { tok_string_multi_content }
- -- Allow bare quotes if it's not a triple quote
- (\" | \"\") / ([\n .] # \") { tok_string_multi_content }
-}
-
<0> {
\'\' { token ITtyQuote }
@@ -2171,11 +2165,23 @@ tok_string span buf len _buf2 = do
src = SourceText $ lexemeToFastString buf len
endsInHash = currentChar (offsetBytes (len - 1) buf) == '#'
--- | Ideally, we would define this completely with Alex syntax, like normal strings.
--- Instead, this is defined as a hybrid solution by manually invoking lex states, which
--- we're doing for two reasons:
--- 1. The multiline string should all be one lexical token, not multiple
--- 2. We need to allow bare quotes, which can't be done with one regex
+{- Note [Lexing multiline strings]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Ideally, we would lex multiline strings completely with Alex syntax, like
+normal strings. However, we can't because:
+
+ 1. The multiline string should all be one lexical token, not multiple
+ 2. We need to allow bare quotes, which can't be done with one regex
+
+Instead, we'll lex them with a hybrid solution in tok_string_multi by manually
+invoking lex states. This allows us to get the performance of native Alex
+syntax as much as possible, and just gluing the pieces together outside of
+Alex.
+
+Implemented in string_multi_content in GHC/Parser/Lexer/String.x
+-}
+
+-- | See Note [Lexing multiline strings]
tok_string_multi :: Action
tok_string_multi startSpan startBuf _len _buf2 = do
-- advance to the end of the multiline string
@@ -2201,17 +2207,14 @@ tok_string_multi startSpan startBuf _len _buf2 = do
pure $ L span $ ITstringMulti src (mkFastString s)
where
goContent i0 =
- case alexScan i0 string_multi_content of
- AlexToken i1 len _
+ case Lexer.String.alexScan i0 Lexer.String.string_multi_content of
+ Lexer.String.AlexToken i1 len _
| Just i2 <- lexDelim i1 -> pure (i1, i2)
| isEOF i1 -> checkSmartQuotes >> setInput i1 >> lexError LexError
- -- is the next token a tab character?
- -- need this explicitly because there's a global rule matching $tab
- | Just ('\t', _) <- alexGetChar' i1 -> setInput i1 >> lexError LexError
-- Can happen if no patterns match, e.g. an unterminated gap
| len == 0 -> setInput i1 >> lexError LexError
| otherwise -> goContent i1
- AlexSkip i1 _ -> goContent i1
+ Lexer.String.AlexSkip i1 _ -> goContent i1
_ -> setInput i0 >> lexError LexError
lexDelim =
@@ -2235,11 +2238,6 @@ tok_string_multi startSpan startBuf _len _buf2 = do
Just (c, loc) -> throwSmartQuoteError c loc
Nothing -> pure ()
--- | Dummy action that should never be called. Should only be used in lex states
--- that are manually lexed in tok_string_multi.
-tok_string_multi_content :: Action
-tok_string_multi_content = panic "tok_string_multi_content unexpectedly invoked"
-
lex_chars :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
lex_chars (startDelim, endDelim) span buf len =
either (throwStringLexError i0) pure $
@@ -2591,105 +2589,6 @@ getLastLocIncludingComments = P $ \s@(PState { prev_loc = prev_loc }) -> POk s p
getLastLoc :: P PsSpan
getLastLoc = P $ \s@(PState { last_loc = last_loc }) -> POk s last_loc
-data AlexInput = AI !PsLoc !StringBuffer deriving (Show)
-
-{-
-Note [Unicode in Alex]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Although newer versions of Alex support unicode, this grammar is processed with
-the old style '--latin1' behaviour. This means that when implementing the
-functions
-
- alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
- alexInputPrevChar :: AlexInput -> Char
-
-which Alex uses to take apart our 'AlexInput', we must
-
- * return a latin1 character in the 'Word8' that 'alexGetByte' expects
- * return a latin1 character in 'alexInputPrevChar'.
-
-We handle this in 'adjustChar' by squishing entire classes of unicode
-characters into single bytes.
--}
-
-{-# INLINE adjustChar #-}
-adjustChar :: Char -> Word8
-adjustChar c = adj_c
- where non_graphic = 0x00
- upper = 0x01
- lower = 0x02
- digit = 0x03
- symbol = 0x04
- space = 0x05
- other_graphic = 0x06
- uniidchar = 0x07
-
- adj_c
- | c <= '\x07' = non_graphic
- | c <= '\x7f' = fromIntegral (ord c)
- -- Alex doesn't handle Unicode, so when Unicode
- -- character is encountered we output these values
- -- with the actual character value hidden in the state.
- | otherwise =
- -- NB: The logic behind these definitions is also reflected
- -- in "GHC.Utils.Lexeme"
- -- Any changes here should likely be reflected there.
-
- case generalCategory c of
- UppercaseLetter -> upper
- LowercaseLetter -> lower
- TitlecaseLetter -> upper
- ModifierLetter -> uniidchar -- see #10196
- OtherLetter -> lower -- see #1103
- NonSpacingMark -> uniidchar -- see #7650
- SpacingCombiningMark -> other_graphic
- EnclosingMark -> other_graphic
- DecimalNumber -> digit
- LetterNumber -> digit
- OtherNumber -> digit -- see #4373
- ConnectorPunctuation -> symbol
- DashPunctuation -> symbol
- OpenPunctuation -> other_graphic
- ClosePunctuation -> other_graphic
- InitialQuote -> other_graphic
- FinalQuote -> other_graphic
- OtherPunctuation -> symbol
- MathSymbol -> symbol
- CurrencySymbol -> symbol
- ModifierSymbol -> symbol
- OtherSymbol -> symbol
- Space -> space
- _other -> non_graphic
-
--- Getting the previous 'Char' isn't enough here - we need to convert it into
--- the same format that 'alexGetByte' would have produced.
---
--- See Note [Unicode in Alex] and #13986.
-alexInputPrevChar :: AlexInput -> Char
-alexInputPrevChar (AI _ buf) = unsafeChr (fromIntegral (adjustChar pc))
- where pc = prevChar buf '\n'
-
-unsafeChr :: Int -> Char
-unsafeChr (I# c) = GHC.Exts.C# (GHC.Exts.chr# c)
-
--- backwards compatibility for Alex 2.x
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar inp = case alexGetByte inp of
- Nothing -> Nothing
- Just (b,i) -> c `seq` Just (c,i)
- where c = unsafeChr $ fromIntegral b
-
--- See Note [Unicode in Alex]
-alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
-alexGetByte (AI loc s)
- | atEnd s = Nothing
- | otherwise = byte `seq` loc' `seq` s' `seq`
- --trace (show (ord c)) $
- Just (byte, (AI loc' s'))
- where (c,s') = nextChar s
- loc' = advancePsLoc loc c
- byte = adjustChar c
-
{-# INLINE alexGetChar' #-}
-- This version does not squash unicode characters, it is used when
-- lexing strings.
@@ -3470,6 +3369,7 @@ topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
-- If the generated alexScan/alexScanUser functions are called multiple times
-- in this file, alexScanUser gets broken out into a separate function and
-- increases memory usage. Make sure GHC inlines this function and optimizes it.
+-- https://github.com/haskell/alex/pull/262
{-# INLINE alexScanUser #-}
lexToken :: P (PsLocated Token)
=====================================
compiler/GHC/Parser/Lexer/Interface.hs
=====================================
@@ -0,0 +1,124 @@
+{-# LANGUAGE MagicHash #-}
+
+{- |
+This module defines the types and functions necessary for an Alex-generated
+lexer.
+
+https://haskell-alex.readthedocs.io/en/latest/api.html#
+-}
+module GHC.Parser.Lexer.Interface (
+ AlexInput (..),
+ alexGetByte,
+ alexInputPrevChar,
+
+ -- * Helpers
+ alexGetChar,
+ adjustChar,
+) where
+
+import GHC.Prelude
+
+import Data.Char (GeneralCategory (..), generalCategory, ord)
+import Data.Word (Word8)
+import GHC.Data.StringBuffer (StringBuffer, atEnd, nextChar, prevChar)
+import GHC.Exts
+import GHC.Types.SrcLoc (PsLoc, advancePsLoc)
+
+data AlexInput = AI !PsLoc !StringBuffer deriving (Show)
+
+-- See Note [Unicode in Alex]
+alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
+alexGetByte (AI loc s)
+ | atEnd s = Nothing
+ | otherwise = byte `seq` loc' `seq` s' `seq`
+ --trace (show (ord c)) $
+ Just (byte, (AI loc' s'))
+ where (c,s') = nextChar s
+ loc' = advancePsLoc loc c
+ byte = adjustChar c
+
+-- Getting the previous 'Char' isn't enough here - we need to convert it into
+-- the same format that 'alexGetByte' would have produced.
+--
+-- See Note [Unicode in Alex] and #13986.
+alexInputPrevChar :: AlexInput -> Char
+alexInputPrevChar (AI _ buf) = unsafeChr (fromIntegral (adjustChar pc))
+ where pc = prevChar buf '\n'
+
+-- backwards compatibility for Alex 2.x
+alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
+alexGetChar inp = case alexGetByte inp of
+ Nothing -> Nothing
+ Just (b,i) -> c `seq` Just (c,i)
+ where c = unsafeChr $ fromIntegral b
+
+unsafeChr :: Int -> Char
+unsafeChr (I# c) = GHC.Exts.C# (GHC.Exts.chr# c)
+
+{-
+Note [Unicode in Alex]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Although newer versions of Alex support unicode, this grammar is processed with
+the old style '--latin1' behaviour. This means that when implementing the
+functions
+
+ alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
+ alexInputPrevChar :: AlexInput -> Char
+
+which Alex uses to take apart our 'AlexInput', we must
+
+ * return a latin1 character in the 'Word8' that 'alexGetByte' expects
+ * return a latin1 character in 'alexInputPrevChar'.
+
+We handle this in 'adjustChar' by squishing entire classes of unicode
+characters into single bytes.
+-}
+
+{-# INLINE adjustChar #-}
+adjustChar :: Char -> Word8
+adjustChar c = adj_c
+ where non_graphic = 0x00
+ upper = 0x01
+ lower = 0x02
+ digit = 0x03
+ symbol = 0x04
+ space = 0x05
+ other_graphic = 0x06
+ uniidchar = 0x07
+
+ adj_c
+ | c <= '\x07' = non_graphic
+ | c <= '\x7f' = fromIntegral (ord c)
+ -- Alex doesn't handle Unicode, so when Unicode
+ -- character is encountered we output these values
+ -- with the actual character value hidden in the state.
+ | otherwise =
+ -- NB: The logic behind these definitions is also reflected
+ -- in "GHC.Utils.Lexeme"
+ -- Any changes here should likely be reflected there.
+
+ case generalCategory c of
+ UppercaseLetter -> upper
+ LowercaseLetter -> lower
+ TitlecaseLetter -> upper
+ ModifierLetter -> uniidchar -- see #10196
+ OtherLetter -> lower -- see #1103
+ NonSpacingMark -> uniidchar -- see #7650
+ SpacingCombiningMark -> other_graphic
+ EnclosingMark -> other_graphic
+ DecimalNumber -> digit
+ LetterNumber -> digit
+ OtherNumber -> digit -- see #4373
+ ConnectorPunctuation -> symbol
+ DashPunctuation -> symbol
+ OpenPunctuation -> other_graphic
+ ClosePunctuation -> other_graphic
+ InitialQuote -> other_graphic
+ FinalQuote -> other_graphic
+ OtherPunctuation -> symbol
+ MathSymbol -> symbol
+ CurrencySymbol -> symbol
+ ModifierSymbol -> symbol
+ OtherSymbol -> symbol
+ Space -> space
+ _other -> non_graphic
=====================================
compiler/GHC/Parser/Lexer/String.x
=====================================
@@ -0,0 +1,96 @@
+{
+{- |
+This module defines lex states for strings.
+
+This needs to be separate from the normal lexer because the normal lexer
+automatically includes rules like skipping whitespace or lexing comments,
+which we don't want in these contexts.
+-}
+module GHC.Parser.Lexer.String (
+ AlexReturn (..),
+ alexScan,
+ string_multi_content,
+) where
+
+import GHC.Prelude
+
+import GHC.Parser.Lexer.Interface
+import GHC.Utils.Panic (panic)
+}
+
+-- -----------------------------------------------------------------------------
+-- Alex "Character set macros"
+-- Copied from GHC/Parser/Lexer.x
+
+$unispace = \x05 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$nl = [\n\r\f]
+$space = [\ $unispace]
+$whitechar = [$nl \v $space]
+$tab = \t
+
+$ascdigit = 0-9
+$unidigit = \x03 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$decdigit = $ascdigit -- exactly $ascdigit, no more no less.
+$digit = [$ascdigit $unidigit]
+
+$special = [\(\)\,\;\[\]\`\{\}]
+$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
+$unisymbol = \x04 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$symbol = [$ascsymbol $unisymbol] # [$special \_\"\']
+
+$unilarge = \x01 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$asclarge = [A-Z]
+$large = [$asclarge $unilarge]
+
+$unismall = \x02 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$ascsmall = [a-z]
+$small = [$ascsmall $unismall \_]
+
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$idchar = [$small $large $digit $uniidchar \']
+
+$unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$graphic = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
+$charesc = [a b f n r t v \\ \" \' \&]
+
+$octit = 0-7
+$hexit = [$decdigit A-F a-f]
+
+-- -----------------------------------------------------------------------------
+-- Alex "Regular expression macros"
+-- Copied from GHC/Parser/Lexer.x
+
+ at numspc = _* -- numeric spacer (#14473)
+ at decimal = $decdigit(@numspc $decdigit)*
+ at octal = $octit(@numspc $octit)*
+ at hexadecimal = $hexit(@numspc $hexit)*
+ at gap = \\ $whitechar+ \\
+ at cntrl = $asclarge | \@ | \[ | \\ | \] | \^ | \_
+ at ascii = \^ @cntrl | "NUL" | "SOH" | "STX" | "ETX" | "EOT" | "ENQ" | "ACK"
+ | "BEL" | "BS" | "HT" | "LF" | "VT" | "FF" | "CR" | "SO" | "SI" | "DLE"
+ | "DC1" | "DC2" | "DC3" | "DC4" | "NAK" | "SYN" | "ETB" | "CAN"
+ | "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
+ at escape = \\ ( $charesc | @ascii | @decimal | o @octal | x @hexadecimal )
+ at stringchar = ($graphic # [\\ \"]) | $space | @escape | @gap
+
+:-
+
+-- Define an empty rule so it compiles; callers should always explicitly specify a startcode
+<0> () ;
+
+-- See Note [Lexing multiline strings]
+<string_multi_content> {
+ -- Parse as much of the multiline string as possible, except for quotes
+ @stringchar* ($nl ([\ $tab] | @gap)* @stringchar*)* { string_multi_content_action }
+ -- Allow bare quotes if it's not a triple quote
+ (\" | \"\") / ([\n .] # \") { string_multi_content_action }
+}
+
+-- -----------------------------------------------------------------------------
+-- Haskell actions
+{
+-- | Dummy action that should never be called. Should only be used in lex states
+-- that are manually lexed in tok_string_multi.
+string_multi_content_action :: a
+string_multi_content_action = panic "string_multi_content_action unexpectedly invoked"
+}
\ No newline at end of file
=====================================
compiler/ghc.cabal.in
=====================================
@@ -646,6 +646,8 @@ Library
GHC.Parser.Errors.Types
GHC.Parser.Header
GHC.Parser.Lexer
+ GHC.Parser.Lexer.Interface
+ GHC.Parser.Lexer.String
GHC.Parser.HaddockLex
GHC.Parser.PostProcess
GHC.Parser.PostProcess.Haddock
=====================================
testsuite/tests/count-deps/CountDepsParser.stdout
=====================================
@@ -127,6 +127,8 @@ GHC.Parser.Errors.Ppr
GHC.Parser.Errors.Types
GHC.Parser.HaddockLex
GHC.Parser.Lexer
+GHC.Parser.Lexer.Interface
+GHC.Parser.Lexer.String
GHC.Parser.PostProcess
GHC.Parser.PostProcess.Haddock
GHC.Parser.String
=====================================
testsuite/tests/parser/should_run/T25609.hs
=====================================
@@ -0,0 +1,26 @@
+{-# LANGUAGE MultilineStrings #-}
+
+main :: IO ()
+main = do
+ -- strings with comment tokens
+ print """{- asdf -}"""
+ print """a {- asdf -} b"""
+ print """-- asdf"""
+ print """{-"""
+
+ -- strings with haddock comments
+ print """{- | test -}"""
+ print """{- * test -}"""
+ print """{- ^ test -}"""
+ print """{- $ test -}"""
+ print """-- | test"""
+ print """-- * test"""
+ print """-- ^ test"""
+ print """-- $ test"""
+
+ -- strings with only whitespace
+ print """ """
+ print """
+
+
+ """
=====================================
testsuite/tests/parser/should_run/T25609.stdout
=====================================
@@ -0,0 +1,14 @@
+"{- asdf -}"
+"a {- asdf -} b"
+"-- asdf"
+"{-"
+"{- | test -}"
+"{- * test -}"
+"{- ^ test -}"
+"{- $ test -}"
+"-- | test"
+"-- * test"
+"-- ^ test"
+"-- $ test"
+" "
+"\n"
=====================================
testsuite/tests/parser/should_run/all.T
=====================================
@@ -21,6 +21,9 @@ test('RecordDotSyntax3', [extra_files(['RecordDotSyntaxA.hs'])], multimod_compil
test('RecordDotSyntax4', [extra_files(['RecordDotSyntaxA.hs'])], multimod_compile_and_run, ['RecordDotSyntax4', ''])
test('RecordDotSyntax5', normal, compile_and_run, [''])
test('ListTuplePunsConstraints', extra_files(['ListTuplePunsConstraints.hs']), ghci_script, ['ListTuplePunsConstraints.script'])
+
+# Multiline strings
test('MultilineStrings', normal, compile_and_run, [''])
test('MultilineStringsOverloaded', normal, compile_and_run, [''])
test('T25375', normal, compile_and_run, [''])
+test('T25609', normal, compile_and_run, [''])
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/695b2a899e4fe5650c8aa9139027b881a124e09c...023b639513fbf7456cc195fa5c09ed79a8cf5f5d
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/695b2a899e4fe5650c8aa9139027b881a124e09c...023b639513fbf7456cc195fa5c09ed79a8cf5f5d
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20241229/fe0619dd/attachment-0001.html>
More information about the ghc-commits
mailing list