[Git][ghc/ghc][wip/T25609] 2 commits: Fix #25609

Mon Dec 30 19:48:53 UTC 2024

Brandon Chinn pushed to branch wip/T25609 at Glasgow Haskell Compiler / GHC


Commits:
2925b014 by Brandon Chinn at 2024-12-30T11:48:36-08:00
Fix #25609

- - - - -
597f862d by Brandon Chinn at 2024-12-30T11:48:37-08:00
Allow decrease in metrics

Metric Decrease:
    MultiLayerModulesRecomp
    parsing001

- - - - -


7 changed files:

- compiler/GHC/Parser/Lexer.x
- + compiler/GHC/Parser/Lexer/String.x
- compiler/ghc.cabal.in
- testsuite/tests/count-deps/CountDepsParser.stdout
- + testsuite/tests/parser/should_run/T25609.hs
- + testsuite/tests/parser/should_run/T25609.stdout
- testsuite/tests/parser/should_run/all.T


Changes:

=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -132,6 +132,7 @@ import GHC.Parser.Errors.Basic
 import GHC.Parser.Errors.Types
 import GHC.Parser.Errors.Ppr ()
 import GHC.Parser.Lexer.Interface
+import qualified GHC.Parser.Lexer.String as Lexer.String
 import GHC.Parser.String
 }
 
@@ -622,13 +623,6 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
   \" @stringchar*    $unigraphic / { isSmartQuote } { smart_quote_error }
 }
 
-<string_multi_content> {
-  -- Parse as much of the multiline string as possible, except for quotes
-  @stringchar* ($nl ([\  $tab] | @gap)* @stringchar*)* { tok_string_multi_content }
-  -- Allow bare quotes if it's not a triple quote
-  (\" | \"\") / ([\n .] # \") { tok_string_multi_content }
-}
-
 <0> {
   \'\' { token ITtyQuote }
 
@@ -2171,11 +2165,23 @@ tok_string span buf len _buf2 = do
     src = SourceText $ lexemeToFastString buf len
     endsInHash = currentChar (offsetBytes (len - 1) buf) == '#'

--- | Ideally, we would define this completely with Alex syntax, like normal strings.
--- Instead, this is defined as a hybrid solution by manually invoking lex states, which
--- we're doing for two reasons:
---   1. The multiline string should all be one lexical token, not multiple
---   2. We need to allow bare quotes, which can't be done with one regex
+{- Note [Lexing multiline strings]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Ideally, we would lex multiline strings completely with Alex syntax, like
+normal strings. However, we can't because:
+
+    1. The multiline string should all be one lexical token, not multiple
+    2. We need to allow bare quotes, which can't be done with one regex
+
+Instead, we'll lex them with a hybrid solution in tok_string_multi by manually
+invoking lex states. This allows us to get the performance of native Alex
+syntax as much as possible, and just gluing the pieces together outside of
+Alex.
+
+Implemented in string_multi_content in GHC/Parser/Lexer/String.x
+-}
+
+-- | See Note [Lexing multiline strings]
 tok_string_multi :: Action
 tok_string_multi startSpan startBuf _len _buf2 = do
   -- advance to the end of the multiline string
@@ -2201,17 +2207,14 @@ tok_string_multi startSpan startBuf _len _buf2 = do
   pure $ L span $ ITstringMulti src (mkFastString s)
   where
     goContent i0 =
-      case alexScan i0 string_multi_content of
-        AlexToken i1 len _
+      case Lexer.String.alexScan i0 Lexer.String.string_multi_content of
+        Lexer.String.AlexToken i1 len _
           | Just i2 <- lexDelim i1 -> pure (i1, i2)
           | isEOF i1 -> checkSmartQuotes >> setInput i1 >> lexError LexError
-          -- is the next token a tab character?
-          -- need this explicitly because there's a global rule matching $tab
-          | Just ('\t', _) <- alexGetChar' i1 -> setInput i1 >> lexError LexError
           -- Can happen if no patterns match, e.g. an unterminated gap
           | len == 0  -> setInput i1 >> lexError LexError
           | otherwise -> goContent i1
-        AlexSkip i1 _ -> goContent i1
+        Lexer.String.AlexSkip i1 _ -> goContent i1
         _ -> setInput i0 >> lexError LexError
 
     lexDelim =
@@ -2235,11 +2238,6 @@ tok_string_multi startSpan startBuf _len _buf2 = do
         Just (c, loc) -> throwSmartQuoteError c loc
         Nothing -> pure ()
 
--- | Dummy action that should never be called. Should only be used in lex states
--- that are manually lexed in tok_string_multi.
-tok_string_multi_content :: Action
-tok_string_multi_content = panic "tok_string_multi_content unexpectedly invoked"
-
 lex_chars :: (String, String) -> PsSpan -> StringBuffer -> Int -> P String
 lex_chars (startDelim, endDelim) span buf len =
   either (throwStringLexError i0) pure $
@@ -3371,6 +3369,7 @@ topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
 -- If the generated alexScan/alexScanUser functions are called multiple times
 -- in this file, alexScanUser gets broken out into a separate function and
 -- increases memory usage. Make sure GHC inlines this function and optimizes it.
+-- https://github.com/haskell/alex/pull/262
 {-# INLINE alexScanUser #-}
 
 lexToken :: P (PsLocated Token)


=====================================
compiler/GHC/Parser/Lexer/String.x
=====================================
@@ -0,0 +1,96 @@
+{
+{- |
+This module defines lex states for strings.
+
+This needs to be separate from the normal lexer because the normal lexer
+automatically includes rules like skipping whitespace or lexing comments,
+which we don't want in these contexts.
+-}
+module GHC.Parser.Lexer.String (
+  AlexReturn (..),
+  alexScan,
+  string_multi_content,
+) where
+
+import GHC.Prelude
+
+import GHC.Parser.Lexer.Interface
+import GHC.Utils.Panic (panic)
+}
+
+-- -----------------------------------------------------------------------------
+-- Alex "Character set macros"
+-- Copied from GHC/Parser/Lexer.x
+
+$unispace    = \x05 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$nl          = [\n\r\f]
+$space       = [\ $unispace]
+$whitechar   = [$nl \v $space]
+$tab         = \t
+
+$ascdigit  = 0-9
+$unidigit  = \x03 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$decdigit  = $ascdigit -- exactly $ascdigit, no more no less.
+$digit     = [$ascdigit $unidigit]
+
+$special   = [\(\)\,\;\[\]\`\{\}]
+$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
+$unisymbol = \x04 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$symbol    = [$ascsymbol $unisymbol] # [$special \_\"\']
+
+$unilarge  = \x01 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$asclarge  = [A-Z]
+$large     = [$asclarge $unilarge]
+
+$unismall  = \x02 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$ascsmall  = [a-z]
+$small     = [$ascsmall $unismall \_]
+
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$idchar    = [$small $large $digit $uniidchar \']
+
+$unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
+$graphic   = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
+$charesc   = [a b f n r t v \\ \" \' \&]
+
+$octit     = 0-7
+$hexit     = [$decdigit A-F a-f]
+
+-- -----------------------------------------------------------------------------
+-- Alex "Regular expression macros"
+-- Copied from GHC/Parser/Lexer.x
+
+ at numspc       = _*                   -- numeric spacer (#14473)
+ at decimal      = $decdigit(@numspc $decdigit)*
+ at octal        = $octit(@numspc $octit)*
+ at hexadecimal  = $hexit(@numspc $hexit)*
+ at gap = \\ $whitechar+ \\
+ at cntrl = $asclarge | \@ | \[ | \\ | \] | \^ | \_
+ at ascii = \^ @cntrl | "NUL" | "SOH" | "STX" | "ETX" | "EOT" | "ENQ" | "ACK"
+       | "BEL" | "BS" | "HT" | "LF" | "VT" | "FF" | "CR" | "SO" | "SI" | "DLE"
+       | "DC1" | "DC2" | "DC3" | "DC4" | "NAK" | "SYN" | "ETB" | "CAN"
+       | "EM" | "SUB" | "ESC" | "FS" | "GS" | "RS" | "US" | "SP" | "DEL"
+ at escape     = \\ ( $charesc      | @ascii | @decimal | o @octal | x @hexadecimal )
+ at stringchar = ($graphic # [\\ \"]) | $space | @escape     | @gap
+
+:-
+
+-- Define an empty rule so it compiles; callers should always explicitly specify a startcode
+<0> () ;
+
+-- See Note [Lexing multiline strings]
+<string_multi_content> {
+  -- Parse as much of the multiline string as possible, except for quotes
+  @stringchar* ($nl ([\  $tab] | @gap)* @stringchar*)* { string_multi_content_action }
+  -- Allow bare quotes if it's not a triple quote
+  (\" | \"\") / ([\n .] # \") { string_multi_content_action }
+}
+
+-- -----------------------------------------------------------------------------
+-- Haskell actions
+{
+-- | Dummy action that should never be called. Should only be used in lex states
+-- that are manually lexed in tok_string_multi.
+string_multi_content_action :: a
+string_multi_content_action = panic "string_multi_content_action unexpectedly invoked"
+}


=====================================
compiler/ghc.cabal.in
=====================================
@@ -647,6 +647,7 @@ Library
         GHC.Parser.Header
         GHC.Parser.Lexer
         GHC.Parser.Lexer.Interface
+        GHC.Parser.Lexer.String
         GHC.Parser.HaddockLex
         GHC.Parser.PostProcess
         GHC.Parser.PostProcess.Haddock


=====================================
testsuite/tests/count-deps/CountDepsParser.stdout
=====================================
@@ -128,6 +128,7 @@ GHC.Parser.Errors.Types
 GHC.Parser.HaddockLex
 GHC.Parser.Lexer
 GHC.Parser.Lexer.Interface
+GHC.Parser.Lexer.String
 GHC.Parser.PostProcess
 GHC.Parser.PostProcess.Haddock
 GHC.Parser.String


=====================================
testsuite/tests/parser/should_run/T25609.hs
=====================================
@@ -0,0 +1,26 @@
+{-# LANGUAGE MultilineStrings #-}
+
+main :: IO ()
+main = do
+  -- strings with comment tokens
+  print """{- asdf -}"""
+  print """a {- asdf -} b"""
+  print """-- asdf"""
+  print """{-"""
+
+  -- strings with haddock comments
+  print """{- | test -}"""
+  print """{- * test -}"""
+  print """{- ^ test -}"""
+  print """{- $ test -}"""
+  print """-- | test"""
+  print """-- * test"""
+  print """-- ^ test"""
+  print """-- $ test"""
+
+  -- strings with only whitespace
+  print """    """
+  print """
+
+
+        """


=====================================
testsuite/tests/parser/should_run/T25609.stdout
=====================================
@@ -0,0 +1,14 @@
+"{- asdf -}"
+"a {- asdf -} b"
+"-- asdf"
+"{-"
+"{- | test -}"
+"{- * test -}"
+"{- ^ test -}"
+"{- $ test -}"
+"-- | test"
+"-- * test"
+"-- ^ test"
+"-- $ test"
+"    "
+"\n"


=====================================
testsuite/tests/parser/should_run/all.T
=====================================
@@ -21,6 +21,9 @@ test('RecordDotSyntax3', [extra_files(['RecordDotSyntaxA.hs'])], multimod_compil
 test('RecordDotSyntax4', [extra_files(['RecordDotSyntaxA.hs'])], multimod_compile_and_run, ['RecordDotSyntax4', ''])
 test('RecordDotSyntax5', normal, compile_and_run, [''])
 test('ListTuplePunsConstraints', extra_files(['ListTuplePunsConstraints.hs']), ghci_script, ['ListTuplePunsConstraints.script'])
+
+# Multiline strings
 test('MultilineStrings', normal, compile_and_run, [''])
 test('MultilineStringsOverloaded', normal, compile_and_run, [''])
 test('T25375', normal, compile_and_run, [''])
+test('T25609', normal, compile_and_run, [''])



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/023b639513fbf7456cc195fa5c09ed79a8cf5f5d...597f862d74544fb5f809152779afaea1c64015e6

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/023b639513fbf7456cc195fa5c09ed79a8cf5f5d...597f862d74544fb5f809152779afaea1c64015e6
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20241230/c75d4177/attachment-0001.html>