[Git][ghc/ghc][wip/strings] 3 commits: Inline alexScanUser

Wed Sep 18 05:07:36 UTC 2024


Brandon Chinn pushed to branch wip/strings at Glasgow Haskell Compiler / GHC


Commits:
fb6c36bd by Brandon Chinn at 2024-09-17T21:17:20-07:00
Inline alexScanUser

- - - - -
c3780088 by Brandon Chinn at 2024-09-17T21:17:23-07:00
Use dlist to reduce allocations

- - - - -
4607c624 by Brandon Chinn at 2024-09-17T21:17:23-07:00
Duplicate processChars for specialization

- - - - -


2 changed files:

- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs


Changes:

=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -3551,6 +3551,11 @@ topNoLayoutContainsCommas [] = False
 topNoLayoutContainsCommas (ALRLayout _ _ : ls) = topNoLayoutContainsCommas ls
 topNoLayoutContainsCommas (ALRNoLayout b _ : _) = b
 
+-- If the generated alexScan/alexScanUser functions are called multiple times
+-- in this file, alexScanUser gets broken out into a separate function and
+-- increases memory usage. Make sure GHC inlines this function and optimizes it.
+{-# INLINE alexScanUser #-}
+
 lexToken :: P (PsLocated Token)
 lexToken = do
   inp@(AI loc1 buf) <- getInput


=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -18,11 +18,9 @@ import GHC.Prelude hiding (getChar)
 import Control.Arrow ((>>>))
 import Control.Monad (when)
 import Data.Char (chr, ord)
-import qualified Data.Foldable as Seq (toList)
 import qualified Data.Foldable1 as Foldable1
 import qualified Data.List.NonEmpty as NonEmpty
 import Data.Maybe (listToMaybe, mapMaybe)
-import qualified Data.Sequence as Seq
 import GHC.Data.StringBuffer (StringBuffer)
 import qualified GHC.Data.StringBuffer as StringBuffer
 import GHC.Parser.CharClass (
@@ -40,7 +38,7 @@ type BufPos = Int
 data StringLexError = StringLexError LexErr BufPos
 
 lexString :: Int -> StringBuffer -> Either StringLexError String
-lexString = lexStringWith processChars
+lexString = lexStringWith processChars processChars
   where
     processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
     processChars =
@@ -68,19 +66,24 @@ So what we'll do is do two passes. The first pass is optimistic; just convert
 to a plain String and process it. If this results in an error, we do a second
 pass, this time where each character is annotated with its position. Now, the
 error has all the information it needs.
+
+Ideally, lexStringWith would take a single (forall c. HasChar c => ...) function,
+but to help the specializer, we pass it in twice to concretize it for the two
+types we actually use.
 -}
 
 -- | See Note [Lexing strings]
 lexStringWith ::
-  (forall c. HasChar c => [c] -> Either (c, LexErr) [c])
+  ([Char] -> Either (Char, LexErr) [Char])
+  -> ([CharPos] -> Either (CharPos, LexErr) [CharPos])
   -> Int
   -> StringBuffer
   -> Either StringLexError String
-lexStringWith processChars len buf =
+lexStringWith processChars processCharsPos len buf =
   case processChars $ bufferChars buf len of
     Right s -> Right s
     Left _ ->
-      case processChars $ bufferLocatedChars buf len of
+      case processCharsPos $ bufferLocatedChars buf len of
         Right _ -> panic "expected lex error on second pass"
         Left ((_, pos), e) -> Left $ StringLexError e pos
 
@@ -103,7 +106,9 @@ pattern Char c <- (getChar -> c)
 bufferChars :: StringBuffer -> Int -> [Char]
 bufferChars = StringBuffer.lexemeToString
 
-bufferLocatedChars :: StringBuffer -> Int -> [(Char, BufPos)]
+type CharPos = (Char, BufPos)
+
+bufferLocatedChars :: StringBuffer -> Int -> [CharPos]
 bufferLocatedChars initialBuf len = go initialBuf
   where
     go buf
@@ -133,17 +138,16 @@ collapseGaps = go
       [] -> panic "gap unexpectedly ended"
 
 resolveEscapes :: HasChar c => [c] -> Either (c, LexErr) [c]
-resolveEscapes = go Seq.empty
+resolveEscapes = go dlistEmpty
   where
-    -- FIXME.bchinn: see if dlist/reverselist improves performance
     go !acc = \case
-      [] -> pure $ Seq.toList acc
+      [] -> pure $ dlistToList acc
       Char '\\' : Char '&' : cs -> go acc cs
       backslash@(Char '\\') : cs ->
         case resolveEscapeChar cs of
-          Right (esc, cs') -> go (acc Seq.|> setChar esc backslash) cs'
+          Right (esc, cs') -> go (acc `dlistSnoc` setChar esc backslash) cs'
           Left (c, e) -> Left (c, e)
-      c : cs -> go (acc Seq.|> c) cs
+      c : cs -> go (acc `dlistSnoc` c) cs
 
 -- -----------------------------------------------------------------------------
 -- Escape characters
@@ -252,7 +256,7 @@ isSingleSmartQuote = \case
 -- and rejoining lines, and instead manually find newline characters,
 -- for performance.
 lexMultilineString :: Int -> StringBuffer -> Either StringLexError String
-lexMultilineString = lexStringWith processChars
+lexMultilineString = lexStringWith processChars processChars
   where
     processChars :: HasChar c => [c] -> Either (c, LexErr) [c]
     processChars =
@@ -373,3 +377,17 @@ It's more precisely defined with the following algorithm:
     * Lines with only whitespace characters
 3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
 -}
+
+-- -----------------------------------------------------------------------------
+-- DList
+
+newtype DList a = DList ([a] -> [a])
+
+dlistEmpty :: DList a
+dlistEmpty = DList id
+
+dlistToList :: DList a -> [a]
+dlistToList (DList f) = f []
+
+dlistSnoc :: DList a -> a -> DList a
+dlistSnoc (DList f) x = DList (f . (x :))



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/d70e41ae439b6a6f2967b43d977b8af6a8febe63...4607c624e4cf4cf27ba9a9c443b2cd203a43b883

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/d70e41ae439b6a6f2967b43d977b8af6a8febe63...4607c624e4cf4cf27ba9a9c443b2cd203a43b883
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240918/00d15070/attachment-0001.html>