[Git][ghc/ghc][master] Unicode: adding compact version of GeneralCategory (resolves #24789)
Marge Bot (@marge-bot)
gitlab at gitlab.haskell.org
Thu Jun 27 01:52:11 UTC 2024
Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC
Commits:
f4606ae0 by Serge S. Gulin at 2024-06-26T21:51:05-04:00
Unicode: adding compact version of GeneralCategory (resolves #24789)
The following features are applied:
1. Lookup code like Cmm-switches (draft implementation proposed by Sylvain Henry @hsyl20)
2. Nested ifs (logarithmic search vs linear search) (the idea proposed by Sylvain Henry @hsyl20)
-------------------------
Metric Decrease:
size_hello_artifact
size_hello_unicode
-------------------------
- - - - -
8 changed files:
- libraries/ghc-internal/src/GHC/Internal/Unicode/Bits.hs
- libraries/ghc-internal/src/GHC/Internal/Unicode/Char/UnicodeData/GeneralCategory.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/ByteString.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/GeneralCategory.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/RangeSwitch.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/WordEncoding.hs
- libraries/ghc-internal/tools/ucd2haskell/exe/UCD2Haskell/ModuleGenerators.hs
- libraries/ghc-internal/tools/ucd2haskell/ucd2haskell.cabal
Changes:
=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Bits.hs
=====================================
@@ -1,6 +1,9 @@
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE UnliftedNewtypes #-}
+{-# LANGUAGE BlockArguments #-}
-----------------------------------------------------------------------------
-- |
@@ -18,20 +21,23 @@
-----------------------------------------------------------------------------
module GHC.Internal.Unicode.Bits
- ( lookupBit64,
- lookupIntN
- ) where
+ ( lookupIntN
+ , lookupBit64
+ , newByteArrayFromWord8List
+ , byteArrayLookupIntN
+ , copyAddrToWord8List
+ , UnicodeByteArray
+ )
+ where
-import GHC.Internal.Base (Bool, Int(..), Word(..), Eq(..))
import GHC.Internal.Bits (finiteBitSize, popCount)
import {-# SOURCE #-} GHC.Internal.ByteOrder
import GHC.Prim
- (Addr#,
- indexWordOffAddr#, indexWord8OffAddr#,
- andI#, uncheckedIShiftRL#,
- and#, word2Int#, uncheckedShiftL#,
- word8ToWord#, byteSwap#)
-import GHC.Internal.Num ((-))
+import GHC.Internal.ST
+import GHC.Internal.Base
+import GHC.Internal.Num
+import GHC.Internal.List
+import GHC.Internal.Word
-- | @lookup64 addr index@ looks up the bit stored at bit index @index@ using a
-- bitmap starting at the address @addr at . Looks up the 64-bit word containing
@@ -71,3 +77,41 @@ lookupIntN
lookupIntN addr# (I# index#) =
let word## = word8ToWord# (indexWord8OffAddr# addr# index#)
in I# (word2Int# word##)
+
+data UnicodeByteArray = UnicodeByteArray !ByteArray#
+
+byteArrayLookupIntN :: UnicodeByteArray -> Int -> Int
+byteArrayLookupIntN ba idx
+ = let !(UnicodeByteArray addr) = ba
+ in lookupIntN (byteArrayContents# addr) idx
+
+newByteArrayFromWord8List :: [Word8] -> UnicodeByteArray
+newByteArrayFromWord8List xs = runST $ ST \s0 ->
+ case newPinnedByteArray# len s0 of
+ !(# s1, mba #) ->
+ let s2 = fillByteArray mba 0# xs s1
+ in case unsafeFreezeByteArray# mba s2 of
+ !(# s3, fba #) -> (# s3, UnicodeByteArray fba #)
+ where
+ !(I# len) = length xs
+
+ fillByteArray _ _ [] s = s
+ fillByteArray mba i (y:ys) s =
+ let !(W8# y#) = y
+ s' = writeWord8Array# mba i y# s
+ in fillByteArray mba (i +# 1#) ys s'
+
+copyAddrToWord8List :: Addr# -> Int -> [Word8]
+copyAddrToWord8List addr !len@(I# len') = runST $ ST \s0 ->
+ case newByteArray# len' s0 of
+ !(# s1, mba #) ->
+ let s2 = copyAddrToByteArray# addr mba 0# len' s1
+ in case unsafeFreezeByteArray# mba s2 of
+ !(# s3, fba #) -> (# s3, readByteFromArray fba 0 len #)
+ where
+ readByteFromArray :: ByteArray# -> Int -> Int -> [Word8]
+ readByteFromArray ba !from@(I# from') to =
+ W8# (indexWord8Array# ba from') :
+ if from == (to - 1)
+ then []
+ else readByteFromArray ba (from + 1) to
=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Char/UnicodeData/GeneralCategory.hs
=====================================
The diff for this file was not included because it is too large.
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/ByteString.hs
=====================================
@@ -0,0 +1,19 @@
+module Generator.ByteString where
+
+import Data.List (intercalate, intersperse)
+import qualified Data.ByteString.Builder as BB
+
+replicateBB :: Int -> BB.Builder -> BB.Builder
+replicateBB indent b = mconcat (replicate indent b)
+
+unlinesBBWithIndent :: Int -> [BB.Builder] -> BB.Builder
+unlinesBBWithIndent indent =
+ let indent_replicated = replicateBB indent " "
+ in mconcat . intersperse ("\n" <> indent_replicated)
+
+unlinesBB :: [BB.Builder] -> BB.Builder
+unlinesBB = (<> "\n") . unlinesBBWithIndent 0
+
+unwordsBB :: [BB.Builder] -> BB.Builder
+unwordsBB = mconcat . intersperse " "
+
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/GeneralCategory.hs
=====================================
@@ -0,0 +1,129 @@
+{-# LANGUAGE BlockArguments #-}
+
+module Generator.GeneralCategory (GeneralCategory (..), generateGeneralCategoryCode) where
+
+import Data.ByteString.Builder qualified as BB
+import Generator.ByteString (unlinesBB, unlinesBBWithIndent, unwordsBB)
+import Generator.RangeSwitch
+import Generator.WordEncoding
+import Unicode.CharacterDatabase.Parser.UnicodeData (GeneralCategory (Cn))
+
+genEnumBitmap ::
+ forall a.
+ (Bounded a, Enum a, Show a) =>
+ -- | Function name
+ BB.Builder ->
+ -- | Default value
+ a ->
+ -- | List of values to encode
+ [a] ->
+ BB.Builder
+genEnumBitmap funcName def as =
+ unlinesBB
+ [ "{-# INLINE " <> funcName <> " #-}",
+ funcName <> " :: Char -> Int",
+ funcName
+ <> " c = let n = ord c in if n >= "
+ <> BB.intDec (length as)
+ <> " then "
+ <> BB.intDec (fromEnum def)
+ <> " else lookup_bitmap n"
+ ]
+
+generateHaskellCode :: Int -> [GeneralCategory] -> BB.Builder
+generateHaskellCode max_char_length cats =
+ let (index_tree, all_allocs) = extract [] range_tree
+ in unlinesBB
+ [ "{-# NOINLINE lookup_bitmap #-}"
+ , "lookup_bitmap :: Int -> Int"
+ , "lookup_bitmap n ="
+ , " (" <> (genCode' index_tree 2)
+ , " )"
+ , " where"
+ , unlinesBB (fmap genDecompressed (zip all_allocs [0 ..]))
+ ]
+ where
+ range_tree = rangeCases max_char_length cats
+
+ genCode' :: (Enum a, Show a) => RangeTree (Either a Int) -> Int -> BB.Builder
+ genCode' (Leaf _ _ cat) _ = BB.string7 $ show cat
+ genCode' (Node start _ (Leaf _ endl c_l) (Leaf startr _ c_r)) _ =
+ unwordsBB
+ [ "({- 1 -} if n <"
+ , BB.intDec (endl + 1)
+ , "then"
+ , "(" <> (genResult start c_l) <> ")"
+ , "else"
+ , "(" <> (genResult startr c_r) <> "))"
+ ]
+ genCode' (Node start _ (Leaf _ endl c_l) node_r@(Node _ _ _ _)) indent =
+ unlinesBBWithIndent
+ (indent * 2)
+ [ unwordsBB ["({- 2 -} if n <", BB.intDec (endl + 1), "then", "(" <> (genResult start c_l) <> ") else ("]
+ , (genCode' node_r $ indent + 1)
+ , "{- 2 -}))"
+ ]
+ genCode' (Node _ _ node_l@(Node _ _ _ _) (Leaf startr _ c_r)) indent =
+ unlinesBBWithIndent
+ (indent * 2)
+ [ unwordsBB ["({- 3 -} if n >=", BB.intDec startr, "then", "(" <> (genResult startr c_r) <> ") else ("]
+ , (genCode' node_l $ indent + 1)
+ , "{- 3 -}))"
+ ]
+ genCode' (Node _ _ node_l@(Node _ endl _ _) node_r@(Node _ _ _ _)) indent =
+ unlinesBBWithIndent
+ (indent * 2)
+ [ unwordsBB ["({- 4 -} if n <", BB.intDec (endl + 1), "then ("]
+ , (genCode' node_l $ indent + 1)
+ , ") {- 4 -} else ("
+ , (genCode' node_r $ indent + 1)
+ , "{- 4 -} ))"
+ ]
+
+ genResult :: (Enum a, Show a) => Int -> Either a Int -> BB.Builder
+ genResult _ (Left s) = BB.string7 $ show (toWord8 s)
+ genResult mi (Right idx) = unwordsBB ["lookupIntN", "decompressed_table_" <> (BB.string7 $ show idx), "(n -", BB.string7 (show mi) <> ")"]
+
+ extract :: [[a]] -> RangeTree (Either a [a]) -> (RangeTree (Either a Int), [[a]])
+ extract acc (Leaf mi ma (Left v)) = (Leaf mi ma (Left v), acc)
+ extract acc (Leaf mi ma (Right v)) = (Leaf mi ma (Right (length acc)), acc ++ [v])
+ extract acc (Node mi ma lt rt) =
+ let (e_lt, l_acc) = extract acc lt
+ (e_rt, r_acc) = extract l_acc rt
+ in (Node mi ma e_lt e_rt, r_acc)
+
+ genDecompressed :: forall a. (Enum a, Bounded a, Show a) => ([a], Int) -> BB.Builder
+ genDecompressed (acc, idx) =
+ let fn_name = "decompressed_table_" <> (BB.string7 $ show idx)
+ in " " <> unwordsBB [fn_name, "=", "\"" <> enumMapToAddrLiteral acc "\"#"]
+
+generateGeneralCategoryCode ::
+ -- | -- How to generate module header where first arg us module name
+ (BB.Builder -> BB.Builder) ->
+ -- | -- Module name
+ BB.Builder ->
+ -- | -- Max char length
+ Int ->
+ -- | -- imported general categories for all symbol list
+ [GeneralCategory] ->
+ BB.Builder
+generateGeneralCategoryCode mkModuleHeader moduleName char_length cats =
+ unlinesBB
+ [ "{-# LANGUAGE NoImplicitPrelude #-}"
+ , "{-# LANGUAGE MagicHash #-}"
+ , "{-# LANGUAGE TypeApplications #-}"
+ , "{-# OPTIONS_HADDOCK hide #-}"
+ , ""
+ , mkModuleHeader moduleName
+ , "module " <> moduleName
+ , "(generalCategory)"
+ , "where"
+ , ""
+ , "import GHC.Internal.Base (Char, Int, Ord(..), ord)"
+ , "import GHC.Internal.Unicode.Bits (lookupIntN)"
+ , "import GHC.Internal.Num ((-))"
+ , ""
+ , generateHaskellCode char_length cats
+ , ""
+ , genEnumBitmap "generalCategory" Cn (reverse cats)
+ ]
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/RangeSwitch.hs
=====================================
@@ -0,0 +1,62 @@
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE LambdaCase #-}
+
+module Generator.RangeSwitch
+ ( rangeCases,
+ RangeTree (..),
+ )
+where
+
+data Case a = Case
+ { caseMin :: Int
+ , caseMax :: Int
+ , caseConstant :: Either a [a]
+ }
+ deriving stock (Show)
+
+ranges :: (Enum a, Eq a, Show a) => [a] -> [(Int, Int, a)]
+ranges = \case
+ [] -> []
+ (x : xs) -> reverse (go 0 0 x [] xs)
+ where
+ go mi ma v rs = \case
+ [] -> (mi, ma, v) : rs
+ (x : xs)
+ | x == v -> go mi (ma + 1) v rs xs
+ | otherwise -> go (ma + 1) (ma + 1) x ((mi, ma, v) : rs) xs
+
+cases :: Int -> [a] -> [(Int, Int, a)] -> [Case a]
+cases max_rep all_cats = go
+ where
+ go = \case
+ [] -> []
+ (r@(mi, ma, v) : rs)
+ | rangeSize r > max_rep -> Case mi ma (Left v) : go rs
+ | otherwise -> go_lookup mi ma (Left v) rs
+
+ go_lookup rmi rma mv = \case
+ [] -> [Case rmi rma mv]
+ (r@(mi, ma, v) : rs)
+ | rangeSize r > max_rep -> Case rmi rma mv : Case mi ma (Left v) : go rs
+ | otherwise -> go_lookup rmi ma (Right (take (ma - rmi + 1) (drop rmi all_cats))) rs
+
+ rangeSize :: (Num a) => (a, a, c) -> a
+ rangeSize (mi, ma, _) = ma - mi + 1
+
+rangeCases :: (Enum a, Eq a, Show a) => Int -> [a] -> RangeTree (Either a [a])
+rangeCases max_char_length cats = buildRangeTree $ cases max_char_length cats (ranges cats)
+
+data RangeTree a
+ = Leaf Int Int a
+ | Node Int Int (RangeTree a) (RangeTree a)
+ deriving stock (Show)
+
+buildRangeTree :: [Case a] -> RangeTree (Either a [a])
+buildRangeTree [(Case start end cat)] = Leaf start end cat
+buildRangeTree ranges' =
+ let mid = length ranges' `div` 2
+ (leftRanges, rightRanges) = splitAt mid ranges'
+ (Case startL _ _) = head leftRanges
+ (Case _ endR _) = last rightRanges
+ in Node startL endR (buildRangeTree leftRanges) (buildRangeTree rightRanges)
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/WordEncoding.hs
=====================================
@@ -0,0 +1,30 @@
+module Generator.WordEncoding where
+
+import Data.ByteString.Builder qualified as BB
+import Data.Word
+
+toWord8 :: (Show a, Enum a) => a -> Word8
+toWord8 a =
+ let w = fromEnum a
+ in if 0 <= w && w <= 0xff
+ then fromIntegral w
+ else error $ "Cannot convert to Word8: " <> show a
+
+-- | Encode a list of values as a byte map, using their 'Enum' instance.
+--
+-- __Note:__ 'Enum' instance must respect the following:
+--
+-- * @fromEnum minBound >= 0x00@
+-- * @fromEnum maxBound <= 0xff@
+enumMapToAddrLiteral ::
+ forall a.
+ (Bounded a, Enum a, Show a) =>
+ -- | Values to encode
+ [a] ->
+ -- | String to append
+ BB.Builder ->
+ BB.Builder
+enumMapToAddrLiteral xs cs = foldr go cs xs
+ where
+ go :: a -> BB.Builder -> BB.Builder
+ go x acc = BB.char7 '\\' <> BB.word8Dec (toWord8 x) <> acc
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/UCD2Haskell/ModuleGenerators.hs
=====================================
@@ -42,18 +42,15 @@ import qualified Unicode.CharacterDatabase.Parser.Common as C
import qualified Unicode.CharacterDatabase.Parser.UnicodeData as UD
import qualified Unicode.CharacterDatabase.Parser.Properties.Multiple as P
+import Generator.GeneralCategory (generateGeneralCategoryCode)
+import Generator.ByteString (unlinesBB, unwordsBB)
+
import Prelude hiding (pred)
--------------------------------------------------------------------------------
-- Helpers
--------------------------------------------------------------------------------
-unlinesBB :: [BB.Builder] -> BB.Builder
-unlinesBB = (<> "\n") . mconcat . intersperse "\n"
-
-unwordsBB :: [BB.Builder] -> BB.Builder
-unwordsBB = mconcat . intersperse " "
-
headerRule :: BB.Builder
headerRule = "-----------------------------------------------------------------------------"
@@ -126,57 +123,6 @@ bitMapToAddrLiteral bs cs = foldr encode cs (unfoldr mkChunks bs)
toByte :: [Bool] -> Int
toByte xs = sum $ map (\i -> if xs !! i then 1 `shiftL` i else 0) [0..7]
-genEnumBitmap ::
- forall a. (Bounded a, Enum a, Show a) =>
- -- | Function name
- BB.Builder ->
- -- | Default value
- a ->
- -- | List of values to encode
- [a] ->
- BB.Builder
-genEnumBitmap funcName def as = unlinesBB
- [ "{-# INLINE " <> funcName <> " #-}"
- , funcName <> " :: Char -> Int"
- , funcName <> " c = let n = ord c in if n >= "
- <> BB.intDec (length as)
- <> " then "
- <> BB.intDec (fromEnum def)
- <> " else lookup_bitmap n"
-
- , "{-# NOINLINE lookup_bitmap #-}"
- , "lookup_bitmap :: Int -> Int"
- , "lookup_bitmap n = lookupIntN bitmap# n"
- , " where"
- , " bitmap# = \"" <> enumMapToAddrLiteral as "\"#"
- ]
-
-{-| Encode a list of values as a byte map, using their 'Enum' instance.
-
-__Note:__ 'Enum' instance must respect the following:
-
-* @fromEnum minBound >= 0x00@
-* @fromEnum maxBound <= 0xff@
--}
-enumMapToAddrLiteral ::
- forall a. (Bounded a, Enum a, Show a) =>
- -- | Values to encode
- [a] ->
- -- | String to append
- BB.Builder ->
- BB.Builder
-enumMapToAddrLiteral xs cs = foldr go cs xs
-
- where
-
- go :: a -> BB.Builder -> BB.Builder
- go x acc = BB.char7 '\\' <> BB.word8Dec (toWord8 x) <> acc
-
- toWord8 :: a -> Word8
- toWord8 a = let w = fromEnum a in if 0 <= w && w <= 0xff
- then fromIntegral w
- else error $ "Cannot convert to Word8: " <> show a
-
genUnicodeVersion :: FilePath -> IO ()
genUnicodeVersion outdir = do
version <- catch
@@ -267,21 +213,7 @@ genGeneralCategoryModule moduleName = Fold step initial done
(replicate (ord ch2 - ord ch1 + 1) d.generalCategory <> acc)
(succ ch2)
- done (GeneralCategoryAcc acc _) = unlinesBB
- [ "{-# LANGUAGE NoImplicitPrelude #-}"
- , "{-# LANGUAGE MagicHash #-}"
- , "{-# OPTIONS_HADDOCK hide #-}"
- , ""
- , mkModuleHeader moduleName
- , "module " <> moduleName
- , "(generalCategory)"
- , "where"
- , ""
- , "import GHC.Internal.Base (Char, Int, Ord(..), ord)"
- , "import GHC.Internal.Unicode.Bits (lookupIntN)"
- , ""
- , genEnumBitmap "generalCategory" UD.Cn (reverse acc)
- ]
+ done (GeneralCategoryAcc acc _) = generateGeneralCategoryCode mkModuleHeader moduleName 50 (reverse acc)
genSimpleCaseMappingModule
:: BB.Builder
=====================================
libraries/ghc-internal/tools/ucd2haskell/ucd2haskell.cabal
=====================================
@@ -45,7 +45,12 @@ executable ucd2haskell
ghc-options: -O2
hs-source-dirs: exe
main-is: UCD2Haskell.hs
- other-modules: UCD2Haskell.ModuleGenerators
+ other-modules:
+ UCD2Haskell.ModuleGenerators
+ Generator.ByteString
+ Generator.GeneralCategory
+ Generator.RangeSwitch
+ Generator.WordEncoding
build-depends:
base >= 4.7 && < 5
, bytestring >= 0.11 && < 0.13
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/f4606ae03ba8e5b464b3d1cabafe27fcc83439ef
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/f4606ae03ba8e5b464b3d1cabafe27fcc83439ef
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240626/721d9128/attachment-0001.html>
More information about the ghc-commits
mailing list