[Git][ghc/ghc][master] Unicode: adding compact version of GeneralCategory (resolves #24789)

Marge Bot (@marge-bot) gitlab at gitlab.haskell.org
Thu Jun 27 01:52:11 UTC 2024



Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC


Commits:
f4606ae0 by Serge S. Gulin at 2024-06-26T21:51:05-04:00
Unicode: adding compact version of GeneralCategory (resolves #24789)

The following features are applied:
1. Lookup code like Cmm-switches (draft implementation proposed by Sylvain Henry @hsyl20)
2. Nested ifs (logarithmic search vs linear search) (the idea proposed by Sylvain Henry @hsyl20)

-------------------------
Metric Decrease:
    size_hello_artifact
    size_hello_unicode
-------------------------

- - - - -


8 changed files:

- libraries/ghc-internal/src/GHC/Internal/Unicode/Bits.hs
- libraries/ghc-internal/src/GHC/Internal/Unicode/Char/UnicodeData/GeneralCategory.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/ByteString.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/GeneralCategory.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/RangeSwitch.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/WordEncoding.hs
- libraries/ghc-internal/tools/ucd2haskell/exe/UCD2Haskell/ModuleGenerators.hs
- libraries/ghc-internal/tools/ucd2haskell/ucd2haskell.cabal


Changes:

=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Bits.hs
=====================================
@@ -1,6 +1,9 @@
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE MagicHash #-}
 {-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE UnliftedNewtypes #-}
+{-# LANGUAGE BlockArguments #-}
 
 -----------------------------------------------------------------------------
 -- |
@@ -18,20 +21,23 @@
 -----------------------------------------------------------------------------
 
 module GHC.Internal.Unicode.Bits
-    ( lookupBit64,
-      lookupIntN
-    ) where
+    ( lookupIntN
+    , lookupBit64
+    , newByteArrayFromWord8List
+    , byteArrayLookupIntN
+    , copyAddrToWord8List
+    , UnicodeByteArray
+    )
+    where
 
-import GHC.Internal.Base (Bool, Int(..), Word(..), Eq(..))
 import GHC.Internal.Bits (finiteBitSize, popCount)
 import {-# SOURCE #-} GHC.Internal.ByteOrder
 import GHC.Prim
-       (Addr#,
-        indexWordOffAddr#, indexWord8OffAddr#,
-        andI#, uncheckedIShiftRL#,
-        and#, word2Int#, uncheckedShiftL#,
-        word8ToWord#, byteSwap#)
-import GHC.Internal.Num ((-))
+import GHC.Internal.ST
+import GHC.Internal.Base
+import GHC.Internal.Num
+import GHC.Internal.List
+import GHC.Internal.Word
 
 -- | @lookup64 addr index@ looks up the bit stored at bit index @index@ using a
 -- bitmap starting at the address @addr at . Looks up the 64-bit word containing
@@ -71,3 +77,41 @@ lookupIntN
 lookupIntN addr# (I# index#) =
   let word## = word8ToWord# (indexWord8OffAddr# addr# index#)
   in I# (word2Int# word##)
+
+data UnicodeByteArray = UnicodeByteArray !ByteArray#
+
+byteArrayLookupIntN :: UnicodeByteArray -> Int -> Int
+byteArrayLookupIntN ba idx
+  = let !(UnicodeByteArray addr) = ba
+  in lookupIntN (byteArrayContents# addr) idx
+
+newByteArrayFromWord8List :: [Word8] -> UnicodeByteArray
+newByteArrayFromWord8List xs = runST $ ST \s0 ->
+  case newPinnedByteArray# len s0 of
+    !(# s1, mba #) ->
+      let s2 = fillByteArray mba 0# xs s1
+      in case unsafeFreezeByteArray# mba s2 of
+        !(# s3, fba #) -> (# s3, UnicodeByteArray fba #)
+  where
+    !(I# len) = length xs
+
+    fillByteArray _ _ [] s = s
+    fillByteArray mba i (y:ys) s =
+      let !(W8# y#) = y
+          s' = writeWord8Array# mba i y# s
+      in fillByteArray mba (i +# 1#) ys s'
+
+copyAddrToWord8List :: Addr# -> Int -> [Word8]
+copyAddrToWord8List addr !len@(I# len') = runST $ ST \s0 ->
+  case newByteArray# len' s0 of
+    !(# s1, mba #) ->
+      let s2 = copyAddrToByteArray# addr mba 0# len' s1
+      in case unsafeFreezeByteArray# mba s2 of
+        !(# s3, fba #) -> (# s3, readByteFromArray fba 0 len #)
+  where
+    readByteFromArray :: ByteArray# -> Int -> Int -> [Word8]
+    readByteFromArray ba !from@(I# from') to =
+      W8# (indexWord8Array# ba from') :
+        if from == (to - 1)
+          then []
+          else readByteFromArray ba (from + 1) to


=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Char/UnicodeData/GeneralCategory.hs
=====================================
The diff for this file was not included because it is too large.

=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/ByteString.hs
=====================================
@@ -0,0 +1,19 @@
+module Generator.ByteString where
+
+import Data.List (intercalate, intersperse)
+import qualified Data.ByteString.Builder as BB
+
+replicateBB :: Int -> BB.Builder -> BB.Builder
+replicateBB indent b = mconcat (replicate indent b)
+
+unlinesBBWithIndent :: Int -> [BB.Builder] -> BB.Builder
+unlinesBBWithIndent indent =
+  let indent_replicated = replicateBB indent " "
+  in mconcat . intersperse ("\n" <> indent_replicated)
+
+unlinesBB :: [BB.Builder] -> BB.Builder
+unlinesBB = (<> "\n") . unlinesBBWithIndent 0
+
+unwordsBB :: [BB.Builder] -> BB.Builder
+unwordsBB = mconcat . intersperse " "
+


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/GeneralCategory.hs
=====================================
@@ -0,0 +1,129 @@
+{-# LANGUAGE BlockArguments #-}
+
+module Generator.GeneralCategory (GeneralCategory (..), generateGeneralCategoryCode) where
+
+import Data.ByteString.Builder qualified as BB
+import Generator.ByteString (unlinesBB, unlinesBBWithIndent, unwordsBB)
+import Generator.RangeSwitch
+import Generator.WordEncoding
+import Unicode.CharacterDatabase.Parser.UnicodeData (GeneralCategory (Cn))
+
+genEnumBitmap ::
+  forall a.
+  (Bounded a, Enum a, Show a) =>
+  -- | Function name
+  BB.Builder ->
+  -- | Default value
+  a ->
+  -- | List of values to encode
+  [a] ->
+  BB.Builder
+genEnumBitmap funcName def as =
+  unlinesBB
+    [ "{-# INLINE " <> funcName <> " #-}",
+      funcName <> " :: Char -> Int",
+      funcName
+        <> " c = let n = ord c in if n >= "
+        <> BB.intDec (length as)
+        <> " then "
+        <> BB.intDec (fromEnum def)
+        <> " else lookup_bitmap n"
+    ]
+
+generateHaskellCode :: Int -> [GeneralCategory] -> BB.Builder
+generateHaskellCode max_char_length cats =
+  let (index_tree, all_allocs) = extract [] range_tree
+   in unlinesBB
+        [ "{-# NOINLINE lookup_bitmap #-}"
+        , "lookup_bitmap :: Int -> Int"
+        , "lookup_bitmap n ="
+        , "  (" <> (genCode' index_tree 2)
+        , "  )"
+        , "  where"
+        , unlinesBB (fmap genDecompressed (zip all_allocs [0 ..]))
+        ]
+  where
+    range_tree = rangeCases max_char_length cats
+
+    genCode' :: (Enum a, Show a) => RangeTree (Either a Int) -> Int -> BB.Builder
+    genCode' (Leaf _ _ cat) _ = BB.string7 $ show cat
+    genCode' (Node start _ (Leaf _ endl c_l) (Leaf startr _ c_r)) _ =
+      unwordsBB
+        [ "({- 1 -} if n <"
+        , BB.intDec (endl + 1)
+        , "then"
+        , "(" <> (genResult start c_l) <> ")"
+        , "else"
+        , "(" <> (genResult startr c_r) <> "))"
+        ]
+    genCode' (Node start _ (Leaf _ endl c_l) node_r@(Node _ _ _ _)) indent =
+      unlinesBBWithIndent
+        (indent * 2)
+        [ unwordsBB ["({- 2 -} if n <", BB.intDec (endl + 1), "then", "(" <> (genResult start c_l) <> ") else ("]
+        , (genCode' node_r $ indent + 1)
+        , "{- 2 -}))"
+        ]
+    genCode' (Node _ _ node_l@(Node _ _ _ _) (Leaf startr _ c_r)) indent =
+      unlinesBBWithIndent
+        (indent * 2)
+        [ unwordsBB ["({- 3 -} if n >=", BB.intDec startr, "then", "(" <> (genResult startr c_r) <> ") else ("]
+        , (genCode' node_l $ indent + 1)
+        , "{- 3 -}))"
+        ]
+    genCode' (Node _ _ node_l@(Node _ endl _ _) node_r@(Node _ _ _ _)) indent =
+      unlinesBBWithIndent
+        (indent * 2)
+        [ unwordsBB ["({- 4 -} if n <", BB.intDec (endl + 1), "then ("]
+        , (genCode' node_l $ indent + 1)
+        , ") {- 4 -} else ("
+        , (genCode' node_r $ indent + 1)
+        , "{- 4 -} ))"
+        ]
+
+    genResult :: (Enum a, Show a) => Int -> Either a Int -> BB.Builder
+    genResult _ (Left s) = BB.string7 $ show (toWord8 s)
+    genResult mi (Right idx) = unwordsBB ["lookupIntN", "decompressed_table_" <> (BB.string7 $ show idx), "(n -", BB.string7 (show mi) <> ")"]
+
+    extract :: [[a]] -> RangeTree (Either a [a]) -> (RangeTree (Either a Int), [[a]])
+    extract acc (Leaf mi ma (Left v)) = (Leaf mi ma (Left v), acc)
+    extract acc (Leaf mi ma (Right v)) = (Leaf mi ma (Right (length acc)), acc ++ [v])
+    extract acc (Node mi ma lt rt) =
+      let (e_lt, l_acc) = extract acc lt
+          (e_rt, r_acc) = extract l_acc rt
+      in (Node mi ma e_lt e_rt, r_acc)
+
+    genDecompressed :: forall a. (Enum a, Bounded a, Show a) => ([a], Int) -> BB.Builder
+    genDecompressed (acc, idx) =
+      let fn_name = "decompressed_table_" <> (BB.string7 $ show idx)
+      in "    " <> unwordsBB [fn_name, "=", "\"" <> enumMapToAddrLiteral acc "\"#"]
+
+generateGeneralCategoryCode ::
+  -- | -- How to generate module header where first arg us module name
+  (BB.Builder -> BB.Builder) ->
+  -- | -- Module name
+  BB.Builder ->
+  -- | -- Max char length
+  Int ->
+  -- | -- imported general categories for all symbol list
+  [GeneralCategory] ->
+  BB.Builder
+generateGeneralCategoryCode mkModuleHeader moduleName char_length cats =
+  unlinesBB
+    [ "{-# LANGUAGE NoImplicitPrelude #-}"
+    , "{-# LANGUAGE MagicHash #-}"
+    , "{-# LANGUAGE TypeApplications #-}"
+    , "{-# OPTIONS_HADDOCK hide #-}"
+    , ""
+    , mkModuleHeader moduleName
+    , "module " <> moduleName
+    , "(generalCategory)"
+    , "where"
+    , ""
+    , "import GHC.Internal.Base (Char, Int, Ord(..), ord)"
+    , "import GHC.Internal.Unicode.Bits (lookupIntN)"
+    , "import GHC.Internal.Num ((-))"
+    , ""
+    , generateHaskellCode char_length cats
+    , ""
+    , genEnumBitmap "generalCategory" Cn (reverse cats)
+    ]


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/RangeSwitch.hs
=====================================
@@ -0,0 +1,62 @@
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE LambdaCase #-}
+
+module Generator.RangeSwitch
+  ( rangeCases,
+    RangeTree (..),
+  )
+where
+
+data Case a = Case
+  { caseMin :: Int
+  , caseMax :: Int
+  , caseConstant :: Either a [a]
+  }
+  deriving stock (Show)
+
+ranges :: (Enum a, Eq a, Show a) => [a] -> [(Int, Int, a)]
+ranges = \case
+  [] -> []
+  (x : xs) -> reverse (go 0 0 x [] xs)
+  where
+    go mi ma v rs = \case
+      [] -> (mi, ma, v) : rs
+      (x : xs)
+        | x == v -> go mi (ma + 1) v rs xs
+        | otherwise -> go (ma + 1) (ma + 1) x ((mi, ma, v) : rs) xs
+
+cases :: Int -> [a] -> [(Int, Int, a)] -> [Case a]
+cases max_rep all_cats = go
+  where
+    go = \case
+      [] -> []
+      (r@(mi, ma, v) : rs)
+        | rangeSize r > max_rep -> Case mi ma (Left v) : go rs
+        | otherwise -> go_lookup mi ma (Left v) rs
+
+    go_lookup rmi rma mv = \case
+      [] -> [Case rmi rma mv]
+      (r@(mi, ma, v) : rs)
+        | rangeSize r > max_rep -> Case rmi rma mv : Case mi ma (Left v) : go rs
+        | otherwise -> go_lookup rmi ma (Right (take (ma - rmi + 1) (drop rmi all_cats))) rs
+
+    rangeSize :: (Num a) => (a, a, c) -> a
+    rangeSize (mi, ma, _) = ma - mi + 1
+
+rangeCases :: (Enum a, Eq a, Show a) => Int -> [a] -> RangeTree (Either a [a])
+rangeCases max_char_length cats = buildRangeTree $ cases max_char_length cats (ranges cats)
+
+data RangeTree a
+  = Leaf Int Int a
+  | Node Int Int (RangeTree a) (RangeTree a)
+  deriving stock (Show)
+
+buildRangeTree :: [Case a] -> RangeTree (Either a [a])
+buildRangeTree [(Case start end cat)] = Leaf start end cat
+buildRangeTree ranges' =
+  let mid = length ranges' `div` 2
+      (leftRanges, rightRanges) = splitAt mid ranges'
+      (Case startL _ _) = head leftRanges
+      (Case _ endR _) = last rightRanges
+  in Node startL endR (buildRangeTree leftRanges) (buildRangeTree rightRanges)


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/WordEncoding.hs
=====================================
@@ -0,0 +1,30 @@
+module Generator.WordEncoding where
+
+import Data.ByteString.Builder qualified as BB
+import Data.Word
+
+toWord8 :: (Show a, Enum a) => a -> Word8
+toWord8 a =
+  let w = fromEnum a
+  in if 0 <= w && w <= 0xff
+      then fromIntegral w
+      else error $ "Cannot convert to Word8: " <> show a
+
+-- | Encode a list of values as a byte map, using their 'Enum' instance.
+--
+-- __Note:__ 'Enum' instance must respect the following:
+--
+-- * @fromEnum minBound >= 0x00@
+-- * @fromEnum maxBound <= 0xff@
+enumMapToAddrLiteral ::
+  forall a.
+  (Bounded a, Enum a, Show a) =>
+  -- | Values to encode
+  [a] ->
+  -- | String to append
+  BB.Builder ->
+  BB.Builder
+enumMapToAddrLiteral xs cs = foldr go cs xs
+  where
+    go :: a -> BB.Builder -> BB.Builder
+    go x acc = BB.char7 '\\' <> BB.word8Dec (toWord8 x) <> acc


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/UCD2Haskell/ModuleGenerators.hs
=====================================
@@ -42,18 +42,15 @@ import qualified Unicode.CharacterDatabase.Parser.Common as C
 import qualified Unicode.CharacterDatabase.Parser.UnicodeData as UD
 import qualified Unicode.CharacterDatabase.Parser.Properties.Multiple as P
 
+import Generator.GeneralCategory (generateGeneralCategoryCode)
+import Generator.ByteString (unlinesBB, unwordsBB)
+
 import Prelude hiding (pred)
 
 --------------------------------------------------------------------------------
 -- Helpers
 --------------------------------------------------------------------------------
 
-unlinesBB :: [BB.Builder] -> BB.Builder
-unlinesBB = (<> "\n") . mconcat . intersperse "\n"
-
-unwordsBB :: [BB.Builder] -> BB.Builder
-unwordsBB = mconcat . intersperse " "
-
 headerRule :: BB.Builder
 headerRule = "-----------------------------------------------------------------------------"
 
@@ -126,57 +123,6 @@ bitMapToAddrLiteral bs cs = foldr encode cs (unfoldr mkChunks bs)
     toByte :: [Bool] -> Int
     toByte xs = sum $ map (\i -> if xs !! i then 1 `shiftL` i else 0) [0..7]
 
-genEnumBitmap ::
-  forall a. (Bounded a, Enum a, Show a) =>
-  -- | Function name
-  BB.Builder ->
-  -- | Default value
-  a ->
-  -- | List of values to encode
-  [a] ->
-  BB.Builder
-genEnumBitmap funcName def as = unlinesBB
-    [ "{-# INLINE " <> funcName <> " #-}"
-    , funcName <> " :: Char -> Int"
-    , funcName <> " c = let n = ord c in if n >= "
-               <> BB.intDec (length as)
-               <> " then "
-               <> BB.intDec (fromEnum def)
-               <> " else lookup_bitmap n"
-
-    , "{-# NOINLINE lookup_bitmap #-}"
-    , "lookup_bitmap :: Int -> Int"
-    , "lookup_bitmap n = lookupIntN bitmap# n"
-    , "  where"
-    , "    bitmap# = \"" <> enumMapToAddrLiteral as "\"#"
-    ]
-
-{-| Encode a list of values as a byte map, using their 'Enum' instance.
-
-__Note:__ 'Enum' instance must respect the following:
-
-* @fromEnum minBound >= 0x00@
-* @fromEnum maxBound <= 0xff@
--}
-enumMapToAddrLiteral ::
-  forall a. (Bounded a, Enum a, Show a) =>
-  -- | Values to encode
-  [a] ->
-  -- | String to append
-  BB.Builder ->
-  BB.Builder
-enumMapToAddrLiteral xs cs = foldr go cs xs
-
-    where
-
-    go :: a -> BB.Builder -> BB.Builder
-    go x acc = BB.char7 '\\' <> BB.word8Dec (toWord8 x) <> acc
-
-    toWord8 :: a -> Word8
-    toWord8 a = let w = fromEnum a in if 0 <= w && w <= 0xff
-        then fromIntegral w
-        else error $ "Cannot convert to Word8: " <> show a
-
 genUnicodeVersion :: FilePath -> IO ()
 genUnicodeVersion outdir = do
   version <- catch
@@ -267,21 +213,7 @@ genGeneralCategoryModule moduleName = Fold step initial done
           (replicate (ord ch2 - ord ch1 + 1) d.generalCategory <> acc)
           (succ ch2)
 
-    done (GeneralCategoryAcc acc _) = unlinesBB
-        [ "{-# LANGUAGE NoImplicitPrelude #-}"
-        , "{-# LANGUAGE MagicHash #-}"
-        , "{-# OPTIONS_HADDOCK hide #-}"
-        , ""
-        , mkModuleHeader moduleName
-        , "module " <> moduleName
-        , "(generalCategory)"
-        , "where"
-        , ""
-        , "import GHC.Internal.Base (Char, Int, Ord(..), ord)"
-        , "import GHC.Internal.Unicode.Bits (lookupIntN)"
-        , ""
-        , genEnumBitmap "generalCategory" UD.Cn (reverse acc)
-        ]
+    done (GeneralCategoryAcc acc _) = generateGeneralCategoryCode mkModuleHeader moduleName 50 (reverse acc)
 
 genSimpleCaseMappingModule
     :: BB.Builder


=====================================
libraries/ghc-internal/tools/ucd2haskell/ucd2haskell.cabal
=====================================
@@ -45,7 +45,12 @@ executable ucd2haskell
   ghc-options: -O2
   hs-source-dirs: exe
   main-is: UCD2Haskell.hs
-  other-modules: UCD2Haskell.ModuleGenerators
+  other-modules:
+    UCD2Haskell.ModuleGenerators
+    Generator.ByteString
+    Generator.GeneralCategory
+    Generator.RangeSwitch
+    Generator.WordEncoding
   build-depends:
       base                >= 4.7   && < 5
     , bytestring          >= 0.11  && < 0.13



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/f4606ae03ba8e5b464b3d1cabafe27fcc83439ef

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/f4606ae03ba8e5b464b3d1cabafe27fcc83439ef
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240626/721d9128/attachment-0001.html>


More information about the ghc-commits mailing list