[Git][ghc/ghc][wip/T24789_impl] Unicode: adding compact version of GeneralCategory

Serge S. Gulin (@gulin.serge) gitlab at gitlab.haskell.org
Sat Jun 8 22:43:14 UTC 2024



Serge S. Gulin pushed to branch wip/T24789_impl at Glasgow Haskell Compiler / GHC


Commits:
415c634c by Serge S. Gulin at 2024-06-09T01:42:50+03:00
Unicode: adding compact version of GeneralCategory

The following features are applied:
1. Lookup code like Cmm-switches (draft implementation proposed by Sylvain Henry @hsyl20)
2. Nested ifs (logarithmic search vs linear search) (the idea proposed by Sylvain Henry @hsyl20)
3. More compact representation via variable encoding by Huffman

- - - - -


12 changed files:

- libraries/ghc-internal/ghc-internal.cabal
- libraries/ghc-internal/src/GHC/Internal/Unicode/Bits.hs
- libraries/ghc-internal/src/GHC/Internal/Unicode/Char/UnicodeData/GeneralCategory.hs
- + libraries/ghc-internal/src/GHC/Internal/Unicode/Huffman.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/GeneralCategory.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/Huffman.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/HuffmanDecode.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/RangeSwitch.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/WordEncoding.hs
- libraries/ghc-internal/tools/ucd2haskell/exe/Parser/Text.hs
- libraries/ghc-internal/tools/ucd2haskell/ucd.sh
- libraries/ghc-internal/tools/ucd2haskell/ucd2haskell.cabal


Changes:

=====================================
libraries/ghc-internal/ghc-internal.cabal
=====================================
@@ -318,6 +318,7 @@ Library
         GHC.Internal.Event.PSQ
         GHC.Internal.Event.Unique
         -- GHC.Internal.IOPort -- TODO: hide again after debug
+        GHC.Internal.Unicode.Huffman
         GHC.Internal.Unicode.Bits
         GHC.Internal.Unicode.Char.DerivedCoreProperties
         GHC.Internal.Unicode.Char.UnicodeData.GeneralCategory


=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Bits.hs
=====================================
@@ -1,6 +1,9 @@
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE MagicHash #-}
 {-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE UnliftedNewtypes #-}
+{-# LANGUAGE BlockArguments #-}
 
 -----------------------------------------------------------------------------
 -- |
@@ -18,20 +21,22 @@
 -----------------------------------------------------------------------------
 
 module GHC.Internal.Unicode.Bits
-    ( lookupBit64,
-      lookupIntN
-    ) where
+    ( lookupIntN
+    , lookupBit64
+    , newByteArrayFromWord8List
+    , byteArrayLookupIntN
+    , copyAddrToWord8List
+    , UnicodeByteArray
+    )
+    where
 
-import GHC.Internal.Base (Bool, Int(..), Word(..), Eq(..))
 import GHC.Internal.Bits (finiteBitSize, popCount)
-import {-# SOURCE #-} GHC.Internal.ByteOrder
 import GHC.Prim
-       (Addr#,
-        indexWordOffAddr#, indexWord8OffAddr#,
-        andI#, uncheckedIShiftRL#,
-        and#, word2Int#, uncheckedShiftL#,
-        word8ToWord#, byteSwap#)
-import GHC.Internal.Num ((-))
+import GHC.Internal.ST
+import GHC.Internal.Base
+import GHC.Internal.Num
+import GHC.Internal.List
+import GHC.Internal.Word
 
 -- | @lookup64 addr index@ looks up the bit stored at bit index @index@ using a
 -- bitmap starting at the address @addr at . Looks up the 64-bit word containing
@@ -49,9 +54,7 @@ lookupBit64 addr# (I# index#) = W# (word## `and#` bitMask##) /= 0
       _  -> popCount fbs -- this is a really weird architecture
 
     wordIndex# = index# `uncheckedIShiftRL#` logFbs#
-    word## = case targetByteOrder of
-      BigEndian    -> byteSwap# (indexWordOffAddr# addr# wordIndex#)
-      LittleEndian -> indexWordOffAddr# addr# wordIndex#
+    word## = byteSwap# (indexWordOffAddr# addr# wordIndex#)
     bitIndex# = index# `andI#` fbs#
     bitMask## = 1## `uncheckedShiftL#` bitIndex#
 
@@ -71,3 +74,38 @@ lookupIntN
 lookupIntN addr# (I# index#) =
   let word## = word8ToWord# (indexWord8OffAddr# addr# index#)
   in I# (word2Int# word##)
+
+data UnicodeByteArray = UnicodeByteArray !ByteArray#
+
+byteArrayLookupIntN :: UnicodeByteArray -> Int -> Int
+byteArrayLookupIntN ba idx
+  = let !(UnicodeByteArray addr) = ba
+  in lookupIntN (byteArrayContents# addr) idx
+
+newByteArrayFromWord8List :: [Word8] -> UnicodeByteArray
+newByteArrayFromWord8List xs = runST $ ST \s0 ->
+  case newPinnedByteArray# len s0 of
+    !(# s1, mba #) ->
+      let s2 = fillByteArray mba 0# xs s1
+      in case unsafeFreezeByteArray# mba s2 of
+        !(# s3, fba #) -> (# s3, UnicodeByteArray fba #)
+  where
+    !(I# len) = length xs
+
+    fillByteArray _ _ [] s = s
+    fillByteArray mba i (y:ys) s =
+      let !(W8# y#) = y
+          s' = writeWord8Array# mba i y# s
+      in fillByteArray mba (i +# 1#) ys s'
+
+copyAddrToWord8List :: Addr# -> Int -> [Word8]
+copyAddrToWord8List addr !(I# len) = runST $ ST \s0 ->
+  case newByteArray# len s0 of
+    !(# s1, mba #) ->
+      let s2 = copyAddrToByteArray# addr mba 0# len s1
+      in case unsafeFreezeByteArray# mba s2 of
+        !(# s3, fba #) -> (# s3, readByteFromArray fba len #)
+  where
+    readByteFromArray :: ByteArray# -> Int# -> [Word8]
+    readByteFromArray ba i =
+      W8# (indexWord8Array# ba i) : readByteFromArray ba (i +# 1#)


=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Char/UnicodeData/GeneralCategory.hs
=====================================
The diff for this file was not included because it is too large.

=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Huffman.hs
=====================================
@@ -0,0 +1,53 @@
+-- DO NOT EDIT IT HERE. It is automatically copied from ucd2haskell tool's Generator.HuffmanDecode
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE ExplicitForAll #-}
+{-# LANGUAGE TypeApplications #-}
+
+module GHC.Internal.Unicode.Huffman
+    ( decodeHuffman
+    , deserializeHuffman
+    , HuffmanTree (..)
+    )
+    where
+
+import GHC.Internal.Word (Word8)
+import GHC.Internal.Bits (testBit)
+import GHC.Internal.Show (Show (..))
+import GHC.Internal.Base (Bool, Eq, Functor, (.), (++), error, map)
+import qualified GHC.Internal.List as L (concatMap)
+
+data HuffmanTree a
+    = HTLeaf !a
+    | HTNode !(HuffmanTree a) !(HuffmanTree a)
+    deriving stock (Show, Eq, Functor)
+
+deserializeHuffman :: forall a . (Word8 -> a) -> [Word8] -> HuffmanTree a
+deserializeHuffman conv = (\(a, _) -> a) . go
+  where
+    go [] = error "Unable to process empty list"
+    go (0x00:value:rest) = (HTLeaf (conv value), rest)
+    go (0x01:rest) =
+        let
+            (left, rest') = go rest
+            (right, rest'') = go rest'
+        in (HTNode left right, rest'')
+    go v = error ("Unknown type of Huffman tree leaf: " ++ show v)
+
+decodeHuffman :: HuffmanTree a -> [Word8] -> [a]
+decodeHuffman huffman_tree = decodeBits huffman_tree . unpackBits
+    where
+        word8ToBools :: Word8 -> [Bool]
+        word8ToBools w = map (testBit w) [7, 6 .. 0]
+
+        unpackBits :: [Word8] -> [Bool]
+        unpackBits = L.concatMap word8ToBools
+
+        decodeBits :: HuffmanTree a -> [Bool] -> [a]
+        decodeBits tree bits = decodeBits' tree bits tree
+            where
+                decodeBits' _ [] _ = []
+                decodeBits' (HTLeaf c) bs tree' = c : decodeBits' tree' bs tree'
+                decodeBits' (HTNode l r) (b:bs) tree' = decodeBits' next bs tree'
+                    where next = if b then r else l
+


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/GeneralCategory.hs
=====================================
@@ -0,0 +1,145 @@
+{-# LANGUAGE BlockArguments #-}
+module Generator.GeneralCategory (GeneralCategory (..), generateGeneralCategoryCode) where
+
+import Generator.RangeSwitch
+import Generator.WordEncoding
+import Data.List (intercalate)
+import Text.Printf (printf)
+import Generator.Huffman (mkHuffmanTree, serializeHuffman)
+
+data GeneralCategory =
+    Lu|Ll|Lt|             --LC
+    Lm|Lo|                --L
+    Mn|Mc|Me|             --M
+    Nd|Nl|No|             --N
+    Pc|Pd|Ps|Pe|Pi|Pf|Po| --P
+    Sm|Sc|Sk|So|          --S
+    Zs|Zl|Zp|             --Z
+    Cc|Cf|Cs|Co|Cn        --C
+    deriving (Show, Eq, Ord, Bounded, Enum, Read)
+
+genEnumBitmap ::
+  forall a. (Bounded a, Enum a, Show a) =>
+  -- | Function name
+  String ->
+  -- | Default value
+  a ->
+  -- | List of values to encode
+  [a] ->
+  String
+genEnumBitmap funcName def as = unlines
+    [ "{-# INLINE " <> funcName <> " #-}"
+    , funcName <> " :: Char -> Int"
+    , funcName <> " c = let n = ord c in if n >= "
+               <> show (length as)
+               <> " then "
+               <> show (fromEnum def)
+               <> " else lookup_bitmap n"
+    ]
+
+generateHaskellCode :: Int -> [GeneralCategory] -> String
+generateHaskellCode max_char_length cats =
+  let (index_tree, all_allocs) = extract [] range_tree
+  in intercalate "\n"
+        [ "{-# NOINLINE deserialized_huffman #-}"
+        , "deserialized_huffman :: HuffmanTree Word8"
+        , "deserialized_huffman ="
+        , intercalate " " ["  let huffman_tree =", "\"" <> mapToAddrLiteral serialized_huffman "\"#"]
+        , printf "  in deserializeHuffman (\\x -> x) (copyAddrToWord8List huffman_tree %d)" (length serialized_huffman)
+        , intercalate "\n" (fmap genDecompressed (zip all_allocs [0..]))
+        , ""
+        , "{-# NOINLINE lookup_bitmap #-}"
+        , "lookup_bitmap :: Int -> Int"
+        , "lookup_bitmap n ="
+        , printf "  (%s)" (genCode' index_tree 2)
+        ]
+  where
+    cases' = rangeCases max_char_length cats
+    huffmanTree = mkHuffmanTree $ extractLookupIntList cases'
+    cases_huffman_encoded = rangesToWord8 huffmanTree cases'
+    range_tree = buildRangeTree cases_huffman_encoded
+
+    serialized_huffman = serializeHuffman toWord8 huffmanTree
+
+    prefixEachLine indent ls = concatMap (\l -> "\n" ++ replicate (indent*2) ' ' ++ l) ls
+
+    genCode' :: (Show a) => RangeTree (Either a Int) -> Int -> String
+    genCode' (Leaf _ _ cat) _ = show cat
+    genCode' (Node start _ (Leaf _ endl c_l) (Leaf startr _ c_r)) indent =
+        prefixEachLine indent
+            [ printf "({- 1 -} if n < %d then (%s) else (%s))" (endl+1) (genResult start c_l) (genResult startr c_r)
+            ]
+
+    genCode' (Node start _ (Leaf _ endl c_l) node_r@(Node _ _ _ _)) indent =
+        prefixEachLine indent
+            [ printf "({- 2 -}if n < %d then (%s) else (%s))" (endl+1) (genResult start c_l) (genCode' node_r $ indent + 1)
+            ]
+
+    genCode' (Node _ _ node_l@(Node _ _ _ _) (Leaf startr _ c_r)) indent =
+        prefixEachLine indent
+            [ printf "({- 3 -} if n >= %d then (%s) else (%s))" startr (genResult startr c_r) (genCode' node_l $ indent + 1)
+            ]
+
+    genCode' (Node _ _ node_l@(Node _ endl _ _) node_r@(Node _ _ _ _)) indent =
+        prefixEachLine indent
+            [ printf "({- 4 -} if n < %d then (%s) else (%s))" (endl+1) (genCode' node_l $ indent + 1) (genCode' node_r $ indent + 1)
+            ]
+
+    genResult :: Show a => Int -> Either a Int -> String
+    genResult _ (Left s) = show s
+    -- genResult mi (Right idx) = intercalate " " ["lookupIntN (decodeHuffman (toEnum . fromIntegral, deserialized_huffman)", "\"" <> mapToAddrLiteral as "\"#)", "(n -", show mi, ")"]
+    genResult mi (Right idx) = intercalate " " ["byteArrayLookupIntN", "decompressed_table_" <> show idx, "(n -", show mi, ")"]
+
+    extract :: [[a]] -> RangeTree (Either a [a]) -> (RangeTree (Either a Int), [[a]])
+    extract acc (Leaf mi ma (Left v)) = (Leaf mi ma (Left v), acc)
+    extract acc (Leaf mi ma (Right v)) = (Leaf mi ma (Right (length acc)), acc ++ [v])
+    extract acc (Node mi ma lt rt) =
+        let
+            (e_lt, l_acc) = extract acc lt
+            (e_rt, r_acc) = extract l_acc rt
+        in (Node mi ma e_lt e_rt, r_acc)
+
+    genDecompressed :: forall a. Show a => ([a], Int) -> String
+    genDecompressed (acc, idx) =
+        let fn_name = "decompressed_table_" <> show idx
+        in intercalate "\n"
+        [ ""
+        , "{-# NOINLINE " <> fn_name <> " #-}"
+        , fn_name <> " :: UnicodeByteArray"
+        , fn_name <> " ="
+        , intercalate " " ["  let compressed = copyAddrToWord8List", "\"" <> mapToAddrLiteral acc "\"#", show (length acc)]
+        , printf "  in newByteArrayFromWord8List (decodeHuffman deserialized_huffman compressed)"
+        ]
+
+generateGeneralCategoryCode
+    :: (String -> String)
+    -- ^-- How to generate module header where first arg us module name
+    -> String
+    -- ^-- Module name
+    -> Int
+    -- ^-- Max char length
+    -> [GeneralCategory]
+    -- ^-- imported general categories for all symbol list
+    -> String
+generateGeneralCategoryCode mkModuleHeader moduleName char_length cats =
+    unlines
+    [ "{-# LANGUAGE NoImplicitPrelude #-}"
+    , "{-# LANGUAGE MagicHash #-}"
+    , "{-# LANGUAGE TypeApplications #-}"
+    , "{-# OPTIONS_HADDOCK hide #-}"
+    , ""
+    , mkModuleHeader moduleName
+    , "module " <> moduleName
+    , "(generalCategory)"
+    , "where"
+    , ""
+    , "import GHC.Internal.Base (Char, Int, Ord(..), ord)"
+    , "import GHC.Internal.Unicode.Bits (UnicodeByteArray, copyAddrToWord8List, newByteArrayFromWord8List, byteArrayLookupIntN)"
+    , "import GHC.Internal.Unicode.Huffman (HuffmanTree, decodeHuffman, deserializeHuffman)"
+    , "import GHC.Internal.Num ((-))"
+    , "import GHC.Internal.Word (Word8)"
+    , ""
+    , generateHaskellCode char_length cats
+    , ""
+    , genEnumBitmap "generalCategory" Cn (reverse cats)
+    ]


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/Huffman.hs
=====================================
@@ -0,0 +1,83 @@
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE PackageImports #-}
+
+module Generator.Huffman
+    ( mkHuffmanTree
+    , encodeHuffman
+    , serializeHuffman
+    )
+    where
+
+import Data.List (sortBy)
+import Data.Ord (comparing)
+import Data.Maybe (fromJust)
+import Data.Map.Strict (Map)
+import qualified Data.Map.Strict as Map
+import Data.Word (Word8)
+import Data.Bits (shiftL, (.|.))
+import Generator.HuffmanDecode (HuffmanTree (..))
+
+data HuffmanTreeFreq a
+    = HTFLeaf a Int
+    | HTFNode Int (HuffmanTreeFreq a) (HuffmanTreeFreq a)
+    deriving stock (Show, Eq, Functor)
+
+buildHuffmanTree :: Ord a => [(a, Int)] -> HuffmanTree a
+buildHuffmanTree freqs = convertTree $ buildTree initialQueue
+  where
+    frequency :: HuffmanTreeFreq a -> Int
+    frequency (HTFLeaf _ f)   = f
+    frequency (HTFNode f _ _) = f
+
+    initialQueue = sortBy (comparing frequency) [HTFLeaf s f | (s, f) <- freqs]
+
+    buildTree [] = error "impossible: empty list is not an appropriate input here"
+    buildTree [t] = t
+    buildTree (t1:t2:ts) =
+      let newNode = HTFNode (frequency t1 + frequency t2) t1 t2
+          newQueue = insertBy (comparing frequency) newNode ts
+      in buildTree newQueue
+
+    insertBy :: (a -> a -> Ordering) -> a -> [a] -> [a]
+    insertBy _ x [] = [x]
+    insertBy cmp x ys@(y:ys')
+      = case cmp x y of
+          GT -> y : insertBy cmp x ys'
+          _  -> x : ys
+
+    convertTree :: HuffmanTreeFreq a -> HuffmanTree a
+    convertTree (HTFLeaf value _) = HTLeaf value
+    convertTree (HTFNode _ left right) = HTNode (convertTree left) (convertTree right)
+
+serializeHuffman :: (a -> Word8) -> HuffmanTree a -> [Word8]
+serializeHuffman conv (HTLeaf value) = [0x00, conv value]
+serializeHuffman conv (HTNode left right) = [0x01] ++ serializeHuffman conv left ++ serializeHuffman conv right
+
+mkHuffmanTree :: (Ord a) => [a] -> HuffmanTree a
+mkHuffmanTree = buildHuffmanTree . Map.toList . huffmanStats
+    where
+        huffmanStats :: (Ord a) => [a] -> Map a Int
+        huffmanStats l = Map.fromListWith (+) [(c, 1) | c <- l]
+
+encodeHuffman :: (Ord a) => HuffmanTree a -> [a] -> [Word8]
+encodeHuffman huffmanTree = packBits . encodeBits (buildHuffmanTable huffmanTree)
+    where
+        boolsToWord8 :: [Bool] -> Word8
+        boolsToWord8 = foldl (\acc b -> shiftL acc 1 .|. if b then 1 else 0) 0
+
+        chunksOf :: Int -> [a] -> [[a]]
+        chunksOf _ [] = []
+        chunksOf n xs = take n xs : chunksOf n (drop n xs)
+
+        packBits :: [Bool] -> [Word8]
+        packBits bits = map boolsToWord8 (chunksOf 8 bits)
+
+        encodeBits :: (Ord a) => Map.Map a [Bool] -> [a] -> [Bool]
+        encodeBits huffmanTable cc = concatMap (\c -> fromJust $ Map.lookup c huffmanTable) cc
+
+        buildHuffmanTable :: Ord a => HuffmanTree a -> Map a [Bool]
+        buildHuffmanTable tree = Map.fromList $ buildCodes tree []
+            where
+                buildCodes (HTLeaf s) code = [(s, code)]
+                buildCodes (HTNode l r) code = buildCodes l (code ++ [False]) ++ buildCodes r (code ++ [True])


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/HuffmanDecode.hs
=====================================
@@ -0,0 +1,52 @@
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE ExplicitForAll #-}
+{-# LANGUAGE TypeApplications #-}
+
+module Generator.HuffmanDecode
+    ( decodeHuffman
+    , deserializeHuffman
+    , HuffmanTree (..)
+    )
+    where
+
+import Data.Word (Word8)
+import Data.Bits (testBit)
+import GHC.Show (Show (..))
+import GHC.Base (Bool, Eq, Functor, (.), (++), error, map)
+import qualified GHC.List as L (concatMap)
+
+data HuffmanTree a
+    = HTLeaf !a
+    | HTNode !(HuffmanTree a) !(HuffmanTree a)
+    deriving stock (Show, Eq, Functor)
+
+deserializeHuffman :: forall a . (Word8 -> a) -> [Word8] -> HuffmanTree a
+deserializeHuffman conv = (\(a, _) -> a) . go
+  where
+    go [] = error "Unable to process empty list"
+    go (0x00:value:rest) = (HTLeaf (conv value), rest)
+    go (0x01:rest) =
+        let
+            (left, rest') = go rest
+            (right, rest'') = go rest'
+        in (HTNode left right, rest'')
+    go v = error ("Unknown type of Huffman tree leaf: " ++ show v)
+
+decodeHuffman :: HuffmanTree a -> [Word8] -> [a]
+decodeHuffman huffman_tree = decodeBits huffman_tree . unpackBits
+    where
+        word8ToBools :: Word8 -> [Bool]
+        word8ToBools w = map (testBit w) [7, 6 .. 0]
+
+        unpackBits :: [Word8] -> [Bool]
+        unpackBits = L.concatMap word8ToBools
+
+        decodeBits :: HuffmanTree a -> [Bool] -> [a]
+        decodeBits tree bits = decodeBits' tree bits tree
+            where
+                decodeBits' _ [] _ = []
+                decodeBits' (HTLeaf c) bs tree' = c : decodeBits' tree' bs tree'
+                decodeBits' (HTNode l r) (b:bs) tree' = decodeBits' next bs tree'
+                    where next = if b then r else l
+


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/RangeSwitch.hs
=====================================
@@ -0,0 +1,73 @@
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE BlockArguments #-}
+module Generator.RangeSwitch
+    where
+
+import Generator.WordEncoding
+import Data.Word
+import Data.Bifunctor (bimap)
+import Generator.Huffman (encodeHuffman)
+import Generator.HuffmanDecode (HuffmanTree)
+
+data Case a = Case
+    { caseMin :: Int
+    , caseMax :: Int
+    , caseConstant :: Either a [a]
+    }
+    deriving stock (Show)
+
+extractLookupIntList :: [Case a] -> [a]
+extractLookupIntList = concat . (fmap \(Case _ _ cc) -> either (const []) id cc)
+
+ranges :: (Enum a, Eq a, Show a) => [a] -> [(Int,Int,a)]
+ranges = \case
+  []     -> []
+  (x:xs) -> reverse (go 0 0 x [] xs)
+  where
+    go mi ma v rs = \case
+      [] -> (mi,ma,v):rs
+      (x:xs)
+        | x == v    -> go mi (ma+1) v rs xs
+        | otherwise -> go (ma+1) (ma+1) x ((mi,ma,v):rs) xs
+
+cases :: Int -> [a] -> [(Int,Int,a)] -> [Case a]
+cases max_rep all_cats = go
+  where
+    go = \case
+        [] -> []
+        (r@(mi,ma,v):rs)
+            | rangeSize r > max_rep -> Case mi ma (Left v) : go rs
+            | otherwise             -> go_lookup mi ma (Left v) rs
+
+    go_lookup rmi rma mv = \case
+        [] -> [Case rmi rma mv]
+        (r@(mi,ma,v):rs)
+            | rangeSize r > max_rep -> Case rmi rma mv : Case mi ma (Left v) : go rs
+            | otherwise             -> go_lookup rmi ma (Right (take (ma-rmi+1) (drop rmi all_cats))) rs
+
+    rangeSize :: Num a => (a, a, c) -> a
+    rangeSize (mi, ma, _) = ma - mi + 1
+
+rangeCases :: (Enum a, Eq a, Show a) => Int -> [a] -> [Case a]
+rangeCases max_char_length cats = cases max_char_length cats (ranges cats)
+
+data RangeTree a
+    = Leaf Int Int a
+    | Node Int Int (RangeTree a) (RangeTree a)
+    deriving stock (Show)
+
+buildRangeTree :: [Case a] -> RangeTree (Either a [a])
+buildRangeTree [(Case start end cat)] = Leaf start end cat
+buildRangeTree ranges' =
+    let
+        mid = length ranges' `div` 2
+        (leftRanges, rightRanges) = splitAt mid ranges'
+        (Case startL _ _) = head leftRanges
+        (Case _ endR _) = last rightRanges
+    in Node startL endR (buildRangeTree leftRanges) (buildRangeTree rightRanges)
+
+rangesToWord8 :: (Show a, Enum a, Ord a) => HuffmanTree a -> [Case a] -> [Case Word8]
+rangesToWord8 htree = fmap \(Case mi ma c) ->
+  Case mi ma $ bimap toWord8 (encodeHuffman htree) c
+


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/WordEncoding.hs
=====================================
@@ -0,0 +1,40 @@
+module Generator.WordEncoding where
+
+import Data.Word
+
+toWord8 :: (Show a, Enum a) => a -> Word8
+toWord8 a = let w = fromEnum a in if 0 <= w && w <= 0xff
+    then fromIntegral w
+    else error $ "Cannot convert to Word8: " <> show a
+
+{-| Encode a list of values as a byte map, using their 'Enum' instance.
+
+__Note:__ 'Enum' instance must respect the following:
+
+* @fromEnum minBound >= 0x00@
+* @fromEnum maxBound <= 0xff@
+-}
+enumMapToAddrLiteral ::
+  forall a. (Bounded a, Enum a, Show a) =>
+  -- | Values to encode
+  [a] ->
+  -- | String to append
+  String ->
+  String
+enumMapToAddrLiteral xs cs = foldr go cs xs
+    where
+    go :: a -> String -> String
+    go x acc = '\\' : shows (toWord8 x) acc
+
+-- Same as enumMapToAddrLiteral but for already converted to Word8
+mapToAddrLiteral ::
+    forall a. (Show a) =>
+    -- | Values to encode
+    [a] ->
+    -- | String to append
+    String ->
+    String
+mapToAddrLiteral xs cs = foldr go cs xs
+    where
+    go :: a -> String -> String
+    go x acc = '\\' : shows x acc


=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Parser/Text.hs
=====================================
@@ -33,6 +33,7 @@ import Streamly.Data.Fold (Fold)
 import System.Directory (createDirectoryIfMissing)
 import System.Environment (getEnv)
 import System.FilePath ((</>), (<.>))
+import Generator.GeneralCategory (GeneralCategory(Cn), generateGeneralCategoryCode)
 
 -- import qualified Data.Set as Set
 import Streamly.Data.Stream (Stream)
@@ -51,17 +52,6 @@ import Prelude hiding (pred)
 -- Types
 -------------------------------------------------------------------------------
 
-data GeneralCategory =
-    Lu|Ll|Lt|             --LC
-    Lm|Lo|                --L
-    Mn|Mc|Me|             --M
-    Nd|Nl|No|             --N
-    Pc|Pd|Ps|Pe|Pi|Pf|Po| --P
-    Sm|Sc|Sk|So|          --S
-    Zs|Zl|Zp|             --Z
-    Cc|Cf|Cs|Co|Cn        --C
-    deriving (Show, Bounded, Enum, Read)
-
 data DecompType =
        DTCanonical | DTCompat  | DTFont
      | DTNoBreak   | DTInitial | DTMedial   | DTFinal
@@ -189,57 +179,6 @@ bitMapToAddrLiteral bs cs = foldr encode cs (unfoldr mkChunks bs)
     toByte :: [Bool] -> Int
     toByte xs = sum $ map (\i -> if xs !! i then 1 `shiftL` i else 0) [0..7]
 
-genEnumBitmap ::
-  forall a. (Bounded a, Enum a, Show a) =>
-  -- | Function name
-  String ->
-  -- | Default value
-  a ->
-  -- | List of values to encode
-  [a] ->
-  String
-genEnumBitmap funcName def as = unlines
-    [ "{-# INLINE " <> funcName <> " #-}"
-    , funcName <> " :: Char -> Int"
-    , funcName <> " c = let n = ord c in if n >= "
-               <> show (length as)
-               <> " then "
-               <> show (fromEnum def)
-               <> " else lookup_bitmap n"
-
-    , "{-# NOINLINE lookup_bitmap #-}"
-    , "lookup_bitmap :: Int -> Int"
-    , "lookup_bitmap n = lookupIntN bitmap# n"
-    , "  where"
-    , "    bitmap# = \"" <> enumMapToAddrLiteral as "\"#"
-    ]
-
-{-| Encode a list of values as a byte map, using their 'Enum' instance.
-
-__Note:__ 'Enum' instance must respect the following:
-
-* @fromEnum minBound >= 0x00@
-* @fromEnum maxBound <= 0xff@
--}
-enumMapToAddrLiteral ::
-  forall a. (Bounded a, Enum a, Show a) =>
-  -- | Values to encode
-  [a] ->
-  -- | String to append
-  String ->
-  String
-enumMapToAddrLiteral xs cs = foldr go cs xs
-
-    where
-
-    go :: a -> String -> String
-    go x acc = '\\' : shows (toWord8 x) acc
-
-    toWord8 :: a -> Word8
-    toWord8 a = let w = fromEnum a in if 0 <= w && w <= 0xff
-        then fromIntegral w
-        else error $ "Cannot convert to Word8: " <> show a
-
 {- [NOTE] Disabled generator (normalization)
 -- This bit of code is duplicated but this duplication allows us to reduce 2
 -- dependencies on the executable.
@@ -321,21 +260,7 @@ genGeneralCategoryModule moduleName =
         -- Regular entry
         else (_generalCategory a : acc, succ (_char a))
 
-    done (acc, _) = unlines
-        [ "{-# LANGUAGE NoImplicitPrelude #-}"
-        , "{-# LANGUAGE MagicHash #-}"
-        , "{-# OPTIONS_HADDOCK hide #-}"
-        , ""
-        , mkModuleHeader moduleName
-        , "module " <> moduleName
-        , "(generalCategory)"
-        , "where"
-        , ""
-        , "import GHC.Internal.Base (Char, Int, Ord(..), ord)"
-        , "import GHC.Internal.Unicode.Bits (lookupIntN)"
-        , ""
-        , genEnumBitmap "generalCategory" Cn (reverse acc)
-        ]
+    done (acc, _) = generateGeneralCategoryCode mkModuleHeader moduleName 220 acc
 
 readDecomp :: String -> (Maybe DecompType, Decomp)
 readDecomp s =


=====================================
libraries/ghc-internal/tools/ucd2haskell/ucd.sh
=====================================
@@ -71,6 +71,23 @@ run_generator() {
         #   --core-prop XID_Continue \
         #   --core-prop Pattern_Syntax \
         #   --core-prop Pattern_White_Space
+
+    echo "-- DO NOT EDIT IT HERE. It is automatically copied from ucd2haskell tool's Generator.HuffmanDecode" > "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+    cat "$SCRIPT_DIR/exe/Generator/HuffmanDecode.hs" >> "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+
+    # See https://stackoverflow.com/a/22084103
+    sed -i.bak -e "s/module Generator\.HuffmanDecode/module GHC.Internal.Unicode.Huffman/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+    rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+    sed -i.bak -e "s/import Data\.Word/import GHC.Internal.Word/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+    rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+    sed -i.bak -e "s/import Data\.Bits/import GHC.Internal.Bits/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+    rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+    sed -i.bak -e "s/import GHC\.Show/import GHC.Internal.Show/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+    rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+    sed -i.bak -e "s/import GHC\.Base/import GHC.Internal.Base/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+    rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+    sed -i.bak -e "s/import qualified GHC\.List/import qualified GHC.Internal.List/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+    rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
 }
 
 # Print help text


=====================================
libraries/ghc-internal/tools/ucd2haskell/ucd2haskell.cabal
=====================================
@@ -50,7 +50,13 @@ executable ucd2haskell
   ghc-options: -O2
   hs-source-dirs: exe
   main-is: UCD2Haskell.hs
-  other-modules: Parser.Text
+  other-modules:
+    Parser.Text
+    Generator.RangeSwitch
+    Generator.GeneralCategory
+    Generator.WordEncoding
+    Generator.Huffman
+    Generator.HuffmanDecode
   build-depends:
       base             >= 4.7   && < 4.20
     , streamly-core    >= 0.2.2 && < 0.3
@@ -60,3 +66,4 @@ executable ucd2haskell
     , containers       >= 0.5   && < 0.7
     , directory        >= 1.3.6 && < 1.3.8
     , filepath         >= 1.4.2 && < 1.5
+    , ghc-prim         >= 0.11 && < 0.12



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/415c634cebce12e638e6a98039f0a1acf16e9b14

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/415c634cebce12e638e6a98039f0a1acf16e9b14
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240608/8ad3c57f/attachment-0001.html>


More information about the ghc-commits mailing list