[Git][ghc/ghc][wip/T24789_impl] Unicode: adding compact version of GeneralCategory
Serge S. Gulin (@gulin.serge)
gitlab at gitlab.haskell.org
Sat Jun 8 22:43:14 UTC 2024
Serge S. Gulin pushed to branch wip/T24789_impl at Glasgow Haskell Compiler / GHC
Commits:
415c634c by Serge S. Gulin at 2024-06-09T01:42:50+03:00
Unicode: adding compact version of GeneralCategory
The following features are applied:
1. Lookup code like Cmm-switches (draft implementation proposed by Sylvain Henry @hsyl20)
2. Nested ifs (logarithmic search vs linear search) (the idea proposed by Sylvain Henry @hsyl20)
3. More compact representation via variable encoding by Huffman
- - - - -
12 changed files:
- libraries/ghc-internal/ghc-internal.cabal
- libraries/ghc-internal/src/GHC/Internal/Unicode/Bits.hs
- libraries/ghc-internal/src/GHC/Internal/Unicode/Char/UnicodeData/GeneralCategory.hs
- + libraries/ghc-internal/src/GHC/Internal/Unicode/Huffman.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/GeneralCategory.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/Huffman.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/HuffmanDecode.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/RangeSwitch.hs
- + libraries/ghc-internal/tools/ucd2haskell/exe/Generator/WordEncoding.hs
- libraries/ghc-internal/tools/ucd2haskell/exe/Parser/Text.hs
- libraries/ghc-internal/tools/ucd2haskell/ucd.sh
- libraries/ghc-internal/tools/ucd2haskell/ucd2haskell.cabal
Changes:
=====================================
libraries/ghc-internal/ghc-internal.cabal
=====================================
@@ -318,6 +318,7 @@ Library
GHC.Internal.Event.PSQ
GHC.Internal.Event.Unique
-- GHC.Internal.IOPort -- TODO: hide again after debug
+ GHC.Internal.Unicode.Huffman
GHC.Internal.Unicode.Bits
GHC.Internal.Unicode.Char.DerivedCoreProperties
GHC.Internal.Unicode.Char.UnicodeData.GeneralCategory
=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Bits.hs
=====================================
@@ -1,6 +1,9 @@
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE UnliftedNewtypes #-}
+{-# LANGUAGE BlockArguments #-}
-----------------------------------------------------------------------------
-- |
@@ -18,20 +21,22 @@
-----------------------------------------------------------------------------
module GHC.Internal.Unicode.Bits
- ( lookupBit64,
- lookupIntN
- ) where
+ ( lookupIntN
+ , lookupBit64
+ , newByteArrayFromWord8List
+ , byteArrayLookupIntN
+ , copyAddrToWord8List
+ , UnicodeByteArray
+ )
+ where
-import GHC.Internal.Base (Bool, Int(..), Word(..), Eq(..))
import GHC.Internal.Bits (finiteBitSize, popCount)
-import {-# SOURCE #-} GHC.Internal.ByteOrder
import GHC.Prim
- (Addr#,
- indexWordOffAddr#, indexWord8OffAddr#,
- andI#, uncheckedIShiftRL#,
- and#, word2Int#, uncheckedShiftL#,
- word8ToWord#, byteSwap#)
-import GHC.Internal.Num ((-))
+import GHC.Internal.ST
+import GHC.Internal.Base
+import GHC.Internal.Num
+import GHC.Internal.List
+import GHC.Internal.Word
-- | @lookup64 addr index@ looks up the bit stored at bit index @index@ using a
-- bitmap starting at the address @addr at . Looks up the 64-bit word containing
@@ -49,9 +54,7 @@ lookupBit64 addr# (I# index#) = W# (word## `and#` bitMask##) /= 0
_ -> popCount fbs -- this is a really weird architecture
wordIndex# = index# `uncheckedIShiftRL#` logFbs#
- word## = case targetByteOrder of
- BigEndian -> byteSwap# (indexWordOffAddr# addr# wordIndex#)
- LittleEndian -> indexWordOffAddr# addr# wordIndex#
+ word## = byteSwap# (indexWordOffAddr# addr# wordIndex#)
bitIndex# = index# `andI#` fbs#
bitMask## = 1## `uncheckedShiftL#` bitIndex#
@@ -71,3 +74,38 @@ lookupIntN
lookupIntN addr# (I# index#) =
let word## = word8ToWord# (indexWord8OffAddr# addr# index#)
in I# (word2Int# word##)
+
+data UnicodeByteArray = UnicodeByteArray !ByteArray#
+
+byteArrayLookupIntN :: UnicodeByteArray -> Int -> Int
+byteArrayLookupIntN ba idx
+ = let !(UnicodeByteArray addr) = ba
+ in lookupIntN (byteArrayContents# addr) idx
+
+newByteArrayFromWord8List :: [Word8] -> UnicodeByteArray
+newByteArrayFromWord8List xs = runST $ ST \s0 ->
+ case newPinnedByteArray# len s0 of
+ !(# s1, mba #) ->
+ let s2 = fillByteArray mba 0# xs s1
+ in case unsafeFreezeByteArray# mba s2 of
+ !(# s3, fba #) -> (# s3, UnicodeByteArray fba #)
+ where
+ !(I# len) = length xs
+
+ fillByteArray _ _ [] s = s
+ fillByteArray mba i (y:ys) s =
+ let !(W8# y#) = y
+ s' = writeWord8Array# mba i y# s
+ in fillByteArray mba (i +# 1#) ys s'
+
+copyAddrToWord8List :: Addr# -> Int -> [Word8]
+copyAddrToWord8List addr !(I# len) = runST $ ST \s0 ->
+ case newByteArray# len s0 of
+ !(# s1, mba #) ->
+ let s2 = copyAddrToByteArray# addr mba 0# len s1
+ in case unsafeFreezeByteArray# mba s2 of
+ !(# s3, fba #) -> (# s3, readByteFromArray fba len #)
+ where
+ readByteFromArray :: ByteArray# -> Int# -> [Word8]
+ readByteFromArray ba i =
+ W8# (indexWord8Array# ba i) : readByteFromArray ba (i +# 1#)
=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Char/UnicodeData/GeneralCategory.hs
=====================================
The diff for this file was not included because it is too large.
=====================================
libraries/ghc-internal/src/GHC/Internal/Unicode/Huffman.hs
=====================================
@@ -0,0 +1,53 @@
+-- DO NOT EDIT IT HERE. It is automatically copied from ucd2haskell tool's Generator.HuffmanDecode
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE ExplicitForAll #-}
+{-# LANGUAGE TypeApplications #-}
+
+module GHC.Internal.Unicode.Huffman
+ ( decodeHuffman
+ , deserializeHuffman
+ , HuffmanTree (..)
+ )
+ where
+
+import GHC.Internal.Word (Word8)
+import GHC.Internal.Bits (testBit)
+import GHC.Internal.Show (Show (..))
+import GHC.Internal.Base (Bool, Eq, Functor, (.), (++), error, map)
+import qualified GHC.Internal.List as L (concatMap)
+
+data HuffmanTree a
+ = HTLeaf !a
+ | HTNode !(HuffmanTree a) !(HuffmanTree a)
+ deriving stock (Show, Eq, Functor)
+
+deserializeHuffman :: forall a . (Word8 -> a) -> [Word8] -> HuffmanTree a
+deserializeHuffman conv = (\(a, _) -> a) . go
+ where
+ go [] = error "Unable to process empty list"
+ go (0x00:value:rest) = (HTLeaf (conv value), rest)
+ go (0x01:rest) =
+ let
+ (left, rest') = go rest
+ (right, rest'') = go rest'
+ in (HTNode left right, rest'')
+ go v = error ("Unknown type of Huffman tree leaf: " ++ show v)
+
+decodeHuffman :: HuffmanTree a -> [Word8] -> [a]
+decodeHuffman huffman_tree = decodeBits huffman_tree . unpackBits
+ where
+ word8ToBools :: Word8 -> [Bool]
+ word8ToBools w = map (testBit w) [7, 6 .. 0]
+
+ unpackBits :: [Word8] -> [Bool]
+ unpackBits = L.concatMap word8ToBools
+
+ decodeBits :: HuffmanTree a -> [Bool] -> [a]
+ decodeBits tree bits = decodeBits' tree bits tree
+ where
+ decodeBits' _ [] _ = []
+ decodeBits' (HTLeaf c) bs tree' = c : decodeBits' tree' bs tree'
+ decodeBits' (HTNode l r) (b:bs) tree' = decodeBits' next bs tree'
+ where next = if b then r else l
+
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/GeneralCategory.hs
=====================================
@@ -0,0 +1,145 @@
+{-# LANGUAGE BlockArguments #-}
+module Generator.GeneralCategory (GeneralCategory (..), generateGeneralCategoryCode) where
+
+import Generator.RangeSwitch
+import Generator.WordEncoding
+import Data.List (intercalate)
+import Text.Printf (printf)
+import Generator.Huffman (mkHuffmanTree, serializeHuffman)
+
+data GeneralCategory =
+ Lu|Ll|Lt| --LC
+ Lm|Lo| --L
+ Mn|Mc|Me| --M
+ Nd|Nl|No| --N
+ Pc|Pd|Ps|Pe|Pi|Pf|Po| --P
+ Sm|Sc|Sk|So| --S
+ Zs|Zl|Zp| --Z
+ Cc|Cf|Cs|Co|Cn --C
+ deriving (Show, Eq, Ord, Bounded, Enum, Read)
+
+genEnumBitmap ::
+ forall a. (Bounded a, Enum a, Show a) =>
+ -- | Function name
+ String ->
+ -- | Default value
+ a ->
+ -- | List of values to encode
+ [a] ->
+ String
+genEnumBitmap funcName def as = unlines
+ [ "{-# INLINE " <> funcName <> " #-}"
+ , funcName <> " :: Char -> Int"
+ , funcName <> " c = let n = ord c in if n >= "
+ <> show (length as)
+ <> " then "
+ <> show (fromEnum def)
+ <> " else lookup_bitmap n"
+ ]
+
+generateHaskellCode :: Int -> [GeneralCategory] -> String
+generateHaskellCode max_char_length cats =
+ let (index_tree, all_allocs) = extract [] range_tree
+ in intercalate "\n"
+ [ "{-# NOINLINE deserialized_huffman #-}"
+ , "deserialized_huffman :: HuffmanTree Word8"
+ , "deserialized_huffman ="
+ , intercalate " " [" let huffman_tree =", "\"" <> mapToAddrLiteral serialized_huffman "\"#"]
+ , printf " in deserializeHuffman (\\x -> x) (copyAddrToWord8List huffman_tree %d)" (length serialized_huffman)
+ , intercalate "\n" (fmap genDecompressed (zip all_allocs [0..]))
+ , ""
+ , "{-# NOINLINE lookup_bitmap #-}"
+ , "lookup_bitmap :: Int -> Int"
+ , "lookup_bitmap n ="
+ , printf " (%s)" (genCode' index_tree 2)
+ ]
+ where
+ cases' = rangeCases max_char_length cats
+ huffmanTree = mkHuffmanTree $ extractLookupIntList cases'
+ cases_huffman_encoded = rangesToWord8 huffmanTree cases'
+ range_tree = buildRangeTree cases_huffman_encoded
+
+ serialized_huffman = serializeHuffman toWord8 huffmanTree
+
+ prefixEachLine indent ls = concatMap (\l -> "\n" ++ replicate (indent*2) ' ' ++ l) ls
+
+ genCode' :: (Show a) => RangeTree (Either a Int) -> Int -> String
+ genCode' (Leaf _ _ cat) _ = show cat
+ genCode' (Node start _ (Leaf _ endl c_l) (Leaf startr _ c_r)) indent =
+ prefixEachLine indent
+ [ printf "({- 1 -} if n < %d then (%s) else (%s))" (endl+1) (genResult start c_l) (genResult startr c_r)
+ ]
+
+ genCode' (Node start _ (Leaf _ endl c_l) node_r@(Node _ _ _ _)) indent =
+ prefixEachLine indent
+ [ printf "({- 2 -}if n < %d then (%s) else (%s))" (endl+1) (genResult start c_l) (genCode' node_r $ indent + 1)
+ ]
+
+ genCode' (Node _ _ node_l@(Node _ _ _ _) (Leaf startr _ c_r)) indent =
+ prefixEachLine indent
+ [ printf "({- 3 -} if n >= %d then (%s) else (%s))" startr (genResult startr c_r) (genCode' node_l $ indent + 1)
+ ]
+
+ genCode' (Node _ _ node_l@(Node _ endl _ _) node_r@(Node _ _ _ _)) indent =
+ prefixEachLine indent
+ [ printf "({- 4 -} if n < %d then (%s) else (%s))" (endl+1) (genCode' node_l $ indent + 1) (genCode' node_r $ indent + 1)
+ ]
+
+ genResult :: Show a => Int -> Either a Int -> String
+ genResult _ (Left s) = show s
+ -- genResult mi (Right idx) = intercalate " " ["lookupIntN (decodeHuffman (toEnum . fromIntegral, deserialized_huffman)", "\"" <> mapToAddrLiteral as "\"#)", "(n -", show mi, ")"]
+ genResult mi (Right idx) = intercalate " " ["byteArrayLookupIntN", "decompressed_table_" <> show idx, "(n -", show mi, ")"]
+
+ extract :: [[a]] -> RangeTree (Either a [a]) -> (RangeTree (Either a Int), [[a]])
+ extract acc (Leaf mi ma (Left v)) = (Leaf mi ma (Left v), acc)
+ extract acc (Leaf mi ma (Right v)) = (Leaf mi ma (Right (length acc)), acc ++ [v])
+ extract acc (Node mi ma lt rt) =
+ let
+ (e_lt, l_acc) = extract acc lt
+ (e_rt, r_acc) = extract l_acc rt
+ in (Node mi ma e_lt e_rt, r_acc)
+
+ genDecompressed :: forall a. Show a => ([a], Int) -> String
+ genDecompressed (acc, idx) =
+ let fn_name = "decompressed_table_" <> show idx
+ in intercalate "\n"
+ [ ""
+ , "{-# NOINLINE " <> fn_name <> " #-}"
+ , fn_name <> " :: UnicodeByteArray"
+ , fn_name <> " ="
+ , intercalate " " [" let compressed = copyAddrToWord8List", "\"" <> mapToAddrLiteral acc "\"#", show (length acc)]
+ , printf " in newByteArrayFromWord8List (decodeHuffman deserialized_huffman compressed)"
+ ]
+
+generateGeneralCategoryCode
+ :: (String -> String)
+ -- ^-- How to generate module header where first arg us module name
+ -> String
+ -- ^-- Module name
+ -> Int
+ -- ^-- Max char length
+ -> [GeneralCategory]
+ -- ^-- imported general categories for all symbol list
+ -> String
+generateGeneralCategoryCode mkModuleHeader moduleName char_length cats =
+ unlines
+ [ "{-# LANGUAGE NoImplicitPrelude #-}"
+ , "{-# LANGUAGE MagicHash #-}"
+ , "{-# LANGUAGE TypeApplications #-}"
+ , "{-# OPTIONS_HADDOCK hide #-}"
+ , ""
+ , mkModuleHeader moduleName
+ , "module " <> moduleName
+ , "(generalCategory)"
+ , "where"
+ , ""
+ , "import GHC.Internal.Base (Char, Int, Ord(..), ord)"
+ , "import GHC.Internal.Unicode.Bits (UnicodeByteArray, copyAddrToWord8List, newByteArrayFromWord8List, byteArrayLookupIntN)"
+ , "import GHC.Internal.Unicode.Huffman (HuffmanTree, decodeHuffman, deserializeHuffman)"
+ , "import GHC.Internal.Num ((-))"
+ , "import GHC.Internal.Word (Word8)"
+ , ""
+ , generateHaskellCode char_length cats
+ , ""
+ , genEnumBitmap "generalCategory" Cn (reverse cats)
+ ]
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/Huffman.hs
=====================================
@@ -0,0 +1,83 @@
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE PackageImports #-}
+
+module Generator.Huffman
+ ( mkHuffmanTree
+ , encodeHuffman
+ , serializeHuffman
+ )
+ where
+
+import Data.List (sortBy)
+import Data.Ord (comparing)
+import Data.Maybe (fromJust)
+import Data.Map.Strict (Map)
+import qualified Data.Map.Strict as Map
+import Data.Word (Word8)
+import Data.Bits (shiftL, (.|.))
+import Generator.HuffmanDecode (HuffmanTree (..))
+
+data HuffmanTreeFreq a
+ = HTFLeaf a Int
+ | HTFNode Int (HuffmanTreeFreq a) (HuffmanTreeFreq a)
+ deriving stock (Show, Eq, Functor)
+
+buildHuffmanTree :: Ord a => [(a, Int)] -> HuffmanTree a
+buildHuffmanTree freqs = convertTree $ buildTree initialQueue
+ where
+ frequency :: HuffmanTreeFreq a -> Int
+ frequency (HTFLeaf _ f) = f
+ frequency (HTFNode f _ _) = f
+
+ initialQueue = sortBy (comparing frequency) [HTFLeaf s f | (s, f) <- freqs]
+
+ buildTree [] = error "impossible: empty list is not an appropriate input here"
+ buildTree [t] = t
+ buildTree (t1:t2:ts) =
+ let newNode = HTFNode (frequency t1 + frequency t2) t1 t2
+ newQueue = insertBy (comparing frequency) newNode ts
+ in buildTree newQueue
+
+ insertBy :: (a -> a -> Ordering) -> a -> [a] -> [a]
+ insertBy _ x [] = [x]
+ insertBy cmp x ys@(y:ys')
+ = case cmp x y of
+ GT -> y : insertBy cmp x ys'
+ _ -> x : ys
+
+ convertTree :: HuffmanTreeFreq a -> HuffmanTree a
+ convertTree (HTFLeaf value _) = HTLeaf value
+ convertTree (HTFNode _ left right) = HTNode (convertTree left) (convertTree right)
+
+serializeHuffman :: (a -> Word8) -> HuffmanTree a -> [Word8]
+serializeHuffman conv (HTLeaf value) = [0x00, conv value]
+serializeHuffman conv (HTNode left right) = [0x01] ++ serializeHuffman conv left ++ serializeHuffman conv right
+
+mkHuffmanTree :: (Ord a) => [a] -> HuffmanTree a
+mkHuffmanTree = buildHuffmanTree . Map.toList . huffmanStats
+ where
+ huffmanStats :: (Ord a) => [a] -> Map a Int
+ huffmanStats l = Map.fromListWith (+) [(c, 1) | c <- l]
+
+encodeHuffman :: (Ord a) => HuffmanTree a -> [a] -> [Word8]
+encodeHuffman huffmanTree = packBits . encodeBits (buildHuffmanTable huffmanTree)
+ where
+ boolsToWord8 :: [Bool] -> Word8
+ boolsToWord8 = foldl (\acc b -> shiftL acc 1 .|. if b then 1 else 0) 0
+
+ chunksOf :: Int -> [a] -> [[a]]
+ chunksOf _ [] = []
+ chunksOf n xs = take n xs : chunksOf n (drop n xs)
+
+ packBits :: [Bool] -> [Word8]
+ packBits bits = map boolsToWord8 (chunksOf 8 bits)
+
+ encodeBits :: (Ord a) => Map.Map a [Bool] -> [a] -> [Bool]
+ encodeBits huffmanTable cc = concatMap (\c -> fromJust $ Map.lookup c huffmanTable) cc
+
+ buildHuffmanTable :: Ord a => HuffmanTree a -> Map a [Bool]
+ buildHuffmanTable tree = Map.fromList $ buildCodes tree []
+ where
+ buildCodes (HTLeaf s) code = [(s, code)]
+ buildCodes (HTNode l r) code = buildCodes l (code ++ [False]) ++ buildCodes r (code ++ [True])
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/HuffmanDecode.hs
=====================================
@@ -0,0 +1,52 @@
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE ExplicitForAll #-}
+{-# LANGUAGE TypeApplications #-}
+
+module Generator.HuffmanDecode
+ ( decodeHuffman
+ , deserializeHuffman
+ , HuffmanTree (..)
+ )
+ where
+
+import Data.Word (Word8)
+import Data.Bits (testBit)
+import GHC.Show (Show (..))
+import GHC.Base (Bool, Eq, Functor, (.), (++), error, map)
+import qualified GHC.List as L (concatMap)
+
+data HuffmanTree a
+ = HTLeaf !a
+ | HTNode !(HuffmanTree a) !(HuffmanTree a)
+ deriving stock (Show, Eq, Functor)
+
+deserializeHuffman :: forall a . (Word8 -> a) -> [Word8] -> HuffmanTree a
+deserializeHuffman conv = (\(a, _) -> a) . go
+ where
+ go [] = error "Unable to process empty list"
+ go (0x00:value:rest) = (HTLeaf (conv value), rest)
+ go (0x01:rest) =
+ let
+ (left, rest') = go rest
+ (right, rest'') = go rest'
+ in (HTNode left right, rest'')
+ go v = error ("Unknown type of Huffman tree leaf: " ++ show v)
+
+decodeHuffman :: HuffmanTree a -> [Word8] -> [a]
+decodeHuffman huffman_tree = decodeBits huffman_tree . unpackBits
+ where
+ word8ToBools :: Word8 -> [Bool]
+ word8ToBools w = map (testBit w) [7, 6 .. 0]
+
+ unpackBits :: [Word8] -> [Bool]
+ unpackBits = L.concatMap word8ToBools
+
+ decodeBits :: HuffmanTree a -> [Bool] -> [a]
+ decodeBits tree bits = decodeBits' tree bits tree
+ where
+ decodeBits' _ [] _ = []
+ decodeBits' (HTLeaf c) bs tree' = c : decodeBits' tree' bs tree'
+ decodeBits' (HTNode l r) (b:bs) tree' = decodeBits' next bs tree'
+ where next = if b then r else l
+
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/RangeSwitch.hs
=====================================
@@ -0,0 +1,73 @@
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE BlockArguments #-}
+module Generator.RangeSwitch
+ where
+
+import Generator.WordEncoding
+import Data.Word
+import Data.Bifunctor (bimap)
+import Generator.Huffman (encodeHuffman)
+import Generator.HuffmanDecode (HuffmanTree)
+
+data Case a = Case
+ { caseMin :: Int
+ , caseMax :: Int
+ , caseConstant :: Either a [a]
+ }
+ deriving stock (Show)
+
+extractLookupIntList :: [Case a] -> [a]
+extractLookupIntList = concat . (fmap \(Case _ _ cc) -> either (const []) id cc)
+
+ranges :: (Enum a, Eq a, Show a) => [a] -> [(Int,Int,a)]
+ranges = \case
+ [] -> []
+ (x:xs) -> reverse (go 0 0 x [] xs)
+ where
+ go mi ma v rs = \case
+ [] -> (mi,ma,v):rs
+ (x:xs)
+ | x == v -> go mi (ma+1) v rs xs
+ | otherwise -> go (ma+1) (ma+1) x ((mi,ma,v):rs) xs
+
+cases :: Int -> [a] -> [(Int,Int,a)] -> [Case a]
+cases max_rep all_cats = go
+ where
+ go = \case
+ [] -> []
+ (r@(mi,ma,v):rs)
+ | rangeSize r > max_rep -> Case mi ma (Left v) : go rs
+ | otherwise -> go_lookup mi ma (Left v) rs
+
+ go_lookup rmi rma mv = \case
+ [] -> [Case rmi rma mv]
+ (r@(mi,ma,v):rs)
+ | rangeSize r > max_rep -> Case rmi rma mv : Case mi ma (Left v) : go rs
+ | otherwise -> go_lookup rmi ma (Right (take (ma-rmi+1) (drop rmi all_cats))) rs
+
+ rangeSize :: Num a => (a, a, c) -> a
+ rangeSize (mi, ma, _) = ma - mi + 1
+
+rangeCases :: (Enum a, Eq a, Show a) => Int -> [a] -> [Case a]
+rangeCases max_char_length cats = cases max_char_length cats (ranges cats)
+
+data RangeTree a
+ = Leaf Int Int a
+ | Node Int Int (RangeTree a) (RangeTree a)
+ deriving stock (Show)
+
+buildRangeTree :: [Case a] -> RangeTree (Either a [a])
+buildRangeTree [(Case start end cat)] = Leaf start end cat
+buildRangeTree ranges' =
+ let
+ mid = length ranges' `div` 2
+ (leftRanges, rightRanges) = splitAt mid ranges'
+ (Case startL _ _) = head leftRanges
+ (Case _ endR _) = last rightRanges
+ in Node startL endR (buildRangeTree leftRanges) (buildRangeTree rightRanges)
+
+rangesToWord8 :: (Show a, Enum a, Ord a) => HuffmanTree a -> [Case a] -> [Case Word8]
+rangesToWord8 htree = fmap \(Case mi ma c) ->
+ Case mi ma $ bimap toWord8 (encodeHuffman htree) c
+
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Generator/WordEncoding.hs
=====================================
@@ -0,0 +1,40 @@
+module Generator.WordEncoding where
+
+import Data.Word
+
+toWord8 :: (Show a, Enum a) => a -> Word8
+toWord8 a = let w = fromEnum a in if 0 <= w && w <= 0xff
+ then fromIntegral w
+ else error $ "Cannot convert to Word8: " <> show a
+
+{-| Encode a list of values as a byte map, using their 'Enum' instance.
+
+__Note:__ 'Enum' instance must respect the following:
+
+* @fromEnum minBound >= 0x00@
+* @fromEnum maxBound <= 0xff@
+-}
+enumMapToAddrLiteral ::
+ forall a. (Bounded a, Enum a, Show a) =>
+ -- | Values to encode
+ [a] ->
+ -- | String to append
+ String ->
+ String
+enumMapToAddrLiteral xs cs = foldr go cs xs
+ where
+ go :: a -> String -> String
+ go x acc = '\\' : shows (toWord8 x) acc
+
+-- Same as enumMapToAddrLiteral but for already converted to Word8
+mapToAddrLiteral ::
+ forall a. (Show a) =>
+ -- | Values to encode
+ [a] ->
+ -- | String to append
+ String ->
+ String
+mapToAddrLiteral xs cs = foldr go cs xs
+ where
+ go :: a -> String -> String
+ go x acc = '\\' : shows x acc
=====================================
libraries/ghc-internal/tools/ucd2haskell/exe/Parser/Text.hs
=====================================
@@ -33,6 +33,7 @@ import Streamly.Data.Fold (Fold)
import System.Directory (createDirectoryIfMissing)
import System.Environment (getEnv)
import System.FilePath ((</>), (<.>))
+import Generator.GeneralCategory (GeneralCategory(Cn), generateGeneralCategoryCode)
-- import qualified Data.Set as Set
import Streamly.Data.Stream (Stream)
@@ -51,17 +52,6 @@ import Prelude hiding (pred)
-- Types
-------------------------------------------------------------------------------
-data GeneralCategory =
- Lu|Ll|Lt| --LC
- Lm|Lo| --L
- Mn|Mc|Me| --M
- Nd|Nl|No| --N
- Pc|Pd|Ps|Pe|Pi|Pf|Po| --P
- Sm|Sc|Sk|So| --S
- Zs|Zl|Zp| --Z
- Cc|Cf|Cs|Co|Cn --C
- deriving (Show, Bounded, Enum, Read)
-
data DecompType =
DTCanonical | DTCompat | DTFont
| DTNoBreak | DTInitial | DTMedial | DTFinal
@@ -189,57 +179,6 @@ bitMapToAddrLiteral bs cs = foldr encode cs (unfoldr mkChunks bs)
toByte :: [Bool] -> Int
toByte xs = sum $ map (\i -> if xs !! i then 1 `shiftL` i else 0) [0..7]
-genEnumBitmap ::
- forall a. (Bounded a, Enum a, Show a) =>
- -- | Function name
- String ->
- -- | Default value
- a ->
- -- | List of values to encode
- [a] ->
- String
-genEnumBitmap funcName def as = unlines
- [ "{-# INLINE " <> funcName <> " #-}"
- , funcName <> " :: Char -> Int"
- , funcName <> " c = let n = ord c in if n >= "
- <> show (length as)
- <> " then "
- <> show (fromEnum def)
- <> " else lookup_bitmap n"
-
- , "{-# NOINLINE lookup_bitmap #-}"
- , "lookup_bitmap :: Int -> Int"
- , "lookup_bitmap n = lookupIntN bitmap# n"
- , " where"
- , " bitmap# = \"" <> enumMapToAddrLiteral as "\"#"
- ]
-
-{-| Encode a list of values as a byte map, using their 'Enum' instance.
-
-__Note:__ 'Enum' instance must respect the following:
-
-* @fromEnum minBound >= 0x00@
-* @fromEnum maxBound <= 0xff@
--}
-enumMapToAddrLiteral ::
- forall a. (Bounded a, Enum a, Show a) =>
- -- | Values to encode
- [a] ->
- -- | String to append
- String ->
- String
-enumMapToAddrLiteral xs cs = foldr go cs xs
-
- where
-
- go :: a -> String -> String
- go x acc = '\\' : shows (toWord8 x) acc
-
- toWord8 :: a -> Word8
- toWord8 a = let w = fromEnum a in if 0 <= w && w <= 0xff
- then fromIntegral w
- else error $ "Cannot convert to Word8: " <> show a
-
{- [NOTE] Disabled generator (normalization)
-- This bit of code is duplicated but this duplication allows us to reduce 2
-- dependencies on the executable.
@@ -321,21 +260,7 @@ genGeneralCategoryModule moduleName =
-- Regular entry
else (_generalCategory a : acc, succ (_char a))
- done (acc, _) = unlines
- [ "{-# LANGUAGE NoImplicitPrelude #-}"
- , "{-# LANGUAGE MagicHash #-}"
- , "{-# OPTIONS_HADDOCK hide #-}"
- , ""
- , mkModuleHeader moduleName
- , "module " <> moduleName
- , "(generalCategory)"
- , "where"
- , ""
- , "import GHC.Internal.Base (Char, Int, Ord(..), ord)"
- , "import GHC.Internal.Unicode.Bits (lookupIntN)"
- , ""
- , genEnumBitmap "generalCategory" Cn (reverse acc)
- ]
+ done (acc, _) = generateGeneralCategoryCode mkModuleHeader moduleName 220 acc
readDecomp :: String -> (Maybe DecompType, Decomp)
readDecomp s =
=====================================
libraries/ghc-internal/tools/ucd2haskell/ucd.sh
=====================================
@@ -71,6 +71,23 @@ run_generator() {
# --core-prop XID_Continue \
# --core-prop Pattern_Syntax \
# --core-prop Pattern_White_Space
+
+ echo "-- DO NOT EDIT IT HERE. It is automatically copied from ucd2haskell tool's Generator.HuffmanDecode" > "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+ cat "$SCRIPT_DIR/exe/Generator/HuffmanDecode.hs" >> "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+
+ # See https://stackoverflow.com/a/22084103
+ sed -i.bak -e "s/module Generator\.HuffmanDecode/module GHC.Internal.Unicode.Huffman/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+ rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+ sed -i.bak -e "s/import Data\.Word/import GHC.Internal.Word/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+ rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+ sed -i.bak -e "s/import Data\.Bits/import GHC.Internal.Bits/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+ rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+ sed -i.bak -e "s/import GHC\.Show/import GHC.Internal.Show/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+ rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+ sed -i.bak -e "s/import GHC\.Base/import GHC.Internal.Base/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+ rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
+ sed -i.bak -e "s/import qualified GHC\.List/import qualified GHC.Internal.List/" "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs"
+ rm "$GHC_MODULE_PATH/GHC/Internal/Unicode/Huffman.hs.bak"
}
# Print help text
=====================================
libraries/ghc-internal/tools/ucd2haskell/ucd2haskell.cabal
=====================================
@@ -50,7 +50,13 @@ executable ucd2haskell
ghc-options: -O2
hs-source-dirs: exe
main-is: UCD2Haskell.hs
- other-modules: Parser.Text
+ other-modules:
+ Parser.Text
+ Generator.RangeSwitch
+ Generator.GeneralCategory
+ Generator.WordEncoding
+ Generator.Huffman
+ Generator.HuffmanDecode
build-depends:
base >= 4.7 && < 4.20
, streamly-core >= 0.2.2 && < 0.3
@@ -60,3 +66,4 @@ executable ucd2haskell
, containers >= 0.5 && < 0.7
, directory >= 1.3.6 && < 1.3.8
, filepath >= 1.4.2 && < 1.5
+ , ghc-prim >= 0.11 && < 0.12
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/415c634cebce12e638e6a98039f0a1acf16e9b14
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/415c634cebce12e638e6a98039f0a1acf16e9b14
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240608/8ad3c57f/attachment-0001.html>
More information about the ghc-commits
mailing list