[Git][ghc/ghc][wip/andreask/elem_rule_fix] Rework treatment of `elem`. Add UTF8 GHC.CString functions.
Andreas Klebinger
gitlab at gitlab.haskell.org
Fri Apr 3 17:06:19 UTC 2020
Andreas Klebinger pushed to branch wip/andreask/elem_rule_fix at Glasgow Haskell Compiler / GHC
Commits:
76eeafee by Andreas Klebinger at 2020-04-03T19:01:00+02:00
Rework treatment of `elem`. Add UTF8 GHC.CString functions.
A fusion RULE for elem was broken preventing it from firing.
Fixing this allows a call to elem on a known list to be translated
into a series of equality checks, and eventually a simple case
expression.
This now also works for unboxed string literals via a builtin rule.
As a byproduct GHC.CString functionality is now available for Ascii
and UTF8 strings. The following missing UTF8 variants were
added: unpackAppendCStringUtf8#, unpackFoldrCStringUtf8#
They work just like their ascii counterparts. Which hopefully
makes proper UTF8 support easier for library authors and was
required to support this transformation for utf8 encoded
unboxed strings.
- - - - -
13 changed files:
- compiler/GHC/Core/Make.hs
- compiler/GHC/Core/Op/ConstantFold.hs
- compiler/GHC/Core/SimpleOpt.hs
- compiler/prelude/PrelNames.hs
- libraries/base/GHC/Base.hs
- libraries/base/GHC/List.hs
- libraries/base/changelog.md
- + libraries/base/tests/perf/Makefile
- + libraries/base/tests/perf/T17752.hs
- + libraries/base/tests/perf/T17752.stdout
- + libraries/base/tests/perf/all.T
- libraries/ghc-prim/GHC/CString.hs
- libraries/ghc-prim/changelog.md
Changes:
=====================================
compiler/GHC/Core/Make.hs
=====================================
@@ -13,6 +13,7 @@ module GHC.Core.Make (
sortQuantVars, castBottomExpr,
-- * Constructing boxed literals
+ trueExpr, falseExpr,
mkWordExpr, mkWordExprWord,
mkIntExpr, mkIntExprInt,
mkIntegerExpr, mkNaturalExpr,
@@ -248,6 +249,10 @@ castBottomExpr e res_ty
************************************************************************
-}
+trueExpr, falseExpr :: CoreExpr
+trueExpr = Var trueDataConId
+falseExpr = Var falseDataConId
+
-- | Create a 'CoreExpr' which will evaluate to the given @Int@
mkIntExpr :: Platform -> Integer -> CoreExpr -- Result = I# i :: Int
mkIntExpr platform i = mkCoreConApps intDataCon [mkIntLit platform i]
=====================================
compiler/GHC/Core/Op/ConstantFold.hs
=====================================
@@ -34,7 +34,8 @@ import GHC.Core
import GHC.Core.Make
import GHC.Types.Id
import GHC.Types.Literal
-import GHC.Core.SimpleOpt ( exprIsLiteral_maybe )
+import GHC.Core.SimpleOpt ( exprIsLiteral_maybe, exprIsLambda_maybe
+ , exprIsStrippableLiteral_maybe )
import PrimOp ( PrimOp(..), tagToEnumKey )
import TysWiredIn
import TysPrim
@@ -57,6 +58,7 @@ import GHC.Types.Basic
import GHC.Platform
import Util
import GHC.Core.Coercion (mkUnbranchedAxInstCo,mkSymCo,Role(..))
+import Encoding (utf8DecodeByteString)
import Control.Applicative ( Alternative(..) )
@@ -67,6 +69,7 @@ import Data.Int
import Data.Ratio
import Data.Word
+
{-
Note [Constant folding]
~~~~~~~~~~~~~~~~~~~~~~~
@@ -1254,11 +1257,18 @@ builtinRules :: [CoreRule]
builtinRules
= [BuiltinRule { ru_name = fsLit "AppendLitString",
ru_fn = unpackCStringFoldrName,
- ru_nargs = 4, ru_try = match_append_lit },
+ ru_nargs = 4, ru_try = match_append_lit_C },
+ BuiltinRule { ru_name = fsLit "AppendLitStringUtf8",
+ ru_fn = unpackCStringFoldrUtf8Name,
+ ru_nargs = 4, ru_try = match_append_lit_utf8 },
BuiltinRule { ru_name = fsLit "EqString", ru_fn = eqStringName,
ru_nargs = 2, ru_try = match_eq_string },
BuiltinRule { ru_name = fsLit "Inline", ru_fn = inlineIdName,
ru_nargs = 2, ru_try = \_ _ _ -> match_inline },
+
+ -- See Note [Compiling `elem` on Strings] in CString.hs
+ BuiltinRule { ru_name = fsLit "ElemLitString", ru_fn = elemName,
+ ru_nargs = 4, ru_try = match_elem },
BuiltinRule { ru_name = fsLit "MagicDict", ru_fn = idName magicDictId,
ru_nargs = 4, ru_try = \_ _ _ -> match_magicDict },
@@ -1430,11 +1440,20 @@ builtinNaturalRules =
---------------------------------------------------
-- The rule is this:
--- unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n)
--- = unpackFoldrCString# "foobaz" c n
+-- unpackFoldrCString*# "foo" c (unpackFoldrCString*# "baz" c n)
+-- = unpackFoldrCString*# "foobaz" c n
+
+-- CString version
+match_append_lit_C :: RuleFun
+match_append_lit_C = match_append_lit unpackCStringFoldrIdKey
+
+-- CStringUTF8 version
+match_append_lit_utf8 :: RuleFun
+match_append_lit_utf8 = match_append_lit unpackCStringFoldrUtf8IdKey
-match_append_lit :: RuleFun
-match_append_lit _ id_unf _
+{-# INLINE match_append_lit #-}
+match_append_lit :: Unique -> RuleFun
+match_append_lit foldVariant _ id_unf _
[ Type ty1
, lit1
, c1
@@ -1447,7 +1466,7 @@ match_append_lit _ id_unf _
`App` lit2
`App` c2
`App` n) <- stripTicksTop tickishFloatable e2
- , unpk `hasKey` unpackCStringFoldrIdKey
+ , unpk `hasKey` foldVariant
, cheapEqExpr' tickishFloatable c1 c2
, (c1Ticks, c1') <- stripTicksTop tickishFloatable c1
, c2Ticks <- stripTicksTopT tickishFloatable c2
@@ -1460,23 +1479,79 @@ match_append_lit _ id_unf _
`App` mkTicks (c1Ticks ++ c2Ticks) c1'
`App` n
-match_append_lit _ _ _ _ = Nothing
+match_append_lit _ _ _ _ _ = Nothing
---------------------------------------------------
-- The rule is this:
-- eqString (unpackCString# (Lit s1)) (unpackCString# (Lit s2)) = s1==s2
+-- Also matches unpackCStringUtf8#
match_eq_string :: RuleFun
match_eq_string _ id_unf _
[Var unpk1 `App` lit1, Var unpk2 `App` lit2]
- | unpk1 `hasKey` unpackCStringIdKey
- , unpk2 `hasKey` unpackCStringIdKey
+ | unpk_key1 <- getUnique unpk1
+ , unpk_key2 <- getUnique unpk2
+ , unpk_key1 == unpk_key2
+ , unpk_key1 `elem` [unpackCStringUtf8IdKey, unpackCStringIdKey]
, Just (LitString s1) <- exprIsLiteral_maybe id_unf lit1
, Just (LitString s2) <- exprIsLiteral_maybe id_unf lit2
= Just (if s1 == s2 then trueValBool else falseValBool)
match_eq_string _ _ _ _ = Nothing
+---------------------------------------------------
+-- The rule is this:
+-- elem x <knownString>
+-- = case x of C# x'
+-- case x' of -
+-- 'a'# -> True
+-- 'b'# -> True
+-- 'c'# -> True
+-- _ -> False
+--
+-- Where knownString is one of:
+-- * (build ((unpackFoldrCString# "abc"#)
+-- * (unpackCString*# "abc"#)
+-- Limits matches to strings <=500 bytes.
+-- Also matches utf8 versions.
+-- See Note [Compiling `elem` on Strings] in CString.hs
+
+match_elem :: RuleFun
+match_elem _ id_unf _ [Type ty, _dict, x, xs]
+ -- elem on string
+ | ty `eqType` charTy
+ , ( xsTicks, xsApp) <- stripTicksTop tickishFloatable xs
+ -- Can we get the string
+ , Just (lam_ticks, litExpr) <- getLitStringExpr xsApp
+ , Just (LitString s1, litTicks) <- exprIsStrippableLiteral_maybe id_unf tickishFloatable litExpr
+ -- For really large strings we fall back to the default implementation.
+ , BS.length s1 <= 500
+ = let elem_string = nubSort $ utf8DecodeByteString s1 :: String
+ x_id = mkWildValBinder charTy :: Id
+ x_prim_id = mkWildValBinder charPrimTy :: Id
+ switch = Case (Var x_prim_id) x_prim_id boolTy
+ ((DEFAULT, [], falseExpr):foldr mkCharAlt [] elem_string)
+ mkCharAlt :: Char -> [CoreAlt] -> [CoreAlt]
+ mkCharAlt c alts = (LitAlt (mkLitChar c), [], trueExpr):alts
+ in Just $ mkTicks (xsTicks ++ lam_ticks ++ litTicks)
+ $ Case x x_id boolTy [(DataAlt charDataCon, [x_prim_id], switch)]
+ where
+ getLitStringExpr :: CoreExpr -> Maybe ([Tickish Id], CoreExpr)
+ getLitStringExpr app
+ -- (build ((unpackFoldrCString*# "abc"#)))
+ | (Var build `App` _char_ty `App` unf_str_app) <- app
+ , build `hasKey` buildIdKey
+ , Just (_arg, (Var unfold_str `App` _ty2 `App` lit1), lam_ticks)
+ <- exprIsLambda_maybe id_unf unf_str_app
+ , getUnique unfold_str `elem` [unpackCStringFoldrIdKey, unpackCStringFoldrUtf8IdKey]
+ = Just (lam_ticks, lit1)
+ -- (unpackCString*# "abc"#)
+ | (Var unpackVar `App` lit1) <- app
+ , getUnique unpackVar `elem` [unpackCStringIdKey, unpackCStringUtf8IdKey]
+ = Just ([], lit1)
+ | otherwise = Nothing
+
+match_elem _ _ _ _ = Nothing
---------------------------------------------------
-- The rule is this:
=====================================
compiler/GHC/Core/SimpleOpt.hs
=====================================
@@ -13,6 +13,7 @@ module GHC.Core.SimpleOpt (
-- ** Predicates on expressions
exprIsConApp_maybe, exprIsLiteral_maybe, exprIsLambda_maybe,
+ exprIsStrippableLiteral_maybe,
-- ** Coercions and casts
pushCoArg, pushCoValArg, pushCoTyArg, collectBindersPushingCo
@@ -1156,6 +1157,20 @@ exprIsLiteral_maybe env@(_, id_unf) e
-> exprIsLiteral_maybe env rhs
_ -> Nothing
+exprIsStrippableLiteral_maybe :: InScopeEnv -> (Tickish Id -> Bool) -> CoreExpr -> Maybe (Literal, [Tickish Id])
+-- Similar to exprIsLiteral_maybe but fails instead of dropping ticks
+-- This means Tick t (Lit l) *can return Nothing*. So use with care
+exprIsStrippableLiteral_maybe (_, id_unf) strip expr
+ = go [] expr
+ where
+ go ts e = case e of
+ Lit l -> Just (l,ts)
+ Tick t e'
+ | strip t -> go (t:ts) e'
+ Var v | Just rhs <- expandUnfolding_maybe (id_unf v)
+ -> go ts rhs
+ _ -> Nothing
+
{-
Note [exprIsLambda_maybe]
~~~~~~~~~~~~~~~~~~~~~~~~~~
=====================================
compiler/prelude/PrelNames.hs
=====================================
@@ -340,8 +340,8 @@ basicKnownKeyNames
groupWithName,
-- Strings and lists
- unpackCStringName,
- unpackCStringFoldrName, unpackCStringUtf8Name,
+ unpackCStringName, unpackCStringUtf8Name,
+ unpackCStringFoldrName, unpackCStringFoldrUtf8Name,
-- Overloaded lists
isListClassName,
@@ -352,6 +352,7 @@ basicKnownKeyNames
-- List operations
concatName, filterName, mapName,
zipName, foldrName, buildName, augmentName, appendName,
+ elemName,
-- FFI primitive types that are not wired-in.
stablePtrTyConName, ptrTyConName, funPtrTyConName,
@@ -677,9 +678,10 @@ ordClass_RDR = nameRdrName ordClassName
enumClass_RDR = nameRdrName enumClassName
monadClass_RDR = nameRdrName monadClassName
-map_RDR, append_RDR :: RdrName
+map_RDR, append_RDR, elem_RDR :: RdrName
map_RDR = nameRdrName mapName
append_RDR = nameRdrName appendName
+elem_RDR = nameRdrName elemName
foldr_RDR, build_RDR, returnM_RDR, bindM_RDR, failM_RDR
:: RdrName
@@ -712,11 +714,12 @@ ioDataCon_RDR :: RdrName
ioDataCon_RDR = nameRdrName ioDataConName
eqString_RDR, unpackCString_RDR, unpackCStringFoldr_RDR,
- unpackCStringUtf8_RDR :: RdrName
+ unpackCStringFoldrUtf8_RDR, unpackCStringUtf8_RDR :: RdrName
eqString_RDR = nameRdrName eqStringName
unpackCString_RDR = nameRdrName unpackCStringName
unpackCStringFoldr_RDR = nameRdrName unpackCStringFoldrName
unpackCStringUtf8_RDR = nameRdrName unpackCStringUtf8Name
+unpackCStringFoldrUtf8_RDR = nameRdrName unpackCStringFoldrUtf8Name
newStablePtr_RDR :: RdrName
newStablePtr_RDR = nameRdrName newStablePtrName
@@ -1006,11 +1009,12 @@ modIntName = varQual gHC_CLASSES (fsLit "modInt#") modIntIdKey
-- Base strings Strings
unpackCStringName, unpackCStringFoldrName,
- unpackCStringUtf8Name, eqStringName :: Name
+ unpackCStringUtf8Name, unpackCStringFoldrUtf8Name,eqStringName :: Name
unpackCStringName = varQual gHC_CSTRING (fsLit "unpackCString#") unpackCStringIdKey
unpackCStringFoldrName = varQual gHC_CSTRING (fsLit "unpackFoldrCString#") unpackCStringFoldrIdKey
unpackCStringUtf8Name = varQual gHC_CSTRING (fsLit "unpackCStringUtf8#") unpackCStringUtf8IdKey
eqStringName = varQual gHC_BASE (fsLit "eqString") eqStringIdKey
+unpackCStringFoldrUtf8Name = varQual gHC_CSTRING (fsLit "unpackFoldrCStringUtf8#") unpackCStringFoldrUtf8IdKey
-- The 'inline' function
inlineIdName :: Name
@@ -1083,7 +1087,7 @@ groupWithName = varQual gHC_EXTS (fsLit "groupWith") groupWithIdKey
-- Random PrelBase functions
fromStringName, otherwiseIdName, foldrName, buildName, augmentName,
- mapName, appendName, assertName,
+ mapName, appendName, elemName, assertName,
breakpointName, breakpointCondName,
opaqueTyConName, dollarName :: Name
dollarName = varQual gHC_BASE (fsLit "$") dollarIdKey
@@ -1097,6 +1101,7 @@ assertName = varQual gHC_BASE (fsLit "assert") assertIdKey
breakpointName = varQual gHC_BASE (fsLit "breakpoint") breakpointIdKey
breakpointCondName= varQual gHC_BASE (fsLit "breakpointCond") breakpointCondIdKey
opaqueTyConName = tcQual gHC_BASE (fsLit "Opaque") opaqueTyConKey
+elemName = varQual gHC_LIST (fsLit "elem") elemIdKey
fromStringName = varQual dATA_STRING (fsLit "fromString") fromStringClassOpKey
-- PrelTup
@@ -2081,13 +2086,15 @@ unsafeReflDataConKey = mkPreludeDataConUnique 114
-}
wildCardKey, absentErrorIdKey, augmentIdKey, appendIdKey,
+ elemIdKey,
buildIdKey, errorIdKey, foldrIdKey, recSelErrorIdKey,
seqIdKey, eqStringIdKey,
noMethodBindingErrorIdKey, nonExhaustiveGuardsErrorIdKey,
runtimeErrorIdKey, patErrorIdKey, voidPrimIdKey,
realWorldPrimIdKey, recConErrorIdKey,
unpackCStringUtf8IdKey, unpackCStringAppendIdKey,
- unpackCStringFoldrIdKey, unpackCStringIdKey,
+ unpackCStringFoldrIdKey, unpackCStringFoldrUtf8IdKey,
+ unpackCStringIdKey,
typeErrorIdKey, divIntIdKey, modIntIdKey,
absentSumFieldErrorIdKey :: Unique
@@ -2095,27 +2102,29 @@ wildCardKey = mkPreludeMiscIdUnique 0 -- See Note [WildCard
absentErrorIdKey = mkPreludeMiscIdUnique 1
augmentIdKey = mkPreludeMiscIdUnique 2
appendIdKey = mkPreludeMiscIdUnique 3
-buildIdKey = mkPreludeMiscIdUnique 4
-errorIdKey = mkPreludeMiscIdUnique 5
-foldrIdKey = mkPreludeMiscIdUnique 6
-recSelErrorIdKey = mkPreludeMiscIdUnique 7
-seqIdKey = mkPreludeMiscIdUnique 8
-absentSumFieldErrorIdKey = mkPreludeMiscIdUnique 9
-eqStringIdKey = mkPreludeMiscIdUnique 10
-noMethodBindingErrorIdKey = mkPreludeMiscIdUnique 11
-nonExhaustiveGuardsErrorIdKey = mkPreludeMiscIdUnique 12
-runtimeErrorIdKey = mkPreludeMiscIdUnique 13
-patErrorIdKey = mkPreludeMiscIdUnique 14
-realWorldPrimIdKey = mkPreludeMiscIdUnique 15
-recConErrorIdKey = mkPreludeMiscIdUnique 16
-unpackCStringUtf8IdKey = mkPreludeMiscIdUnique 17
-unpackCStringAppendIdKey = mkPreludeMiscIdUnique 18
-unpackCStringFoldrIdKey = mkPreludeMiscIdUnique 19
-unpackCStringIdKey = mkPreludeMiscIdUnique 20
-voidPrimIdKey = mkPreludeMiscIdUnique 21
-typeErrorIdKey = mkPreludeMiscIdUnique 22
-divIntIdKey = mkPreludeMiscIdUnique 23
-modIntIdKey = mkPreludeMiscIdUnique 24
+elemIdKey = mkPreludeMiscIdUnique 4
+buildIdKey = mkPreludeMiscIdUnique 5
+errorIdKey = mkPreludeMiscIdUnique 6
+foldrIdKey = mkPreludeMiscIdUnique 7
+recSelErrorIdKey = mkPreludeMiscIdUnique 8
+seqIdKey = mkPreludeMiscIdUnique 9
+absentSumFieldErrorIdKey = mkPreludeMiscIdUnique 10
+eqStringIdKey = mkPreludeMiscIdUnique 11
+noMethodBindingErrorIdKey = mkPreludeMiscIdUnique 12
+nonExhaustiveGuardsErrorIdKey = mkPreludeMiscIdUnique 13
+runtimeErrorIdKey = mkPreludeMiscIdUnique 14
+patErrorIdKey = mkPreludeMiscIdUnique 15
+realWorldPrimIdKey = mkPreludeMiscIdUnique 16
+recConErrorIdKey = mkPreludeMiscIdUnique 17
+unpackCStringUtf8IdKey = mkPreludeMiscIdUnique 18
+unpackCStringAppendIdKey = mkPreludeMiscIdUnique 19
+unpackCStringFoldrIdKey = mkPreludeMiscIdUnique 20
+unpackCStringFoldrUtf8IdKey = mkPreludeMiscIdUnique 21
+unpackCStringIdKey = mkPreludeMiscIdUnique 22
+voidPrimIdKey = mkPreludeMiscIdUnique 23
+typeErrorIdKey = mkPreludeMiscIdUnique 24
+divIntIdKey = mkPreludeMiscIdUnique 25
+modIntIdKey = mkPreludeMiscIdUnique 26
concatIdKey, filterIdKey, zipIdKey,
bindIOIdKey, returnIOIdKey, newStablePtrIdKey,
=====================================
libraries/base/GHC/Base.hs
=====================================
@@ -1618,7 +1618,11 @@ iShiftRL# :: Int# -> Int# -> Int#
a `iShiftRL#` b | isTrue# (b >=# WORD_SIZE_IN_BITS#) = 0#
| otherwise = a `uncheckedIShiftRL#` b
+------------------------------------------------------------------------
-- Rules for C strings (the functions themselves are now in GHC.CString)
+------------------------------------------------------------------------
+
+-- Ascii variants
{-# RULES
"unpack" [~1] forall a . unpackCString# a = build (unpackFoldrCString# a)
"unpack-list" [1] forall a . unpackFoldrCString# a (:) [] = unpackCString# a
@@ -1628,3 +1632,14 @@ a `iShiftRL#` b | isTrue# (b >=# WORD_SIZE_IN_BITS#) = 0#
-- unpackFoldr "foo" c (unpackFoldr "baz" c n) = unpackFoldr "foobaz" c n
#-}
+
+-- Utf8 variants
+{-# RULES
+"unpackUtf8" [~1] forall a . unpackCStringUtf8# a = build (unpackFoldrCStringUtf8# a)
+"unpack-listUtf8" [1] forall a . unpackFoldrCStringUtf8# a (:) [] = unpackCStringUtf8# a
+"unpack-appendUtf8" forall a n . unpackFoldrCStringUtf8# a (:) n = unpackAppendCStringUtf8# a n
+
+-- There's a built-in rule (in PrelRules.hs) for
+-- unpackFoldrUtf8 "föö" c (unpackFoldrUtf8 "bäz" c n) = unpackFoldrUtf8 "fööbäz" c n
+
+ #-}
=====================================
libraries/base/GHC/List.hs
=====================================
@@ -1147,11 +1147,12 @@ elem x = any (== x)
#else
elem _ [] = False
elem x (y:ys) = x==y || elem x ys
-{-# NOINLINE [1] elem #-}
+{-# NOINLINE elem #-}
{-# RULES
-"elem/build" forall x (g :: forall b . Eq a => (a -> b -> b) -> b -> b)
+"elem/build"[2] forall x (g :: forall b . (a -> b -> b) -> b -> b)
. elem x (build g) = g (\ y r -> (x == y) || r) False
#-}
+
#endif
-- | 'notElem' is the negation of 'elem'.
@@ -1172,9 +1173,9 @@ notElem x = all (/= x)
#else
notElem _ [] = True
notElem x (y:ys)= x /= y && notElem x ys
-{-# NOINLINE [1] notElem #-}
+{-# NOINLINE notElem #-}
{-# RULES
-"notElem/build" forall x (g :: forall b . Eq a => (a -> b -> b) -> b -> b)
+"notElem/build" forall x (g :: forall b . (a -> b -> b) -> b -> b)
. notElem x (build g) = g (\ y r -> (x /= y) && r) True
#-}
#endif
=====================================
libraries/base/changelog.md
=====================================
@@ -9,6 +9,13 @@
* Add `hGetContents'`, `getContents'`, and `readFile'` in `System.IO`:
Strict IO variants of `hGetContents`, `getContents`, and `readFile`.
+ * Add rules `unpackUtf8`, `unpack-listUtf8` and `unpack-appendUtf8` to `GHC.Base`.
+ They correspond to their ascii versions and hopefully make it easier
+ to handle utf8 encoded strings efficiently.
+
+ * An issue with list fusion and `elem` was fixed. `elem` applied to known
+ small lists will now compile to a simple case statement as expected.
+
## 4.14.0.0 *TBA*
* Bundled with GHC 8.10.1
=====================================
libraries/base/tests/perf/Makefile
=====================================
@@ -0,0 +1,17 @@
+# This Makefile runs the tests using GHC's testsuite framework. It
+# assumes the package is part of a GHC build tree with the testsuite
+# installed in ../../../testsuite.
+
+TOP=../../../../testsuite
+include $(TOP)/mk/boilerplate.mk
+include $(TOP)/mk/test.mk
+
+
+T17752:
+ '$(TEST_HC)' $(TEST_HC_OPTS) -O --make T17752 -rtsopts -ddump-simpl -ddump-to-file -dsuppress-uniques
+ # We only grep for specific patterns to minimize failures when eg core syntax changes
+ # We expect two unfolds using eqChar
+ echo $$(cat T17752.dump-simpl | grep unpackFoldrCString)
+ echo $$(cat T17752.dump-simpl | grep eqChar)
+ # And two pattern matches
+ echo $$(cat T17752.dump-simpl | grep "case x1" -A4 )
=====================================
libraries/base/tests/perf/T17752.hs
=====================================
@@ -0,0 +1,82 @@
+module T17752 where
+
+-- Should compile to unfoldCStr if the rules fire
+isElemLit x = x `elem` "theQuickShinyGHCJumpsOverTheRuleRecursion"
+isNotElemLit x = x `notElem` "_theQuickShinyGHCCompileJumpsOverTheRuleRecursion"
+
+-- Should compile to a pattern match if the rules fire
+isElemList x = x `elem` ['a','b','c']
+isNotElemList x = x `elem` ['x','y','z']
+
+{-
+
+Should the grep tests fail make sure the core still behaves
+like the one below (or better!)
+
+-- RHS size: {terms: 1, types: 0, coercions: 0, joins: 0/0}
+isElemLit1 = "theQuickShinyGHCJumpsOverTheRuleRecursion"#
+
+-- RHS size: {terms: 20, types: 9, coercions: 0, joins: 0/0}
+isElemLit
+ = \ x ->
+ unpackFoldrCString#
+ isElemLit1
+ (\ y r ->
+ case x of { C# x1 ->
+ case y of { C# y1 ->
+ case eqChar# x1 y1 of {
+ __DEFAULT -> r;
+ 1# -> True
+ }
+ }
+ })
+ False
+
+-- RHS size: {terms: 1, types: 0, coercions: 0, joins: 0/0}
+isNotElemLit1
+ = "_theQuickShinyGHCCompileJumpsOverTheRuleRecursion"#
+
+-- RHS size: {terms: 25, types: 10, coercions: 0, joins: 0/0}
+isNotElemLit
+ = \ x ->
+ case unpackFoldrCString#
+ isNotElemLit1
+ (\ y r ->
+ case x of { C# x1 ->
+ case y of { C# y1 ->
+ case eqChar# x1 y1 of {
+ __DEFAULT -> r;
+ 1# -> True
+ }
+ }
+ })
+ False
+ of {
+ False -> True;
+ True -> False
+ }
+
+-- RHS size: {terms: 14, types: 4, coercions: 0, joins: 0/0}
+isElemList
+ = \ x ->
+ case x of { C# x1 ->
+ case x1 of {
+ __DEFAULT -> False;
+ 'a'# -> True;
+ 'b'# -> True;
+ 'c'# -> True
+ }
+ }
+
+-- RHS size: {terms: 14, types: 4, coercions: 0, joins: 0/0}
+isNotElemList
+ = \ x ->
+ case x of { C# x1 ->
+ case x1 of {
+ __DEFAULT -> False;
+ 'x'# -> True;
+ 'y'# -> True;
+ 'z'# -> True
+ }
+ }
+-}
=====================================
libraries/base/tests/perf/T17752.stdout
=====================================
@@ -0,0 +1,25 @@
+[1 of 1] Compiling T17752 ( T17752.hs, T17752.o )
+GHC.CString.unpackFoldrCString#
+ case GHC.CString.unpackFoldrCString#
+case GHC.Prim.eqChar# x1 y1 of {
+ case GHC.Prim.eqChar# x1 y1 of {
+case x1 of {
+ __DEFAULT -> GHC.Types.False;
+ 'a'# -> GHC.Types.True;
+ 'b'# -> GHC.Types.True;
+ 'c'# -> GHC.Types.True
+ -- case x1 of {
+ __DEFAULT -> GHC.Types.False;
+ 'a'# -> GHC.Types.True;
+ 'b'# -> GHC.Types.True;
+ 'c'# -> GHC.Types.True
+ -- case x1 of {
+ __DEFAULT -> GHC.Types.False;
+ 'x'# -> GHC.Types.True;
+ 'y'# -> GHC.Types.True;
+ 'z'# -> GHC.Types.True
+ -- case x1 of {
+ __DEFAULT -> GHC.Types.False;
+ 'x'# -> GHC.Types.True;
+ 'y'# -> GHC.Types.True;
+ 'z'# -> GHC.Types.True
=====================================
libraries/base/tests/perf/all.T
=====================================
@@ -0,0 +1,5 @@
+#--------------------------------------
+# Check specialization of elem via rules
+#--------------------------------------
+
+test('T17752', [only_ways(['normal'])] , makefile_test, ['T17752'])
\ No newline at end of file
=====================================
libraries/ghc-prim/GHC/CString.hs
=====================================
@@ -17,8 +17,14 @@
-----------------------------------------------------------------------------
module GHC.CString (
+ -- * Ascii variants
unpackCString#, unpackAppendCString#, unpackFoldrCString#,
- unpackCStringUtf8#, unpackNBytes#
+
+ -- * Utf variants
+ unpackCStringUtf8#, unpackAppendCStringUtf8#, unpackFoldrCStringUtf8#,
+
+ -- * Other
+ unpackNBytes#,
) where
import GHC.Types
@@ -71,8 +77,22 @@ Moreover, we want to make it CONLIKE, so that:
All of this goes for unpackCStringUtf8# too.
-}
-{- Note [unpackCString# iterating over addr]
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+{- Note [NOINLINE for unpackFoldrCString]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Usually the unpack-list rule turns unpackFoldrCString# into unpackCString#
+It also has a BuiltInRule in PrelRules.hs:
+ unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n)
+ = unpackFoldrCString# "foobaz" c n
+At one stage I had NOINLINE [0] on the grounds that, unlike
+unpackCString#, there *is* some point in inlining
+unpackFoldrCString#, because we get better code for the
+higher-order function call. BUT there may be a lot of
+literal strings, and making a separate 'unpack' loop for
+each is highly gratuitous. See nofib/real/anna/PrettyPrint.
+
+ Note [unpackCString# iterating over addr]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When unpacking unpackCString# and friends repeatedly return a cons cell
containing:
@@ -89,6 +109,32 @@ This works since these two expressions will read from the same address.
This way we avoid the need for the thunks to close over both the start of
the string and the current offset, saving a word for each character unpacked.
+
+This has the additional advantage the we can guarantee that only the
+increment will happen in the loop.
+If we use the offset start off with the increment and an addition
+to get the real address. Which might not be optimized aways.
+
+ Note [Compiling `elem` on Strings]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The pattern f c = c `elem` "<someString>" is quite common in
+parsers and often performance critical.
+
+The fastest way to process this pattern is to transform it
+into a case. That is we transform:
+
+ x `elem` "xy"
+=>
+ case x of
+ 'x' -> True
+ 'y' -> True
+ _ -> False
+
+For this reason there is a BUILTIN rule "ElemLitString" which
+performs this rewrite early in the pipeline in case the string
+is known and not excessively long.
+
-}
unpackCString# :: Addr# -> [Char]
@@ -111,28 +157,27 @@ unpackAppendCString# addr rest
-- See Note [unpackCString# iterating over addr]
!ch = indexCharOffAddr# addr 0#
-unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a
-
--- Usually the unpack-list rule turns unpackFoldrCString# into unpackCString#
-
--- It also has a BuiltInRule in GHC.Core.Op.ConstantFold:
--- unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n)
--- = unpackFoldrCString# "foobaz" c n
+-- See Note [NOINLINE for unpackFoldrCString]
+-- {-# INLINE[0] unpackFoldrCString2# #-}
+-- unpackFoldrCString2# :: Addr# -> (Char -> a -> a) -> a -> a
+-- unpackFoldrCString2# addr f z
+-- | isTrue# (ch `eqChar#` '\0'#) = z
+-- | True = C# ch `f` unpackFoldrCString# (addr `plusAddr#` 1#) f z
+-- where
+-- -- See Note [unpackCString# iterating over addr]
+-- !ch = indexCharOffAddr# addr 0#
-{-# NOINLINE unpackFoldrCString# #-}
--- At one stage I had NOINLINE [0] on the grounds that, unlike
--- unpackCString#, there *is* some point in inlining
--- unpackFoldrCString#, because we get better code for the
--- higher-order function call. BUT there may be a lot of
--- literal strings, and making a separate 'unpack' loop for
--- each is highly gratuitous. See nofib/real/anna/PrettyPrint.
-
-unpackFoldrCString# addr f z
- | isTrue# (ch `eqChar#` '\0'#) = z
- | True = C# ch `f` unpackFoldrCString# (addr `plusAddr#` 1#) f z
+{-# INLINE[0] unpackFoldrCString# #-}
+-- {-# NOINLINE unpackFoldrCString# #-}
+unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a
+unpackFoldrCString# str f z = go str z
where
- -- See Note [unpackCString# iterating over addr]
- !ch = indexCharOffAddr# addr 0#
+ go addr z
+ | isTrue# (ch `eqChar#` '\0'#) = z
+ | True = C# ch `f` go (addr `plusAddr#` 1#) z
+ where
+ -- See Note [unpackCString# iterating over addr]
+ !ch = indexCharOffAddr# addr 0#
-- There's really no point in inlining this for the same reasons as
-- unpackCString. See Note [Inlining unpackCString#] above for details.
@@ -140,26 +185,45 @@ unpackCStringUtf8# :: Addr# -> [Char]
{-# NOINLINE CONLIKE unpackCStringUtf8# #-}
unpackCStringUtf8# addr
| isTrue# (ch `eqChar#` '\0'# ) = []
- | isTrue# (ch `leChar#` '\x7F'#) = C# ch : unpackCStringUtf8# (addr `plusAddr#` 1#)
- | isTrue# (ch `leChar#` '\xDF'#) =
- let !c = C# (chr# (((ord# ch -# 0xC0#) `uncheckedIShiftL#` 6#) +#
- (ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#)))
- in c : unpackCStringUtf8# (addr `plusAddr#` 2#)
- | isTrue# (ch `leChar#` '\xEF'#) =
- let !c = C# (chr# (((ord# ch -# 0xE0#) `uncheckedIShiftL#` 12#) +#
- ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +#
- (ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#)))
- in c : unpackCStringUtf8# (addr `plusAddr#` 3#)
- | True =
- let !c = C# (chr# (((ord# ch -# 0xF0#) `uncheckedIShiftL#` 18#) +#
- ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 12#) +#
- ((ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +#
- (ord# (indexCharOffAddr# (addr `plusAddr#` 3#) 0#) -# 0x80#)))
- in c : unpackCStringUtf8# (addr `plusAddr#` 4#)
+ | True =
+ let !byte_count = getByteCount ch
+ !utf_ch = unpackUtf8Char# byte_count ch addr
+ !addr' = addr `plusBytes` byte_count
+ in C# utf_ch : unpackCStringUtf8# addr'
where
-- See Note [unpackCString# iterating over addr]
!ch = indexCharOffAddr# addr 0#
+
+unpackAppendCStringUtf8# :: Addr# -> [Char] -> [Char]
+{-# NOINLINE unpackAppendCStringUtf8# #-}
+ -- See the NOINLINE note on unpackCString#
+unpackAppendCStringUtf8# addr rest
+ | isTrue# (ch `eqChar#` '\0'#) = rest
+ | True =
+ let !byte_count = getByteCount ch
+ !utf_ch = unpackUtf8Char# byte_count ch addr
+ !addr' = (addr `plusBytes` byte_count)
+ in C# utf_ch : unpackAppendCStringUtf8# addr' rest
+ where
+ -- See Note [unpackCString# iterating over addr]
+ !ch = indexCharOffAddr# addr 0#
+
+-- See Note [NOINLINE for unpackFoldrCString]
+-- {-# INLINE[0] unpackFoldrCString# #-}
+{-# INLINE[0] unpackFoldrCStringUtf8# #-}
+unpackFoldrCStringUtf8# :: Addr# -> (Char -> a -> a) -> a -> a
+unpackFoldrCStringUtf8# addr f z
+ | isTrue# (ch `eqChar#` '\0'#) = z
+ | True =
+ let !byte_count = getByteCount ch
+ !utf_ch = unpackUtf8Char# byte_count ch addr
+ !addr' = (addr `plusBytes` byte_count)
+ in C# utf_ch `f` unpackFoldrCStringUtf8# addr' f z
+ where
+ -- See Note [unpackCString# iterating over addr]
+ !ch = indexCharOffAddr# addr 0#
+
-- There's really no point in inlining this for the same reasons as
-- unpackCString. See Note [Inlining unpackCString#] above for details.
unpackNBytes# :: Addr# -> Int# -> [Char]
@@ -174,3 +238,61 @@ unpackNBytes# addr len# = unpack [] (len# -# 1#)
case indexCharOffAddr# addr i# of
ch -> unpack (C# ch : acc) (i# -# 1#)
+
+
+------------------------------
+--- UTF8 decoding utilities
+------------------------------
+--
+-- These functions make explizit the logic that was originally
+-- part of unpackCStringUtf8. Since we want the same support for ascii
+-- and non-ascii a variety of functions needs the same logic. Instead
+-- of C&P'in the decoding logic all over we have it here once, and then
+-- force GHC to inline it.
+--
+-- All the overhead of the Bytes argument and calls goes away once all is
+-- said and done. And what remains is readable code in Haskell land and
+-- performant code in the resulting binary.
+
+data Bytes = One | Two | Three | Four
+
+{-# INLINE getByteCount #-}
+getByteCount :: Char# -> Bytes
+getByteCount ch
+ | isTrue# (ch `leChar#` '\x7F'#) = One
+ | isTrue# (ch `leChar#` '\xDF'#) = Two
+ | isTrue# (ch `leChar#` '\xEF'#) = Three
+ | True = Four
+
+{-# INLINE plusBytes #-}
+plusBytes :: Addr# -> Bytes -> Addr#
+plusBytes addr bytes =
+ case bytes of
+ One -> addr `plusAddr#` 1#
+ Two -> addr `plusAddr#` 2#
+ Three -> addr `plusAddr#` 3#
+ Four -> addr `plusAddr#` 4#
+
+-- | Take the current address, read unicode char of the given size.
+-- We obviously want the number of bytes, but we have to read one
+-- byte to determine the number of bytes for the current codepoint
+-- so we might as well reuse it and avoid a read.
+--
+-- Side Note: We don't dare to decode all 4 possibilities at once.
+-- Reading past the end of the addr might trigger an exception.
+-- For this reason we really have to check the width first and only
+-- decode after.
+{-# INLINE unpackUtf8Char# #-}
+unpackUtf8Char# :: Bytes -> Char# -> Addr# -> Char#
+unpackUtf8Char# bytes ch addr =
+ case bytes of
+ One -> ch
+ Two -> (chr# (((ord# ch -# 0xC0#) `uncheckedIShiftL#` 6#) +#
+ (ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#)))
+ Three -> (chr# (((ord# ch -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+ ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +#
+ (ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#)))
+ Four -> (chr# (((ord# ch -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+ ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 12#) +#
+ ((ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#) `uncheckedIShiftL#` 6#) +#
+ (ord# (indexCharOffAddr# (addr `plusAddr#` 3#) 0#) -# 0x80#)))
=====================================
libraries/ghc-prim/changelog.md
=====================================
@@ -1,3 +1,13 @@
+## 0.6.2 (edit as necessary)
+
+- Shipped with GHC 8.12.1
+
+- In order to support unicode better the following functions in `GHC.CString`
+ gained UTF8 counterparts:
+
+ unpackAppendCStringUtf8# :: Addr# -> [Char] -> [Char]
+ unpackFoldrCStringUtf8# :: Addr# -> (Char -> a -> a) -> a -> a
+
## 0.6.1 (edit as necessary)
- Shipped with GHC 8.10.1
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/76eeafeecafd30ef418c38888bdc30eae6444bf0
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/76eeafeecafd30ef418c38888bdc30eae6444bf0
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20200403/4abb2056/attachment-0001.html>
More information about the ghc-commits
mailing list