[commit: ghc] master: ghc-prim: Don't allocate a thunk for each unpacked UTF-8 character (85a295d)

git at git.haskell.org git at git.haskell.org
Mon Jul 24 23:36:44 UTC 2017


Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/85a295d5607b5f8015bb3517601ced0d1adc29ef/ghc

>---------------------------------------------------------------

commit 85a295d5607b5f8015bb3517601ced0d1adc29ef
Author: Ben Gamari <bgamari.foss at gmail.com>
Date:   Mon Jul 24 19:01:58 2017 -0400

    ghc-prim: Don't allocate a thunk for each unpacked UTF-8 character
    
    While debugging #14005 I noticed that unpackCStringUtf8# was allocating
    a thunk for each Unicode character that it unpacked. This seems hardly
    worthwhile given that the thunk's closure will be at least three words,
    whereas the Char itself will be only two and requires only a bit of bit
    twiddling to construct.
    
    Test Plan: Validate
    
    Reviewers: simonmar, austin
    
    Subscribers: dfeuer, rwbarton, thomie
    
    Differential Revision: https://phabricator.haskell.org/D3769


>---------------------------------------------------------------

85a295d5607b5f8015bb3517601ced0d1adc29ef
 libraries/ghc-prim/GHC/CString.hs | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/libraries/ghc-prim/GHC/CString.hs b/libraries/ghc-prim/GHC/CString.hs
index cdda2db..0e6199f 100644
--- a/libraries/ghc-prim/GHC/CString.hs
+++ b/libraries/ghc-prim/GHC/CString.hs
@@ -125,24 +125,28 @@ unpackCStringUtf8# :: Addr# -> [Char]
 unpackCStringUtf8# addr
   = unpack 0#
   where
+    -- We take care to strictly evaluate the character decoding as
+    -- indexCharOffAddr# is marked with the can_fail flag and
+    -- consequently GHC won't evaluate the expression unless it is absolutely
+    -- needed.
     unpack nh
       | isTrue# (ch `eqChar#` '\0'#  ) = []
       | isTrue# (ch `leChar#` '\x7F'#) = C# ch : unpack (nh +# 1#)
       | isTrue# (ch `leChar#` '\xDF'#) =
-          C# (chr# (((ord# ch                                  -# 0xC0#) `uncheckedIShiftL#`  6#) +#
-                     (ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#))) :
-          unpack (nh +# 2#)
+          let !c = C# (chr# (((ord# ch                                  -# 0xC0#) `uncheckedIShiftL#`  6#) +#
+                              (ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#)))
+          in c : unpack (nh +# 2#)
       | isTrue# (ch `leChar#` '\xEF'#) =
-          C# (chr# (((ord# ch                                  -# 0xE0#) `uncheckedIShiftL#` 12#) +#
-                    ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#`  6#) +#
-                     (ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#))) :
-          unpack (nh +# 3#)
+          let !c = C# (chr# (((ord# ch                                  -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+                             ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#`  6#) +#
+                              (ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#)))
+          in c : unpack (nh +# 3#)
       | True                           =
-          C# (chr# (((ord# ch                                  -# 0xF0#) `uncheckedIShiftL#` 18#) +#
-                    ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 12#) +#
-                    ((ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#) `uncheckedIShiftL#`  6#) +#
-                     (ord# (indexCharOffAddr# addr (nh +# 3#)) -# 0x80#))) :
-          unpack (nh +# 4#)
+          let !c = C# (chr# (((ord# ch                                  -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+                             ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 12#) +#
+                             ((ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#) `uncheckedIShiftL#`  6#) +#
+                              (ord# (indexCharOffAddr# addr (nh +# 3#)) -# 0x80#)))
+          in c : unpack (nh +# 4#)
       where
         !ch = indexCharOffAddr# addr nh
 



More information about the ghc-commits mailing list