[Git][ghc/ghc][master] 8 commits: Remove length field from FastString

Marge Bot gitlab at gitlab.haskell.org
Thu Jul 23 00:18:19 UTC 2020



 Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC


Commits:
0bf8980e by Daniel Gröber at 2020-07-22T20:18:11-04:00
Remove length field from FastString

- - - - -
1010c33b by Daniel Gröber at 2020-07-22T20:18:11-04:00
Use ShortByteString for FastString

There are multiple reasons we want this:

- Fewer allocations: ByteString has 3 fields, ShortByteString just has one.
- ByteString memory is pinned:
  - This can cause fragmentation issues (see for example #13110) but also
  - makes using FastStrings in compact regions impossible.

Metric Decrease:
    T5837
    T12150
    T12234
    T12425

- - - - -
8336ba78 by Daniel Gröber at 2020-07-22T20:18:11-04:00
Pass specialised utf8DecodeChar# to utf8DecodeLazy# for performance

Currently we're passing a indexWord8OffAddr# type function to
utf8DecodeLazy# which then passes it on to utf8DecodeChar#. By passing one
of utf8DecodeCharAddr# or utf8DecodeCharByteArray# instead we benefit from
the inlining and specialization already done for those.

- - - - -
7484a9a4 by Daniel Gröber at 2020-07-22T20:18:11-04:00
Encoding: Add comment about tricky ForeignPtr lifetime

- - - - -
5536ed28 by Daniel Gröber at 2020-07-22T20:18:11-04:00
Use IO constructor instead of `stToIO . ST`

- - - - -
5b8902e3 by Daniel Gröber at 2020-07-22T20:18:11-04:00
Encoding: Remove redundant use of withForeignPtr

- - - - -
5976a161 by Daniel Gröber at 2020-07-22T20:18:11-04:00
Encoding: Reformat utf8EncodeShortByteString to be more consistent

- - - - -
9ddf1614 by Daniel Gröber at 2020-07-22T20:18:11-04:00
FastString: Reintroduce character count cache

Metric Increase:
    ManyConstructors

Metric Decrease:
    T4029

- - - - -


5 changed files:

- compiler/GHC/Core/DataCon.hs
- compiler/GHC/Core/SimpleOpt.hs
- compiler/GHC/Data/FastString.hs
- compiler/GHC/Data/StringBuffer.hs
- compiler/GHC/Utils/Encoding.hs


Changes:

=====================================
compiler/GHC/Core/DataCon.hs
=====================================
@@ -1453,11 +1453,14 @@ dataConRepArgTys (MkData { dcRep = rep
 dataConIdentity :: DataCon -> ByteString
 -- We want this string to be UTF-8, so we get the bytes directly from the FastStrings.
 dataConIdentity dc = LBS.toStrict $ BSB.toLazyByteString $ mconcat
-   [ BSB.byteString $ bytesFS (unitFS (moduleUnit mod))
+   [ BSB.shortByteString $ fastStringToShortByteString $
+       unitFS $ moduleUnit mod
    , BSB.int8 $ fromIntegral (ord ':')
-   , BSB.byteString $ bytesFS (moduleNameFS (moduleName mod))
+   , BSB.shortByteString $ fastStringToShortByteString $
+       moduleNameFS $ moduleName mod
    , BSB.int8 $ fromIntegral (ord '.')
-   , BSB.byteString $ bytesFS (occNameFS (nameOccName name))
+   , BSB.shortByteString $ fastStringToShortByteString $
+       occNameFS $ nameOccName name
    ]
   where name = dataConName dc
         mod  = ASSERT( isExternalName name ) nameModule name


=====================================
compiler/GHC/Core/SimpleOpt.hs
=====================================
@@ -1212,7 +1212,7 @@ dealWithStringLiteral fun str co
   = let strFS = mkFastStringByteString str
 
         char = mkConApp charDataCon [mkCharLit (headFS strFS)]
-        charTail = bytesFS (tailFS strFS)
+        charTail = BS.tail (bytesFS strFS)
 
         -- In singleton strings, just add [] instead of unpackCstring# ""#.
         rest = if BS.null charTail


=====================================
compiler/GHC/Data/FastString.hs
=====================================
@@ -32,12 +32,16 @@
 module GHC.Data.FastString
        (
         -- * ByteString
-        bytesFS,            -- :: FastString -> ByteString
-        fastStringToByteString, -- = bytesFS (kept for haddock)
+        bytesFS,
+        fastStringToByteString,
         mkFastStringByteString,
         fastZStringToByteString,
         unsafeMkByteString,
 
+        -- * ShortByteString
+        fastStringToShortByteString,
+        mkFastStringShortByteString,
+
         -- * FastZString
         FastZString,
         hPutFZS,
@@ -52,7 +56,6 @@ module GHC.Data.FastString
         mkFastString,
         mkFastStringBytes,
         mkFastStringByteList,
-        mkFastStringForeignPtr,
         mkFastString#,
 
         -- ** Deconstruction
@@ -67,7 +70,6 @@ module GHC.Data.FastString
         nullFS,
         appendFS,
         headFS,
-        tailFS,
         concatFS,
         consFS,
         nilFS,
@@ -108,20 +110,19 @@ import Control.Concurrent.MVar
 import Control.DeepSeq
 import Control.Monad
 import Data.ByteString (ByteString)
+import Data.ByteString.Short (ShortByteString)
 import qualified Data.ByteString          as BS
 import qualified Data.ByteString.Char8    as BSC
-import qualified Data.ByteString.Internal as BS
 import qualified Data.ByteString.Unsafe   as BS
+import qualified Data.ByteString.Short    as SBS
+import qualified Data.ByteString.Short.Internal as SBS
 import Foreign.C
-import GHC.Exts
 import System.IO
 import Data.Data
 import Data.IORef
 import Data.Char
 import Data.Semigroup as Semi
 
-import GHC.IO
-
 import Foreign
 
 #if GHC_STAGE >= 2
@@ -131,14 +132,18 @@ import GHC.Conc.Sync    (sharedCAF)
 #if __GLASGOW_HASKELL__ < 811
 import GHC.Base (unpackCString#,unpackNBytes#)
 #endif
+import GHC.Exts
+import GHC.IO
 
 -- | Gives the UTF-8 encoded bytes corresponding to a 'FastString'
-bytesFS :: FastString -> ByteString
-bytesFS f = fs_bs f
+bytesFS, fastStringToByteString :: FastString -> ByteString
+bytesFS = fastStringToByteString
 
 {-# DEPRECATED fastStringToByteString "Use `bytesFS` instead" #-}
-fastStringToByteString :: FastString -> ByteString
-fastStringToByteString = bytesFS
+fastStringToByteString f = SBS.fromShort $ fs_sbs f
+
+fastStringToShortByteString :: FastString -> ShortByteString
+fastStringToShortByteString = fs_sbs
 
 fastZStringToByteString :: FastZString -> ByteString
 fastZStringToByteString (FastZString bs) = bs
@@ -148,9 +153,7 @@ unsafeMkByteString :: String -> ByteString
 unsafeMkByteString = BSC.pack
 
 hashFastString :: FastString -> Int
-hashFastString (FastString _ _ bs _)
-    = inlinePerformIO $ BS.unsafeUseAsCStringLen bs $ \(ptr, len) ->
-      return $ hashStr (castPtr ptr) len
+hashFastString fs = hashStr $ fs_sbs fs
 
 -- -----------------------------------------------------------------------------
 
@@ -182,7 +185,7 @@ of this string which is used by the compiler internally.
 data FastString = FastString {
       uniq    :: {-# UNPACK #-} !Int, -- unique id
       n_chars :: {-# UNPACK #-} !Int, -- number of chars
-      fs_bs   :: {-# UNPACK #-} !ByteString,
+      fs_sbs  :: {-# UNPACK #-} !ShortByteString,
       fs_zenc :: FastZString
       -- ^ Lazily computed z-encoding of this string.
       --
@@ -229,12 +232,9 @@ instance NFData FastString where
   rnf fs = seq fs ()
 
 cmpFS :: FastString -> FastString -> Ordering
-cmpFS f1@(FastString u1 _ _ _) f2@(FastString u2 _ _ _) =
-  if u1 == u2 then EQ else
-  compare (bytesFS f1) (bytesFS f2)
-
-foreign import ccall unsafe "memcmp"
-  memcmp :: Ptr a -> Ptr b -> Int -> IO Int
+cmpFS fs1 fs2 =
+  if uniq fs1 == uniq fs2 then EQ else
+  compare (fs_sbs fs1) (fs_sbs fs2)
 
 -- -----------------------------------------------------------------------------
 -- Construction
@@ -405,12 +405,12 @@ The procedure goes like this:
 -}
 
 mkFastStringWith
-    :: (Int -> IORef Int-> IO FastString) -> Ptr Word8 -> Int -> IO FastString
-mkFastStringWith mk_fs !ptr !len = do
+    :: (Int -> IORef Int-> IO FastString) -> ShortByteString -> IO FastString
+mkFastStringWith mk_fs sbs = do
   FastStringTableSegment lock _ buckets# <- readIORef segmentRef
   let idx# = hashToIndex# buckets# hash#
   bucket <- IO $ readArray# buckets# idx#
-  res <- bucket_match bucket len ptr
+  res <- bucket_match bucket sbs
   case res of
     Just found -> return found
     Nothing -> do
@@ -424,13 +424,13 @@ mkFastStringWith mk_fs !ptr !len = do
     !(FastStringTable uid n_zencs segments#) = stringTable
     get_uid = atomicModifyIORef' uid $ \n -> (n+1,n)
 
-    !(I# hash#) = hashStr ptr len
+    !(I# hash#) = hashStr sbs
     (# segmentRef #) = indexArray# segments# (hashToSegment# hash#)
     insert fs = do
       FastStringTableSegment _ counter buckets# <- maybeResizeSegment segmentRef
       let idx# = hashToIndex# buckets# hash#
       bucket <- IO $ readArray# buckets# idx#
-      res <- bucket_match bucket len ptr
+      res <- bucket_match bucket sbs
       case res of
         -- The FastString was added by another thread after previous read and
         -- before we acquired the write lock.
@@ -442,100 +442,71 @@ mkFastStringWith mk_fs !ptr !len = do
           modifyIORef' counter succ
           return fs
 
-bucket_match :: [FastString] -> Int -> Ptr Word8 -> IO (Maybe FastString)
-bucket_match [] _ _ = return Nothing
-bucket_match (v@(FastString _ _ bs _):ls) len ptr
-      | len == BS.length bs = do
-         b <- BS.unsafeUseAsCString bs $ \buf ->
-             cmpStringPrefix ptr (castPtr buf) len
-         if b then return (Just v)
-              else bucket_match ls len ptr
-      | otherwise =
-         bucket_match ls len ptr
+bucket_match :: [FastString] -> ShortByteString -> IO (Maybe FastString)
+bucket_match [] _ = return Nothing
+bucket_match (fs@(FastString {fs_sbs=fs_sbs}) : ls) sbs
+  | fs_sbs == sbs = return (Just fs)
+  | otherwise     =  bucket_match ls sbs
 
 mkFastStringBytes :: Ptr Word8 -> Int -> FastString
 mkFastStringBytes !ptr !len =
     -- NB: Might as well use unsafeDupablePerformIO, since mkFastStringWith is
     -- idempotent.
-    unsafeDupablePerformIO $
-        mkFastStringWith (copyNewFastString ptr len) ptr len
-
--- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
--- between this and 'mkFastStringBytes' is that we don't have to copy
--- the bytes if the string is new to the table.
-mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
-mkFastStringForeignPtr ptr !fp len
-    = mkFastStringWith (mkNewFastString fp ptr len) ptr len
-
--- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
--- between this and 'mkFastStringBytes' is that we don't have to copy
--- the bytes if the string is new to the table.
+    unsafeDupablePerformIO $ do
+        sbs <- newSBSFromPtr ptr len
+        mkFastStringWith (mkNewFastStringShortByteString sbs) sbs
+
+newSBSFromPtr :: Ptr a -> Int -> IO ShortByteString
+newSBSFromPtr (Ptr src#) (I# len#) = do
+  IO $ \s ->
+    case newByteArray# len# s of { (# s, dst# #) ->
+    case copyAddrToByteArray# src# dst# 0# len# s of { s ->
+    case unsafeFreezeByteArray# dst# s of { (# s, ba# #) ->
+    (# s, SBS.SBS ba# #) }}}
+
+-- | Create a 'FastString' by copying an existing 'ByteString'
 mkFastStringByteString :: ByteString -> FastString
 mkFastStringByteString bs =
-    inlinePerformIO $
-      BS.unsafeUseAsCStringLen bs $ \(ptr, len) -> do
-        let ptr' = castPtr ptr
-        mkFastStringWith (mkNewFastStringByteString bs ptr' len) ptr' len
+  let sbs = SBS.toShort bs in
+  inlinePerformIO $
+      mkFastStringWith (mkNewFastStringShortByteString sbs) sbs
+
+-- | Create a 'FastString' from an existing 'ShortByteString' without
+-- copying.
+mkFastStringShortByteString :: ShortByteString -> FastString
+mkFastStringShortByteString sbs =
+  inlinePerformIO $ mkFastStringWith (mkNewFastStringShortByteString sbs) sbs
 
 -- | Creates a UTF-8 encoded 'FastString' from a 'String'
 mkFastString :: String -> FastString
 mkFastString str =
   inlinePerformIO $ do
-    let l = utf8EncodedLength str
-    buf <- mallocForeignPtrBytes l
-    withForeignPtr buf $ \ptr -> do
-      utf8EncodeString ptr str
-      mkFastStringForeignPtr ptr buf l
+    sbs <- utf8EncodeShortByteString str
+    mkFastStringWith (mkNewFastStringShortByteString sbs) sbs
 
 -- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
 mkFastStringByteList :: [Word8] -> FastString
-mkFastStringByteList str = mkFastStringByteString (BS.pack str)
+mkFastStringByteList str = mkFastStringShortByteString (SBS.pack str)
 
--- | Creates a (lazy) Z-encoded 'FastString' from a 'String' and account
--- the number of forced z-strings into the passed 'IORef'.
-mkZFastString :: IORef Int -> ByteString -> FastZString
-mkZFastString n_zencs bs = unsafePerformIO $ do
+-- | Creates a (lazy) Z-encoded 'FastString' from a 'ShortByteString' and
+-- account the number of forced z-strings into the passed 'IORef'.
+mkZFastString :: IORef Int -> ShortByteString -> FastZString
+mkZFastString n_zencs sbs = unsafePerformIO $ do
   atomicModifyIORef' n_zencs $ \n -> (n+1, ())
-  return $ mkFastZStringString (zEncodeString (utf8DecodeByteString bs))
-
-mkNewFastString :: ForeignPtr Word8 -> Ptr Word8 -> Int -> Int
-                -> IORef Int -> IO FastString
-mkNewFastString fp ptr len uid n_zencs = do
-  let bs = BS.fromForeignPtr fp 0 len
-      zstr = mkZFastString n_zencs bs
-  n_chars <- countUTF8Chars ptr len
-  return (FastString uid n_chars bs zstr)
-
-mkNewFastStringByteString :: ByteString -> Ptr Word8 -> Int -> Int
-                          -> IORef Int -> IO FastString
-mkNewFastStringByteString bs ptr len uid n_zencs = do
-  let zstr = mkZFastString n_zencs bs
-  n_chars <- countUTF8Chars ptr len
-  return (FastString uid n_chars bs zstr)
-
-copyNewFastString :: Ptr Word8 -> Int -> Int -> IORef Int -> IO FastString
-copyNewFastString ptr len uid n_zencs = do
-  fp <- copyBytesToForeignPtr ptr len
-  let bs = BS.fromForeignPtr fp 0 len
-      zstr = mkZFastString n_zencs bs
-  n_chars <- countUTF8Chars ptr len
-  return (FastString uid n_chars bs zstr)
-
-copyBytesToForeignPtr :: Ptr Word8 -> Int -> IO (ForeignPtr Word8)
-copyBytesToForeignPtr ptr len = do
-  fp <- mallocForeignPtrBytes len
-  withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
-  return fp
-
-cmpStringPrefix :: Ptr Word8 -> Ptr Word8 -> Int -> IO Bool
-cmpStringPrefix ptr1 ptr2 len =
- do r <- memcmp ptr1 ptr2 len
-    return (r == 0)
-
-hashStr  :: Ptr Word8 -> Int -> Int
- -- use the Addr to produce a hash value between 0 & m (inclusive)
-hashStr (Ptr a#) (I# len#) = loop 0# 0#
-  where
+  return $ mkFastZStringString (zEncodeString (utf8DecodeShortByteString sbs))
+
+mkNewFastStringShortByteString :: ShortByteString -> Int
+                               -> IORef Int -> IO FastString
+mkNewFastStringShortByteString sbs uid n_zencs = do
+  let zstr = mkZFastString n_zencs sbs
+  chars <- countUTF8Chars sbs
+  return (FastString uid chars sbs zstr)
+
+hashStr  :: ShortByteString -> Int
+ -- produce a hash value between 0 & m (inclusive)
+hashStr sbs@(SBS.SBS ba#) = loop 0# 0#
+   where
+    !(I# len#) = SBS.length sbs
     loop h n =
       if isTrue# (n ==# len#) then
         I# h
@@ -544,7 +515,7 @@ hashStr (Ptr a#) (I# len#) = loop 0# 0#
           -- DO NOT move this let binding! indexCharOffAddr# reads from the
           -- pointer so we need to evaluate this based on the length check
           -- above. Not doing this right caused #17909.
-          !c = ord# (indexCharOffAddr# a# n)
+          !c = indexInt8Array# ba# n
           !h2 = (h *# 16777619#) `xorI#` c
         in
           loop h2 (n +# 1#)
@@ -554,15 +525,15 @@ hashStr (Ptr a#) (I# len#) = loop 0# 0#
 
 -- | Returns the length of the 'FastString' in characters
 lengthFS :: FastString -> Int
-lengthFS f = n_chars f
+lengthFS fs = n_chars fs
 
 -- | Returns @True@ if the 'FastString' is empty
 nullFS :: FastString -> Bool
-nullFS f = BS.null (fs_bs f)
+nullFS fs = SBS.null $ fs_sbs fs
 
 -- | Unpacks and decodes the FastString
 unpackFS :: FastString -> String
-unpackFS (FastString _ _ bs _) = utf8DecodeByteString bs
+unpackFS fs = utf8DecodeShortByteString $ fs_sbs fs
 
 -- | Returns a Z-encoded version of a 'FastString'.  This might be the
 -- original, if it was already Z-encoded.  The first time this
@@ -570,33 +541,25 @@ unpackFS (FastString _ _ bs _) = utf8DecodeByteString bs
 -- memoized.
 --
 zEncodeFS :: FastString -> FastZString
-zEncodeFS (FastString _ _ _ ref) = ref
+zEncodeFS fs = fs_zenc fs
 
 appendFS :: FastString -> FastString -> FastString
 appendFS fs1 fs2 = mkFastStringByteString
                  $ BS.append (bytesFS fs1) (bytesFS fs2)
 
 concatFS :: [FastString] -> FastString
-concatFS = mkFastStringByteString . BS.concat . map fs_bs
+concatFS = mkFastStringShortByteString . mconcat . map fs_sbs
 
 headFS :: FastString -> Char
-headFS (FastString _ 0 _ _) = panic "headFS: Empty FastString"
-headFS (FastString _ _ bs _) =
-  inlinePerformIO $ BS.unsafeUseAsCString bs $ \ptr ->
-         return (fst (utf8DecodeChar (castPtr ptr)))
-
-tailFS :: FastString -> FastString
-tailFS (FastString _ 0 _ _) = panic "tailFS: Empty FastString"
-tailFS (FastString _ _ bs _) =
-    inlinePerformIO $ BS.unsafeUseAsCString bs $ \ptr ->
-    do let (_, n) = utf8DecodeChar (castPtr ptr)
-       return $! mkFastStringByteString (BS.drop n bs)
+headFS fs
+  | SBS.null $ fs_sbs fs = panic "headFS: Empty FastString"
+headFS fs = head $ unpackFS fs
 
 consFS :: Char -> FastString -> FastString
 consFS c fs = mkFastString (c : unpackFS fs)
 
 uniqueOfFS :: FastString -> Int
-uniqueOfFS (FastString u _ _ _) = u
+uniqueOfFS fs = uniq fs
 
 nilFS :: FastString
 nilFS = mkFastString ""


=====================================
compiler/GHC/Data/StringBuffer.hs
=====================================
@@ -200,7 +200,7 @@ nextChar (StringBuffer buf len (I# cur#)) =
   -- Getting our fingers dirty a little here, but this is performance-critical
   inlinePerformIO $ do
     withForeignPtr buf $ \(Ptr a#) -> do
-        case utf8DecodeChar# (a# `plusAddr#` cur#) of
+        case utf8DecodeCharAddr# (a# `plusAddr#` cur#) 0# of
           (# c#, nBytes# #) ->
              let cur' = I# (cur# +# nBytes#) in
              return (C# c#, StringBuffer buf len cur')


=====================================
compiler/GHC/Utils/Encoding.hs
=====================================
@@ -13,14 +13,16 @@
 
 module GHC.Utils.Encoding (
         -- * UTF-8
-        utf8DecodeChar#,
+        utf8DecodeCharAddr#,
         utf8PrevChar,
         utf8CharStart,
         utf8DecodeChar,
         utf8DecodeByteString,
+        utf8DecodeShortByteString,
         utf8DecodeStringLazy,
         utf8EncodeChar,
         utf8EncodeString,
+        utf8EncodeShortByteString,
         utf8EncodedLength,
         countUTF8Chars,
 
@@ -36,14 +38,16 @@ module GHC.Utils.Encoding (
 import GHC.Prelude
 
 import Foreign
-import Foreign.ForeignPtr.Unsafe
+import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
 import Data.Char
 import qualified Data.Char as Char
 import Numeric
 import GHC.IO
+import GHC.ST
 
 import Data.ByteString (ByteString)
 import qualified Data.ByteString.Internal as BS
+import Data.ByteString.Short.Internal (ShortByteString(..))
 
 import GHC.Exts
 
@@ -60,23 +64,23 @@ import GHC.Exts
 -- before decoding them (see "GHC.Data.StringBuffer").
 
 {-# INLINE utf8DecodeChar# #-}
-utf8DecodeChar# :: Addr# -> (# Char#, Int# #)
-utf8DecodeChar# a# =
-  let !ch0 = word2Int# (indexWord8OffAddr# a# 0#) in
+utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
+utf8DecodeChar# indexWord8# =
+  let !ch0 = word2Int# (indexWord8# 0#) in
   case () of
     _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
 
       | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
-        let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
+        let !ch1 = word2Int# (indexWord8# 1#) in
         if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
         (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
                   (ch1 -# 0x80#)),
            2# #)
 
       | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
-        let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
+        let !ch1 = word2Int# (indexWord8# 1#) in
         if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
-        let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
+        let !ch2 = word2Int# (indexWord8# 2#) in
         if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
         (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
                  ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#)  +#
@@ -84,11 +88,11 @@ utf8DecodeChar# a# =
            3# #)
 
      | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
-        let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
+        let !ch1 = word2Int# (indexWord8# 1#) in
         if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
-        let !ch2 = word2Int# (indexWord8OffAddr# a# 2#) in
+        let !ch2 = word2Int# (indexWord8# 2#) in
         if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
-        let !ch3 = word2Int# (indexWord8OffAddr# a# 3#) in
+        let !ch3 = word2Int# (indexWord8# 3#) in
         if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
         (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
                  ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
@@ -106,9 +110,18 @@ utf8DecodeChar# a# =
         -- confusing parse error later on.  Instead we use '\0' which
         -- will signal a lexer error immediately.
 
+utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharAddr# a# off# =
+    utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
+
+utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharByteArray# ba# off# =
+    utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
+
 utf8DecodeChar :: Ptr Word8 -> (Char, Int)
-utf8DecodeChar (Ptr a#) =
-  case utf8DecodeChar# a# of (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
+utf8DecodeChar !(Ptr a#) =
+  case utf8DecodeCharAddr# a# 0# of
+    (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
 
 -- UTF-8 is cleverly designed so that we can always figure out where
 -- the start of the current character is, given any position in a
@@ -124,73 +137,102 @@ utf8CharStart p = go p
                         then go (p `plusPtr` (-1))
                         else return p
 
-utf8DecodeByteString :: ByteString -> [Char]
-utf8DecodeByteString (BS.PS ptr offset len)
-  = utf8DecodeStringLazy ptr offset len
-
-utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
-utf8DecodeStringLazy fptr offset len
-  = unsafeDupablePerformIO $ unpack start
+{-# INLINE utf8DecodeLazy# #-}
+utf8DecodeLazy# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
+utf8DecodeLazy# retain decodeChar# len#
+  = unpack 0#
   where
-    !start = unsafeForeignPtrToPtr fptr `plusPtr` offset
-    !end = start `plusPtr` len
-
-    unpack p
-        | p >= end  = touchForeignPtr fptr >> return []
+    unpack i#
+        | isTrue# (i# >=# len#) = retain >> return []
         | otherwise =
-            case utf8DecodeChar# (unPtr p) of
-                (# c#, nBytes# #) -> do
-                  rest <- unsafeDupableInterleaveIO $ unpack (p `plusPtr#` nBytes#)
-                  return (C# c# : rest)
-
-countUTF8Chars :: Ptr Word8 -> Int -> IO Int
-countUTF8Chars ptr len = go ptr 0
-  where
-        !end = ptr `plusPtr` len
+            case decodeChar# i# of
+              (# c#, nBytes# #) -> do
+                rest <- unsafeDupableInterleaveIO $ unpack (i# +# nBytes#)
+                return (C# c# : rest)
 
-        go p !n
-           | p >= end = return n
-           | otherwise  = do
-                case utf8DecodeChar# (unPtr p) of
-                  (# _, nBytes# #) -> go (p `plusPtr#` nBytes#) (n+1)
-
-unPtr :: Ptr a -> Addr#
-unPtr (Ptr a) = a
-
-plusPtr# :: Ptr a -> Int# -> Ptr a
-plusPtr# ptr nBytes# = ptr `plusPtr` (I# nBytes#)
+utf8DecodeByteString :: ByteString -> [Char]
+utf8DecodeByteString (BS.PS fptr offset len)
+  = utf8DecodeStringLazy fptr offset len
 
-utf8EncodeChar :: Char -> Ptr Word8 -> IO (Ptr Word8)
-utf8EncodeChar c ptr =
+utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
+utf8DecodeStringLazy fp offset (I# len#)
+  = unsafeDupablePerformIO $ do
+      let !(Ptr a#) = unsafeForeignPtrToPtr fp `plusPtr` offset
+      utf8DecodeLazy# (touchForeignPtr fp) (utf8DecodeCharAddr# a#) len#
+-- Note that since utf8DecodeLazy# returns a thunk the lifetime of the
+-- ForeignPtr actually needs to be longer than the lexical lifetime
+-- withForeignPtr would provide here. That's why we use touchForeignPtr to
+-- keep the fp alive until the last character has actually been decoded.
+
+utf8DecodeShortByteString :: ShortByteString -> [Char]
+utf8DecodeShortByteString (SBS ba#)
+  = unsafeDupablePerformIO $
+      let len# = sizeofByteArray# ba# in
+      utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len#
+
+countUTF8Chars :: ShortByteString -> IO Int
+countUTF8Chars (SBS ba) = go 0# 0#
+  where
+    len# = sizeofByteArray# ba
+    go i# n#
+      | isTrue# (i# >=# len#) =
+          return (I# n#)
+      | otherwise = do
+          case utf8DecodeCharByteArray# ba i# of
+            (# _, nBytes# #) -> go (i# +# nBytes#) (n# +# 1#)
+
+{-# INLINE utf8EncodeChar #-}
+utf8EncodeChar :: (Int# -> Word# -> State# s -> State# s)
+               -> Char -> ST s Int
+utf8EncodeChar write# c =
   let x = ord c in
   case () of
     _ | x > 0 && x <= 0x007f -> do
-          poke ptr (fromIntegral x)
-          return (ptr `plusPtr` 1)
+          write 0 x
+          return 1
         -- NB. '\0' is encoded as '\xC0\x80', not '\0'.  This is so that we
         -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8).
       | x <= 0x07ff -> do
-          poke ptr (fromIntegral (0xC0 .|. ((x `shiftR` 6) .&. 0x1F)))
-          pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x .&. 0x3F)))
-          return (ptr `plusPtr` 2)
+          write 0 (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))
+          write 1 (0x80 .|. (x .&. 0x3F))
+          return 2
       | x <= 0xffff -> do
-          poke ptr (fromIntegral (0xE0 .|. (x `shiftR` 12) .&. 0x0F))
-          pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x `shiftR` 6) .&. 0x3F))
-          pokeElemOff ptr 2 (fromIntegral (0x80 .|. (x .&. 0x3F)))
-          return (ptr `plusPtr` 3)
+          write 0 (0xE0 .|. (x `shiftR` 12) .&. 0x0F)
+          write 1 (0x80 .|. (x `shiftR` 6) .&. 0x3F)
+          write 2 (0x80 .|. (x .&. 0x3F))
+          return 3
       | otherwise -> do
-          poke ptr (fromIntegral (0xF0 .|. (x `shiftR` 18)))
-          pokeElemOff ptr 1 (fromIntegral (0x80 .|. ((x `shiftR` 12) .&. 0x3F)))
-          pokeElemOff ptr 2 (fromIntegral (0x80 .|. ((x `shiftR` 6) .&. 0x3F)))
-          pokeElemOff ptr 3 (fromIntegral (0x80 .|. (x .&. 0x3F)))
-          return (ptr `plusPtr` 4)
+          write 0 (0xF0 .|. (x `shiftR` 18))
+          write 1 (0x80 .|. ((x `shiftR` 12) .&. 0x3F))
+          write 2 (0x80 .|. ((x `shiftR` 6) .&. 0x3F))
+          write 3 (0x80 .|. (x .&. 0x3F))
+          return 4
+  where
+    {-# INLINE write #-}
+    write (I# off#) (I# c#) = ST $ \s ->
+      case write# off# (int2Word# c#) s of
+        s -> (# s, () #)
 
 utf8EncodeString :: Ptr Word8 -> String -> IO ()
-utf8EncodeString ptr str = go ptr str
-  where go !_   []     = return ()
-        go ptr (c:cs) = do
-          ptr' <- utf8EncodeChar c ptr
-          go ptr' cs
+utf8EncodeString (Ptr a#) str = go a# str
+  where go !_   []   = return ()
+        go a# (c:cs) = do
+          I# off# <- stToIO $ utf8EncodeChar (writeWord8OffAddr# a#) c
+          go (a# `plusAddr#` off#) cs
+
+utf8EncodeShortByteString :: String -> IO ShortByteString
+utf8EncodeShortByteString str = IO $ \s ->
+  case utf8EncodedLength str         of { I# len# ->
+  case newByteArray# len# s          of { (# s, mba# #) ->
+  case go mba# 0# str                of { ST f_go ->
+  case f_go s                        of { (# s, () #) ->
+  case unsafeFreezeByteArray# mba# s of { (# s, ba# #) ->
+  (# s, SBS ba# #) }}}}}
+  where
+    go _ _ [] = return ()
+    go mba# i# (c:cs) = do
+      I# off# <- utf8EncodeChar (\j# -> writeWord8Array# mba# (i# +# j#)) c
+      go mba# (i# +# off#) cs
 
 utf8EncodedLength :: String -> Int
 utf8EncodedLength str = go 0 str



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/f2f817e4c547657c25bb110199f6f0b6014f843b...9ddf161492194edb321b87b1977eda8264df35aa

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/f2f817e4c547657c25bb110199f6f0b6014f843b...9ddf161492194edb321b87b1977eda8264df35aa
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20200722/06969986/attachment-0001.html>


More information about the ghc-commits mailing list