[commit: ghc] master: Return nBytes instead of nextAddr from utf8DecodeChar (caf449e)
git at git.haskell.org
git at git.haskell.org
Tue Sep 16 12:59:40 UTC 2014
Repository : ssh://git@git.haskell.org/ghc
On branch : master
Link : http://ghc.haskell.org/trac/ghc/changeset/caf449e39f5e7545eeabd567349661450aa8c6e5/ghc
>---------------------------------------------------------------
commit caf449e39f5e7545eeabd567349661450aa8c6e5
Author: Thomas Miedema <thomasmiedema at gmail.com>
Date: Tue Sep 16 07:56:35 2014 -0500
Return nBytes instead of nextAddr from utf8DecodeChar
Summary:
While researching D176, I came across the following simplification
opportunity:
Not all functions that call utf8DecodeChar actually need the address
of the next char. And some need the 'number of bytes' read. So returning
nBytes instead of nextAddr should save a few addition and subtraction
operations, and makes the code a bit simpler.
Test Plan: it validates
Reviewers: simonmar, ezyang, austin
Reviewed By: austin
Subscribers: simonmar, ezyang, carter
Differential Revision: https://phabricator.haskell.org/D179
>---------------------------------------------------------------
caf449e39f5e7545eeabd567349661450aa8c6e5
compiler/utils/Encoding.hs | 50 +++++++++++++++++++++--------------------
compiler/utils/FastString.lhs | 3 +--
compiler/utils/StringBuffer.lhs | 4 ++--
3 files changed, 29 insertions(+), 28 deletions(-)
diff --git a/compiler/utils/Encoding.hs b/compiler/utils/Encoding.hs
index 115703f..5c8619b 100644
--- a/compiler/utils/Encoding.hs
+++ b/compiler/utils/Encoding.hs
@@ -47,18 +47,18 @@ import ExtsCompat46
-- before decoding them (see StringBuffer.hs).
{-# INLINE utf8DecodeChar# #-}
-utf8DecodeChar# :: Addr# -> (# Char#, Addr# #)
+utf8DecodeChar# :: Addr# -> (# Char#, Int# #)
utf8DecodeChar# a# =
let !ch0 = word2Int# (indexWord8OffAddr# a# 0#) in
case () of
- _ | ch0 <=# 0x7F# -> (# chr# ch0, a# `plusAddr#` 1# #)
+ _ | ch0 <=# 0x7F# -> (# chr# ch0, 1# #)
| ch0 >=# 0xC0# && ch0 <=# 0xDF# ->
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else
(# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
(ch1 -# 0x80#)),
- a# `plusAddr#` 2# #)
+ 2# #)
| ch0 >=# 0xE0# && ch0 <=# 0xEF# ->
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
@@ -68,7 +68,7 @@ utf8DecodeChar# a# =
(# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch2 -# 0x80#)),
- a# `plusAddr#` 3# #)
+ 3# #)
| ch0 >=# 0xF0# && ch0 <=# 0xF8# ->
let !ch1 = word2Int# (indexWord8OffAddr# a# 1#) in
@@ -81,20 +81,21 @@ utf8DecodeChar# a# =
((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
(ch3 -# 0x80#)),
- a# `plusAddr#` 4# #)
+ 4# #)
| otherwise -> fail 1#
where
-- all invalid sequences end up here:
- fail n = (# '\0'#, a# `plusAddr#` n #)
+ fail :: Int# -> (# Char#, Int# #)
+ fail nBytes# = (# '\0'#, nBytes# #)
-- '\xFFFD' would be the usual replacement character, but
-- that's a valid symbol in Haskell, so will result in a
-- confusing parse error later on. Instead we use '\0' which
-- will signal a lexer error immediately.
-utf8DecodeChar :: Ptr Word8 -> (Char, Ptr Word8)
+utf8DecodeChar :: Ptr Word8 -> (Char, Int)
utf8DecodeChar (Ptr a#) =
- case utf8DecodeChar# a# of (# c#, b# #) -> ( C# c#, Ptr b# )
+ case utf8DecodeChar# a# of (# c#, nBytes# #) -> ( C# c#, I# nBytes# )
-- UTF-8 is cleverly designed so that we can always figure out where
-- the start of the current character is, given any position in a
@@ -111,35 +112,36 @@ utf8CharStart p = go p
else return p
utf8DecodeString :: Ptr Word8 -> Int -> IO [Char]
-STRICT2(utf8DecodeString)
-utf8DecodeString (Ptr a#) (I# len#)
- = unpack a#
+utf8DecodeString ptr len
+ = unpack ptr
where
- !end# = addr2Int# (a# `plusAddr#` len#)
+ !end = ptr `plusPtr` len
- unpack p#
- | addr2Int# p# >=# end# = return []
+ unpack p
+ | p >= end = return []
| otherwise =
- case utf8DecodeChar# p# of
- (# c#, q# #) -> do
- chs <- unpack q#
+ case utf8DecodeChar# (unPtr p) of
+ (# c#, nBytes# #) -> do
+ chs <- unpack (p `plusPtr#` nBytes#)
return (C# c# : chs)
countUTF8Chars :: Ptr Word8 -> Int -> IO Int
-countUTF8Chars ptr bytes = go ptr 0
+countUTF8Chars ptr len = go ptr 0
where
- end = ptr `plusPtr` bytes
+ !end = ptr `plusPtr` len
- STRICT2(go)
- go ptr n
- | ptr >= end = return n
+ go p !n
+ | p >= end = return n
| otherwise = do
- case utf8DecodeChar# (unPtr ptr) of
- (# _, a #) -> go (Ptr a) (n+1)
+ case utf8DecodeChar# (unPtr p) of
+ (# _, nBytes# #) -> go (p `plusPtr#` nBytes#) (n+1)
unPtr :: Ptr a -> Addr#
unPtr (Ptr a) = a
+plusPtr# :: Ptr a -> Int# -> Ptr a
+plusPtr# ptr nBytes# = ptr `plusPtr` (I# nBytes#)
+
utf8EncodeChar :: Char -> Ptr Word8 -> IO (Ptr Word8)
utf8EncodeChar c ptr =
let x = ord c in
diff --git a/compiler/utils/FastString.lhs b/compiler/utils/FastString.lhs
index a38d87e..91236cc 100644
--- a/compiler/utils/FastString.lhs
+++ b/compiler/utils/FastString.lhs
@@ -529,8 +529,7 @@ tailFS :: FastString -> FastString
tailFS (FastString _ 0 _ _) = panic "tailFS: Empty FastString"
tailFS (FastString _ _ bs _) =
inlinePerformIO $ BS.unsafeUseAsCString bs $ \ptr ->
- do let (_, ptr') = utf8DecodeChar (castPtr ptr)
- n = ptr' `minusPtr` ptr
+ do let (_, n) = utf8DecodeChar (castPtr ptr)
return $! mkFastStringByteString (BS.drop n bs)
consFS :: Char -> FastString -> FastString
diff --git a/compiler/utils/StringBuffer.lhs b/compiler/utils/StringBuffer.lhs
index 50d8443..d298457 100644
--- a/compiler/utils/StringBuffer.lhs
+++ b/compiler/utils/StringBuffer.lhs
@@ -179,8 +179,8 @@ nextChar (StringBuffer buf len (I# cur#)) =
inlinePerformIO $ do
withForeignPtr buf $ \(Ptr a#) -> do
case utf8DecodeChar# (a# `plusAddr#` cur#) of
- (# c#, b# #) ->
- let cur' = I# (b# `minusAddr#` a#) in
+ (# c#, nBytes# #) ->
+ let cur' = I# (cur# +# nBytes#) in
return (C# c#, StringBuffer buf len cur')
currentChar :: StringBuffer -> Char
More information about the ghc-commits
mailing list