[commit: ghc] master: Improve isDigit, isSpace, etc. (3157127)

Tue Oct 21 21:50:41 UTC 2014

Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/31571270625a690410b794b7cfe48d866c084e74/ghc

>---------------------------------------------------------------

commit 31571270625a690410b794b7cfe48d866c084e74
Author: David Feuer <David.Feuer at gmail.com>
Date:   Tue Oct 21 15:01:14 2014 -0500

    Improve isDigit, isSpace, etc.
    
    Summary:
    Make things less branchy; use unsigned comparisons for range checking.
    Eliminate non-spaces more quickly in common cases in isSpace.
    
    Reviewers: ekmett, carter, austin
    
    Reviewed By: austin
    
    Subscribers: thomie, carter, ezyang, simonmar
    
    Differential Revision: https://phabricator.haskell.org/D340
    
    GHC Trac Issues: #1473


>---------------------------------------------------------------

31571270625a690410b794b7cfe48d866c084e74
 libraries/base/Data/Char.hs   | 12 ++++++++----
 libraries/base/GHC/Unicode.hs | 37 ++++++++++++++++++++++---------------
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/libraries/base/Data/Char.hs b/libraries/base/Data/Char.hs
index aa4a594..ac708ac 100644
--- a/libraries/base/Data/Char.hs
+++ b/libraries/base/Data/Char.hs
@@ -68,10 +68,14 @@ import GHC.Enum
 -- (i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@).
 digitToInt :: Char -> Int
 digitToInt c
- | isDigit c            =  ord c - ord '0'
- | c >= 'a' && c <= 'f' =  ord c - ord 'a' + 10
- | c >= 'A' && c <= 'F' =  ord c - ord 'A' + 10
- | otherwise            =  error ("Char.digitToInt: not a digit " ++ show c) -- sigh
+  | (fromIntegral dec::Word) <= 9 = dec
+  | (fromIntegral hexl::Word) <= 5 = hexl + 10
+  | (fromIntegral hexu::Word) <= 5 = hexu + 10
+  | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh
+  where
+    dec = ord c - ord '0'
+    hexl = ord c - ord 'a'
+    hexu = ord c - ord 'A'
 
 -- | Unicode General Categories (column 2 of the UnicodeData table)
 -- in the order they are listed in the Unicode standard.
diff --git a/libraries/base/GHC/Unicode.hs b/libraries/base/GHC/Unicode.hs
index dea2fb9..6277805 100644
--- a/libraries/base/GHC/Unicode.hs
+++ b/libraries/base/GHC/Unicode.hs
@@ -30,6 +30,8 @@ module GHC.Unicode (
 
 import GHC.Base
 import GHC.Char        (chr)
+import GHC.Real
+import GHC.Num
 
 #include "HsBaseConfig.h"
 
@@ -65,16 +67,16 @@ isPrint                 :: Char -> Bool
 -- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v at .
 isSpace                 :: Char -> Bool
 -- isSpace includes non-breaking space
--- Done with explicit equalities both for efficiency, and to avoid a tiresome
--- recursion with GHC.List elem
-isSpace c               =  c == ' '     ||
-                           c == '\t'    ||
-                           c == '\n'    ||
-                           c == '\r'    ||
-                           c == '\f'    ||
-                           c == '\v'    ||
-                           c == '\xa0'  ||
-                           iswspace (ord c) /= 0
+-- The magic 0x377 isn't really that magical. As of 2014, all the codepoints
+-- at or below 0x377 have been assigned, so we shouldn't have to worry about
+-- any new spaces appearing below there. It would probably be best to
+-- use branchless ||, but currently the eqLit transformation will undo that,
+-- so we'll do it like this until there's a way around that.
+isSpace c
+  | uc <= 0x377 = uc == 32 || uc - 0x9 <= 4 || uc == 0xa0
+  | otherwise = iswspace (ord c) /= 0
+  where
+    uc = fromIntegral (ord c) :: Word
 
 -- | Selects upper-case or title-case alphabetic Unicode characters (letters).
 -- Title case is used by a small number of letter ligatures like the
@@ -98,17 +100,23 @@ isAlphaNum              :: Char -> Bool
 
 -- | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@.
 isDigit                 :: Char -> Bool
-isDigit c               =  c >= '0' && c <= '9'
+isDigit c               =  (fromIntegral (ord c - ord '0') :: Word) <= 9
+
+-- We use an addition and an unsigned comparison instead of two signed
+-- comparisons because it's usually faster and puts less strain on branch
+-- prediction. It likely also enables some CSE when combined with functions
+-- that follow up with an actual conversion.
 
 -- | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@.
 isOctDigit              :: Char -> Bool
-isOctDigit c            =  c >= '0' && c <= '7'
+isOctDigit c            =  (fromIntegral (ord c - ord '0') :: Word) <= 7
 
 -- | Selects ASCII hexadecimal digits,
 -- i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@.
 isHexDigit              :: Char -> Bool
-isHexDigit c            =  isDigit c || c >= 'A' && c <= 'F' ||
-                                        c >= 'a' && c <= 'f'
+isHexDigit c            =  isDigit c ||
+                           (fromIntegral (ord c - ord 'A')::Word) <= 5 ||
+                           (fromIntegral (ord c - ord 'a')::Word) <= 5
 
 -- | Convert a letter to the corresponding upper-case letter, if any.
 -- Any other character is returned unchanged.
@@ -132,7 +140,6 @@ toTitle                 :: Char -> Char
 
 isAlpha    c = iswalpha (ord c) /= 0
 isAlphaNum c = iswalnum (ord c) /= 0
---isSpace    c = iswspace (ord c) /= 0
 isControl  c = iswcntrl (ord c) /= 0
 isPrint    c = iswprint (ord c) /= 0
 isUpper    c = iswupper (ord c) /= 0