[commit: haddock] master: Use Alex 3's Unicode support to properly lex source files as UTF-8 (478b606)

David Waern waern at galois.com
Sun Mar 10 20:26:36 CET 2013


Repository : ssh://darcs.haskell.org//srv/darcs/haddock

On branch  : master

http://hackage.haskell.org/trac/ghc/changeset/478b606bb0d61c4fea2d72171e2e5bb881a366d2

>---------------------------------------------------------------

commit 478b606bb0d61c4fea2d72171e2e5bb881a366d2
Author: Max Bolingbroke <batterseapower at hotmail.com>
Date:   Sun Feb 3 18:56:18 2013 +0000

    Use Alex 3's Unicode support to properly lex source files as UTF-8
    
    Signed-off-by: David Waern <david.waern at gmail.com>

>---------------------------------------------------------------

 haddock.cabal     |    4 +-
 src/Haddock/Lex.x |   65 ++++++++++++++++++++++++++++++++++------------------
 2 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/haddock.cabal b/haddock.cabal
index c4e33ad..ac13ae8 100644
--- a/haddock.cabal
+++ b/haddock.cabal
@@ -67,7 +67,7 @@ executable haddock
   else
     ghc-options:          -funbox-strict-fields -Wall -fwarn-tabs -O2
   if !flag(in-ghc-tree)
-    build-tools: alex >= 2.3, happy >= 1.18
+    build-tools: alex >= 3, happy >= 1.18
   build-depends:
     base >= 4.3 && < 4.8
   if flag(in-ghc-tree)
@@ -121,7 +121,7 @@ library
   -- In a GHC tree - in particular, in a source tarball - we don't
   -- require alex or happy
   if !flag(in-ghc-tree)
-    build-tools: alex >= 2.3, happy >= 1.18
+    build-tools: alex >= 3, happy >= 1.18
   build-depends:
     base >= 4.3 && < 4.8,
     filepath,
diff --git a/src/Haddock/Lex.x b/src/Haddock/Lex.x
index 0d8dd95..9e59fa4 100644
--- a/src/Haddock/Lex.x
+++ b/src/Haddock/Lex.x
@@ -30,6 +30,7 @@ import SrcLoc
 import DynFlags
 import FastString
 
+import qualified Data.Bits
 import Data.Char
 import Data.Word (Word8)
 import Numeric
@@ -149,46 +150,64 @@ tokenPos t = let AlexPn _ line col = snd t in (line, col)
 -- Unicode/UTF-8 support in Alex 3.x, and Unicode documentation will
 -- probably get mangled.
 
+-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
+utf8Encode :: Char -> [Word8]
+utf8Encode = map fromIntegral . go . ord
+ where
+  go oc
+   | oc <= 0x7f       = [oc]
+
+   | oc <= 0x7ff      = [ 0xc0 + (oc `Data.Bits.shiftR` 6)
+                        , 0x80 + oc Data.Bits..&. 0x3f
+                        ]
+
+   | oc <= 0xffff     = [ 0xe0 + (oc `Data.Bits.shiftR` 12)
+                        , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
+                        , 0x80 + oc Data.Bits..&. 0x3f
+                        ]
+   | otherwise        = [ 0xf0 + (oc `Data.Bits.shiftR` 18)
+                        , 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f)
+                        , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
+                        , 0x80 + oc Data.Bits..&. 0x3f
+                        ]
+
+type Byte = Word8
+
 type AlexInput = (AlexPosn,     -- current position,
                   Char,         -- previous char
+                  [Byte],       -- pending bytes on current char
                   String)       -- current input string
 
 alexInputPrevChar :: AlexInput -> Char
-alexInputPrevChar (p,c,s) = c
+alexInputPrevChar (p,c,bs,s) = c
 
-alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
-alexGetByte (p,c,[]) = Nothing
-alexGetByte (p,_,(c:s))  = let p' = alexMove p c
-                              in p' `seq`  Just (fromIntegral (ord c), (p', c, s))
+alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)
+alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s))
+alexGetByte (p,c,[],[]) = Nothing
+alexGetByte (p,_,[],(c:s))  = let p' = alexMove p c
+                                  (b:bs) = utf8Encode c
+                              in p' `seq`  Just (b, (p', c, bs, s))
 
--- for compat with Alex 2.x:
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar i = case alexGetByte i of
-                  Nothing     -> Nothing
-                  Just (b,i') -> Just (chr (fromIntegral b), i')
+data AlexPosn = AlexPn !Int !Int !Int
+        deriving (Eq,Show)
 
 alexMove :: AlexPosn -> Char -> AlexPosn
 alexMove (AlexPn a l c) '\t' = AlexPn (a+1)  l     (((c+7) `div` 8)*8+1)
 alexMove (AlexPn a l c) '\n' = AlexPn (a+1) (l+1)   1
 alexMove (AlexPn a l c) _    = AlexPn (a+1)  l     (c+1)
 
-data AlexPosn = AlexPn !Int !Int !Int
-        deriving (Eq,Show)
-
 type StartCode = Int
 type Action = AlexPosn -> String -> StartCode -> (StartCode -> [LToken]) -> DynFlags -> [LToken]
 
 tokenise :: DynFlags -> String -> (Int, Int) -> [LToken]
-tokenise dflags str (line, col) = let toks = go (posn, '\n', eofHack str) para in {-trace (show toks)-} toks
-  where
-    posn = AlexPn 0 line col
-
-    go inp@(pos, _, str) sc =
-	  case alexScan inp sc of
-		AlexEOF -> []
-		AlexError _ -> []
-		AlexSkip  inp' _       -> go inp' sc
-		AlexToken inp'@(pos',_,_) len act -> act pos (take len str) sc (\sc -> go inp' sc) dflags
+tokenise dflags str (line, col) = go (posn,'\n',[],eofHack str) para
+  where posn = AlexPn 0 line col
+        go inp@(pos,_,_,str) sc =
+          case alexScan inp sc of
+                AlexEOF -> []
+                AlexError _ -> []
+                AlexSkip  inp' len     -> go inp' sc
+                AlexToken inp' len act -> act pos (take len str) sc (\sc -> go inp' sc) dflags
 
 -- NB. we add a final \n to the string, (see comment in the beginning of line
 -- production above).





More information about the ghc-commits mailing list