Code for conversion to and from UTF8

George Russell ger at informatik.uni-bremen.de
Thu Feb 12 20:38:21 EST 2004


I don't know if anyone else has done this, but I thought my own
effort (attached) might be of interest.  It converts String -> String
to and from UTF8 representations.  It /should/ compile with any Haskell98
compiler which also has the (base) library Data.Bits.  I'd be happy
to see it in the standard hierarchy, if that's desired.

Good points: (1) I think I've tested it fairly thoroughly; (2) when decoding
UTF8 sequences, it rejects all ill-formatted sequences, and such errors
can be easily detected (without exception handlers).

Bad points: it's unoptimised.
-------------- next part --------------
{- This module contains functions for converting to and from the UTF8 
   representations for Strings.
   -}
module UTF8(
   toUTF8, 
      -- :: String -> String
      -- Converts a String (whose characters must all have codes <2^31) into
      -- its UTF8 representation.
   fromUTF8WE,
      -- :: Monad m => String -> m String
      -- Converts a UTF8 representation of a String back into the String,
      -- catching all possible format errors.
      --
      -- Example: With the Haskell module Control.Monad.Error, you can 
      -- instance this as
      -- (fromUTF8WE :: String -> Either String String)
      -- to get a conversion function which either succeeds (Right) or 
      -- returns an error message (Left).
   ) where

import Char
import List

import Data.Bits

-- --------------------------------------------------------------------------
-- Encoding
-- --------------------------------------------------------------------------

-- | Converts a String into its UTF8 representation.
toUTF8 :: String -> String
toUTF8 [] = []
toUTF8 (x:xs) =
   let
      xs1 = toUTF8 xs
      ox = ord x

      mkUTF8 :: Int -> String -> Int -> Int -> String
      mkUTF8 x0 xs0 xmask0 xmax0 =
         let
            xbot = 0x80 .|. (x0 .&. 0x3f)
            x1 = x0 `shiftR` 6
            xs1 = chr xbot : xs0
         in
            if x1 < xmax0
              then
                 chr (xmask0 .|. x1) : xs1
              else
                 let
                    xmask1 = xmask0 .|. xmax0
                    xmax1 = xmax0 `shiftR` 1
                 in
                    mkUTF8 x1 xs1 xmask1 xmax1
   in
      if ox <= 0x7f 
         then 
            x : xs1
         else
           if ox `shiftR` 31 /= 0
              then
                 error ("Huge character with code " ++ show ox ++ 
                    " detected in string being converted to UTF8.")
              else
                 mkUTF8 ox xs1 0xc0 0x20

-- | Converts a UTF8 representation of a String back into the String,
-- catching all possible format errors.
--
-- Example: With the Haskell module Control.Monad.Error, you can 
-- instance this as
-- (fromUTF8WE :: String -> Either String String)
-- to get a conversion function which either succeeds (Right) or 
-- returns an error message (Left). 
fromUTF8WE :: Monad m => String -> m String
fromUTF8WE [] = return []
fromUTF8WE (x0 : xs0) =
   let
      ox = ord x0
   in
      case topZero8 ox of
         7 -> 
            do
               xs1 <- fromUTF8WE xs0
               return (x0 : xs1)
         6 -> 
            fail "UTF8 escape sequence starts 10xxxxxx"
         0 ->
            fail "UTF8 escape sequence starts 11111110"
         -1 -> 
            fail "UTF8 escape sequence starts 11111111"
         n ->
            let
               r = 6 - n -- number of 6-bit pieces
               xtop = ox .&. ones n

               minx = 
                  bit (
                     if r == 1
                        then
                           7
                        else
                           5*r + 1
                     )

               mkx [] _ _ = 
                  fail "UTF8 string ends in middle of escape sequence"
               mkx (ch : xs1) x0 count0 =
                  do
                     let
                        och = ord ch
                     if och .&. 0x80 /= 0x80
                        then
                           fail ("UTF8 escape sequence contains continuing "
                              ++ "character not of form 10xxxxxx")
                        else
                           return ()
                     let
                        xbot = och .&. 0x3f
                        x1 = (x0 `shiftL` 6) .|. xbot 
                        count1 = count0 - 1
                     if count1 == 0
                        then
                           return (x1,xs1)
                        else
                           mkx xs1 x1 count1
            in
               do
                  (x,xs1) <- mkx xs0 xtop r
                  if x < minx
                     then
                        fail ("UTF8 escape sequence contains character not "
                           ++ "optimally encoded")
                     else
                        do
                           xs2 <- fromUTF8WE xs1
                           return (chr x : xs2)           
                

-- --------------------------------------------------------------------------
-- Binary utilities
-- --------------------------------------------------------------------------

-- | return the number of the top bit which is zero, or -1 if they
-- are all zero, for a number between 0 and 255.
topZero8 :: Int -> Int
topZero8 i = 
   case
      (findIndex not
         (map
            (\ bn -> testBit i bn) 
            [7,6..0]
            ))
      of
         Just n -> 7 - n
         Nothing -> -1

-- | (ones i) is number with binary representation 1 written i times.
ones :: Int -> Int
ones i = bit i - 1


More information about the Libraries mailing list