UTF8 (was Re: Hexdump)

Tue Mar 21 10:07:53 EST 2006

Dominic Steinitz <dominic.steinitz at blueyonder.co.uk> wrote:

> I finally got around to putting a "hexdump" function into a module.
> Any  suggestions as to where it should live in the hierarchy?

How about
    Codec.Hexdump
    Codec.Text.Hexdump
    Text.Codec.Hexdump

I was also thinking it would be nice to have pure Haskell
implementations of the various Unicode encodings.  Here is my attempt at
the UTF-8 codec.  Like Dominic, I am wondering where such a module
should live in the hierarchy?

    Text.Codec.UTF8
    Codec.Text.UTF8
    Text.UTF8
    Codec.UTF8

Regards,
    Malcolm
-------------- next part --------------
module UTF8
  ( fromUTF8
  , toUTF8
  ) where

import Data.Word
import Data.Bits

mask = (.&.)	-- for readability
add  = (.|.)	-- for readability

fromUTF8 :: [Word8] -> [Char]
fromUTF8 [] = []
fromUTF8 (w:ws)
    | w <  0x80  {- 0xxxxxxx -} = toEnum (fromEnum w) : fromUTF8 ws
    | w >= 0xc0  {- 1111110x -} = bytes 5 (fromEnum (w`mask`0x01)) ws
    | w >= 0xe0  {- 111110xx -} = bytes 4 (fromEnum (w`mask`0x03)) ws
    | w >= 0xf0  {- 11110xxx -} = bytes 3 (fromEnum (w`mask`0x07)) ws
    | w >= 0xf8  {- 1110xxxx -} = bytes 2 (fromEnum (w`mask`0x0f)) ws
    | w >= 0xfc  {- 110xxxxx -} = bytes 1 (fromEnum (w`mask`0x1f)) ws
    | otherwise  = error "incorrect UTF-8 encoding: wrong 7th-bit in first byte"
  where
    bytes :: Int -> Int -> [Word8] -> [Char]
    bytes 0 acc ws  = toEnum acc : fromUTF8 ws
    bytes n acc []  = error "incorrect UTF-8 encoding: missing bytes"
    bytes n 0 ws    = error "incorrect UTF-8 encoding: non-minimal"
    bytes n acc (w:ws)
      | w >= 0x80 = bytes (n-1) ((acc`shiftL`6) + fromEnum (w`mask`0x3f)) ws
      | otherwise = error "incorrect UTF-8 encoding: 8-bit not set in trailer"

toUTF8 :: [Char] -> [Word8]
toUTF8 = utf8 . map fromEnum
  where
    utf8 :: [Int] -> [Word8]
    utf8 [] = []
    utf8 (c:cs)
      | c < 0x80       = toEnum c : utf8 cs
      | c < 0x800      = bytes 1 (add 0xc0) c $ utf8 cs
      | c < 0x10000    = bytes 2 (add 0xe0) c $ utf8 cs
      | c < 0x200000   = bytes 3 (add 0xf0) c $ utf8 cs
      | c < 0x4000000  = bytes 4 (add 0xf8) c $ utf8 cs
      | c < 0x7fffffff = bytes 5 (add 0xfc) c $ utf8 cs
      | otherwise      = error "toUTF8: character outside permissible range"
    bytes :: Int -> (Word8->Word8) -> Int -> [Word8] -> [Word8]
    bytes 0 header c rest = header (toEnum c): rest
    bytes n header c rest = bytes (n-1) header (c`shiftR`6) $
                            (toEnum ((c`mask`0x3f) + 0x80)) : rest