UTF8 (was Re: Hexdump)
Malcolm Wallace
Malcolm.Wallace at cs.york.ac.uk
Tue Mar 21 10:07:53 EST 2006
Dominic Steinitz <dominic.steinitz at blueyonder.co.uk> wrote:
> I finally got around to putting a "hexdump" function into a module.
> Any suggestions as to where it should live in the hierarchy?
How about
Codec.Hexdump
Codec.Text.Hexdump
Text.Codec.Hexdump
I was also thinking it would be nice to have pure Haskell
implementations of the various Unicode encodings. Here is my attempt at
the UTF-8 codec. Like Dominic, I am wondering where such a module
should live in the hierarchy?
Text.Codec.UTF8
Codec.Text.UTF8
Text.UTF8
Codec.UTF8
Regards,
Malcolm
-------------- next part --------------
module UTF8
( fromUTF8
, toUTF8
) where
import Data.Word
import Data.Bits
mask = (.&.) -- for readability
add = (.|.) -- for readability
fromUTF8 :: [Word8] -> [Char]
fromUTF8 [] = []
fromUTF8 (w:ws)
| w < 0x80 {- 0xxxxxxx -} = toEnum (fromEnum w) : fromUTF8 ws
| w >= 0xc0 {- 1111110x -} = bytes 5 (fromEnum (w`mask`0x01)) ws
| w >= 0xe0 {- 111110xx -} = bytes 4 (fromEnum (w`mask`0x03)) ws
| w >= 0xf0 {- 11110xxx -} = bytes 3 (fromEnum (w`mask`0x07)) ws
| w >= 0xf8 {- 1110xxxx -} = bytes 2 (fromEnum (w`mask`0x0f)) ws
| w >= 0xfc {- 110xxxxx -} = bytes 1 (fromEnum (w`mask`0x1f)) ws
| otherwise = error "incorrect UTF-8 encoding: wrong 7th-bit in first byte"
where
bytes :: Int -> Int -> [Word8] -> [Char]
bytes 0 acc ws = toEnum acc : fromUTF8 ws
bytes n acc [] = error "incorrect UTF-8 encoding: missing bytes"
bytes n 0 ws = error "incorrect UTF-8 encoding: non-minimal"
bytes n acc (w:ws)
| w >= 0x80 = bytes (n-1) ((acc`shiftL`6) + fromEnum (w`mask`0x3f)) ws
| otherwise = error "incorrect UTF-8 encoding: 8-bit not set in trailer"
toUTF8 :: [Char] -> [Word8]
toUTF8 = utf8 . map fromEnum
where
utf8 :: [Int] -> [Word8]
utf8 [] = []
utf8 (c:cs)
| c < 0x80 = toEnum c : utf8 cs
| c < 0x800 = bytes 1 (add 0xc0) c $ utf8 cs
| c < 0x10000 = bytes 2 (add 0xe0) c $ utf8 cs
| c < 0x200000 = bytes 3 (add 0xf0) c $ utf8 cs
| c < 0x4000000 = bytes 4 (add 0xf8) c $ utf8 cs
| c < 0x7fffffff = bytes 5 (add 0xfc) c $ utf8 cs
| otherwise = error "toUTF8: character outside permissible range"
bytes :: Int -> (Word8->Word8) -> Int -> [Word8] -> [Word8]
bytes 0 header c rest = header (toEnum c): rest
bytes n header c rest = bytes (n-1) header (c`shiftR`6) $
(toEnum ((c`mask`0x3f) + 0x80)) : rest
More information about the Libraries
mailing list