Reading/Writing Binary Data in Haskell
Axel Simon
A.Simon@kent.ac.uk
Sun, 13 Jul 2003 10:33:24 +0100
On Thu, Jul 10, 2003 at 09:28:38AM +0200, Martin Sjgren wrote:
> tor 2003-07-10 klockan 04.56 skrev Glynn Clements:
> > OTOH, existing implementations (at least GHC and Hugs) currently read
> > and write "8-bit binary", i.e. characters 0-255 get read and written
> > "as-is" and anything else breaks, and changing that would probably
> > break a fair amount of existing code.
>
> What I would like to see, is a package for converting between different
> encodings and character sets. Python has two types for strings, 'str'
> (which is just a sequence of octets) and 'unicode'.
Char in Haskell can represent Unicode and bytes are Word8. What is a bad
legacy is that functions like withCString deal with bytes not with Unicode
(or rather UTF8). I use the following as a workaround until there is some
result from the internationalization effort for Haskell
(http://sourceforge.net/projects/haskell-i18n/).
Cheers,
Axel.
{-# OPTIONS -cpp #-}
-- * This module adds CString-like functions that handle UTF8 strings.
-- Furthermore it serves as an impedance matcher for different compiler
-- versions.
module FFI(
with,
nullForeignPtr,
foreignFree,
withUTFString,
withUTFStringLen,
newUTFString,
newUTFStringLen,
peekUTFString,
peekUTFStringLen,
module Foreign,
#if __GLASGOW_HASKELL__>=504
module Foreign.C
#else
module CForeign
#endif
) where
import Monad (liftM)
import Char
import LocalData(unsafePerformIO)
#if __GLASGOW_HASKELL__>=504
import Data.Bits
import Foreign.C
import qualified Foreign
import Foreign hiding (with)
#else
import Bits
import CForeign
import qualified Foreign
import Foreign hiding (withObject)
#endif
#if __GLASGOW_HASKELL__>=504
with :: (Storable a) => a -> (Ptr a -> IO b) -> IO b
with = Foreign.with
#else
with :: (Storable a) => a -> (Ptr a -> IO b) -> IO b
with = Foreign.withObject
#endif
#if __GLASGOW_HASKELL__>=600
foreign import ccall unsafe "&free"
free' :: FinalizerPtr a
foreignFree :: Ptr a -> FinalizerPtr a
foreignFree _ = free'
nullForeignPtr :: ForeignPtr a
nullForeignPtr = unsafePerformIO $ newForeignPtr nullPtr free'
#else
nullForeignPtr :: ForeignPtr a
nullForeignPtr = unsafePerformIO $ newForeignPtr nullPtr (return ())
foreignFree :: Ptr a -> IO ()
foreignFree = free
#endif
-- Define withUTFString to emit UTF-8.
--
withUTFString :: String -> (CString -> IO a) -> IO a
withUTFString hsStr = withCString (toUTF hsStr)
-- Define withUTFStringLen to emit UTF-8.
--
withUTFStringLen :: String -> (CStringLen -> IO a) -> IO a
withUTFStringLen hsStr = withCStringLen (toUTF hsStr)
-- Define newUTFString to emit UTF-8.
--
newUTFString :: String -> IO CString
newUTFString = newCString . toUTF
-- Define newUTFStringLen to emit UTF-8.
--
newUTFStringLen :: String -> IO CStringLen
newUTFStringLen = newCStringLen . toUTF
-- Define peekUTFString to retrieve UTF-8.
--
peekUTFString :: CString -> IO String
peekUTFString strPtr = liftM fromUTF $ peekCString strPtr
-- Define peekUTFStringLen to retrieve UTF-8.
--
peekUTFStringLen :: CStringLen -> IO String
peekUTFStringLen strPtr = liftM fromUTF $ peekCStringLen strPtr
-- Convert Unicode characters to UTF-8.
--
toUTF :: String -> String
toUTF [] = []
toUTF (x:xs) | ord x<=0x007F = x:toUTF xs
| ord x<=0x07FF = chr (0xC0 .|. ((ord x `shift` (-6)) .&.
0x1F)):
chr (0x80 .|. (ord x .&. 0x3F)):
toUTF xs
| otherwise = chr (0xE0 .|. ((ord x `shift` (-12)) .&.
0x0F)):
chr (0x80 .|. ((ord x `shift` (-6)) .&.
0x3F)):
chr (0x80 .|. (ord x .&. 0x3F)):
toUTF xs
-- Convert UTF-8 to Unicode.
--
fromUTF :: String -> String
fromUTF [] = []
fromUTF (all@(x:xs)) | ord x<=0x7F = x:fromUTF xs
| ord x<=0xBF = err
| ord x<=0xDF = twoBytes all
| ord x<=0xEF = threeBytes all
| otherwise = err
where
twoBytes (x1:x2:xs) = chr (((ord x1 .&. 0x1F) `shift` 6) .|.
(ord x2 .&. 0x3F)):fromUTF xs
twoBytes _ = error "fromUTF: illegal two byte sequence"
threeBytes (x1:x2:x3:xs) = chr (((ord x1 .&. 0x0F) `shift` 12) .|.
((ord x2 .&. 0x3F) `shift` 6) .|.
(ord x3 .&. 0x3F)):fromUTF xs
threeBytes _ = error "fromUTF: illegal three byte sequence"
err = error "fromUTF: illegal UTF-8 character"