Reading/Writing Binary Data in Haskell

Axel Simon A.Simon@kent.ac.uk
Sun, 13 Jul 2003 10:33:24 +0100


On Thu, Jul 10, 2003 at 09:28:38AM +0200, Martin Sjgren wrote:
> tor 2003-07-10 klockan 04.56 skrev Glynn Clements:
> > OTOH, existing implementations (at least GHC and Hugs) currently read
> > and write "8-bit binary", i.e. characters 0-255 get read and written
> > "as-is" and anything else breaks, and changing that would probably
> > break a fair amount of existing code.
> 
> What I would like to see, is a package for converting between different
> encodings and character sets. Python has two types for strings, 'str'
> (which is just a sequence of octets) and 'unicode'.

Char in Haskell can represent Unicode and bytes are Word8. What is a bad
legacy is that functions like withCString deal with bytes not with Unicode
(or rather UTF8). I use the following as a workaround until there is some
result from the internationalization effort for Haskell
(http://sourceforge.net/projects/haskell-i18n/).

Cheers,
Axel.


{-# OPTIONS -cpp #-}
-- * This module adds CString-like functions that handle UTF8 strings.
--   Furthermore it serves as an impedance matcher for different compiler
--   versions.
module FFI(
  with,
  nullForeignPtr,
  foreignFree,
  withUTFString,
  withUTFStringLen,
  newUTFString,
  newUTFStringLen,
  peekUTFString,
  peekUTFStringLen,
  module Foreign,
#if __GLASGOW_HASKELL__>=504
  module Foreign.C
#else
  module CForeign
#endif
  ) where

import Monad    (liftM)
import Char
import LocalData(unsafePerformIO)

#if __GLASGOW_HASKELL__>=504
import Data.Bits
import Foreign.C
import qualified Foreign
import Foreign   hiding (with)
#else
import Bits
import CForeign
import qualified Foreign
import Foreign   hiding (withObject)
#endif


#if __GLASGOW_HASKELL__>=504
with :: (Storable a) => a -> (Ptr a -> IO b) -> IO b
with = Foreign.with
#else
with :: (Storable a) => a -> (Ptr a -> IO b) -> IO b
with = Foreign.withObject
#endif

#if __GLASGOW_HASKELL__>=600
foreign import ccall unsafe "&free"
  free' :: FinalizerPtr a

foreignFree :: Ptr a -> FinalizerPtr a
foreignFree _ = free'

nullForeignPtr :: ForeignPtr a
nullForeignPtr = unsafePerformIO $ newForeignPtr nullPtr free'
#else
nullForeignPtr :: ForeignPtr a
nullForeignPtr = unsafePerformIO $ newForeignPtr nullPtr (return ())

foreignFree :: Ptr a -> IO ()
foreignFree = free
#endif


-- Define withUTFString to emit UTF-8.
--
withUTFString :: String -> (CString -> IO a) -> IO a
withUTFString hsStr = withCString (toUTF hsStr)

-- Define withUTFStringLen to emit UTF-8.
--
withUTFStringLen :: String -> (CStringLen -> IO a) -> IO a
withUTFStringLen hsStr = withCStringLen (toUTF hsStr)

-- Define newUTFString to emit UTF-8.
--
newUTFString :: String -> IO CString
newUTFString = newCString . toUTF

-- Define newUTFStringLen to emit UTF-8.
--
newUTFStringLen :: String -> IO CStringLen
newUTFStringLen = newCStringLen . toUTF

-- Define peekUTFString to retrieve UTF-8.
--
peekUTFString :: CString -> IO String
peekUTFString strPtr = liftM fromUTF $ peekCString strPtr

-- Define peekUTFStringLen to retrieve UTF-8.
--
peekUTFStringLen :: CStringLen -> IO String
peekUTFStringLen strPtr = liftM fromUTF $ peekCStringLen strPtr

-- Convert Unicode characters to UTF-8.
--
toUTF :: String -> String
toUTF [] = []
toUTF (x:xs) | ord x<=0x007F = x:toUTF xs
             | ord x<=0x07FF = chr (0xC0 .|. ((ord x `shift` (-6)) .&. 
0x1F)):
                               chr (0x80 .|. (ord x .&. 0x3F)):
                               toUTF xs
             | otherwise     = chr (0xE0 .|. ((ord x `shift` (-12)) .&. 
0x0F)):
                               chr (0x80 .|. ((ord x `shift` (-6)) .&. 
0x3F)):
                               chr (0x80 .|. (ord x .&. 0x3F)):
                               toUTF xs

-- Convert UTF-8 to Unicode.
--
fromUTF :: String -> String
fromUTF [] = []
fromUTF (all@(x:xs)) | ord x<=0x7F = x:fromUTF xs
                     | ord x<=0xBF = err
                     | ord x<=0xDF = twoBytes all
                     | ord x<=0xEF = threeBytes all
                     | otherwise   = err
  where
    twoBytes (x1:x2:xs) = chr (((ord x1 .&. 0x1F) `shift` 6) .|.
                               (ord x2 .&. 0x3F)):fromUTF xs
    twoBytes _ = error "fromUTF: illegal two byte sequence"

    threeBytes (x1:x2:x3:xs) = chr (((ord x1 .&. 0x0F) `shift` 12) .|.
                                    ((ord x2 .&. 0x3F) `shift` 6) .|.
                                    (ord x3 .&. 0x3F)):fromUTF xs
    threeBytes _ = error "fromUTF: illegal three byte sequence" 
    
    err = error "fromUTF: illegal UTF-8 character"