CWString

Mon Aug 25 20:04:28 EDT 2003

Attached is a properly internationalized implementation of
Foreign.C.String, along with some other routines which I feel would be
very at home in the FFI standard.

Note that I am trying to solve a simpler problem than full generic i18n.
I just want the ability to work within the current locale, whatever it
might be. I have tested these routines in utf8, latin1, greek, korean
and a few other locales. they seem to work well.

in addition to properly localeizing withCString, peek/pokeCString and
friends I feel it is important to provide routines to work on wchar_t *
strings. there are a number of reasons:

 * if __STDC_ISO_10646__ is defined (which is almost always),
   conversions can be incredibly optimized, in particular an array 
   of Chars can be implemented directly as an array of wchar_t's
 * many c libraries nativly take wchar_t *s, converting to and from a 
   multibyte encoding would be wasteful
 * wchar_t and the assosiated charset routines has been part of C since C90.1 
   and are quite stable.
 * and mainly, is is LIKELY that the current multibyte encoding is lossy
   compared to wchar_t. forcing people to go through char *s will break
   encodings for no good reason.
 * multiple locales are very common nowadays, not just for
   internationalization, but to differentiate between a utf8 based unix
   and a latin1 one.

the routines I propose adding are:

    withCWString, withCWStringLen, newCWString, newCWStringLen,
    peekCWString, peekCWStringLen, -- same as CString counterparts
    charIsRepresentable :: Char -> IO Bool -- returns true if the
      argument can be represented in the current locale

and the types CWChar, CWString, CWStringLen with CWChar == wchar_t and
the others defined analogously to the CString versions.

also, to a lesser extent I propose we add explicit utf8 routines:

    withUTF8String, withUTF8StringLen, newUTF8String,
    newUTF8StringLen, peekUTF8String, peekUTF8StringLen

there are several libraries (X11 being a major one) which export an
explicit utf8 based interface, it would be nice to be able to call their
routines directly without worrying about the current locale. these might
be more at home in a seperate library and not the FFI spec since they
can be implemented independently, but can probably benefit from compiler
dependent optimization.

my implementation is pretty hairy, but can be improved. it currently
only works on systems where __STDC_ISO_10646__ is defined, but
fortunatly, I have never come across a system where it was not defined
and it implemented wchar_t at all.
the localized versions of the CString routines are named with LCString,
which stands for localized C string...

-------------- next part --------------
{-# OPTIONS -fglasgow-exts -ffi  -#include <wchar.h> #-}

module CWString (
    -- utf8 versions
    withUTF8String,
    withUTF8StringLen,
    newUTF8String,
    newUTF8StringLen,
    peekUTF8String,
    peekUTF8StringLen,
    -- wchar stuff
#if defined(__STDC_ISO_10646__)
    withCWString,
    withCWStringLen,
    newCWString,
    newCWStringLen,
    peekCWString,
    peekCWStringLen,
#endif
    wcharIsUnicode,
    CWChar, 
    CWString, 
    CWStringLen,
    -- locale versions 
    withLCString,
    withLCStringLen,
    newLCString,
    newLCStringLen,
    peekLCStringLen,
    peekLCString,
    charIsRepresentable

    ) where

import Data.Bits
import Foreign.C.String
import Foreign.C.Types
import Char
import Foreign
import Monad
import qualified CForeign
import GHC.Exts
import IO

#ifndef CONFIG_INCLUDED
#define CONFIG_INCLUDED
#include <config.h>
#endif
#include <wchar.h>
#include <limits.h>

type CWChar = (#type wchar_t)
type CWString = Ptr CWChar
type CWStringLen = (CWString, Int)

fi x = fromIntegral x

-------------------
-- CWChar functions
-------------------

{-# INLINE wcharIsUnicode #-}
wcharIsUnicode :: Bool

#if defined(__STDC_ISO_10646__)

wcharIsUnicode = True

-- support functions 
wNUL :: CWChar
wNUL = 0
#ifndef __GLASGOW_HASKELL__
pairLength :: String -> CString -> CStringLen
pairLength  = flip (,) . length

cwCharsToChars :: [CWChar] -> [Char]
cwCharsToChars xs  = map castCWCharToChar xs
charsToCWChars :: [Char] -> [CWChar]
charsToCWChars xs  = map castCharToCWChar xs

#endif

castCWCharToChar :: CWChar -> Char
castCWCharToChar ch = chr (fromIntegral ch )

castCharToCWChar :: Char -> CWChar
castCharToCWChar ch = fromIntegral (ord ch)

-- exported functions
peekCWString    :: CWString -> IO String
#ifndef __GLASGOW_HASKELL__
peekCString cp  = do cs <- peekArray0 wNUL cp; return (cwCharsToChars cs)
#else
peekCWString cp = loop 0
  where
    loop i = do
        val <- peekElemOff cp i
        if val == wNUL then return [] else do
            rest <- loop (i+1)
            return (castCWCharToChar val : rest)
#endif

peekCWStringLen           :: CWStringLen -> IO String
#ifndef __GLASGOW_HASKELL__
peekCWStringLen (cp, len)  = do cs <- peekArray len cp; return (cwCharsToChars cs)
#else
peekCWStringLen (cp, len) = loop 0
  where
    loop i | i == len  = return []
	   | otherwise = do
        	val <- peekElemOff cp i
        	rest <- loop (i+1)
        	return (castCWCharToChar val : rest)
#endif

newCWString :: String -> IO CWString
#ifndef __GLASGOW_HASKELL__
newCWString  = newArray0 wNUL . charsToCWChars
#else
newCWString str = do
  ptr <- mallocArray0 (length str)
  let
	go [] n##   = pokeElemOff ptr (I## n##) wNUL
    	go (c:cs) n## = do pokeElemOff ptr (I## n##) (castCharToCWChar c); go cs (n## +## 1##)
  go str 0##
  return ptr
#endif

newCWStringLen     :: String -> IO CWStringLen
#ifndef __GLASGOW_HASKELL__
newCWStringLen str  = do a <- newArray (charsToCWChars str)
			return (pairLength str a)
#else
newCWStringLen str = do
  ptr <- mallocArray0 len
  let
	go [] n##     = return ()
    	go (c:cs) n## = do pokeElemOff ptr (I## n##) (castCharToCWChar c); go cs (n## +## 1##)
  go str 0##
  return (ptr, len)
  where
    len = length str
#endif

withCWString :: String -> (CWString -> IO a) -> IO a
#ifndef __GLASGOW_HASKELL__
withCWString  = withArray0 wNUL . charsToCWChars
#else
withCWString str f =
  allocaArray0 (length str) $ \ptr ->
      let
	go [] n##     = pokeElemOff ptr (I## n##) wNUL
    	go (c:cs) n## = do pokeElemOff ptr (I## n##) (castCharToCWChar c); go cs (n## +## 1##)
      in do
      go str 0##
      f ptr
#endif

withCWStringLen         :: String -> (CWStringLen -> IO a) -> IO a
#ifndef __GLASGOW_HASKELL__
withCWStringLen str act  = withArray (charsToCWChars str) $ act . pairLength str
#else
withCWStringLen str f =
  allocaArray len $ \ptr ->
      let
	go [] n##     = return ()
    	go (c:cs) n## = do pokeElemOff ptr (I## n##) (castCharToCWChar c); go cs (n## +## 1##)
      in do
      go str 0##
      f (ptr,len)
  where
    len = length str
#endif

#else
wcharIsUnicode = False
#endif

#if defined(__STDC_ISO_10646__)

newtype MBState = MBState { _mbstate :: (Ptr MBState)}

withMBState :: (MBState -> IO a) -> IO a
withMBState act = allocaBytes (#const sizeof(mbstate_t)) (\mb -> c_memset mb 0 (#const sizeof(mbstate_t)) >> act (MBState mb)) 

clearMBState :: MBState -> IO ()
clearMBState (MBState mb) = c_memset mb 0 (#const sizeof(mbstate_t)) >> return ()

wcsrtombs :: CWString -> (CString, CSize) -> IO CSize
wcsrtombs wcs (cs,len) = alloca (\p -> poke p wcs >> withMBState (\mb -> wcsrtombs' p cs len mb)) where
    wcsrtombs'  p cs len mb = c_wcsrtombs cs p len mb >>= \x -> case x of 
        -1 -> do
            sp <- peek p 
            poke sp ((fi (ord '?'))::CWChar)
            poke p wcs
            clearMBState mb
            wcsrtombs' p cs len mb
        _ -> return x

#def inline HsInt hs_get_mb_cur_max () { return MB_CUR_MAX; }
foreign import ccall unsafe hs_get_mb_cur_max :: IO Int 

mb_cur_max :: Int
mb_cur_max = unsafePerformIO hs_get_mb_cur_max 

charIsRepresentable :: Char -> IO Bool
charIsRepresentable ch = fmap (/= -1) $ allocaBytes mb_cur_max (\cs -> c_wctomb cs (fi $ ord ch)) 

foreign import ccall unsafe "stdlib.h wctomb" c_wctomb :: CString -> CWChar -> IO CInt
foreign import ccall unsafe "stdlib.h wcsrtombs" c_wcsrtombs :: CString -> (Ptr (Ptr CWChar)) -> CSize -> MBState -> IO CSize
foreign import ccall unsafe "string.h memset" c_memset :: Ptr a -> CInt -> CSize -> IO (Ptr a)

foreign import ccall unsafe "stdlib.h mbstowcs" c_mbstowcs :: CWString -> CString -> CSize -> IO CSize

mbstowcs a b s = throwIf (== -1) (const "mbstowcs") $ c_mbstowcs a b s 

peekLCString    :: CString -> IO String
peekLCString cp  = do 
    sz <- mbstowcs nullPtr cp 0
    allocaArray (fi $ sz + 1) (\wcp -> mbstowcs wcp cp (sz + 1) >> peekCWString wcp)

-- TODO fix for embeded NULs
peekLCStringLen           :: CStringLen -> IO String
peekLCStringLen (cp, len)  =  allocaBytes (len + 1) $ \ncp -> do
    copyBytes ncp cp len
    pokeElemOff ncp len 0
    peekLCString ncp

newLCString :: String -> IO CString
newLCString s = withCWString s $ \wcs -> do mallocArray0 alen >>= \cs -> wcsrtombs wcs (cs, fi alen) >> return cs where
    alen = mb_cur_max * length s

newLCStringLen     :: String -> IO CStringLen
newLCStringLen str  = newLCString str >>= \cs -> return (pairLength1 str cs)

withLCString :: String -> (CString -> IO a) -> IO a
withLCString s a = withCWString s $ \wcs -> allocaArray0 alen (\cs -> wcsrtombs wcs (cs,fi alen) >> a cs) where
    alen = mb_cur_max * length s

withLCStringLen         :: String -> (CStringLen -> IO a) -> IO a
withLCStringLen s a = withCWString s $ \wcs -> allocaArray0 alen (\cs -> wcsrtombs wcs (cs,fi alen) >>= \sz -> a (cs,fi sz)) where
    alen = mb_cur_max * length s

pairLength1 :: String -> CString -> CStringLen
pairLength1  = flip (,) . length

#else

charIsRepresentable :: Char -> IO Bool
charIsRepresentable ch = return $ isLatin1 ch

withLCString = withCString
withLCStringLen = withCStringLen
newLCString = newCString
newLCStringLen = newCStringLen
peekLCString = peekCString
peekLCStringLen = peekCStringLen

#endif

-----------------
-- UTF8 versions
-----------------

withUTF8String :: String -> (CString -> IO a) -> IO a
withUTF8String hsStr = CForeign.withCString (toUTF hsStr)

withUTF8StringLen :: String -> (CStringLen -> IO a) -> IO a
withUTF8StringLen hsStr = CForeign.withCStringLen (toUTF hsStr)

newUTF8String :: String -> IO CString
newUTF8String = CForeign.newCString . toUTF

newUTF8StringLen :: String -> IO CStringLen
newUTF8StringLen = CForeign.newCStringLen . toUTF

peekUTF8String :: CString -> IO String
peekUTF8String strPtr = fmap fromUTF $ CForeign.peekCString strPtr

peekUTF8StringLen :: CStringLen -> IO String
peekUTF8StringLen strPtr = fmap fromUTF $ CForeign.peekCStringLen strPtr

-- these should read and write directly from/to memory.
-- A first pass will be needed to determine the size of the allocated region

toUTF :: String -> String
toUTF [] = []
toUTF (x:xs) | ord x<=0x007F = x:toUTF xs
	     | ord x<=0x07FF = chr (0xC0 .|. ((ord x `shift` (-6)) .&. 0x1F)):
			       chr (0x80 .|. (ord x .&. 0x3F)):
			       toUTF xs
	     | otherwise     = chr (0xE0 .|. ((ord x `shift` (-12)) .&. 0x0F)):
			       chr (0x80 .|. ((ord x `shift` (-6)) .&. 0x3F)):
			       chr (0x80 .|. (ord x .&. 0x3F)):
			       toUTF xs

fromUTF :: String -> String
fromUTF [] = []
fromUTF (all@(x:xs)) | ord x<=0x7F = x:fromUTF xs
		     | ord x<=0xBF = err
		     | ord x<=0xDF = twoBytes all
		     | ord x<=0xEF = threeBytes all
		     | otherwise   = err
  where
    twoBytes (x1:x2:xs) = chr (((ord x1 .&. 0x1F) `shift` 6) .|.
			       (ord x2 .&. 0x3F)):fromUTF xs
    twoBytes _ = error "fromUTF: illegal two byte sequence"

    threeBytes (x1:x2:x3:xs) = chr (((ord x1 .&. 0x0F) `shift` 12) .|.
				    ((ord x2 .&. 0x3F) `shift` 6) .|.
				    (ord x3 .&. 0x3F)):fromUTF xs
    threeBytes _ = error "fromUTF: illegal three byte sequence" 

    err = error "fromUTF: illegal UTF-8 character"