CWString
John Meacham
john at repetae.net
Mon Aug 25 20:04:28 EDT 2003
Attached is a properly internationalized implementation of
Foreign.C.String, along with some other routines which I feel would be
very at home in the FFI standard.
Note that I am trying to solve a simpler problem than full generic i18n.
I just want the ability to work within the current locale, whatever it
might be. I have tested these routines in utf8, latin1, greek, korean
and a few other locales. they seem to work well.
in addition to properly localeizing withCString, peek/pokeCString and
friends I feel it is important to provide routines to work on wchar_t *
strings. there are a number of reasons:
* if __STDC_ISO_10646__ is defined (which is almost always),
conversions can be incredibly optimized, in particular an array
of Chars can be implemented directly as an array of wchar_t's
* many c libraries nativly take wchar_t *s, converting to and from a
multibyte encoding would be wasteful
* wchar_t and the assosiated charset routines has been part of C since C90.1
and are quite stable.
* and mainly, is is LIKELY that the current multibyte encoding is lossy
compared to wchar_t. forcing people to go through char *s will break
encodings for no good reason.
* multiple locales are very common nowadays, not just for
internationalization, but to differentiate between a utf8 based unix
and a latin1 one.
the routines I propose adding are:
withCWString, withCWStringLen, newCWString, newCWStringLen,
peekCWString, peekCWStringLen, -- same as CString counterparts
charIsRepresentable :: Char -> IO Bool -- returns true if the
argument can be represented in the current locale
and the types CWChar, CWString, CWStringLen with CWChar == wchar_t and
the others defined analogously to the CString versions.
also, to a lesser extent I propose we add explicit utf8 routines:
withUTF8String, withUTF8StringLen, newUTF8String,
newUTF8StringLen, peekUTF8String, peekUTF8StringLen
there are several libraries (X11 being a major one) which export an
explicit utf8 based interface, it would be nice to be able to call their
routines directly without worrying about the current locale. these might
be more at home in a seperate library and not the FFI spec since they
can be implemented independently, but can probably benefit from compiler
dependent optimization.
my implementation is pretty hairy, but can be improved. it currently
only works on systems where __STDC_ISO_10646__ is defined, but
fortunatly, I have never come across a system where it was not defined
and it implemented wchar_t at all.
the localized versions of the CString routines are named with LCString,
which stands for localized C string...
-------------- next part --------------
{-# OPTIONS -fglasgow-exts -ffi -#include <wchar.h> #-}
module CWString (
-- utf8 versions
withUTF8String,
withUTF8StringLen,
newUTF8String,
newUTF8StringLen,
peekUTF8String,
peekUTF8StringLen,
-- wchar stuff
#if defined(__STDC_ISO_10646__)
withCWString,
withCWStringLen,
newCWString,
newCWStringLen,
peekCWString,
peekCWStringLen,
#endif
wcharIsUnicode,
CWChar,
CWString,
CWStringLen,
-- locale versions
withLCString,
withLCStringLen,
newLCString,
newLCStringLen,
peekLCStringLen,
peekLCString,
charIsRepresentable
) where
import Data.Bits
import Foreign.C.String
import Foreign.C.Types
import Char
import Foreign
import Monad
import qualified CForeign
import GHC.Exts
import IO
#ifndef CONFIG_INCLUDED
#define CONFIG_INCLUDED
#include <config.h>
#endif
#include <wchar.h>
#include <limits.h>
type CWChar = (#type wchar_t)
type CWString = Ptr CWChar
type CWStringLen = (CWString, Int)
fi x = fromIntegral x
-------------------
-- CWChar functions
-------------------
{-# INLINE wcharIsUnicode #-}
wcharIsUnicode :: Bool
#if defined(__STDC_ISO_10646__)
wcharIsUnicode = True
-- support functions
wNUL :: CWChar
wNUL = 0
#ifndef __GLASGOW_HASKELL__
pairLength :: String -> CString -> CStringLen
pairLength = flip (,) . length
cwCharsToChars :: [CWChar] -> [Char]
cwCharsToChars xs = map castCWCharToChar xs
charsToCWChars :: [Char] -> [CWChar]
charsToCWChars xs = map castCharToCWChar xs
#endif
castCWCharToChar :: CWChar -> Char
castCWCharToChar ch = chr (fromIntegral ch )
castCharToCWChar :: Char -> CWChar
castCharToCWChar ch = fromIntegral (ord ch)
-- exported functions
peekCWString :: CWString -> IO String
#ifndef __GLASGOW_HASKELL__
peekCString cp = do cs <- peekArray0 wNUL cp; return (cwCharsToChars cs)
#else
peekCWString cp = loop 0
where
loop i = do
val <- peekElemOff cp i
if val == wNUL then return [] else do
rest <- loop (i+1)
return (castCWCharToChar val : rest)
#endif
peekCWStringLen :: CWStringLen -> IO String
#ifndef __GLASGOW_HASKELL__
peekCWStringLen (cp, len) = do cs <- peekArray len cp; return (cwCharsToChars cs)
#else
peekCWStringLen (cp, len) = loop 0
where
loop i | i == len = return []
| otherwise = do
val <- peekElemOff cp i
rest <- loop (i+1)
return (castCWCharToChar val : rest)
#endif
newCWString :: String -> IO CWString
#ifndef __GLASGOW_HASKELL__
newCWString = newArray0 wNUL . charsToCWChars
#else
newCWString str = do
ptr <- mallocArray0 (length str)
let
go [] n## = pokeElemOff ptr (I## n##) wNUL
go (c:cs) n## = do pokeElemOff ptr (I## n##) (castCharToCWChar c); go cs (n## +## 1##)
go str 0##
return ptr
#endif
newCWStringLen :: String -> IO CWStringLen
#ifndef __GLASGOW_HASKELL__
newCWStringLen str = do a <- newArray (charsToCWChars str)
return (pairLength str a)
#else
newCWStringLen str = do
ptr <- mallocArray0 len
let
go [] n## = return ()
go (c:cs) n## = do pokeElemOff ptr (I## n##) (castCharToCWChar c); go cs (n## +## 1##)
go str 0##
return (ptr, len)
where
len = length str
#endif
withCWString :: String -> (CWString -> IO a) -> IO a
#ifndef __GLASGOW_HASKELL__
withCWString = withArray0 wNUL . charsToCWChars
#else
withCWString str f =
allocaArray0 (length str) $ \ptr ->
let
go [] n## = pokeElemOff ptr (I## n##) wNUL
go (c:cs) n## = do pokeElemOff ptr (I## n##) (castCharToCWChar c); go cs (n## +## 1##)
in do
go str 0##
f ptr
#endif
withCWStringLen :: String -> (CWStringLen -> IO a) -> IO a
#ifndef __GLASGOW_HASKELL__
withCWStringLen str act = withArray (charsToCWChars str) $ act . pairLength str
#else
withCWStringLen str f =
allocaArray len $ \ptr ->
let
go [] n## = return ()
go (c:cs) n## = do pokeElemOff ptr (I## n##) (castCharToCWChar c); go cs (n## +## 1##)
in do
go str 0##
f (ptr,len)
where
len = length str
#endif
#else
wcharIsUnicode = False
#endif
#if defined(__STDC_ISO_10646__)
newtype MBState = MBState { _mbstate :: (Ptr MBState)}
withMBState :: (MBState -> IO a) -> IO a
withMBState act = allocaBytes (#const sizeof(mbstate_t)) (\mb -> c_memset mb 0 (#const sizeof(mbstate_t)) >> act (MBState mb))
clearMBState :: MBState -> IO ()
clearMBState (MBState mb) = c_memset mb 0 (#const sizeof(mbstate_t)) >> return ()
wcsrtombs :: CWString -> (CString, CSize) -> IO CSize
wcsrtombs wcs (cs,len) = alloca (\p -> poke p wcs >> withMBState (\mb -> wcsrtombs' p cs len mb)) where
wcsrtombs' p cs len mb = c_wcsrtombs cs p len mb >>= \x -> case x of
-1 -> do
sp <- peek p
poke sp ((fi (ord '?'))::CWChar)
poke p wcs
clearMBState mb
wcsrtombs' p cs len mb
_ -> return x
#def inline HsInt hs_get_mb_cur_max () { return MB_CUR_MAX; }
foreign import ccall unsafe hs_get_mb_cur_max :: IO Int
mb_cur_max :: Int
mb_cur_max = unsafePerformIO hs_get_mb_cur_max
charIsRepresentable :: Char -> IO Bool
charIsRepresentable ch = fmap (/= -1) $ allocaBytes mb_cur_max (\cs -> c_wctomb cs (fi $ ord ch))
foreign import ccall unsafe "stdlib.h wctomb" c_wctomb :: CString -> CWChar -> IO CInt
foreign import ccall unsafe "stdlib.h wcsrtombs" c_wcsrtombs :: CString -> (Ptr (Ptr CWChar)) -> CSize -> MBState -> IO CSize
foreign import ccall unsafe "string.h memset" c_memset :: Ptr a -> CInt -> CSize -> IO (Ptr a)
foreign import ccall unsafe "stdlib.h mbstowcs" c_mbstowcs :: CWString -> CString -> CSize -> IO CSize
mbstowcs a b s = throwIf (== -1) (const "mbstowcs") $ c_mbstowcs a b s
peekLCString :: CString -> IO String
peekLCString cp = do
sz <- mbstowcs nullPtr cp 0
allocaArray (fi $ sz + 1) (\wcp -> mbstowcs wcp cp (sz + 1) >> peekCWString wcp)
-- TODO fix for embeded NULs
peekLCStringLen :: CStringLen -> IO String
peekLCStringLen (cp, len) = allocaBytes (len + 1) $ \ncp -> do
copyBytes ncp cp len
pokeElemOff ncp len 0
peekLCString ncp
newLCString :: String -> IO CString
newLCString s = withCWString s $ \wcs -> do mallocArray0 alen >>= \cs -> wcsrtombs wcs (cs, fi alen) >> return cs where
alen = mb_cur_max * length s
newLCStringLen :: String -> IO CStringLen
newLCStringLen str = newLCString str >>= \cs -> return (pairLength1 str cs)
withLCString :: String -> (CString -> IO a) -> IO a
withLCString s a = withCWString s $ \wcs -> allocaArray0 alen (\cs -> wcsrtombs wcs (cs,fi alen) >> a cs) where
alen = mb_cur_max * length s
withLCStringLen :: String -> (CStringLen -> IO a) -> IO a
withLCStringLen s a = withCWString s $ \wcs -> allocaArray0 alen (\cs -> wcsrtombs wcs (cs,fi alen) >>= \sz -> a (cs,fi sz)) where
alen = mb_cur_max * length s
pairLength1 :: String -> CString -> CStringLen
pairLength1 = flip (,) . length
#else
charIsRepresentable :: Char -> IO Bool
charIsRepresentable ch = return $ isLatin1 ch
withLCString = withCString
withLCStringLen = withCStringLen
newLCString = newCString
newLCStringLen = newCStringLen
peekLCString = peekCString
peekLCStringLen = peekCStringLen
#endif
-----------------
-- UTF8 versions
-----------------
withUTF8String :: String -> (CString -> IO a) -> IO a
withUTF8String hsStr = CForeign.withCString (toUTF hsStr)
withUTF8StringLen :: String -> (CStringLen -> IO a) -> IO a
withUTF8StringLen hsStr = CForeign.withCStringLen (toUTF hsStr)
newUTF8String :: String -> IO CString
newUTF8String = CForeign.newCString . toUTF
newUTF8StringLen :: String -> IO CStringLen
newUTF8StringLen = CForeign.newCStringLen . toUTF
peekUTF8String :: CString -> IO String
peekUTF8String strPtr = fmap fromUTF $ CForeign.peekCString strPtr
peekUTF8StringLen :: CStringLen -> IO String
peekUTF8StringLen strPtr = fmap fromUTF $ CForeign.peekCStringLen strPtr
-- these should read and write directly from/to memory.
-- A first pass will be needed to determine the size of the allocated region
toUTF :: String -> String
toUTF [] = []
toUTF (x:xs) | ord x<=0x007F = x:toUTF xs
| ord x<=0x07FF = chr (0xC0 .|. ((ord x `shift` (-6)) .&. 0x1F)):
chr (0x80 .|. (ord x .&. 0x3F)):
toUTF xs
| otherwise = chr (0xE0 .|. ((ord x `shift` (-12)) .&. 0x0F)):
chr (0x80 .|. ((ord x `shift` (-6)) .&. 0x3F)):
chr (0x80 .|. (ord x .&. 0x3F)):
toUTF xs
fromUTF :: String -> String
fromUTF [] = []
fromUTF (all@(x:xs)) | ord x<=0x7F = x:fromUTF xs
| ord x<=0xBF = err
| ord x<=0xDF = twoBytes all
| ord x<=0xEF = threeBytes all
| otherwise = err
where
twoBytes (x1:x2:xs) = chr (((ord x1 .&. 0x1F) `shift` 6) .|.
(ord x2 .&. 0x3F)):fromUTF xs
twoBytes _ = error "fromUTF: illegal two byte sequence"
threeBytes (x1:x2:x3:xs) = chr (((ord x1 .&. 0x0F) `shift` 12) .|.
((ord x2 .&. 0x3F) `shift` 6) .|.
(ord x3 .&. 0x3F)):fromUTF xs
threeBytes _ = error "fromUTF: illegal three byte sequence"
err = error "fromUTF: illegal UTF-8 character"
More information about the FFI
mailing list