[Haskell-cafe] regex-pcre and ghc-7.4.2 is not working with UTF-8

Thu Aug 23 22:39:47 CEST 2012

On Thu, Aug 23, 2012 at 08:59:52AM -0300, José Romildo Malaquias wrote:
> Hello.
> 
> I think I have an explanation for the problem with regex-pcre, ghc-7.4.2
> and UTF Strings.
> 
> The Text.Regex.PCRE.String module uses the withCString and
> withCStringLen from the module Foreign.C.String to pass a Haskell string
> to the C library pcre functions that compile regular expressions, and
> execute regular expressions to match some text.
> 
> Recent versions of ghc have withCString and withCStringLen definitions
> that uses the current system locale to define the marshalling of a
> Haskell string into a NUL terminated C string using temporary storage.
> 
> With a UTF-8 locale the length of the C string will be greater than the
> length of the corresponding Haskell string in the presence with
> characters outside of the ASCII range. Therefore positions of
> corresponding characters in both strings do not match.
> 
> In order to compute matching positions, regex-pcre functions use C
> strings. But to compute matching strings they use those positions with
> Haskell strings.
> 
> That gives the mismatch shown earlier and repeated here with the
> attached program run on a system with a UTF-8 locale:
> 
> 
>    $ LANG=en_US.UTF-8 && ./test1
>    getForeignEncoding: UTF-8
> 
>    regex            : país:(.*):(.*)
>    text             : país:Brasília:Brasil
>    String matchOnce : Just (array (0,2) [(0,(0,22)),(1,(6,9)),(2,(16,6))])
>    String match     : [["pa\237s:Bras\237lia:Brasil","ras\237lia:B","asil"]]
> 
>    $ LANG=en_US.ISO-8859-1 && ./test1
>    getForeignEncoding: ISO-8859-1
> 
>    regex            : pa�s:(.*):(.*)
>    text             : pa�s:Bras�lia:Brasil
>    String matchOnce : Just (array (0,2) [(0,(0,20)),(1,(5,8)),(2,(14,6))])
>    String match     : [["pa\237s:Bras\237lia:Brasil","Bras\237lia","Brasil"]]
> 
> 
> I see two ways of fixing this bug:
> 
> 1. make the matching functions compute the text using the C string and
>    the positions calculated by the C function, and convert the text back
>    to a Haskell string.
> 
> 2. map the positions in the C string (if possible) to the corresponding
>    positions in the Haskell string; this way the current definitions of
>    the matching functions returning text will just work.
> 
> I hope this would help fixing the issue.


I have a fix for this bug and it would be nice if others take a look at
it and see if it is ok. It is based on the second way presented above.

Romildo
-------------- next part --------------
diff -ur regex-pcre-0.94.4.orig/Text/Regex/PCRE/String.hs regex-pcre-0.94.4/Text/Regex/PCRE/String.hs

--- regex-pcre-0.94.4.orig/Text/Regex/PCRE/String.hs	2012-05-30 18:44:14.000000000 -0300
+++ regex-pcre-0.94.4/Text/Regex/PCRE/String.hs	2012-08-23 17:22:14.114641657 -0300
@@ -46,11 +46,16 @@
   ) where
 
 import Text.Regex.PCRE.Wrap -- all
-import Foreign.C.String(withCStringLen,withCString)
-import Data.Array(Array,listArray)
+import Foreign.C.String(CStringLen,withCStringLen,withCString)
+import Foreign.Storable(peekByteOff)
+import Data.Word(Word8)
+import Data.Array.IO(IOUArray,newArray,readArray,writeArray)
+import Data.Array(Array,listArray,bounds,elems)
 import System.IO.Unsafe(unsafePerformIO)
-import Text.Regex.Base.RegexLike(RegexMaker(..),RegexLike(..),RegexContext(..),MatchLength,MatchOffset)
+import Text.Regex.Base.RegexLike(RegexMaker(..),RegexLike(..),RegexContext(..),MatchLength,MatchOffset,MatchArray)
 import Text.Regex.Base.Impl(polymatch,polymatchM)
+import GHC.IO.Encoding(getForeignEncoding,textEncodingName)
+import Control.Monad(forM)
 
 instance RegexContext Regex String String where
   match = polymatch
@@ -72,7 +77,7 @@
   matchOnce regex str = unsafePerformIO $
     execute regex str >>= unwrap
   matchAll regex str = unsafePerformIO $ 
-    withCStringLen str (wrapMatchAll regex) >>= unwrap
+    withCStringLen str (wrapMatchAllFixPos regex) >>= unwrap
   matchCount regex str = unsafePerformIO $ 
     withCStringLen str (wrapCount regex) >>= unwrap
 
@@ -91,7 +96,7 @@
                 -- string, or:
                 --   'Just' an array of (offset,length) pairs where index 0 is whole match, and the rest are the captured subexpressions.
 execute regex str = do
-  maybeStartEnd <- withCStringLen str (wrapMatch 0 regex)
+  maybeStartEnd <- withCStringLen str (wrapMatchFixPos 0 regex)
   case maybeStartEnd of
     Right Nothing -> return (Right Nothing)
 --  Right (Just []) -> fail "got [] back!" -- should never happen
@@ -115,9 +120,94 @@
         ,getSub matchedStartStop
         ,drop stop str
         ,map getSub subStartStop)
-  maybeStartEnd <- withCStringLen str (wrapMatch 0 regex)
+  maybeStartEnd <- withCStringLen str (wrapMatchFixPos 0 regex)
   case maybeStartEnd of
     Right Nothing -> return (Right Nothing)
 --  Right (Just []) -> fail "got [] back!" -- should never happen
     Right (Just parts) -> return . Right . Just . matchedParts $ parts
     Left err -> return (Left err)
+
+
+
+-- | wrapMatchFixPos calls wrapMatch and fixes the string offsets
+-- in the result so that they are valid in the original Haskell string
+-- 
+-- 
+wrapMatchFixPos :: StartOffset
+                -> Regex
+                -> CStringLen
+                -> IO (Either WrapError (Maybe [(StartOffset,EndOffset)]))
+wrapMatchFixPos startOffset regex cstr_len = do
+  maybeStartEnd <- wrapMatch startOffset regex cstr_len
+  case maybeStartEnd of
+    Right (Just parts) -> do maybeMapPos <- decode_positions cstr_len
+                             case maybeMapPos of
+                               Just mapPos -> fmap (Right . Just) $ forM parts $ \(s,e)-> do
+                                                s' <- readArray mapPos s
+                                                e' <- readArray mapPos (pred e)
+                                                return (s',succ e')
+                               Nothing -> return maybeStartEnd
+    _ -> return maybeStartEnd
+
+-- | wrapMatchAllFixPos calls wrapMatchAll and fixes the string offsets
+-- in the result so that they are valid in the original Haskell string
+-- 
+-- 
+wrapMatchAllFixPos :: Regex
+                   -> CStringLen
+                   -> IO (Either WrapError [ MatchArray ])
+wrapMatchAllFixPos regex cstr_len = do
+  putStrLn "wrapMatchAllFixPos"
+  maybeStartLen <- wrapMatchAll regex cstr_len
+  case maybeStartLen of
+    Right parts -> do maybeMapPos <- decode_positions cstr_len
+                      case maybeMapPos of
+                        Just mapPos -> fmap Right $ forM parts $ \arr ->
+                                         fmap (listArray (bounds arr)) $ forM (elems arr) $ \(s,n) -> do
+                                                                           s' <- readArray mapPos s
+                                                                           return (s',n)
+                        Nothing -> return maybeStartLen
+    _ -> return maybeStartLen
+
+
+-- | utf8_range determines how many bytes are needed to represent a UTF-8
+-- character given its first byte
+-- 
+-- 
+utf8_range :: Word8 -> Int
+utf8_range c | c <= 0x7f = 1
+             | c <= 0xdf = 2
+             | c <= 0xef = 3
+             | otherwise = 4
+
+
+-- | utf8_decode_pos_array constructs an array that maps positions in an
+-- UTF-8 encoded C string to the related positions in the corresponding
+-- unencoded Haskell string.
+-- 
+-- 
+utf8_decode_pos_array :: CStringLen -> IO (IOUArray Int Int)
+utf8_decode_pos_array (cstr,len) =
+  do arr <- newArray (0,pred len) 0 :: IO (IOUArray Int Int)
+
+     let loop x i n
+           | i == len  = return arr
+           | n == 0    = do c <- peekByteOff cstr i :: IO Word8
+                            loop (succ x) i (utf8_range c)
+           | otherwise = do writeArray arr i x
+                            loop x (succ i) (pred n)
+
+     loop (-1) 0 0
+
+-- | decode_positions constructs an array that maps positions in an
+-- encoded C string to the related positions in the corresponding unencoded
+-- Haskell string. Currently it works only for UTF-8 strings.
+-- 
+-- 
+decode_positions :: CStringLen -> IO (Maybe (IOUArray Int Int))
+decode_positions cstr_len = do
+  enc <- getForeignEncoding
+  case textEncodingName enc of
+    "UTF-8" -> fmap Just $ utf8_decode_pos_array cstr_len
+    _       -> return Nothing
+