[commit: packages/text] master: fix utf8 error recovery (a125908)

git at git.haskell.org git at git.haskell.org
Tue Aug 8 15:23:15 UTC 2017


Repository : ssh://git@git.haskell.org/text

On branch  : master
Link       : http://git.haskell.org/packages/text.git/commitdiff/a125908830a974870e835298fd35807b7a529574

>---------------------------------------------------------------

commit a125908830a974870e835298fd35807b7a529574
Author: Kubo Kovac <kuko at fb.com>
Date:   Mon May 22 16:27:20 2017 +0100

    fix utf8 error recovery


>---------------------------------------------------------------

a125908830a974870e835298fd35807b7a529574
 cbits/cbits.c             |  9 +++------
 tests/Tests/Properties.hs | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/cbits/cbits.c b/cbits/cbits.c
index 9aec02a..029d7e8 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -222,12 +222,9 @@ _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
 {
   uint32_t codepoint;
   uint32_t state = UTF8_ACCEPT;
-  uint8_t const *ret = _hs_text_decode_utf8_int(dest, destoff, &src, srcend,
-						&codepoint, &state);
-  /* Back up if we have an incomplete or invalid encoding */
-  if (state != UTF8_ACCEPT)
-    ret -= 1;
-  return ret;
+  _hs_text_decode_utf8_int(dest, destoff, &src, srcend,
+                          &codepoint, &state);
+  return src;
 }
 
 void
diff --git a/tests/Tests/Properties.hs b/tests/Tests/Properties.hs
index 2193562..d490438 100644
--- a/tests/Tests/Properties.hs
+++ b/tests/Tests/Properties.hs
@@ -180,6 +180,19 @@ genInvalidUTF8 = B.pack <$> oneof [
       k <- choose (0,n)
       vectorOf k gen
 
+-- See http://unicode.org/faq/utf_bom.html#gen8
+-- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
+-- When faced with this illegal byte sequence ... a UTF-8 conformant process
+-- must treat the first byte 110xxxxx2 as an illegal termination error
+-- (e.g. filter it out or replace by 0xFFFD) ...
+-- ... and continue processing at the second byte 0xxxxxxx2
+t_decode_with_error2 =
+  E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97]) === "xa"
+t_decode_with_error3 =
+  E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xE0, 97, 97]) === "xaa"
+t_decode_with_error4 =
+  E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xF0, 97, 97, 97]) === "xaaa"
+
 s_Eq s            = (s==)    `eq` ((S.streamList s==) . S.streamList)
     where _types = s :: String
 sf_Eq p s =
@@ -955,6 +968,11 @@ tests =
       testGroup "errors" [
         testProperty "t_utf8_err" t_utf8_err,
         testProperty "t_utf8_err'" t_utf8_err'
+      ],
+      testGroup "error recovery" [
+        testProperty "t_decode_with_error2" t_decode_with_error2,
+        testProperty "t_decode_with_error3" t_decode_with_error3,
+        testProperty "t_decode_with_error4" t_decode_with_error4
       ]
     ],
 



More information about the ghc-commits mailing list