[commit: packages/text] master: fix utf8 error recovery (a125908)
git at git.haskell.org
git at git.haskell.org
Tue Aug 8 15:23:15 UTC 2017
Repository : ssh://git@git.haskell.org/text
On branch : master
Link : http://git.haskell.org/packages/text.git/commitdiff/a125908830a974870e835298fd35807b7a529574
>---------------------------------------------------------------
commit a125908830a974870e835298fd35807b7a529574
Author: Kubo Kovac <kuko at fb.com>
Date: Mon May 22 16:27:20 2017 +0100
fix utf8 error recovery
>---------------------------------------------------------------
a125908830a974870e835298fd35807b7a529574
cbits/cbits.c | 9 +++------
tests/Tests/Properties.hs | 18 ++++++++++++++++++
2 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/cbits/cbits.c b/cbits/cbits.c
index 9aec02a..029d7e8 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -222,12 +222,9 @@ _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
{
uint32_t codepoint;
uint32_t state = UTF8_ACCEPT;
- uint8_t const *ret = _hs_text_decode_utf8_int(dest, destoff, &src, srcend,
- &codepoint, &state);
- /* Back up if we have an incomplete or invalid encoding */
- if (state != UTF8_ACCEPT)
- ret -= 1;
- return ret;
+ _hs_text_decode_utf8_int(dest, destoff, &src, srcend,
+ &codepoint, &state);
+ return src;
}
void
diff --git a/tests/Tests/Properties.hs b/tests/Tests/Properties.hs
index 2193562..d490438 100644
--- a/tests/Tests/Properties.hs
+++ b/tests/Tests/Properties.hs
@@ -180,6 +180,19 @@ genInvalidUTF8 = B.pack <$> oneof [
k <- choose (0,n)
vectorOf k gen
+-- See http://unicode.org/faq/utf_bom.html#gen8
+-- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
+-- When faced with this illegal byte sequence ... a UTF-8 conformant process
+-- must treat the first byte 110xxxxx2 as an illegal termination error
+-- (e.g. filter it out or replace by 0xFFFD) ...
+-- ... and continue processing at the second byte 0xxxxxxx2
+t_decode_with_error2 =
+ E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xC2, 97]) === "xa"
+t_decode_with_error3 =
+ E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xE0, 97, 97]) === "xaa"
+t_decode_with_error4 =
+ E.decodeUtf8With (\_ _ -> Just 'x') (B.pack [0xF0, 97, 97, 97]) === "xaaa"
+
s_Eq s = (s==) `eq` ((S.streamList s==) . S.streamList)
where _types = s :: String
sf_Eq p s =
@@ -955,6 +968,11 @@ tests =
testGroup "errors" [
testProperty "t_utf8_err" t_utf8_err,
testProperty "t_utf8_err'" t_utf8_err'
+ ],
+ testGroup "error recovery" [
+ testProperty "t_decode_with_error2" t_decode_with_error2,
+ testProperty "t_decode_with_error3" t_decode_with_error3,
+ testProperty "t_decode_with_error4" t_decode_with_error4
]
],
More information about the ghc-commits
mailing list