[commit: ghc] master: Always use native-Haskell de/encoders for ASCII and latin1 (1319363)

Tue May 24 21:45:59 UTC 2016

Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/1319363f7633c441bdb1f659616d71ecd700076d/ghc

>---------------------------------------------------------------

commit 1319363f7633c441bdb1f659616d71ecd700076d
Author: Thomas Miedema <thomasmiedema at gmail.com>
Date:   Tue May 24 11:31:45 2016 +0200

    Always use native-Haskell de/encoders for ASCII and latin1
    
    This fixes test encoding005 on Windows (#10623).
    
    Reviewed by: austin, bgamari
    
    Differential Revision: https://phabricator.haskell.org/D2262


>---------------------------------------------------------------

1319363f7633c441bdb1f659616d71ecd700076d
 libraries/base/GHC/IO/Encoding.hs          | 34 +++++++++++++++++-------------
 libraries/base/tests/IO/all.T              |  3 +--
 libraries/base/tests/IO/encoding005.hs     | 16 +++++++-------
 libraries/base/tests/IO/encoding005.stdout |  2 +-
 4 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/libraries/base/GHC/IO/Encoding.hs b/libraries/base/GHC/IO/Encoding.hs
index 18b5432..578a420 100644
--- a/libraries/base/GHC/IO/Encoding.hs
+++ b/libraries/base/GHC/IO/Encoding.hs
@@ -245,8 +245,16 @@ mkTextEncoding' cfm enc =
     "UTF32"   -> return $ UTF32.mkUTF32 cfm
     "UTF32LE" -> return $ UTF32.mkUTF32le cfm
     "UTF32BE" -> return $ UTF32.mkUTF32be cfm
-  -- ISO8859-1 we can handle ourselves as well
-    "ISO88591" -> return $ Latin1.mkLatin1 cfm
+    -- On AIX, we want to avoid iconv, because it is either
+    -- a) totally broken, or b) non-reentrant, or c) actually works.
+    -- Detecting b) is difficult as you'd have to trigger the reentrancy
+    -- corruption.
+    -- Therefore, on AIX, we handle the popular ASCII and latin1 encodings
+    -- ourselves. For consistency, we do the same on other platforms.
+    -- We use `mkLatin1_checked` instead of `mkLatin1`, since the latter
+    -- completely ignores the CodingFailureMode (TEST=encoding005).
+    _ | isAscii -> return (Latin1.mkAscii cfm)
+    _ | isLatin1 -> return (Latin1.mkLatin1_checked cfm)
 #if defined(mingw32_HOST_OS)
     'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
     _ -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
@@ -256,25 +264,21 @@ mkTextEncoding' cfm enc =
     -- Unfortunately there is no good way to determine whether iconv is actually
     -- functional without telling it to do something.
     _ -> do res <- Iconv.mkIconvEncoding cfm enc
-            let isAscii = any (== enc) ansiEncNames
             case res of
               Just e -> return e
-              -- At this point we know that we can't count on iconv to work
-              -- (see, for instance, Trac #10298). However, we still want to do
-              --  what we can to work with what we have. For instance, ASCII is
-              -- easy. We match on ASCII encodings directly using several
-              -- possible aliases (specified by RFC 1345 & Co) and for this use
-              -- the 'ascii' encoding
-              Nothing
-                | isAscii   -> return (Latin1.mkAscii cfm)
-                | otherwise ->
-                    unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
+              Nothing -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
+#endif
   where
-    ansiEncNames = -- ASCII aliases
+    isAscii = enc `elem` asciiEncNames
+    isLatin1 = enc `elem` latin1EncNames
+    asciiEncNames = -- ASCII aliases specified by RFC 1345 and RFC 3808.
       [ "ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991"
       , "US-ASCII", "us", "IBM367", "cp367", "csASCII", "ASCII", "ISO646-US"
       ]
-#endif
+    latin1EncNames = -- latin1 aliases specified by RFC 1345 and RFC 3808.
+      [ "ISO_8859-1:1987", "iso-ir-100", "ISO_8859-1", "ISO-8859-1", "latin1",
+        "l1", "IBM819", "CP819", "csISOLatin1"
+      ]
 
 
 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
diff --git a/libraries/base/tests/IO/all.T b/libraries/base/tests/IO/all.T
index d04f3c4..295553f 100644
--- a/libraries/base/tests/IO/all.T
+++ b/libraries/base/tests/IO/all.T
@@ -138,8 +138,7 @@ test('encoding001',
 test('encoding002', normal, compile_and_run, [''])
 test('encoding003', normal, compile_and_run, [''])
 test('encoding004', normal, compile_and_run, [''])
-test('encoding005', when(opsys('mingw32'), expect_broken(10623)),
-     compile_and_run, [''])
+test('encoding005', normal, compile_and_run, [''])
 
 test('environment001',
      [extra_clean(['environment001'])],
diff --git a/libraries/base/tests/IO/encoding005.hs b/libraries/base/tests/IO/encoding005.hs
index 99db84a..b4ee381 100644
--- a/libraries/base/tests/IO/encoding005.hs
+++ b/libraries/base/tests/IO/encoding005.hs
@@ -44,9 +44,9 @@ test_latin1 cfm enc = do
     ErrorOnCodingFailure -> Nothing
     IgnoreCodingFailure -> Just [0xfe,0xff,0xff,0xfe]
     TransliterateCodingFailure -> Just [0xfe,0xff,0x3f,0x3f,0x3f,0xff,0xfe]
-    -- N.B. The argument "LATIN1//TRANSLIT" to mkTextEncoding does not
-    -- correspond to "LATIN1//TRANSLIT" in iconv! Instead GHC asks iconv
-    -- to encode to "LATIN1" and uses its own "evil hack" to insert '?'
+    -- N.B. The argument "latin1//TRANSLIT" to mkTextEncoding does not
+    -- correspond to "latin1//TRANSLIT" in iconv! Instead GHC asks iconv
+    -- to encode to "latin1" and uses its own "evil hack" to insert '?'
     -- (ASCII 0x3f) in place of failures. See GHC.IO.Encoding.recoverEncode.
     --
     -- U+0100 is LATIN CAPITAL LETTER A WITH MACRON, which iconv would
@@ -108,8 +108,8 @@ main = do
   test_ascii TransliterateCodingFailure =<< mkTextEncoding "ASCII//TRANSLIT"
   test_ascii RoundtripFailure =<< mkTextEncoding "ASCII//ROUNDTRIP"
 
-  putStrLn "mkTextEncoding LATIN1 tests"
-  test_latin1 ErrorOnCodingFailure =<< mkTextEncoding "LATIN1"
-  test_latin1 IgnoreCodingFailure =<< mkTextEncoding "LATIN1//IGNORE"
-  test_latin1 TransliterateCodingFailure =<< mkTextEncoding "LATIN1//TRANSLIT"
-  test_latin1 RoundtripFailure =<< mkTextEncoding "LATIN1//ROUNDTRIP"
+  putStrLn "mkTextEncoding latin1 tests"
+  test_latin1 ErrorOnCodingFailure =<< mkTextEncoding "latin1"
+  test_latin1 IgnoreCodingFailure =<< mkTextEncoding "latin1//IGNORE"
+  test_latin1 TransliterateCodingFailure =<< mkTextEncoding "latin1//TRANSLIT"
+  test_latin1 RoundtripFailure =<< mkTextEncoding "latin1//ROUNDTRIP"
diff --git a/libraries/base/tests/IO/encoding005.stdout b/libraries/base/tests/IO/encoding005.stdout
index 664a193..e7995b1 100644
--- a/libraries/base/tests/IO/encoding005.stdout
+++ b/libraries/base/tests/IO/encoding005.stdout
@@ -2,4 +2,4 @@ char8 tests
 Latin1.ascii tests
 Latin1.latin1_checked tests
 mkTextEncoding ASCII tests
-mkTextEncoding LATIN1 tests
+mkTextEncoding latin1 tests