[commit: ghc] master: Fix self-contained handling of ASCII encoding (d69dfba)

git at git.haskell.org git at git.haskell.org
Fri Jul 10 17:54:59 UTC 2015


Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/d69dfba4e27c4ec33459906fd87c9a56a371f510/ghc

>---------------------------------------------------------------

commit d69dfba4e27c4ec33459906fd87c9a56a371f510
Author: Ben Gamari <bgamari.foss at gmail.com>
Date:   Fri Jul 10 19:49:29 2015 +0200

    Fix self-contained handling of ASCII encoding
    
    D898 was primarily intended to fix hangs in the event that iconv was
    unavailable (namely #10298 and #7695). In addition to this fix, it also
    introduced self-contained handling of ANSI terminals to allow compiled
    executables to run in minimal environments lacking iconv.
    
    However, the behavior that the patch introduced is highly suspicious.
    Specifically, it gives the user a UTF-8 encoding even if they requested
    ASCII.
    
    This has the potential to break quite a lot of code. At very least it
    breaks GHC's Unicode terminal detection logic, which attempts to catch
    an invalid character when encoding a pair of smart-quotes. Of course,
    this exception will never be thrown if a UTF-8 encoder is used.
    
    Here we use the `char8` encoding to handle requests for ASCII encodings
    in the event that we find iconv to be non-functional.
    
    Fixes #10623.
    
    Test Plan: Validate with T8959a
    
    Reviewers: rwbarton, hvr, austin, hsyl20
    
    Subscribers: thomie
    
    Differential Revision: https://phabricator.haskell.org/D1059
    
    GHC Trac Issues: #10623


>---------------------------------------------------------------

d69dfba4e27c4ec33459906fd87c9a56a371f510
 libraries/base/GHC/IO/Encoding.hs       | 33 +++++++++++++++++++++++----------
 libraries/base/GHC/IO/Encoding/Iconv.hs | 27 ++++++++++++++++++++-------
 libraries/base/GHC/TopHandler.hs        |  7 ++++++-
 testsuite/tests/driver/Makefile         |  4 ++++
 testsuite/tests/driver/T2507.stderr     |  4 ++--
 testsuite/tests/driver/T8959a.stderr    |  4 ++--
 6 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/libraries/base/GHC/IO/Encoding.hs b/libraries/base/GHC/IO/Encoding.hs
index 014b61b..76c7f55 100644
--- a/libraries/base/GHC/IO/Encoding.hs
+++ b/libraries/base/GHC/IO/Encoding.hs
@@ -235,14 +235,9 @@ mkTextEncoding e = case mb_coding_failure_mode of
         _             -> Nothing
 
 mkTextEncoding' :: CodingFailureMode -> String -> IO TextEncoding
-mkTextEncoding' cfm enc
-  -- First, specifically match on ASCII encodings directly using
-  -- several possible aliases (specified by RFC 1345 & co), which
-  -- allows us to handle ASCII conversions without iconv at all (see
-  -- trac #10298).
-  | any (== enc) ansiEncNames = return (UTF8.mkUTF8 cfm)
-  -- Otherwise, handle other encoding needs via iconv.
-  | otherwise = case [toUpper c | c <- enc, c /= '-'] of
+mkTextEncoding' cfm enc =
+  case [toUpper c | c <- enc, c /= '-'] of
+  -- UTF-8 and friends we can handle ourselves
     "UTF8"    -> return $ UTF8.mkUTF8 cfm
     "UTF16"   -> return $ UTF16.mkUTF16 cfm
     "UTF16LE" -> return $ UTF16.mkUTF16le cfm
@@ -254,13 +249,31 @@ mkTextEncoding' cfm enc
     'C':'P':n | [(cp,"")] <- reads n -> return $ CodePage.mkCodePageEncoding cfm cp
     _ -> unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
 #else
-    _ -> Iconv.mkIconvEncoding cfm enc
-#endif
+    -- Otherwise, handle other encoding needs via iconv.
+
+    -- Unfortunately there is no good way to determine whether iconv is actually
+    -- functional without telling it to do something.
+    _ -> do res <- Iconv.mkIconvEncoding cfm enc
+            let isAscii = any (== enc) ansiEncNames
+            case res of
+              Just e -> return e
+              -- At this point we know that we can't count on iconv to work
+              -- (see, for instance, Trac #10298). However, we still want to do
+              --  what we can to work with what we have. For instance, ASCII is
+              -- easy. We match on ASCII encodings directly using several
+              -- possible aliases (specified by RFC 1345 & Co) and for this use
+              -- the 'char8' encoding
+              Nothing
+                | isAscii   -> return char8
+                | otherwise ->
+                    unknownEncodingErr (enc ++ codingFailureModeSuffix cfm)
   where
     ansiEncNames = -- ASCII aliases
       [ "ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991"
       , "US-ASCII", "us", "IBM367", "cp367", "csASCII", "ASCII", "ISO646-US"
       ]
+#endif
+
 
 latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)
 latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8
diff --git a/libraries/base/GHC/IO/Encoding/Iconv.hs b/libraries/base/GHC/IO/Encoding/Iconv.hs
index 89ca71e..061bd60 100644
--- a/libraries/base/GHC/IO/Encoding/Iconv.hs
+++ b/libraries/base/GHC/IO/Encoding/Iconv.hs
@@ -34,9 +34,10 @@ import GHC.Base () -- For build ordering
 #else
 
 import Foreign
-import Foreign.C
+import Foreign.C hiding (charIsRepresentable)
 import Data.Maybe
 import GHC.Base
+import GHC.Foreign (charIsRepresentable)
 import GHC.IO.Buffer
 import GHC.IO.Encoding.Failure
 import GHC.IO.Encoding.Types
@@ -96,15 +97,27 @@ char_shift :: Int
 char_shift | charSize == 2 = 1
            | otherwise     = 2
 
-iconvEncoding :: String -> IO TextEncoding
+iconvEncoding :: String -> IO (Maybe TextEncoding)
 iconvEncoding = mkIconvEncoding ErrorOnCodingFailure
 
-mkIconvEncoding :: CodingFailureMode -> String -> IO TextEncoding
+-- | Construct an iconv-based 'TextEncoding' for the given character set and
+-- 'CodingFailureMode'.
+--
+-- As iconv is missing in some minimal environments (e.g. #10298), this
+-- checks to ensure that iconv is working properly before returning the
+-- encoding, returning 'Nothing' if not.
+mkIconvEncoding :: CodingFailureMode -> String -> IO (Maybe TextEncoding)
 mkIconvEncoding cfm charset = do
-  return (TextEncoding {
-                textEncodingName = charset,
-                mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix) (recoverDecode cfm) iconvDecode,
-                mkTextEncoder = newIConv haskellChar charset                 (recoverEncode cfm) iconvEncode})
+    let enc = TextEncoding {
+                  textEncodingName = charset,
+                  mkTextDecoder = newIConv raw_charset (haskellChar ++ suffix)
+                                           (recoverDecode cfm) iconvDecode,
+                  mkTextEncoder = newIConv haskellChar charset
+                                           (recoverEncode cfm) iconvEncode}
+    good <- charIsRepresentable enc 'a'
+    return $ if good
+               then Just enc
+               else Nothing
   where
     -- An annoying feature of GNU iconv is that the //PREFIXES only take
     -- effect when they appear on the tocode parameter to iconv_open:
diff --git a/libraries/base/GHC/TopHandler.hs b/libraries/base/GHC/TopHandler.hs
index e725196..05c905f 100644
--- a/libraries/base/GHC/TopHandler.hs
+++ b/libraries/base/GHC/TopHandler.hs
@@ -176,7 +176,12 @@ disasterHandler exit _ =
   withCAString "%s" $ \fmt ->
     withCAString msgStr $ \msg ->
       errorBelch fmt msg >> exit 1
-  where msgStr = "encountered an exception while trying to report an exception"
+  where
+    msgStr =
+        "encountered an exception while trying to report an exception." ++
+        "One possible reason for this is that we failed while trying to " ++
+        "encode an error message. Check that your locale is configured " ++
+        "properly."
 
 {- Note [Disaster with iconv]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/testsuite/tests/driver/Makefile b/testsuite/tests/driver/Makefile
index d5ce683..dc1238c 100644
--- a/testsuite/tests/driver/Makefile
+++ b/testsuite/tests/driver/Makefile
@@ -550,6 +550,10 @@ T7130:
 T7563:
 	-"$(TEST_HC)" $(TEST_HC_OPTS_NO_RECOMP) -C T7563.hs
 
+# Below we set LC_ALL=C to request standard ASCII output in the resulting error
+# messages. Unfortunately, Mac OS X still uses a Unicode encoding even with
+# LC_ALL=C, so we expect these tests to fail there.
+
 .PHONY: T6037
 T6037:
 	-LC_ALL=C "$(TEST_HC)" $(TEST_HC_OPTS_NO_RECOMP) -c T6037.hs
diff --git a/testsuite/tests/driver/T2507.stderr b/testsuite/tests/driver/T2507.stderr
index eb0878f..1a6e6f3 100644
--- a/testsuite/tests/driver/T2507.stderr
+++ b/testsuite/tests/driver/T2507.stderr
@@ -1,5 +1,5 @@
 
 T2507.hs:5:7: error:
-    Couldn't match expected type ‘Int’ with actual type ‘()’
+    Couldn't match expected type `Int' with actual type `()'
     In the expression: ()
-    In an equation for ‘foo’: foo = ()
+    In an equation for `foo': foo = ()
diff --git a/testsuite/tests/driver/T8959a.stderr b/testsuite/tests/driver/T8959a.stderr
index defb34b..476b9ee 100644
--- a/testsuite/tests/driver/T8959a.stderr
+++ b/testsuite/tests/driver/T8959a.stderr
@@ -1,5 +1,5 @@
 
 T8959a.hs:5:7: error:
-    Couldn't match expected type ‘Int -> Int’ with actual type ‘()’
+    Couldn't match expected type `Int -> Int' with actual type `()'
     In the expression: ()
-    In an equation for ‘foo’: foo = ()
+    In an equation for `foo': foo = ()



More information about the ghc-commits mailing list