[commit: ghc] master: StringBuffer should not contain initial byte-order mark (BOM) (9e93940)

Mon Sep 1 20:15:02 UTC 2014

Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/9e939403241b758a685834c9ff62edcd3172a2cf/ghc

>---------------------------------------------------------------

commit 9e939403241b758a685834c9ff62edcd3172a2cf
Author: Thomas Miedema <thomasmiedema at gmail.com>
Date:   Mon Sep 1 15:11:50 2014 -0500

    StringBuffer should not contain initial byte-order mark (BOM)
    
    Summary:
    Just skipping over a BOM, but leaving it in the Stringbuffer, is not
    sufficient. The Lexer calls prevChar when a regular expression starts
    with '^' (which is a shorthand for '\n^'). It would never match on the
    first line, since instead of '\n', prevChar would still return '\xfeff'.
    
    Test Plan: validate
    
    Reviewers: austin, ezyang
    
    Reviewed By: austin, ezyang
    
    Subscribers: simonmar, ezyang, carter
    
    Differential Revision: https://phabricator.haskell.org/D176
    
    GHC Trac Issues: #6016


>---------------------------------------------------------------

9e939403241b758a685834c9ff62edcd3172a2cf
 compiler/utils/StringBuffer.lhs         | 45 +++++++++++++++++++++++----------
 testsuite/.gitignore                    |  2 ++
 testsuite/tests/parser/unicode/T6016.hs | 34 +++++++++++++++++++++++++
 testsuite/tests/parser/unicode/all.T    |  1 +
 4 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/compiler/utils/StringBuffer.lhs b/compiler/utils/StringBuffer.lhs
index a54f45f..50d8443 100644
--- a/compiler/utils/StringBuffer.lhs
+++ b/compiler/utils/StringBuffer.lhs
@@ -47,9 +47,12 @@ import Encoding
 import FastString
 import FastTypes
 import FastFunctions
+import Outputable
+import Util
 
-import System.IO                ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
-                                , Handle, hTell, openBinaryFile )
+import Data.Maybe
+import Control.Exception
+import System.IO
 import System.IO.Unsafe         ( unsafePerformIO )
 
 import GHC.Exts
@@ -89,7 +92,8 @@ hGetStringBuffer :: FilePath -> IO StringBuffer
 hGetStringBuffer fname = do
    h <- openBinaryFile fname ReadMode
    size_i <- hFileSize h
-   let size = fromIntegral size_i
+   offset_i <- skipBOM h size_i 0  -- offset is 0 initially
+   let size = fromIntegral $ size_i - offset_i
    buf <- mallocForeignPtrArray (size+3)
    withForeignPtr buf $ \ptr -> do
      r <- if size == 0 then return 0 else hGetBuf h ptr size
@@ -101,7 +105,7 @@ hGetStringBuffer fname = do
 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
 hGetStringBufferBlock handle wanted
     = do size_i <- hFileSize handle
-         offset_i <- hTell handle
+         offset_i <- hTell handle >>= skipBOM handle size_i
          let size = min wanted (fromIntegral $ size_i-offset_i)
          buf <- mallocForeignPtrArray (size+3)
          withForeignPtr buf $ \ptr ->
@@ -110,19 +114,34 @@ hGetStringBufferBlock handle wanted
                    then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
                    else newUTF8StringBuffer buf ptr size
 
+-- | Skip the byte-order mark if there is one (see #1744 and #6016),
+-- and return the new position of the handle in bytes.
+--
+-- This is better than treating #FEFF as whitespace,
+-- because that would mess up layout.  We don't have a concept
+-- of zero-width whitespace in Haskell: all whitespace codepoints
+-- have a width of one column.
+skipBOM :: Handle -> Integer -> Integer -> IO Integer
+skipBOM h size offset =
+  -- Only skip BOM at the beginning of a file.
+  if size > 0 && offset == 0
+    then do
+      -- Validate assumption that handle is in binary mode.
+      ASSERTM( hGetEncoding h >>= return . isNothing )
+      -- Temporarily select text mode to make `hLookAhead` and
+      -- `hGetChar` return full Unicode characters.
+      bracket_ (hSetBinaryMode h False) (hSetBinaryMode h True) $ do
+        c <- hLookAhead h
+        if c == '\xfeff'
+          then hGetChar h >> hTell h
+          else return offset
+    else return offset
+
 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
 newUTF8StringBuffer buf ptr size = do
   pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
   -- sentinels for UTF-8 decoding
-  let
-      sb0 = StringBuffer buf size 0
-      (first_char, sb1) = nextChar sb0
-        -- skip the byte-order mark if there is one (see #1744)
-        -- This is better than treating #FEFF as whitespace,
-        -- because that would mess up layout.  We don't have a concept
-        -- of zero-width whitespace in Haskell: all whitespace codepoints
-        -- have a width of one column.
-  return (if first_char == '\xfeff' then sb1 else sb0)
+  return $ StringBuffer buf size 0
 
 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
 appendStringBuffers sb1 sb2
diff --git a/testsuite/.gitignore b/testsuite/.gitignore
index 591545c..4f8ac87 100644
--- a/testsuite/.gitignore
+++ b/testsuite/.gitignore
@@ -1074,6 +1074,8 @@ mk/ghcconfig_*_inplace_bin_ghc-stage2.exe.mk
 /tests/parser/should_run/readRun004
 /tests/parser/unicode/1744
 /tests/parser/unicode/T1744
+/tests/parser/unicode/T6016
+/tests/parser/unicode/T6016-twoBOMs
 /tests/parser/unicode/utf8_024
 /tests/patsyn/should_run/bidir-explicit
 /tests/patsyn/should_run/bidir-explicit-scope
diff --git a/testsuite/tests/parser/unicode/T6016.hs b/testsuite/tests/parser/unicode/T6016.hs
new file mode 100644
index 0000000..5783a72
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T6016.hs
@@ -0,0 +1,34 @@
+module Main where
+
+import Control.Exception
+import Data.Char
+import System.IO
+
+import StringBuffer
+
+twoBOMs = "T6016-twoBOMs"
+
+ignoreFirstBOM = do
+  -- StringBuffer should not contain initial byte-order mark.
+  --
+  -- Just skipping over it, but leaving it in the Stringbuffer, is not
+  -- sufficient. The Lexer calls prevChar when a regular expression
+  -- starts with '^' (which is a shorthand for '\n^'). It would never
+  -- match on the first line, since instead of '\n', prevChar would
+  -- still return '\xfeff'.
+  s <- hGetStringBuffer twoBOMs
+  assert (prevChar s '\n' == '\n') return ()
+
+dontIgnoreSecondBOM = do
+  -- U+FEFF is considered a BOM only if it appears as the first
+  -- character of a file.
+  h <- openBinaryFile twoBOMs ReadMode
+  hSeek h AbsoluteSeek 3
+  s <- hGetStringBufferBlock h 3
+  hClose h
+  assert (currentChar s == '\xfeff') return ()
+
+main = do
+  writeFile twoBOMs "\xfeff\xfeff"
+  ignoreFirstBOM
+  dontIgnoreSecondBOM
diff --git a/testsuite/tests/parser/unicode/all.T b/testsuite/tests/parser/unicode/all.T
index a8e19eb..2ff7edf 100644
--- a/testsuite/tests/parser/unicode/all.T
+++ b/testsuite/tests/parser/unicode/all.T
@@ -20,4 +20,5 @@ test('T1744', normal, compile_and_run, [''])
 test('T1103', normal, compile, [''])
 test('T2302', only_ways(['normal']), compile_fail, [''])
 test('T4373', normal, compile, [''])
+test('T6016', extra_clean('T6016-twoBOMs'), compile_and_run, ['-package ghc'])
 test('T7671', normal, compile, [''])