[commit: ghc] master: StringBuffer should not contain initial byte-order mark (BOM) (9e93940)
git at git.haskell.org
git at git.haskell.org
Mon Sep 1 20:15:02 UTC 2014
Repository : ssh://git@git.haskell.org/ghc
On branch : master
Link : http://ghc.haskell.org/trac/ghc/changeset/9e939403241b758a685834c9ff62edcd3172a2cf/ghc
>---------------------------------------------------------------
commit 9e939403241b758a685834c9ff62edcd3172a2cf
Author: Thomas Miedema <thomasmiedema at gmail.com>
Date: Mon Sep 1 15:11:50 2014 -0500
StringBuffer should not contain initial byte-order mark (BOM)
Summary:
Just skipping over a BOM, but leaving it in the Stringbuffer, is not
sufficient. The Lexer calls prevChar when a regular expression starts
with '^' (which is a shorthand for '\n^'). It would never match on the
first line, since instead of '\n', prevChar would still return '\xfeff'.
Test Plan: validate
Reviewers: austin, ezyang
Reviewed By: austin, ezyang
Subscribers: simonmar, ezyang, carter
Differential Revision: https://phabricator.haskell.org/D176
GHC Trac Issues: #6016
>---------------------------------------------------------------
9e939403241b758a685834c9ff62edcd3172a2cf
compiler/utils/StringBuffer.lhs | 45 +++++++++++++++++++++++----------
testsuite/.gitignore | 2 ++
testsuite/tests/parser/unicode/T6016.hs | 34 +++++++++++++++++++++++++
testsuite/tests/parser/unicode/all.T | 1 +
4 files changed, 69 insertions(+), 13 deletions(-)
diff --git a/compiler/utils/StringBuffer.lhs b/compiler/utils/StringBuffer.lhs
index a54f45f..50d8443 100644
--- a/compiler/utils/StringBuffer.lhs
+++ b/compiler/utils/StringBuffer.lhs
@@ -47,9 +47,12 @@ import Encoding
import FastString
import FastTypes
import FastFunctions
+import Outputable
+import Util
-import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
- , Handle, hTell, openBinaryFile )
+import Data.Maybe
+import Control.Exception
+import System.IO
import System.IO.Unsafe ( unsafePerformIO )
import GHC.Exts
@@ -89,7 +92,8 @@ hGetStringBuffer :: FilePath -> IO StringBuffer
hGetStringBuffer fname = do
h <- openBinaryFile fname ReadMode
size_i <- hFileSize h
- let size = fromIntegral size_i
+ offset_i <- skipBOM h size_i 0 -- offset is 0 initially
+ let size = fromIntegral $ size_i - offset_i
buf <- mallocForeignPtrArray (size+3)
withForeignPtr buf $ \ptr -> do
r <- if size == 0 then return 0 else hGetBuf h ptr size
@@ -101,7 +105,7 @@ hGetStringBuffer fname = do
hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
hGetStringBufferBlock handle wanted
= do size_i <- hFileSize handle
- offset_i <- hTell handle
+ offset_i <- hTell handle >>= skipBOM handle size_i
let size = min wanted (fromIntegral $ size_i-offset_i)
buf <- mallocForeignPtrArray (size+3)
withForeignPtr buf $ \ptr ->
@@ -110,19 +114,34 @@ hGetStringBufferBlock handle wanted
then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
else newUTF8StringBuffer buf ptr size
+-- | Skip the byte-order mark if there is one (see #1744 and #6016),
+-- and return the new position of the handle in bytes.
+--
+-- This is better than treating #FEFF as whitespace,
+-- because that would mess up layout. We don't have a concept
+-- of zero-width whitespace in Haskell: all whitespace codepoints
+-- have a width of one column.
+skipBOM :: Handle -> Integer -> Integer -> IO Integer
+skipBOM h size offset =
+ -- Only skip BOM at the beginning of a file.
+ if size > 0 && offset == 0
+ then do
+ -- Validate assumption that handle is in binary mode.
+ ASSERTM( hGetEncoding h >>= return . isNothing )
+ -- Temporarily select text mode to make `hLookAhead` and
+ -- `hGetChar` return full Unicode characters.
+ bracket_ (hSetBinaryMode h False) (hSetBinaryMode h True) $ do
+ c <- hLookAhead h
+ if c == '\xfeff'
+ then hGetChar h >> hTell h
+ else return offset
+ else return offset
+
newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
newUTF8StringBuffer buf ptr size = do
pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
-- sentinels for UTF-8 decoding
- let
- sb0 = StringBuffer buf size 0
- (first_char, sb1) = nextChar sb0
- -- skip the byte-order mark if there is one (see #1744)
- -- This is better than treating #FEFF as whitespace,
- -- because that would mess up layout. We don't have a concept
- -- of zero-width whitespace in Haskell: all whitespace codepoints
- -- have a width of one column.
- return (if first_char == '\xfeff' then sb1 else sb0)
+ return $ StringBuffer buf size 0
appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
appendStringBuffers sb1 sb2
diff --git a/testsuite/.gitignore b/testsuite/.gitignore
index 591545c..4f8ac87 100644
--- a/testsuite/.gitignore
+++ b/testsuite/.gitignore
@@ -1074,6 +1074,8 @@ mk/ghcconfig_*_inplace_bin_ghc-stage2.exe.mk
/tests/parser/should_run/readRun004
/tests/parser/unicode/1744
/tests/parser/unicode/T1744
+/tests/parser/unicode/T6016
+/tests/parser/unicode/T6016-twoBOMs
/tests/parser/unicode/utf8_024
/tests/patsyn/should_run/bidir-explicit
/tests/patsyn/should_run/bidir-explicit-scope
diff --git a/testsuite/tests/parser/unicode/T6016.hs b/testsuite/tests/parser/unicode/T6016.hs
new file mode 100644
index 0000000..5783a72
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T6016.hs
@@ -0,0 +1,34 @@
+module Main where
+
+import Control.Exception
+import Data.Char
+import System.IO
+
+import StringBuffer
+
+twoBOMs = "T6016-twoBOMs"
+
+ignoreFirstBOM = do
+ -- StringBuffer should not contain initial byte-order mark.
+ --
+ -- Just skipping over it, but leaving it in the Stringbuffer, is not
+ -- sufficient. The Lexer calls prevChar when a regular expression
+ -- starts with '^' (which is a shorthand for '\n^'). It would never
+ -- match on the first line, since instead of '\n', prevChar would
+ -- still return '\xfeff'.
+ s <- hGetStringBuffer twoBOMs
+ assert (prevChar s '\n' == '\n') return ()
+
+dontIgnoreSecondBOM = do
+ -- U+FEFF is considered a BOM only if it appears as the first
+ -- character of a file.
+ h <- openBinaryFile twoBOMs ReadMode
+ hSeek h AbsoluteSeek 3
+ s <- hGetStringBufferBlock h 3
+ hClose h
+ assert (currentChar s == '\xfeff') return ()
+
+main = do
+ writeFile twoBOMs "\xfeff\xfeff"
+ ignoreFirstBOM
+ dontIgnoreSecondBOM
diff --git a/testsuite/tests/parser/unicode/all.T b/testsuite/tests/parser/unicode/all.T
index a8e19eb..2ff7edf 100644
--- a/testsuite/tests/parser/unicode/all.T
+++ b/testsuite/tests/parser/unicode/all.T
@@ -20,4 +20,5 @@ test('T1744', normal, compile_and_run, [''])
test('T1103', normal, compile, [''])
test('T2302', only_ways(['normal']), compile_fail, [''])
test('T4373', normal, compile, [''])
+test('T6016', extra_clean('T6016-twoBOMs'), compile_and_run, ['-package ghc'])
test('T7671', normal, compile, [''])
More information about the ghc-commits
mailing list