[commit: ghc] master: Fix a bug in 'alexInputPrevChar' (821adee)
git at git.haskell.org
git at git.haskell.org
Wed Oct 25 20:44:47 UTC 2017
Repository : ssh://git@git.haskell.org/ghc
On branch : master
Link : http://ghc.haskell.org/trac/ghc/changeset/821adee12e89dbd0a52fde872b633e4e2e9051dc/ghc
>---------------------------------------------------------------
commit 821adee12e89dbd0a52fde872b633e4e2e9051dc
Author: Alec Theriault <alec.theriault at gmail.com>
Date: Wed Oct 25 15:52:38 2017 -0400
Fix a bug in 'alexInputPrevChar'
The lexer hacks around unicode by squishing any character into a 'Word8'
and then storing the actual character in its state. This happens at
'alexGetByte'.
That is all and well, but we ought to be careful that the characters we
retrieve via 'alexInputPrevChar' also fit this convention.
In fact, #13986 exposes nicely what can go wrong: the regex in the left
context of the type application rule uses the '$idchar' character set
which relies on the unicode hack. However, a left context corresponds
to a call to 'alexInputPrevChar', and we end up passing full blown
unicode characters to '$idchar', despite it not being equipped to deal
with these.
Test Plan: Added a regression test case
Reviewers: austin, bgamari
Reviewed By: bgamari
Subscribers: rwbarton, thomie
GHC Trac Issues: #13986
Differential Revision: https://phabricator.haskell.org/D4105
>---------------------------------------------------------------
821adee12e89dbd0a52fde872b633e4e2e9051dc
compiler/parser/Lexer.x | 78 +++++++++++++++++--------
testsuite/tests/parser/should_compile/T13986.hs | 5 ++
testsuite/tests/parser/should_compile/all.T | 1 +
3 files changed, 59 insertions(+), 25 deletions(-)
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 8c17315..3bf249b 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -129,38 +129,38 @@ import ApiAnnotation
-- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs
-- Any changes here should likely be reflected there.
-$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetByte.
+$unispace = \x05 -- Trick Alex into handling Unicode. See [Unicode in Alex].
$nl = [\n\r\f]
$whitechar = [$nl\v\ $unispace]
$white_no_nl = $whitechar # \n -- TODO #8424
$tab = \t
$ascdigit = 0-9
-$unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetByte.
+$unidigit = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex].
$decdigit = $ascdigit -- for now, should really be $digit (ToDo)
$digit = [$ascdigit $unidigit]
$special = [\(\)\,\;\[\]\`\{\}]
$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
-$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetByte.
+$unisymbol = \x04 -- Trick Alex into handling Unicode. See [Unicode in Alex].
$symbol = [$ascsymbol $unisymbol] # [$special \_\"\']
-$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetByte.
+$unilarge = \x01 -- Trick Alex into handling Unicode. See [Unicode in Alex].
$asclarge = [A-Z]
$large = [$asclarge $unilarge]
-$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetByte.
+$unismall = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex].
$ascsmall = [a-z]
$small = [$ascsmall $unismall \_]
-$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetByte.
+$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex].
$graphic = [$small $large $symbol $digit $special $unigraphic \"\']
$binit = 0-1
$octit = 0-7
$hexit = [$decdigit A-F a-f]
-$uniidchar = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex].
$idchar = [$small $large $digit $uniidchar \']
$pragmachar = [$small $large $digit]
@@ -1968,27 +1968,29 @@ getLastTk = P $ \s@(PState { last_tk = last_tk }) -> POk s last_tk
data AlexInput = AI RealSrcLoc StringBuffer
-alexInputPrevChar :: AlexInput -> Char
-alexInputPrevChar (AI _ buf) = prevChar buf '\n'
+{-
+Note [Unicode in Alex]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Although newer versions of Alex support unicode, this grammar is processed with
+the old style '--latin1' behaviour. This means that when implementing the
+functions
--- backwards compatibility for Alex 2.x
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar inp = case alexGetByte inp of
- Nothing -> Nothing
- Just (b,i) -> c `seq` Just (c,i)
- where c = chr $ fromIntegral b
+ alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
+ alexInputPrevChar :: AlexInput -> Char
-alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
-alexGetByte (AI loc s)
- | atEnd s = Nothing
- | otherwise = byte `seq` loc' `seq` s' `seq`
- --trace (show (ord c)) $
- Just (byte, (AI loc' s'))
- where (c,s') = nextChar s
- loc' = advanceSrcLoc loc c
- byte = fromIntegral $ ord adj_c
+which Alex uses to to take apart our 'AlexInput', we must
+
+ * return a latin1 character in the 'Word8' that 'alexGetByte' expects
+ * return a latin1 character in 'alexInputPrevChar'.
+
+We handle this in 'adjustChar' by squishing entire classes of unicode
+characters into single bytes.
+-}
- non_graphic = '\x00'
+{-# INLINE adjustChar #-}
+adjustChar :: Char -> Word8
+adjustChar c = fromIntegral $ ord adj_c
+ where non_graphic = '\x00'
upper = '\x01'
lower = '\x02'
digit = '\x03'
@@ -2034,6 +2036,32 @@ alexGetByte (AI loc s)
Space -> space
_other -> non_graphic
+-- Getting the previous 'Char' isn't enough here - we need to convert it into
+-- the same format that 'alexGetByte' would have produced.
+--
+-- See Note [Unicode in Alex] and #13986.
+alexInputPrevChar :: AlexInput -> Char
+alexInputPrevChar (AI _ buf) = chr (fromIntegral (adjustChar pc))
+ where pc = prevChar buf '\n'
+
+-- backwards compatibility for Alex 2.x
+alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
+alexGetChar inp = case alexGetByte inp of
+ Nothing -> Nothing
+ Just (b,i) -> c `seq` Just (c,i)
+ where c = chr $ fromIntegral b
+
+-- See Note [Unicode in Alex]
+alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
+alexGetByte (AI loc s)
+ | atEnd s = Nothing
+ | otherwise = byte `seq` loc' `seq` s' `seq`
+ --trace (show (ord c)) $
+ Just (byte, (AI loc' s'))
+ where (c,s') = nextChar s
+ loc' = advanceSrcLoc loc c
+ byte = adjustChar c
+
-- This version does not squash unicode characters, it is used when
-- lexing strings.
alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
diff --git a/testsuite/tests/parser/should_compile/T13986.hs b/testsuite/tests/parser/should_compile/T13986.hs
new file mode 100644
index 0000000..b1b4882
--- /dev/null
+++ b/testsuite/tests/parser/should_compile/T13986.hs
@@ -0,0 +1,5 @@
+{-# LANGUAGE TypeApplications #-}
+
+module T13986 where
+
+foo x₁@True = 10
diff --git a/testsuite/tests/parser/should_compile/all.T b/testsuite/tests/parser/should_compile/all.T
index c008bd4..e2f68f6 100644
--- a/testsuite/tests/parser/should_compile/all.T
+++ b/testsuite/tests/parser/should_compile/all.T
@@ -109,3 +109,4 @@ test('DumpRenamedAst', normal, compile, ['-dsuppress-uniques -ddump-rn-ast']
test('DumpTypecheckedAst', normal, compile, ['-dsuppress-uniques -ddump-tc-ast'])
test('T13747', normal, compile, [''])
test('T14189', normal, compile, ['-dsuppress-uniques -ddump-rn-ast'])
+test('T13986', normal, compile, [''])
More information about the ghc-commits
mailing list