[commit: ghc] master: Fix a bug in 'alexInputPrevChar' (821adee)

Wed Oct 25 20:44:47 UTC 2017

Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/821adee12e89dbd0a52fde872b633e4e2e9051dc/ghc

>---------------------------------------------------------------

commit 821adee12e89dbd0a52fde872b633e4e2e9051dc
Author: Alec Theriault <alec.theriault at gmail.com>
Date:   Wed Oct 25 15:52:38 2017 -0400

    Fix a bug in 'alexInputPrevChar'
    
    The lexer hacks around unicode by squishing any character into a 'Word8'
    and then storing the actual character in its state. This happens at
    'alexGetByte'.
    
    That is all and well, but we ought to be careful that the characters we
    retrieve via 'alexInputPrevChar' also fit this convention.
    
    In fact, #13986 exposes nicely what can go wrong: the regex in the left
    context of the type application rule uses the '$idchar' character set
    which relies on the unicode hack. However, a left context corresponds
    to a call to 'alexInputPrevChar', and we end up passing full blown
    unicode characters to '$idchar', despite it not being equipped to deal
    with these.
    
    Test Plan: Added a regression test case
    
    Reviewers: austin, bgamari
    
    Reviewed By: bgamari
    
    Subscribers: rwbarton, thomie
    
    GHC Trac Issues: #13986
    
    Differential Revision: https://phabricator.haskell.org/D4105


>---------------------------------------------------------------

821adee12e89dbd0a52fde872b633e4e2e9051dc
 compiler/parser/Lexer.x                         | 78 +++++++++++++++++--------
 testsuite/tests/parser/should_compile/T13986.hs |  5 ++
 testsuite/tests/parser/should_compile/all.T     |  1 +
 3 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 8c17315..3bf249b 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -129,38 +129,38 @@ import ApiAnnotation
 
 -- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs
 -- Any changes here should likely be reflected there.
-$unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetByte.
+$unispace    = \x05 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $nl          = [\n\r\f]
 $whitechar   = [$nl\v\ $unispace]
 $white_no_nl = $whitechar # \n -- TODO #8424
 $tab         = \t
 
 $ascdigit  = 0-9
-$unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetByte.
+$unidigit  = \x03 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
 $digit     = [$ascdigit $unidigit]
 
 $special   = [\(\)\,\;\[\]\`\{\}]
 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
-$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetByte.
+$unisymbol = \x04 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $symbol    = [$ascsymbol $unisymbol] # [$special \_\"\']
 
-$unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetByte.
+$unilarge  = \x01 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $asclarge  = [A-Z]
 $large     = [$asclarge $unilarge]
 
-$unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetByte.
+$unismall  = \x02 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $ascsmall  = [a-z]
 $small     = [$ascsmall $unismall \_]
 
-$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetByte.
+$unigraphic = \x06 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $graphic   = [$small $large $symbol $digit $special $unigraphic \"\']
 
 $binit     = 0-1
 $octit     = 0-7
 $hexit     = [$decdigit A-F a-f]
 
-$uniidchar = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See [Unicode in Alex].
 $idchar    = [$small $large $digit $uniidchar \']
 
 $pragmachar = [$small $large $digit]
@@ -1968,27 +1968,29 @@ getLastTk = P $ \s@(PState { last_tk = last_tk }) -> POk s last_tk
 
 data AlexInput = AI RealSrcLoc StringBuffer
 
-alexInputPrevChar :: AlexInput -> Char
-alexInputPrevChar (AI _ buf) = prevChar buf '\n'
+{-
+Note [Unicode in Alex]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Although newer versions of Alex support unicode, this grammar is processed with
+the old style '--latin1' behaviour. This means that when implementing the
+functions
 
--- backwards compatibility for Alex 2.x
-alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
-alexGetChar inp = case alexGetByte inp of
-                    Nothing    -> Nothing
-                    Just (b,i) -> c `seq` Just (c,i)
-                       where c = chr $ fromIntegral b
+    alexGetByte       :: AlexInput -> Maybe (Word8,AlexInput)
+    alexInputPrevChar :: AlexInput -> Char
 
-alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
-alexGetByte (AI loc s)
-  | atEnd s   = Nothing
-  | otherwise = byte `seq` loc' `seq` s' `seq`
-                --trace (show (ord c)) $
-                Just (byte, (AI loc' s'))
-  where (c,s') = nextChar s
-        loc'   = advanceSrcLoc loc c
-        byte   = fromIntegral $ ord adj_c
+which Alex uses to to take apart our 'AlexInput', we must
+
+  * return a latin1 character in the 'Word8' that 'alexGetByte' expects
+  * return a latin1 character in 'alexInputPrevChar'.
+
+We handle this in 'adjustChar' by squishing entire classes of unicode
+characters into single bytes.
+-}
 
-        non_graphic     = '\x00'
+{-# INLINE adjustChar #-}
+adjustChar :: Char -> Word8
+adjustChar c = fromIntegral $ ord adj_c
+  where non_graphic     = '\x00'
         upper           = '\x01'
         lower           = '\x02'
         digit           = '\x03'
@@ -2034,6 +2036,32 @@ alexGetByte (AI loc s)
                   Space                 -> space
                   _other                -> non_graphic
 
+-- Getting the previous 'Char' isn't enough here - we need to convert it into
+-- the same format that 'alexGetByte' would have produced.
+--
+-- See Note [Unicode in Alex] and #13986.
+alexInputPrevChar :: AlexInput -> Char
+alexInputPrevChar (AI _ buf) = chr (fromIntegral (adjustChar pc))
+  where pc = prevChar buf '\n'
+
+-- backwards compatibility for Alex 2.x
+alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
+alexGetChar inp = case alexGetByte inp of
+                    Nothing    -> Nothing
+                    Just (b,i) -> c `seq` Just (c,i)
+                       where c = chr $ fromIntegral b
+
+-- See Note [Unicode in Alex]
+alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
+alexGetByte (AI loc s)
+  | atEnd s   = Nothing
+  | otherwise = byte `seq` loc' `seq` s' `seq`
+                --trace (show (ord c)) $
+                Just (byte, (AI loc' s'))
+  where (c,s') = nextChar s
+        loc'   = advanceSrcLoc loc c
+        byte   = adjustChar c
+
 -- This version does not squash unicode characters, it is used when
 -- lexing strings.
 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
diff --git a/testsuite/tests/parser/should_compile/T13986.hs b/testsuite/tests/parser/should_compile/T13986.hs
new file mode 100644
index 0000000..b1b4882
--- /dev/null
+++ b/testsuite/tests/parser/should_compile/T13986.hs
@@ -0,0 +1,5 @@
+{-# LANGUAGE TypeApplications #-}
+
+module T13986 where
+
+foo x₁@True = 10
diff --git a/testsuite/tests/parser/should_compile/all.T b/testsuite/tests/parser/should_compile/all.T
index c008bd4..e2f68f6 100644
--- a/testsuite/tests/parser/should_compile/all.T
+++ b/testsuite/tests/parser/should_compile/all.T
@@ -109,3 +109,4 @@ test('DumpRenamedAst',     normal, compile, ['-dsuppress-uniques -ddump-rn-ast']
 test('DumpTypecheckedAst', normal, compile, ['-dsuppress-uniques -ddump-tc-ast'])
 test('T13747', normal, compile, [''])
 test('T14189',     normal, compile, ['-dsuppress-uniques -ddump-rn-ast'])
+test('T13986', normal, compile, [''])