[commit: ghc] master: Allow combining characters in identifiers (#7650) (2aee419)

git at git.haskell.org git at git.haskell.org
Tue Feb 23 11:27:38 UTC 2016


Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/2aee41960aa00fe09a2cd1983e02c15e06013037/ghc

>---------------------------------------------------------------

commit 2aee41960aa00fe09a2cd1983e02c15e06013037
Author: Thomas Miedema <thomasmiedema at gmail.com>
Date:   Sat Feb 20 23:50:28 2016 +0100

    Allow combining characters in identifiers (#7650)
    
    Reviewed by: austin, rwbarton
    
    Differential Revision: https://phabricator.haskell.org/D1938


>---------------------------------------------------------------

2aee41960aa00fe09a2cd1983e02c15e06013037
 compiler/basicTypes/Lexeme.hs                                |  5 +++--
 compiler/parser/Lexer.x                                      | 12 ++++++------
 testsuite/tests/parser/unicode/T7650.hs                      | 11 +++++++++++
 .../tests/parser/unicode/T7650.stdout                        |  0
 testsuite/tests/parser/unicode/all.T                         |  1 +
 5 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/compiler/basicTypes/Lexeme.hs b/compiler/basicTypes/Lexeme.hs
index 9e75376..22515c1 100644
--- a/compiler/basicTypes/Lexeme.hs
+++ b/compiler/basicTypes/Lexeme.hs
@@ -194,9 +194,10 @@ okIdChar c = case generalCategory c of
   LowercaseLetter -> True
   TitlecaseLetter -> True
   ModifierLetter  -> True -- See #10196
-  OtherLetter     -> True
+  OtherLetter     -> True -- See #1103
+  NonSpacingMark  -> True -- See #7650
   DecimalNumber   -> True
-  OtherNumber     -> True
+  OtherNumber     -> True -- See #4373
   _               -> c == '\'' || c == '_'
 
 -- | Is this character acceptable in a symbol (after the first char)?
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 5f3bdee..3f959f2 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -155,8 +155,8 @@ $binit     = 0-1
 $octit     = 0-7
 $hexit     = [$decdigit A-F a-f]
 
-$modifier  = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
-$idchar    = [$small $large $digit $modifier \']
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
+$idchar    = [$small $large $digit $uniidchar \']
 
 $pragmachar = [$small $large $digit]
 
@@ -1874,10 +1874,10 @@ alexGetByte (AI loc s)
         symbol          = '\x04'
         space           = '\x05'
         other_graphic   = '\x06'
-        modifier        = '\x07'
+        uniidchar       = '\x07'
 
         adj_c
-          | c <= '\x06' = non_graphic
+          | c <= '\x07' = non_graphic
           | c <= '\x7f' = c
           -- Alex doesn't handle Unicode, so when Unicode
           -- character is encountered we output these values
@@ -1891,9 +1891,9 @@ alexGetByte (AI loc s)
                   UppercaseLetter       -> upper
                   LowercaseLetter       -> lower
                   TitlecaseLetter       -> upper
-                  ModifierLetter        -> modifier -- see #10196
+                  ModifierLetter        -> uniidchar -- see #10196
                   OtherLetter           -> lower -- see #1103
-                  NonSpacingMark        -> other_graphic
+                  NonSpacingMark        -> uniidchar -- see #7650
                   SpacingCombiningMark  -> other_graphic
                   EnclosingMark         -> other_graphic
                   DecimalNumber         -> digit
diff --git a/testsuite/tests/parser/unicode/T7650.hs b/testsuite/tests/parser/unicode/T7650.hs
new file mode 100644
index 0000000..c474bc0
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T7650.hs
@@ -0,0 +1,11 @@
+main = print spın̈alTap
+    where spın̈alTap = 11
+
+-- n̈ is a combining character sequence. We now allow it to be used in
+-- identifiers (#7650).
+--
+-- > map generalCategory "n̈"
+-- [LowercaseLetter,NonSpacingMark]
+--
+-- > map show "n̈"
+-- ["'n'","'\776'"]
diff --git a/libraries/ghc-prim/tests/T6026.stdout b/testsuite/tests/parser/unicode/T7650.stdout
similarity index 100%
copy from libraries/ghc-prim/tests/T6026.stdout
copy to testsuite/tests/parser/unicode/T7650.stdout
diff --git a/testsuite/tests/parser/unicode/all.T b/testsuite/tests/parser/unicode/all.T
index 6972a0d..36554cc 100644
--- a/testsuite/tests/parser/unicode/all.T
+++ b/testsuite/tests/parser/unicode/all.T
@@ -25,3 +25,4 @@ test('T7671', normal, compile, [''])
 # TODO: This test ought to be run in a non-UTF8 locale, but this is not yet
 # supported by the test suite (see 10907)
 test('T10907', normal, compile, [''])
+test('T7650', normal, compile, [''])



More information about the ghc-commits mailing list