[commit: ghc] master: Allow combining characters in identifiers (#7650) (2aee419)
git at git.haskell.org
git at git.haskell.org
Tue Feb 23 11:27:38 UTC 2016
Repository : ssh://git@git.haskell.org/ghc
On branch : master
Link : http://ghc.haskell.org/trac/ghc/changeset/2aee41960aa00fe09a2cd1983e02c15e06013037/ghc
>---------------------------------------------------------------
commit 2aee41960aa00fe09a2cd1983e02c15e06013037
Author: Thomas Miedema <thomasmiedema at gmail.com>
Date: Sat Feb 20 23:50:28 2016 +0100
Allow combining characters in identifiers (#7650)
Reviewed by: austin, rwbarton
Differential Revision: https://phabricator.haskell.org/D1938
>---------------------------------------------------------------
2aee41960aa00fe09a2cd1983e02c15e06013037
compiler/basicTypes/Lexeme.hs | 5 +++--
compiler/parser/Lexer.x | 12 ++++++------
testsuite/tests/parser/unicode/T7650.hs | 11 +++++++++++
.../tests/parser/unicode/T7650.stdout | 0
testsuite/tests/parser/unicode/all.T | 1 +
5 files changed, 21 insertions(+), 8 deletions(-)
diff --git a/compiler/basicTypes/Lexeme.hs b/compiler/basicTypes/Lexeme.hs
index 9e75376..22515c1 100644
--- a/compiler/basicTypes/Lexeme.hs
+++ b/compiler/basicTypes/Lexeme.hs
@@ -194,9 +194,10 @@ okIdChar c = case generalCategory c of
LowercaseLetter -> True
TitlecaseLetter -> True
ModifierLetter -> True -- See #10196
- OtherLetter -> True
+ OtherLetter -> True -- See #1103
+ NonSpacingMark -> True -- See #7650
DecimalNumber -> True
- OtherNumber -> True
+ OtherNumber -> True -- See #4373
_ -> c == '\'' || c == '_'
-- | Is this character acceptable in a symbol (after the first char)?
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 5f3bdee..3f959f2 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -155,8 +155,8 @@ $binit = 0-1
$octit = 0-7
$hexit = [$decdigit A-F a-f]
-$modifier = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
-$idchar = [$small $large $digit $modifier \']
+$uniidchar = \x07 -- Trick Alex into handling Unicode. See alexGetByte.
+$idchar = [$small $large $digit $uniidchar \']
$pragmachar = [$small $large $digit]
@@ -1874,10 +1874,10 @@ alexGetByte (AI loc s)
symbol = '\x04'
space = '\x05'
other_graphic = '\x06'
- modifier = '\x07'
+ uniidchar = '\x07'
adj_c
- | c <= '\x06' = non_graphic
+ | c <= '\x07' = non_graphic
| c <= '\x7f' = c
-- Alex doesn't handle Unicode, so when Unicode
-- character is encountered we output these values
@@ -1891,9 +1891,9 @@ alexGetByte (AI loc s)
UppercaseLetter -> upper
LowercaseLetter -> lower
TitlecaseLetter -> upper
- ModifierLetter -> modifier -- see #10196
+ ModifierLetter -> uniidchar -- see #10196
OtherLetter -> lower -- see #1103
- NonSpacingMark -> other_graphic
+ NonSpacingMark -> uniidchar -- see #7650
SpacingCombiningMark -> other_graphic
EnclosingMark -> other_graphic
DecimalNumber -> digit
diff --git a/testsuite/tests/parser/unicode/T7650.hs b/testsuite/tests/parser/unicode/T7650.hs
new file mode 100644
index 0000000..c474bc0
--- /dev/null
+++ b/testsuite/tests/parser/unicode/T7650.hs
@@ -0,0 +1,11 @@
+main = print spın̈alTap
+ where spın̈alTap = 11
+
+-- n̈ is a combining character sequence. We now allow it to be used in
+-- identifiers (#7650).
+--
+-- > map generalCategory "n̈"
+-- [LowercaseLetter,NonSpacingMark]
+--
+-- > map show "n̈"
+-- ["'n'","'\776'"]
diff --git a/libraries/ghc-prim/tests/T6026.stdout b/testsuite/tests/parser/unicode/T7650.stdout
similarity index 100%
copy from libraries/ghc-prim/tests/T6026.stdout
copy to testsuite/tests/parser/unicode/T7650.stdout
diff --git a/testsuite/tests/parser/unicode/all.T b/testsuite/tests/parser/unicode/all.T
index 6972a0d..36554cc 100644
--- a/testsuite/tests/parser/unicode/all.T
+++ b/testsuite/tests/parser/unicode/all.T
@@ -25,3 +25,4 @@ test('T7671', normal, compile, [''])
# TODO: This test ought to be run in a non-UTF8 locale, but this is not yet
# supported by the test suite (see 10907)
test('T10907', normal, compile, [''])
+test('T7650', normal, compile, [''])
More information about the ghc-commits
mailing list