[Git][ghc/ghc][master] Give better errors for code corrupted by Unicode smart quotes (#21843)
Marge Bot (@marge-bot)
gitlab at gitlab.haskell.org
Sat Nov 19 08:22:34 UTC 2022
Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC
Commits:
b0ac3813 by Lawton Nichols at 2022-11-19T03:22:14-05:00
Give better errors for code corrupted by Unicode smart quotes (#21843)
Previously, we emitted a generic and potentially confusing error during lexical
analysis on programs containing smart quotes (“/”/‘/’). This commit adds
smart quote-aware lexer errors.
- - - - -
17 changed files:
- compiler/GHC/Parser/Errors/Ppr.hs
- compiler/GHC/Parser/Errors/Types.hs
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Types/Error/Codes.hs
- + testsuite/tests/parser/should_fail/T21843a.hs
- + testsuite/tests/parser/should_fail/T21843a.stderr
- + testsuite/tests/parser/should_fail/T21843b.hs
- + testsuite/tests/parser/should_fail/T21843b.stderr
- + testsuite/tests/parser/should_fail/T21843c.hs
- + testsuite/tests/parser/should_fail/T21843c.stderr
- + testsuite/tests/parser/should_fail/T21843d.hs
- + testsuite/tests/parser/should_fail/T21843d.stderr
- + testsuite/tests/parser/should_fail/T21843e.hs
- + testsuite/tests/parser/should_fail/T21843e.stderr
- + testsuite/tests/parser/should_fail/T21843f.hs
- + testsuite/tests/parser/should_fail/T21843f.stderr
- testsuite/tests/parser/should_fail/all.T
Changes:
=====================================
compiler/GHC/Parser/Errors/Ppr.hs
=====================================
@@ -513,6 +513,16 @@ instance Diagnostic PsMessage where
, nest 2 $ text "but" <+> quotes (ppr tycon) <+> text "has" <+> speakN n ]
, text "In the newtype declaration for" <+> quotes (ppr tycon) ]
+ PsErrUnicodeCharLooksLike bad_char looks_like_char looks_like_char_name
+ -> mkSimpleDecorated $
+ hsep [ text "Unicode character"
+ -- purposefully not using `quotes (text [bad_char])`, because the quotes function adds smart quotes,
+ -- and smart quotes may be the topic of this error message
+ , text "'" <> text [bad_char] <> text "' (" <> text (show bad_char) <> text ")"
+ , text "looks like"
+ , text "'" <> text [looks_like_char] <> text "' (" <> text looks_like_char_name <> text ")" <> comma
+ , text "but it is not" ]
+
diagnosticReason = \case
PsUnknownMessage m -> diagnosticReason m
PsHeaderMessage m -> psHeaderMessageReason m
@@ -630,6 +640,7 @@ instance Diagnostic PsMessage where
PsErrIllegalGadtRecordMultiplicity{} -> ErrorWithoutFlag
PsErrInvalidCApiImport {} -> ErrorWithoutFlag
PsErrMultipleConForNewtype {} -> ErrorWithoutFlag
+ PsErrUnicodeCharLooksLike{} -> ErrorWithoutFlag
diagnosticHints = \case
PsUnknownMessage m -> diagnosticHints m
@@ -800,6 +811,7 @@ instance Diagnostic PsMessage where
PsErrIllegalGadtRecordMultiplicity{} -> noHints
PsErrInvalidCApiImport {} -> noHints
PsErrMultipleConForNewtype {} -> noHints
+ PsErrUnicodeCharLooksLike{} -> noHints
diagnosticCode = constructorCode
=====================================
compiler/GHC/Parser/Errors/Types.hs
=====================================
@@ -466,6 +466,11 @@ data PsMessage
| PsErrMultipleConForNewtype !RdrName !Int
+ | PsErrUnicodeCharLooksLike
+ Char -- ^ the problematic character
+ Char -- ^ the character it looks like
+ String -- ^ the name of the character that it looks like
+
deriving Generic
-- | Extra details about a parse error, which helps
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -274,6 +274,9 @@ $tab { warnTab }
"-- " / { atEOL } { lineCommentToken }
+-- Everywhere: check for smart quotes--they are not allowed outside of strings
+$unigraphic / { isSmartQuote } { smart_quote_error }
+
-- 'bol' state: beginning of a line. Slurp up all the whitespace (including
-- blank lines) until we find a non-whitespace character, then do layout
-- processing.
@@ -2037,7 +2040,8 @@ lex_string_tok span buf _len _buf2 = do
lex_quoted_label :: Action
lex_quoted_label span _buf _len _buf2 = do
- s <- lex_string_helper ""
+ start <- getInput
+ s <- lex_string_helper "" start
(AI end _) <- getInput
let
token = ITlabelvarid (mkFastString s)
@@ -2050,7 +2054,8 @@ data LexedString = LexedRegularString String | LexedPrimString String
lex_string :: P LexedString
lex_string = do
- s <- lex_string_helper ""
+ start <- getInput
+ s <- lex_string_helper "" start
magicHash <- getBit MagicHashBit
if magicHash
then do
@@ -2070,8 +2075,8 @@ lex_string = do
return $ LexedRegularString s
-lex_string_helper :: String -> P String
-lex_string_helper s = do
+lex_string_helper :: String -> AlexInput -> P String
+lex_string_helper s start = do
i <- getInput
case alexGetChar' i of
Nothing -> lit_error i
@@ -2082,26 +2087,36 @@ lex_string_helper s = do
Just ('\\',i)
| Just ('&',i) <- next -> do
- setInput i; lex_string_helper s
+ setInput i; lex_string_helper s start
| Just (c,i) <- next, c <= '\x7f' && is_space c -> do
-- is_space only works for <= '\x7f' (#3751, #5425)
- setInput i; lex_stringgap s
+ setInput i; lex_stringgap s start
where next = alexGetChar' i
Just (c, i1) -> do
case c of
- '\\' -> do setInput i1; c' <- lex_escape; lex_string_helper (c':s)
- c | isAny c -> do setInput i1; lex_string_helper (c:s)
+ '\\' -> do setInput i1; c' <- lex_escape; lex_string_helper (c':s) start
+ c | isAny c -> do setInput i1; lex_string_helper (c:s) start
+ _other | any isDoubleSmartQuote s -> do
+ -- if the built-up string s contains a smart double quote character, it was
+ -- likely the reason why the string literal was not lexed correctly
+ setInput start -- rewind to the first character in the string literal
+ -- so we can find the smart quote character's location
+ advance_to_smart_quote_character
+ i2@(AI loc _) <- getInput
+ case alexGetChar' i2 of
+ Just (c, _) -> do add_nonfatal_smart_quote_error c loc; lit_error i
+ Nothing -> lit_error i -- should never get here
_other -> lit_error i
-lex_stringgap :: String -> P String
-lex_stringgap s = do
+lex_stringgap :: String -> AlexInput -> P String
+lex_stringgap s start = do
i <- getInput
c <- getCharOrFail i
case c of
- '\\' -> lex_string_helper s
- c | c <= '\x7f' && is_space c -> lex_stringgap s
+ '\\' -> lex_string_helper s start
+ c | c <= '\x7f' && is_space c -> lex_stringgap s start
-- is_space only works for <= '\x7f' (#3751, #5425)
_other -> lit_error i
@@ -2123,15 +2138,16 @@ lex_char_tok span buf _len _buf2 = do -- We've seen '
setInput i2
return (L (mkPsSpan loc end2) ITtyQuote)
- Just ('\\', i2@(AI _end2 _)) -> do -- We've seen 'backslash
+ Just ('\\', i2@(AI end2 _)) -> do -- We've seen 'backslash
setInput i2
lit_ch <- lex_escape
i3 <- getInput
mc <- getCharOrFail i3 -- Trailing quote
if mc == '\'' then finish_char_tok buf loc lit_ch
- else lit_error i3
+ else if isSingleSmartQuote mc then add_smart_quote_error mc end2
+ else lit_error i3
- Just (c, i2@(AI _end2 _))
+ Just (c, i2@(AI end2 _))
| not (isAny c) -> lit_error i1
| otherwise ->
@@ -2141,6 +2157,7 @@ lex_char_tok span buf _len _buf2 = do -- We've seen '
Just ('\'', i3) -> do -- We've seen 'x'
setInput i3
finish_char_tok buf loc c
+ Just (c, _) | isSingleSmartQuote c -> add_smart_quote_error c end2
_other -> do -- We've seen 'x not followed by quote
-- (including the possibility of EOF)
-- Just parse the quote only
@@ -2171,7 +2188,7 @@ isAny c | c > '\x7f' = isPrint c
lex_escape :: P Char
lex_escape = do
- i0 <- getInput
+ i0@(AI loc _) <- getInput
c <- getCharOrFail i0
case c of
'a' -> return '\a'
@@ -2184,6 +2201,11 @@ lex_escape = do
'\\' -> return '\\'
'"' -> return '\"'
'\'' -> return '\''
+ -- the next two patterns build up a Unicode smart quote error (#21843)
+ smart_double_quote | isDoubleSmartQuote smart_double_quote ->
+ add_smart_quote_error smart_double_quote loc
+ smart_single_quote | isSingleSmartQuote smart_single_quote ->
+ add_smart_quote_error smart_single_quote loc
'^' -> do i1 <- getInput
c <- getCharOrFail i1
if c >= '@' && c <= '_'
@@ -2338,6 +2360,49 @@ quasiquote_error start = do
reportLexError start (psRealLoc end) buf
(\k srcLoc -> mkPlainErrorMsgEnvelope srcLoc (PsErrLexer LexUnterminatedQQ k))
+-- -----------------------------------------------------------------------------
+-- Unicode Smart Quote detection (#21843)
+
+isDoubleSmartQuote :: Char -> Bool
+isDoubleSmartQuote '“' = True
+isDoubleSmartQuote '”' = True
+isDoubleSmartQuote _ = False
+
+isSingleSmartQuote :: Char -> Bool
+isSingleSmartQuote '‘' = True
+isSingleSmartQuote '’' = True
+isSingleSmartQuote _ = False
+
+isSmartQuote :: AlexAccPred ExtsBitmap
+isSmartQuote _ _ _ (AI _ buf) = let c = prevChar buf ' ' in isSingleSmartQuote c || isDoubleSmartQuote c
+
+smart_quote_error_message :: Char -> PsLoc -> MsgEnvelope PsMessage
+smart_quote_error_message c loc =
+ let (correct_char, correct_char_name) =
+ if isSingleSmartQuote c then ('\'', "Single Quote") else ('"', "Quotation Mark")
+ err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (mkPsSpan loc loc)) $
+ PsErrUnicodeCharLooksLike c correct_char correct_char_name in
+ err
+
+smart_quote_error :: Action
+smart_quote_error span buf _len _buf2 = do
+ let c = currentChar buf
+ addFatalError (smart_quote_error_message c (psSpanStart span))
+
+add_smart_quote_error :: Char -> PsLoc -> P a
+add_smart_quote_error c loc = addFatalError (smart_quote_error_message c loc)
+
+add_nonfatal_smart_quote_error :: Char -> PsLoc -> P ()
+add_nonfatal_smart_quote_error c loc = addError (smart_quote_error_message c loc)
+
+advance_to_smart_quote_character :: P ()
+advance_to_smart_quote_character = do
+ i <- getInput
+ case alexGetChar' i of
+ Just (c, _) | isDoubleSmartQuote c -> return ()
+ Just (_, i2) -> do setInput i2; advance_to_smart_quote_character
+ Nothing -> return () -- should never get here
+
-- -----------------------------------------------------------------------------
-- Warnings
=====================================
compiler/GHC/Types/Error/Codes.hs
=====================================
@@ -267,6 +267,7 @@ type family GhcDiagnosticCode c = n | n -> c where
GhcDiagnosticCode "PsErrIllegalGadtRecordMultiplicity" = 37475
GhcDiagnosticCode "PsErrInvalidCApiImport" = 72744
GhcDiagnosticCode "PsErrMultipleConForNewtype" = 05380
+ GhcDiagnosticCode "PsErrUnicodeCharLooksLike" = 31623
-- Driver diagnostic codes
GhcDiagnosticCode "DriverMissingHomeModules" = 32850
=====================================
testsuite/tests/parser/should_fail/T21843a.hs
=====================================
@@ -0,0 +1,3 @@
+module UnicodeSmartQuotes where
+
+badString = “hello”
=====================================
testsuite/tests/parser/should_fail/T21843a.stderr
=====================================
@@ -0,0 +1,4 @@
+
+T21843a.hs:3:13: [GHC-31623]
+ Unicode character '“' ('/8220') looks like '"' (Quotation Mark), but it is not
+
=====================================
testsuite/tests/parser/should_fail/T21843b.hs
=====================================
@@ -0,0 +1,3 @@
+module UnicodeSmartQuotes where
+
+badChar = ‘x’
=====================================
testsuite/tests/parser/should_fail/T21843b.stderr
=====================================
@@ -0,0 +1,3 @@
+
+T21843b.hs:3:11: [GHC-31623]
+ Unicode character '‘' ('/8216') looks like ''' (Single Quote), but it is not
=====================================
testsuite/tests/parser/should_fail/T21843c.hs
=====================================
@@ -0,0 +1,3 @@
+module UnicodeSmartQuotes where
+
+badString = "hello”
=====================================
testsuite/tests/parser/should_fail/T21843c.stderr
=====================================
@@ -0,0 +1,6 @@
+
+T21843c.hs:3:19: [GHC-31623]
+ Unicode character '”' ('/8221') looks like '"' (Quotation Mark), but it is not
+
+T21843c.hs:3:20: [GHC-21231]
+ lexical error in string/character literal at character '/n'
=====================================
testsuite/tests/parser/should_fail/T21843d.hs
=====================================
@@ -0,0 +1,4 @@
+module UnicodeSmartQuotes where
+
+badChar = 'x’
+
=====================================
testsuite/tests/parser/should_fail/T21843d.stderr
=====================================
@@ -0,0 +1,3 @@
+
+T21843d.hs:3:13: [GHC-31623]
+ Unicode character '’' ('/8217') looks like ''' (Single Quote), but it is not
=====================================
testsuite/tests/parser/should_fail/T21843e.hs
=====================================
@@ -0,0 +1,3 @@
+module UnicodeSmartQuotes where
+
+badString = "\”"
=====================================
testsuite/tests/parser/should_fail/T21843e.stderr
=====================================
@@ -0,0 +1,3 @@
+
+T21843e.hs:3:15: [GHC-31623]
+ Unicode character '”' ('/8221') looks like '"' (Quotation Mark), but it is not
=====================================
testsuite/tests/parser/should_fail/T21843f.hs
=====================================
@@ -0,0 +1,3 @@
+module UnicodeSmartQuotes where
+
+badChar = '\‘'
=====================================
testsuite/tests/parser/should_fail/T21843f.stderr
=====================================
@@ -0,0 +1,3 @@
+
+T21843f.hs:3:13: [GHC-31623]
+ Unicode character '‘' ('/8216') looks like ''' (Single Quote), but it is not
=====================================
testsuite/tests/parser/should_fail/all.T
=====================================
@@ -209,3 +209,9 @@ test('T20385A', normal, compile_fail, [''])
test('T20385B', normal, compile_fail, [''])
test('T16999', normal, compile_fail, [''])
test('T22070', normal, compile_fail, [''])
+test('T21843a', normal, compile_fail, [''])
+test('T21843b', normal, compile_fail, [''])
+test('T21843c', normal, compile_fail, [''])
+test('T21843d', normal, compile_fail, [''])
+test('T21843e', normal, compile_fail, [''])
+test('T21843f', normal, compile_fail, [''])
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/b0ac38133767a8ca7de63112f39436241ff435a0
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/b0ac38133767a8ca7de63112f39436241ff435a0
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20221119/0faef5e7/attachment-0001.html>
More information about the ghc-commits
mailing list