[Git][ghc/ghc][wip/multiline-strings] 3 commits: Factor out string processing functions
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Wed Feb 14 06:17:13 UTC 2024
Brandon Chinn pushed to branch wip/multiline-strings at Glasgow Haskell Compiler / GHC
Commits:
9000b889 by Brandon Chinn at 2024-02-10T18:18:25-08:00
Factor out string processing functions
- - - - -
f8812c6b by Brandon Chinn at 2024-02-13T22:10:50-08:00
Implement MultilineStrings
- - - - -
bfd3f99e by Brandon Chinn at 2024-02-13T22:16:59-08:00
Add docs for MultilineStrings
- - - - -
10 changed files:
- compiler/GHC/Hs/Lit.hs
- compiler/GHC/Hs/Syn/Type.hs
- compiler/GHC/HsToCore/Match/Literal.hs
- compiler/GHC/Parser/Lexer.x
- + compiler/GHC/Parser/String.hs
- compiler/Language/Haskell/Syntax/Extension.hs
- compiler/Language/Haskell/Syntax/Lit.hs
- compiler/ghc.cabal.in
- + docs/users_guide/exts/multiline_strings.rst
- utils/check-exact/ExactPrint.hs
Changes:
=====================================
compiler/GHC/Hs/Lit.hs
=====================================
@@ -46,6 +46,7 @@ import Language.Haskell.Syntax.Lit
type instance XHsChar (GhcPass _) = SourceText
type instance XHsCharPrim (GhcPass _) = SourceText
type instance XHsString (GhcPass _) = SourceText
+type instance XHsMultilineString (GhcPass _) = SourceText
type instance XHsStringPrim (GhcPass _) = SourceText
type instance XHsInt (GhcPass _) = NoExtField
type instance XHsIntPrim (GhcPass _) = SourceText
@@ -132,6 +133,7 @@ hsLitNeedsParens p = go
go (HsChar {}) = False
go (HsCharPrim {}) = False
go (HsString {}) = False
+ go (HsMultilineString {}) = False
go (HsStringPrim {}) = False
go (HsInt _ x) = p > topPrec && il_neg x
go (HsInteger _ x _) = p > topPrec && x < 0
@@ -155,6 +157,7 @@ convertLit :: HsLit (GhcPass p1) -> HsLit (GhcPass p2)
convertLit (HsChar a x) = HsChar a x
convertLit (HsCharPrim a x) = HsCharPrim a x
convertLit (HsString a x) = HsString a x
+convertLit (HsMultilineString a x) = HsMultilineString a x
convertLit (HsStringPrim a x) = HsStringPrim a x
convertLit (HsInt a x) = HsInt a x
convertLit (HsIntPrim a x) = HsIntPrim a x
@@ -192,6 +195,7 @@ instance Outputable (HsLit (GhcPass p)) where
ppr (HsChar st c) = pprWithSourceText st (pprHsChar c)
ppr (HsCharPrim st c) = pprWithSourceText st (pprPrimChar c)
ppr (HsString st s) = pprWithSourceText st (pprHsString s)
+ ppr (HsMultilineString st s) = pprWithSourceText st (pprHsString s)
ppr (HsStringPrim st s) = pprWithSourceText st (pprHsBytes s)
ppr (HsInt _ i)
= pprWithSourceText (il_text i) (integer (il_value i))
@@ -231,6 +235,7 @@ pmPprHsLit :: HsLit (GhcPass x) -> SDoc
pmPprHsLit (HsChar _ c) = pprHsChar c
pmPprHsLit (HsCharPrim _ c) = pprHsChar c
pmPprHsLit (HsString st s) = pprWithSourceText st (pprHsString s)
+pmPprHsLit (HsMultilineString st s) = pprWithSourceText st (pprHsString s)
pmPprHsLit (HsStringPrim _ s) = pprHsBytes s
pmPprHsLit (HsInt _ i) = integer (il_value i)
pmPprHsLit (HsIntPrim _ i) = integer i
=====================================
compiler/GHC/Hs/Syn/Type.hs
=====================================
@@ -74,6 +74,7 @@ hsLitType :: HsLit (GhcPass p) -> Type
hsLitType (HsChar _ _) = charTy
hsLitType (HsCharPrim _ _) = charPrimTy
hsLitType (HsString _ _) = stringTy
+hsLitType (HsMultilineString _ _) = stringTy
hsLitType (HsStringPrim _ _) = addrPrimTy
hsLitType (HsInt _ _) = intTy
hsLitType (HsIntPrim _ _) = intPrimTy
=====================================
compiler/GHC/HsToCore/Match/Literal.hs
=====================================
@@ -121,6 +121,7 @@ dsLit l = do
HsDoublePrim _ fl -> return (Lit (LitDouble (rationalFromFractionalLit fl)))
HsChar _ c -> return (mkCharExpr c)
HsString _ str -> mkStringExprFS str
+ HsMultilineString _ str -> mkStringExprFS str
HsInteger _ i _ -> return (mkIntegerExpr platform i)
HsInt _ i -> return (mkIntExpr platform (il_value i))
HsRat _ fl ty -> dsFractionalLitToRational fl ty
@@ -474,6 +475,7 @@ getSimpleIntegralLit (HsInteger _ i ty) = Just (i, ty)
getSimpleIntegralLit HsChar{} = Nothing
getSimpleIntegralLit HsCharPrim{} = Nothing
getSimpleIntegralLit HsString{} = Nothing
+getSimpleIntegralLit HsMultilineString{} = Nothing
getSimpleIntegralLit HsStringPrim{} = Nothing
getSimpleIntegralLit HsRat{} = Nothing
getSimpleIntegralLit HsFloatPrim{} = Nothing
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -87,7 +87,7 @@ import qualified GHC.Data.Strict as Strict
import Control.Monad
import Control.Applicative
import Data.Char
-import Data.List (stripPrefix, isInfixOf, partition)
+import Data.List (stripPrefix, isInfixOf, partition, unfoldr)
import Data.List.NonEmpty ( NonEmpty(..) )
import qualified Data.List.NonEmpty as NE
import Data.Maybe
@@ -130,6 +130,7 @@ import GHC.Driver.Flags
import GHC.Parser.Errors.Basic
import GHC.Parser.Errors.Types
import GHC.Parser.Errors.Ppr ()
+import GHC.Parser.String
}
-- -----------------------------------------------------------------------------
@@ -662,7 +663,8 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
-- to convert it to a String.
<0> {
\' { lex_char_tok }
- \" { lex_string_tok }
+ \"\"\" / { ifExtension MultilineStringsBit} { lex_string_tok StringTypeMulti }
+ \" { lex_string_tok StringTypeSingle }
}
-- Note [Whitespace-sensitive operator parsing]
@@ -948,6 +950,7 @@ data Token
| ITchar SourceText Char -- Note [Literal source text] in "GHC.Types.SourceText"
| ITstring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+ | ITmultilinestring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
| ITinteger IntegralLit -- Note [Literal source text] in "GHC.Types.SourceText"
| ITrational FractionalLit
@@ -2160,33 +2163,37 @@ lex_string_prag_comment mkTok span _buf _len _buf2
-- This stuff is horrible. I hates it.
-lex_string_tok :: Action
-lex_string_tok span buf _len _buf2 = do
- s <- lex_string
+lex_string_tok :: LexStringType -> Action
+lex_string_tok strType span buf _len _buf2 = do
+ s <- lex_string strType
i <- getInput
- lex_magic_hash i >>= \case
- Just i' -> do
- when (any (> '\xFF') s) $ do
- pState <- getPState
- let msg = PsErrPrimStringInvalidChar
- let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
- addError err
-
- setInput i'
- let (psSpan, src) = getStringLoc (buf, locStart) i'
- pure $ L psSpan (ITprimstring src (unsafeMkByteString s))
- Nothing -> do
+ case strType of
+ StringTypeSingle ->
+ lex_magic_hash i >>= \case
+ Just i' -> do
+ when (any (> '\xFF') s) $ do
+ pState <- getPState
+ let msg = PsErrPrimStringInvalidChar
+ let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
+ addError err
+
+ setInput i'
+ let (psSpan, src) = getStringLoc (buf, locStart) i'
+ pure $ L psSpan (ITprimstring src (unsafeMkByteString s))
+ Nothing -> do
+ let (psSpan, src) = getStringLoc (buf, locStart) i
+ pure $ L psSpan (ITstring src (mkFastString s))
+ StringTypeMulti -> do
let (psSpan, src) = getStringLoc (buf, locStart) i
- pure $ L psSpan (ITstring src (mkFastString s))
+ pure $ L psSpan (ITmultilinestring src (mkFastString s))
where
locStart = psSpanStart span
lex_quoted_label :: Action
lex_quoted_label span buf _len _buf2 = do
- start <- getInput
- s <- lex_string_helper "" start
+ s <- lex_string StringTypeSingle
(AI end bufEnd) <- getInput
let
token = ITlabelvarid (SourceText src) (mkFastString s)
@@ -2196,54 +2203,69 @@ lex_quoted_label span buf _len _buf2 = do
return $ L (mkPsSpan start end) token
-lex_string :: P String
-lex_string = getInput >>= lex_string_helper ""
-
-
-lex_string_helper :: String -> AlexInput -> P String
-lex_string_helper s start = do
- i <- getInput
- case alexGetChar' i of
- Nothing -> lit_error i
-
- Just ('"',i) -> do
- setInput i
- return (reverse s)
-
- Just ('\\',i)
- | Just ('&',i) <- next -> do
- setInput i; lex_string_helper s start
- | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
- -- is_space only works for <= '\x7f' (#3751, #5425)
- setInput i; lex_stringgap s start
- where next = alexGetChar' i
-
- Just (c, i1) -> do
- case c of
- '\\' -> do setInput i1; c' <- lex_escape; lex_string_helper (c':s) start
- c | isAny c -> do setInput i1; lex_string_helper (c:s) start
- _other | any isDoubleSmartQuote s -> do
- -- if the built-up string s contains a smart double quote character, it was
- -- likely the reason why the string literal was not lexed correctly
- setInput start -- rewind to the first character in the string literal
- -- so we can find the smart quote character's location
- advance_to_smart_quote_character
- i2@(AI loc _) <- getInput
- case alexGetChar' i2 of
- Just (c, _) -> do add_nonfatal_smart_quote_error c loc; lit_error i
- Nothing -> lit_error i -- should never get here
- _other -> lit_error i
-
-
-lex_stringgap :: String -> AlexInput -> P String
-lex_stringgap s start = do
- i <- getInput
- c <- getCharOrFail i
- case c of
- '\\' -> lex_string_helper s start
- c | c <= '\x7f' && is_space c -> lex_stringgap s start
- -- is_space only works for <= '\x7f' (#3751, #5425)
- _other -> lit_error i
+lex_string :: LexStringType -> P String
+lex_string strType = do
+ start <- getInput
+ case lexString [] start of
+ Right (lexedStr, next) -> do
+ setInput next
+ either fromStringLexError pure $ resolveLexedString strType lexedStr
+ Left (e, s, i) -> do
+ -- see if we can find a smart quote in the string we've found so far.
+ -- if the built-up string s contains a smart double quote character, it was
+ -- likely the reason why the string literal was not lexed correctly
+ case filter (\(LexedChar c _) -> isDoubleSmartQuote c) s of
+ LexedChar c (AI loc _) : _ -> add_nonfatal_smart_quote_error c loc
+ _ -> pure ()
+
+ -- regardless whether we found a smart quote, throw a lexical error
+ setInput i >> lexError e
+ where
+ -- Given the (reversed) string we've seen so far and the current location,
+ -- return Right with the fully lexed string and the subsequent location,
+ -- or Left with the string we've seen so far and the location where lexing
+ -- failed.
+ lexString acc0 i0 = do
+ let acc = reverse acc0
+ case alexGetChar' i0 of
+ _ | Just i1 <- lexDelimiter i0 -> Right (acc, i1)
+
+ Just (c0, i1) -> do
+ let acc1 = LexedChar c0 i0 : acc0
+ case c0 of
+ '\\' -> do
+ case alexGetChar' i1 of
+ Just (c1, i2)
+ | is_space' c1 -> lexStringGap (LexedChar c1 i1 : acc1) i2
+ | otherwise -> lexString (LexedChar c1 i1 : acc1) i2
+ Nothing -> Left (LexStringCharLit, acc, i1)
+ _ | isAny c0 -> lexString acc1 i1
+ _ | strType == StringTypeMulti && c0 `elem` ['\n', '\t'] -> lexString acc1 i1
+ _ -> Left (LexStringCharLit, acc, i1)
+
+ Nothing -> Left (LexStringCharLit, acc, i0)
+
+ lexDelimiter i0 =
+ case strType of
+ StringTypeSingle -> do
+ ('"', i1) <- alexGetChar' i0
+ Just i1
+ StringTypeMulti -> do
+ ('"', i1) <- alexGetChar' i0
+ ('"', i2) <- alexGetChar' i1
+ ('"', i3) <- alexGetChar' i2
+ Just i3
+
+ lexStringGap acc0 i0 = do
+ let acc = reverse acc0
+ case alexGetChar' i0 of
+ Just (c0, i1) -> do
+ let acc1 = LexedChar c0 i0 : acc0
+ case c0 of
+ '\\' -> lexString acc1 i1
+ _ | is_space' c0 -> lexStringGap acc1 i1
+ _ -> Left (LexStringCharLit, acc, i1)
+ Nothing -> Left (LexStringCharLitEOF, acc, i0)
lex_char_tok :: Action
@@ -2264,13 +2286,16 @@ lex_char_tok span buf _len _buf2 = do -- We've seen '
return (L (mkPsSpan loc end2) ITtyQuote)
Just ('\\', i2@(AI end2 _)) -> do -- We've seen 'backslash
- setInput i2
- lit_ch <- lex_escape
- i3 <- getInput
- mc <- getCharOrFail i3 -- Trailing quote
- if mc == '\'' then finish_char_tok buf loc lit_ch
- else if isSingleSmartQuote mc then add_smart_quote_error mc end2
- else lit_error i3
+ (LexedChar lit_ch _, i3, _) <-
+ either fromStringLexError pure $
+ resolveEscapeCharacter (LexedChar '\\' i1) (asLexedString i2)
+ case alexGetChar' i3 of
+ Just (mc, i4)
+ | '\'' <- mc -> do
+ setInput i4
+ finish_char_tok buf loc lit_ch
+ | isSingleSmartQuote mc -> add_smart_quote_error mc end2
+ _ -> lit_error i3
Just (c, i2@(AI end2 _))
| not (isAny c) -> lit_error i1
@@ -2328,115 +2353,24 @@ isAny :: Char -> Bool
isAny c | c > '\x7f' = isPrint c
| otherwise = is_any c
-lex_escape :: P Char
-lex_escape = do
- i0@(AI loc _) <- getInput
- c <- getCharOrFail i0
- case c of
- 'a' -> return '\a'
- 'b' -> return '\b'
- 'f' -> return '\f'
- 'n' -> return '\n'
- 'r' -> return '\r'
- 't' -> return '\t'
- 'v' -> return '\v'
- '\\' -> return '\\'
- '"' -> return '\"'
- '\'' -> return '\''
- -- the next two patterns build up a Unicode smart quote error (#21843)
- smart_double_quote | isDoubleSmartQuote smart_double_quote ->
- add_smart_quote_error smart_double_quote loc
- smart_single_quote | isSingleSmartQuote smart_single_quote ->
- add_smart_quote_error smart_single_quote loc
- '^' -> do i1 <- getInput
- c <- getCharOrFail i1
- if c >= '@' && c <= '_'
- then return (chr (ord c - ord '@'))
- else lit_error i1
-
- 'x' -> readNum is_hexdigit 16 hexDigit
- 'o' -> readNum is_octdigit 8 octDecDigit
- x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
-
- c1 -> do
- i <- getInput
- case alexGetChar' i of
- Nothing -> lit_error i0
- Just (c2,i2) ->
- case alexGetChar' i2 of
- Nothing -> do lit_error i0
- Just (c3,i3) ->
- let str = [c1,c2,c3] in
- case [ (c,rest) | (p,c) <- silly_escape_chars,
- Just rest <- [stripPrefix p str] ] of
- (escape_char,[]):_ -> do
- setInput i3
- return escape_char
- (escape_char,_:_):_ -> do
- setInput i2
- return escape_char
- [] -> lit_error i0
-
-readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
-readNum is_digit base conv = do
- i <- getInput
- c <- getCharOrFail i
- if is_digit c
- then readNum2 is_digit base conv (conv c)
- else lit_error i
+-- is_space only works for <= '\x7f' (#3751, #5425)
+--
+-- TODO: why not put this logic in is_space directly?
+is_space' :: Char -> Bool
+is_space' c | c > '\x7f' = False
+ | otherwise = is_space c
+
+-- | Returns a LexedString that, when iterated, lazily queries the
+-- next character from the AlexInput.
+asLexedString :: AlexInput -> LexedString AlexInput
+asLexedString = unfoldr (fmap toLexedChar . alexGetChar')
+ where
+ toLexedChar (c, i) = (LexedChar c i, i)
-readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
-readNum2 is_digit base conv i = do
- input <- getInput
- read i input
- where read i input = do
- case alexGetChar' input of
- Just (c,input') | is_digit c -> do
- let i' = i*base + conv c
- if i' > 0x10ffff
- then setInput input >> lexError LexNumEscapeRange
- else read i' input'
- _other -> do
- setInput input; return (chr i)
-
-
-silly_escape_chars :: [(String, Char)]
-silly_escape_chars = [
- ("NUL", '\NUL'),
- ("SOH", '\SOH'),
- ("STX", '\STX'),
- ("ETX", '\ETX'),
- ("EOT", '\EOT'),
- ("ENQ", '\ENQ'),
- ("ACK", '\ACK'),
- ("BEL", '\BEL'),
- ("BS", '\BS'),
- ("HT", '\HT'),
- ("LF", '\LF'),
- ("VT", '\VT'),
- ("FF", '\FF'),
- ("CR", '\CR'),
- ("SO", '\SO'),
- ("SI", '\SI'),
- ("DLE", '\DLE'),
- ("DC1", '\DC1'),
- ("DC2", '\DC2'),
- ("DC3", '\DC3'),
- ("DC4", '\DC4'),
- ("NAK", '\NAK'),
- ("SYN", '\SYN'),
- ("ETB", '\ETB'),
- ("CAN", '\CAN'),
- ("EM", '\EM'),
- ("SUB", '\SUB'),
- ("ESC", '\ESC'),
- ("FS", '\FS'),
- ("GS", '\GS'),
- ("RS", '\RS'),
- ("US", '\US'),
- ("SP", '\SP'),
- ("DEL", '\DEL')
- ]
+fromStringLexError :: StringLexError AlexInput -> P a
+fromStringLexError = \case
+ SmartQuoteError c (AI loc _) -> add_smart_quote_error c loc
+ StringLexError _ i e -> setInput i >> lexError e
-- before calling lit_error, ensure that the current input is pointing to
-- the position of the error in the buffer. This is so that we can report
@@ -2505,16 +2439,6 @@ quasiquote_error start = do
-- -----------------------------------------------------------------------------
-- Unicode Smart Quote detection (#21843)
-isDoubleSmartQuote :: Char -> Bool
-isDoubleSmartQuote '“' = True
-isDoubleSmartQuote '”' = True
-isDoubleSmartQuote _ = False
-
-isSingleSmartQuote :: Char -> Bool
-isSingleSmartQuote '‘' = True
-isSingleSmartQuote '’' = True
-isSingleSmartQuote _ = False
-
isSmartQuote :: AlexAccPred ExtsBitmap
isSmartQuote _ _ _ (AI _ buf) = let c = prevChar buf ' ' in isSingleSmartQuote c || isDoubleSmartQuote c
@@ -3043,6 +2967,7 @@ data ExtBits
| OverloadedRecordDotBit
| OverloadedRecordUpdateBit
| ExtendedLiteralsBit
+ | MultilineStringsBit
-- Flags that are updated once parsing starts
| InRulePragBit
@@ -3123,6 +3048,7 @@ mkParserOpts extensionFlags diag_opts supported
.|. OverloadedRecordDotBit `xoptBit` LangExt.OverloadedRecordDot
.|. OverloadedRecordUpdateBit `xoptBit` LangExt.OverloadedRecordUpdate -- Enable testing via 'getBit OverloadedRecordUpdateBit' in the parser (RecordDotSyntax parsing uses that information).
.|. ExtendedLiteralsBit `xoptBit` LangExt.ExtendedLiterals
+ .|. MultilineStringsBit `xoptBit` LangExt.MultilineStrings
optBits =
HaddockBit `setBitIf` isHaddock
.|. RawTokenStreamBit `setBitIf` rawTokStream
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -0,0 +1,341 @@
+{-# LANGUAGE LambdaCase #-}
+
+module GHC.Parser.String (
+ LexedString,
+ LexedChar (..),
+ StringLexError (..),
+ LexStringType (..),
+ resolveLexedString,
+ resolveEscapeCharacter,
+
+ -- * Unicode smart quote helpers
+ isDoubleSmartQuote,
+ isSingleSmartQuote,
+) where
+
+import GHC.Prelude
+
+import Control.Monad (guard, unless, when, (>=>))
+import Data.Char (chr, isSpace, ord)
+import qualified Data.Foldable1 as Foldable1
+import qualified Data.List.NonEmpty as NonEmpty
+import Data.Maybe (listToMaybe, mapMaybe, maybeToList)
+import GHC.Parser.CharClass (
+ hexDigit,
+ is_decdigit,
+ is_hexdigit,
+ is_octdigit,
+ octDecDigit,
+ )
+import GHC.Parser.Errors.Types (LexErr (..))
+import GHC.Utils.Panic (panic)
+
+data LexStringType = StringTypeSingle | StringTypeMulti deriving (Eq)
+
+data LexedChar loc = LexedChar !Char !loc
+type LexedString loc = [LexedChar loc]
+
+unLexedChar :: LexedChar loc -> Char
+unLexedChar (LexedChar c _) = c
+
+unLexedString :: LexedString loc -> String
+unLexedString = map unLexedChar
+
+-- | Apply the given StringProcessors to the given LexedString left-to-right,
+-- and return the processed string.
+resolveLexedString ::
+ LexStringType ->
+ LexedString loc ->
+ Either (StringLexError loc) String
+resolveLexedString strType = fmap unLexedString . foldr (>=>) pure processString
+ where
+ processString =
+ case strType of
+ StringTypeSingle ->
+ [ collapseStringGaps
+ , resolveEscapeCharacters
+ ]
+ StringTypeMulti ->
+ [ collapseStringGaps
+ , resolveMultilineString
+ , resolveEscapeCharacters
+ ]
+
+data StringLexError loc
+ = SmartQuoteError !Char !loc
+ | StringLexError !Char !loc !LexErr
+
+type StringProcessor loc = LexedString loc -> Either (StringLexError loc) (LexedString loc)
+
+collapseStringGaps :: StringProcessor loc
+collapseStringGaps s0 = pure (go s0)
+ where
+ go = \case
+ [] -> []
+
+ backslash@(LexedChar '\\' _) : c : s
+ | isLexedSpace c ->
+ -- lexer should have validated that this is a valid gap,
+ -- so we'll panic if we find any invalid characters
+ case dropWhile isLexedSpace s of
+ LexedChar '\\' _ : s -> go s
+ _ -> panic $ "Invalid string gap in " ++ show (unLexedString s0)
+ | otherwise ->
+ backslash : c : go s
+
+ c : s -> c : go s
+
+resolveEscapeCharacters :: StringProcessor loc
+resolveEscapeCharacters = go
+ where
+ go = \case
+ [] -> pure []
+ backslashChar@(LexedChar '\\' _) : s -> do
+ (c, _, s') <- resolveEscapeCharacter backslashChar s
+ (c :) <$> go s'
+ c : s ->
+ (c :) <$> go s
+
+-- | After finding a backslash, parse the rest of the escape character.
+resolveEscapeCharacter ::
+ LexedChar loc -> -- the backslash character
+ LexedString loc -> -- the rest of the string to parse
+ Either
+ (StringLexError loc)
+ (LexedChar loc, loc, LexedString loc) -- the resolved escape character,
+ -- the location of the last character we parsed,
+ -- and the rest of the string
+resolveEscapeCharacter backslashChar s0 = do
+ (firstChar@(LexedChar c loc), s1) <- expectNext backslashChar s0
+ let rewrap c' = pure (LexedChar c' loc, loc, s1)
+ case c of
+ 'a' -> rewrap '\a'
+ 'b' -> rewrap '\b'
+ 'f' -> rewrap '\f'
+ 'n' -> rewrap '\n'
+ 'r' -> rewrap '\r'
+ 't' -> rewrap '\t'
+ 'v' -> rewrap '\v'
+ '\\' -> rewrap '\\'
+ '"' -> rewrap '\"'
+ '\'' -> rewrap '\''
+ -- escape codes
+ 'x' -> expectNum is_hexdigit 16 hexDigit (firstChar, s1)
+ 'o' -> expectNum is_octdigit 8 octDecDigit (firstChar, s1)
+ _ | is_decdigit c -> expectNum is_decdigit 10 octDecDigit (backslashChar, s0)
+ -- control characters (e.g. '\^M')
+ '^' -> do
+ (LexedChar c1 loc1, s2) <- expectNext firstChar s1
+ unless (c1 >= '@' && c1 <= '_') $
+ Left $ StringLexError c1 loc1 LexStringCharLit
+ let c' = chr $ ord c1 - ord '@'
+ pure (LexedChar c' loc, loc1, s2)
+ -- long form escapes (e.g. '\NUL')
+ _ | Just (c', loc', s2) <- parseLongEscape firstChar s1 -> pure (LexedChar c' loc, loc', s2)
+ -- check unicode smart quotes (#21843)
+ _ | isDoubleSmartQuote c -> Left $ SmartQuoteError c loc
+ _ | isSingleSmartQuote c -> Left $ SmartQuoteError c loc
+ -- unknown escape
+ _ -> Left $ StringLexError c loc LexStringCharLit
+ where
+ expectNext lastChar = \case
+ [] -> do
+ let LexedChar c loc = lastChar
+ Left $ StringLexError c loc LexStringCharLitEOF
+ c : cs -> pure (c, cs)
+
+ expectNum isDigit base toDigit (lastChar, s0) = do
+ (LexedChar c loc, s1) <- expectNext lastChar s0
+ unless (isDigit c) $ Left $ StringLexError c loc LexStringCharLit
+ let parseNum lastLoc x = \case
+ LexedChar c' loc' : s' | isDigit c' -> do
+ let x' = x * base + toDigit c'
+ when (x' > 0x10ffff) $ Left $ StringLexError c' loc' LexNumEscapeRange
+ parseNum loc' x' s'
+ s ->
+ pure (LexedChar (chr x) loc, lastLoc, s)
+ parseNum loc (toDigit c) s1
+
+-- | Check if the escape characters match a long escape code.
+--
+-- >>> parseLongEscape 'C' [LexedChar 'R', LexedChar 'X', ...s] = Just ('\CR', [LexedChar 'X', ...s])
+-- >>> parseLongEscape 'X' [LexedChar 'X', LexedChar 'X', ...s] = Nothing
+parseLongEscape :: LexedChar loc -> LexedString loc -> Maybe (Char, loc, LexedString loc)
+parseLongEscape (LexedChar c loc) s = listToMaybe $ mapMaybe tryParse longEscapeCodes
+ where
+ tryParse (prefix, c') = do
+ p0 : p <- pure prefix
+ guard (p0 == c) -- see if the first character matches
+ (loc', s') <- parsePrefix loc p s -- see if the rest of the prefix matches
+ pure (c', loc', s')
+
+ parsePrefix _ (p : ps) (LexedChar t loc' : ts) | p == t = parsePrefix loc' ps ts
+ parsePrefix loc' [] s' = Just (loc', s') -- we've matched the whole prefix, return the rest
+ parsePrefix _ _ _ = Nothing
+
+ longEscapeCodes =
+ [ ("NUL", '\NUL')
+ , ("SOH", '\SOH')
+ , ("STX", '\STX')
+ , ("ETX", '\ETX')
+ , ("EOT", '\EOT')
+ , ("ENQ", '\ENQ')
+ , ("ACK", '\ACK')
+ , ("BEL", '\BEL')
+ , ("BS", '\BS')
+ , ("HT", '\HT')
+ , ("LF", '\LF')
+ , ("VT", '\VT')
+ , ("FF", '\FF')
+ , ("CR", '\CR')
+ , ("SO", '\SO')
+ , ("SI", '\SI')
+ , ("DLE", '\DLE')
+ , ("DC1", '\DC1')
+ , ("DC2", '\DC2')
+ , ("DC3", '\DC3')
+ , ("DC4", '\DC4')
+ , ("NAK", '\NAK')
+ , ("SYN", '\SYN')
+ , ("ETB", '\ETB')
+ , ("CAN", '\CAN')
+ , ("EM", '\EM')
+ , ("SUB", '\SUB')
+ , ("ESC", '\ESC')
+ , ("FS", '\FS')
+ , ("GS", '\GS')
+ , ("RS", '\RS')
+ , ("US", '\US')
+ , ("SP", '\SP')
+ , ("DEL", '\DEL')
+ ]
+
+-- -----------------------------------------------------------------------------
+-- Unicode Smart Quote detection (#21843)
+
+isDoubleSmartQuote :: Char -> Bool
+isDoubleSmartQuote = \case
+ '“' -> True
+ '”' -> True
+ _ -> False
+
+isSingleSmartQuote :: Char -> Bool
+isSingleSmartQuote = \case
+ '‘' -> True
+ '’' -> True
+ _ -> False
+
+{-
+Note [Multiline string literals]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Multiline string literals were added following the acceptance of the
+proposal: https://github.com/ghc-proposals/ghc-proposals/pull/569
+
+Multiline string literals are syntax sugar for normal string literals,
+with an extra post processing step. This all happens in the Lexer; that
+is, HsMultilineString will contain the post-processed string. This matches
+the same behavior as HsString, which contains the normalized string
+(see Note [Literal source text]).
+
+The string is post-processed with the following steps:
+1. Collapse string gaps
+2. Split the string by newlines
+3. Convert leading tabs into spaces
+ * In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
+4. Remove common whitespace prefix in every line
+ * See below
+5. Join the string back with `\n` delimiters
+6. If the first character of the string is a newline, remove it
+7. Interpret escaped characters
+
+The common whitespace prefix can be informally defined as "The longest
+prefix of whitespace shared by all lines in the string, excluding the
+first line and any whitespace-only lines".
+
+It's more precisely defined with the following algorithm:
+
+1. Take a list representing the lines in the string
+2. Ignore the following elements in the list:
+ * The first line (we want to ignore everything before the first newline)
+ * Empty lines
+ * Lines with only whitespace characters
+3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
+-}
+
+-- | A lexed line, with the string and the location info of the ending newline
+-- character, if one exists
+data LexedLine loc = LexedLine !(LexedString loc) (Maybe loc)
+
+mapLine :: (LexedString loc -> LexedString loc) -> LexedLine loc -> LexedLine loc
+mapLine f (LexedLine line nl) = LexedLine (f line) nl
+
+mapLines :: (LexedString loc -> LexedString loc) -> [LexedLine loc] -> [LexedLine loc]
+mapLines f = map (mapLine f)
+
+filterLines :: (LexedString loc -> Bool) -> [LexedLine loc] -> [LexedLine loc]
+filterLines f = filter (\(LexedLine line _) -> f line)
+
+splitLines :: LexedString loc -> [LexedLine loc]
+splitLines =
+ foldr
+ ( curry $ \case
+ (LexedChar '\n' loc, ls) -> LexedLine [] (Just loc) : ls
+ (c, l : ls) -> mapLine (c :) l : ls
+ (c, []) -> LexedLine [c] Nothing : [] -- should not happen
+ )
+ [emptyLine]
+ where
+ emptyLine = LexedLine [] Nothing
+
+joinLines :: [LexedLine loc] -> LexedString loc
+joinLines = concatMap (\(LexedLine line nl) -> line ++ maybeToList (LexedChar '\n' <$> nl))
+
+-- | See Note [Multiline string literals]
+resolveMultilineString :: StringProcessor loc
+resolveMultilineString = pure . process
+ where
+ (.>) :: (a -> b) -> (b -> c) -> (a -> c)
+ (.>) = flip (.)
+
+ process =
+ splitLines
+ .> convertLeadingTabs
+ .> rmCommonWhitespacePrefix
+ .> joinLines
+ .> rmFirstNewline
+
+ convertLeadingTabs =
+ let convertLine col = \case
+ [] -> []
+ c@(LexedChar ' ' _) : cs -> c : convertLine (col + 1) cs
+ LexedChar '\t' loc : cs ->
+ let fill = 8 - (col `mod` 8)
+ in replicate fill (LexedChar ' ' loc) ++ convertLine (col + fill) cs
+ c : cs -> c : cs
+ in mapLines (convertLine 0)
+
+ rmCommonWhitespacePrefix strLines =
+ let
+ excludeLines =
+ drop 1 -- ignore first line
+ .> filterLines (not . all isLexedSpace) -- ignore lines that are all whitespace
+ commonWSPrefix =
+ case NonEmpty.nonEmpty (excludeLines strLines) of
+ Nothing -> 0
+ Just strLines' ->
+ Foldable1.minimum $
+ flip NonEmpty.map strLines' $ \(LexedLine line _) ->
+ length $ takeWhile isLexedSpace line
+ in
+ mapLines (drop commonWSPrefix) strLines
+
+ rmFirstNewline = \case
+ LexedChar '\n' _ : s -> s
+ s -> s
+
+-- -----------------------------------------------------------------------------
+-- Helpers
+
+isLexedSpace :: LexedChar loc -> Bool
+isLexedSpace = isSpace . unLexedChar
=====================================
compiler/Language/Haskell/Syntax/Extension.hs
=====================================
@@ -565,6 +565,7 @@ type family XXApplicativeArg x
type family XHsChar x
type family XHsCharPrim x
type family XHsString x
+type family XHsMultilineString x
type family XHsStringPrim x
type family XHsInt x
type family XHsIntPrim x
=====================================
compiler/Language/Haskell/Syntax/Lit.hs
=====================================
@@ -54,6 +54,8 @@ data HsLit x
-- ^ Unboxed character
| HsString (XHsString x) {- SourceText -} FastString
-- ^ String
+ | HsMultilineString (XHsMultilineString x) {- SourceText -} FastString
+ -- ^ String
| HsStringPrim (XHsStringPrim x) {- SourceText -} !ByteString
-- ^ Packed bytes
| HsInt (XHsInt x) IntegralLit
=====================================
compiler/ghc.cabal.in
=====================================
@@ -609,6 +609,7 @@ Library
GHC.Parser.HaddockLex
GHC.Parser.PostProcess
GHC.Parser.PostProcess.Haddock
+ GHC.Parser.String
GHC.Parser.Types
GHC.Parser.Utils
GHC.Platform
=====================================
docs/users_guide/exts/multiline_strings.rst
=====================================
@@ -0,0 +1,17 @@
+.. _multiline-strings:
+
+Multiline string literals
+-------------------------
+
+.. extension:: MultilineStrings
+ :shortdesc: Enable multiline string literals.
+
+ :since: 9.10.1
+
+ Enable multiline string literals.
+
+With this extension, GHC now recognizes multiline string literals with ``"""`` delimiters. Indentation is automatically stripped, and gets desugared to normal string literals, so it works as expected for ``OverloadedStrings`` and any other functionality.
+
+TODO: explain removing common whitespace prefix
+TODO: add full spec
+TODO: add examples
=====================================
utils/check-exact/ExactPrint.hs
=====================================
@@ -4923,6 +4923,7 @@ hsLit2String lit =
HsChar src v -> toSourceTextWithSuffix src v ""
HsCharPrim src p -> toSourceTextWithSuffix src p ""
HsString src v -> toSourceTextWithSuffix src v ""
+ HsMultilineString src v -> toSourceTextWithSuffix src v ""
HsStringPrim src v -> toSourceTextWithSuffix src v ""
HsInt _ (IL src _ v) -> toSourceTextWithSuffix src v ""
HsIntPrim src v -> toSourceTextWithSuffix src v ""
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/3dc71170704814a77172d2a5655a930c97afe17b...bfd3f99e84734f697253b2e226cf1a47ded46ee2
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/3dc71170704814a77172d2a5655a930c97afe17b...bfd3f99e84734f697253b2e226cf1a47ded46ee2
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240214/e540ce8d/attachment-0001.html>
More information about the ghc-commits
mailing list