[Git][ghc/ghc][wip/interpolated-strings] Implement interpolated strings
Brandon Chinn (@brandonchinn178)
gitlab at gitlab.haskell.org
Sun Nov 24 00:25:38 UTC 2024
Brandon Chinn pushed to branch wip/interpolated-strings at Glasgow Haskell Compiler / GHC
Commits:
f7cdd5a8 by Brandon Chinn at 2024-11-23T16:22:10-08:00
Implement interpolated strings
- - - - -
7 changed files:
- compiler/GHC/Hs/Expr.hs
- compiler/GHC/Parser.y
- compiler/GHC/Parser/Lexer.x
- compiler/GHC/Parser/String.hs
- compiler/GHC/Rename/Expr.hs
- compiler/Language/Haskell/Syntax/Expr.hs
- compiler/Language/Haskell/Syntax/Extension.hs
Changes:
=====================================
compiler/GHC/Hs/Expr.hs
=====================================
@@ -254,6 +254,9 @@ type instance XIPVar GhcRn = NoExtField
type instance XIPVar GhcTc = DataConCantHappen
type instance XOverLitE (GhcPass _) = NoExtField
type instance XLitE (GhcPass _) = NoExtField
+type instance XInterString (GhcPass _) = NoExtField
+type instance XInterStringRaw (GhcPass _) = NoExtField
+type instance XInterStringExp (GhcPass _) = NoExtField
type instance XLam (GhcPass _) = EpAnnLam
type instance XApp (GhcPass _) = NoExtField
=====================================
compiler/GHC/Parser.y
=====================================
@@ -729,6 +729,11 @@ are the most common patterns, rewritten as regular expressions for clarity:
CHAR { L _ (ITchar _ _) }
STRING { L _ (ITstring _ _ StringTypeSingle) }
STRING_MULTI { L _ (ITstring _ _ StringTypeMulti) }
+ STRING_INTER_BEGIN { L _ (ITstringInterBegin _) }
+ STRING_INTER_RAW { L _ (ITstringInterRaw _ _) }
+ STRING_INTER_EXP_OPEN { L _ ITstringInterExpOpen }
+ STRING_INTER_EXP_CLOSE { L _ ITstringInterExpClose }
+ STRING_INTER_END { L _ (ITstringInterEnd _) }
INTEGER { L _ (ITinteger _) }
RATIONAL { L _ (ITrational _) }
@@ -3113,6 +3118,8 @@ aexp2 :: { ECP }
-- into HsOverLit when -XOverloadedStrings is on.
-- | STRING { sL (getLoc $1) (HsOverLit $! mkHsIsString (getSTRINGs $1)
-- (getSTRING $1) noExtField) }
+ | stringInter {% fmap ecpFromExp
+ (ams1 $1 (HsInterString NoExtField $! unLoc $1)) }
| INTEGER { ECP $ mkHsOverLitPV (sL1a $1 $ mkHsIntegral (getINTEGER $1)) }
| RATIONAL { ECP $ mkHsOverLitPV (sL1a $1 $ mkHsFractional (getRATIONAL $1)) }
@@ -3707,6 +3714,14 @@ ipvar :: { Located HsIPName }
overloaded_label :: { Located (SourceText, FastString) }
: LABELVARID { sL1 $1 (getLABELVARIDs $1, getLABELVARID $1) }
+-----------------------------------------------------------------------------
+-- Interpolated strings
+-- See Note [Interpolated strings] in GHC.Parser.String
+
+stringInter :: { [Either FastString (LHsExpr GhcPs)] }
+ -- TODO(bchinn): break out recursive stringInterPart rule
+ : STRING_INTER_BEGIN (STRING_INTER_RAW | STRING_INTER_EXP_OPEN exp STRING_INTER_EXP_CLOSE)* STRING_INTER_END { undefined }
+
-----------------------------------------------------------------------------
-- Warnings and deprecations
=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -170,7 +170,7 @@ $idchar = [$small $large $digit $uniidchar \']
$unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$graphic = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
-$charesc = [a b f n r t v \\ \" \' \&]
+$charesc = [a b f n r t v \\ \" \' \& \$]
$binit = 0-1
$octit = 0-7
@@ -228,8 +228,9 @@ $docsym = [\| \^ \* \$]
-- character sets can be subtracted, not strings
@escape = \\ ( $charesc | @ascii | @decimal | o @octal | x @hexadecimal )
@escapechar = \\ ( $charesc # \& | @ascii | @decimal | o @octal | x @hexadecimal )
- at stringchar = ($graphic # [\\ \"]) | $space | @escape | @gap
- at char = ($graphic # [\\ \']) | $space | @escapechar
+ at stringchar = ($graphic # [\\ \"]) | $space | @escape | @gap
+ at char = ($graphic # [\\ \']) | $space | @escapechar
+ at stringinterchar = ($graphic # [\\ \" \$]) | $space | @escape | @gap
-- normal signed numerical literals can only be explicitly negative,
-- not explicitly positive (contrast @exponent)
@@ -629,6 +630,21 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
(\" | \"\") / ([\n .] # \") { tok_string_multi_content }
}
+-- See Note [Interpolated strings]
+<0> {
+ s \" { \span _ _ _ -> pushLexState string_inter_content >> pure (L span (ITstringInterBegin StringTypeSingle)) }
+ -- TODO(bchinn): interpolated multiline strings
+}
+
+-- TODO(bchinn): add string_inter state to all <0> states that can be in an interpolated string
+<string_inter_content> {
+ @stringinterchar* { tok_string_inter_raw }
+ \$ \{ { \span _ _ _ -> pushLexState string_inter >> pure (L span ITstringInterExpOpen) }
+ \" { \span _ _ _ -> popLexState >> pure (L span (ITstringInterEnd StringTypeSingle)) }
+
+ -- TODO(bchinn): check for smart quotes
+}
+
<0> {
\'\' { token ITtyQuote }
@@ -923,6 +939,13 @@ data Token
| ITchar SourceText Char -- Note [Literal source text] in "GHC.Types.SourceText"
| ITstring SourceText FastString StringType -- Note [Literal source text] in "GHC.Types.SourceText"
+ -- See Note [Interpolated strings]
+ | ITstringInterBegin StringType
+ | ITstringInterRaw SourceText RawLexedString -- Note [Literal source text] in "GHC.Types.SourceText"
+ | ITstringInterExpOpen
+ | ITstringInterExpClose
+ | ITstringInterEnd StringType
+
| ITinteger IntegralLit -- Note [Literal source text] in "GHC.Types.SourceText"
| ITrational FractionalLit
@@ -1672,8 +1695,11 @@ open_brace span _str _len _buf2 = do
setContext (NoLayout:ctx)
return (L span ITocurly)
close_brace span _str _len _buf2 = do
- popContext
- return (L span ITccurly)
+ ctx <- getContext
+ sc <- getLexState
+ if null ctx && sc == string_inter
+ then popLexState >> pure (L span ITstringInterExpClose)
+ else popContext >> pure (L span ITccurly)
qvarid, qconid :: StringBuffer -> Int -> Token
qvarid buf len = ITqvarid $! splitQualName buf len False
=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -8,6 +8,10 @@ module GHC.Parser.String (
StringType (..),
lexString,
+ -- * Raw strings
+ RawLexedString,
+ lexStringRaw,
+
-- * Unicode smart quote helpers
isDoubleSmartQuote,
isSingleSmartQuote,
@@ -17,8 +21,11 @@ import GHC.Prelude hiding (getChar)
import Control.Arrow ((>>>))
import Control.Monad (when)
+import Data.Bifunctor (first)
import Data.Char (chr, ord)
import qualified Data.Foldable1 as Foldable1
+import Data.Functor.Identity (Identity (..))
+import Data.List (unsnoc)
import qualified Data.List.NonEmpty as NonEmpty
import Data.Maybe (listToMaybe, mapMaybe)
import GHC.Data.StringBuffer (StringBuffer)
@@ -259,6 +266,75 @@ isSingleSmartQuote = \case
'’' -> True
_ -> False
+-- -----------------------------------------------------------------------------
+-- Interpolated strings
+
+-- | A string that's been validated to be lexically correct, but still
+-- contains the raw string lexed, without anything resolved.
+newtype RawLexedString = RawLexedString {unRawLexedString :: String}
+
+-- | Load and validate the string in the given StringBuffer.
+--
+-- e.g. Lexing "a\nb" will return RawLexedString ['a', '\\', 'n', 'b'].
+lexStringRaw :: Int -> StringBuffer -> Either StringLexError RawLexedString
+lexStringRaw len buf = RawLexedString (bufferChars len buf) <$ validateString len buf
+
+fromRawLexedStringSingle :: RawLexedString -> String
+fromRawLexedStringSingle (RawLexedString s) =
+ case processCharsSingle s of
+ Right s' -> s'
+ Left _ -> panic "Unexpectedly got an error when re-lexing the string"
+
+fromRawLexedStringMulti :: (RawLexedString, [(x, RawLexedString)]) -> (String, [(x, String)])
+fromRawLexedStringMulti s =
+ case processCharsMulti (to s) of
+ Just s' -> from s'
+ Nothing -> panic "Unexpectedly got an error when re-lexing the string"
+ where
+ to (pre, parts) = InterMultiString pre parts
+ from (InterMultiString pre parts) = (pre, parts)
+
+{-
+Note [Interpolated strings]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Interpolated string syntax was accepted in this proposal:
+https://github.com/ghc-proposals/ghc-proposals/pull/570
+
+Interpolated strings are syntax sugar for <TODO(bchinn)>
+
+Interpolated strings are implemented in the following manner:
+
+1. Lexer takes the string as input:
+
+ s"Hello ${Text.toUpper name}!"
+
+ and outputs the following tokens:
+
+ [ ITstringInterBegin src StringTypeSingle
+ , ITstringInterRaw src "Hello "
+ , ITstringInterExpOpen src
+ , ITqvarid ("Text.toUpper", "name")
+ , ITvarid "name"
+ , ITstringInterExpClose src
+ , ITstringInterRaw src "!"
+ , ITstringInterEnd src StringTypeSingle
+ ]
+
+2. The parser will then parse the tokens into the following HsExpr:
+
+ HsInterString ext
+ [ HsInterRaw ext "Hello "
+ , HsInterExp ext $
+ HsApp ext
+ (HsVar ext 'Text.toUpper)
+ (HsVar ext 'name)
+ , HsInterRaw ext "!"
+ ]
+
+3. This will then be desugared into <TODO(bchinn)>
+-}
+
-- -----------------------------------------------------------------------------
-- Multiline strings
@@ -268,20 +344,54 @@ isSingleSmartQuote = \case
-- and rejoining lines, and instead manually find newline characters,
-- for performance.
processCharsMulti :: String -> Maybe String
-processCharsMulti =
- collapseGaps -- Step 1
- >>> normalizeEOL
- >>> expandLeadingTabs -- Step 3
- >>> rmCommonWhitespacePrefix -- Step 4
- >>> collapseOnlyWsLines -- Step 5
- >>> rmFirstNewline -- Step 7a
- >>> rmLastNewline -- Step 7b
- >>> resolveEscapesMaybe -- Step 8
+processCharsMulti = fmap from . processCharsMulti' . to
+ where
+ -- Convert a normal multiline string to/from an interpolated multiline string
+ -- with no interpolated expressions.
+ to s = InterMultiString s []
+ from = \case
+ InterMultiString s [] -> s
+ _ -> panic "Got unexpected result when processing characters in multiline string"
+
+-- | An interpolated, multiline string to be processed.
+--
+-- `x` here will only ever be instantiated as `HsExpr`, but we'll leave it general to ensure
+-- we never modify it, we only ever propagate it.
+--
+-- We represent this as a list of (x, String) tuples instead of [Either x String] to guarantee
+-- that we don't have to handle two raw Strings next to each other.
+data InterMultiString x =
+ InterMultiString
+ String -- ^ beginning of the string before the first interpolated expr
+ [(x, String)] -- ^ (expr, raw string) interleaved groups
+
+-- Run the given function over all raw strings, ignoring expressions
+overRaw :: (String -> String) -> InterMultiString x -> InterMultiString x
+overRaw f = runIdentity . overRawM (Identity . f)
+
+overRawM :: Monad m => (String -> m String) -> InterMultiString x -> m (InterMultiString x)
+overRawM f (InterMultiString pre parts) = InterMultiString <$> f pre <*> (traverse . traverse) f parts
+
+-- | Process multiline characters generally, for both normal multiline strings and interpolated
+-- multiline strings.
+processCharsMulti' :: InterMultiString x -> Maybe (InterMultiString x)
+processCharsMulti' =
+ overRaw collapseGaps -- Step 1
+ >>> overRaw normalizeEOL
+ >>> expandLeadingTabs -- Step 3
+ >>> rmCommonWhitespacePrefix -- Step 4
+ >>> collapseOnlyWsLines -- Step 5
+ >>> rmFirstNewline -- Step 7a
+ >>> rmLastNewline -- Step 7b
+ >>> overRawM resolveEscapesMaybe -- Step 8
-- | Expands all tabs blindly, since the lexer will verify that tabs can only appear
-- as leading indentation
-expandLeadingTabs :: String -> String
-expandLeadingTabs = go 0
+expandLeadingTabs :: InterMultiString x -> InterMultiString x
+expandLeadingTabs =
+ -- we can expand each raw string part independently, because leading
+ -- indentation will never contain an interpolated expression
+ overRaw $ go 0
where
go !col = \case
c@'\t' : cs ->
@@ -303,21 +413,27 @@ normalizeEOL = go
c : cs -> c : go cs
[] -> []
-rmCommonWhitespacePrefix :: String -> String
-rmCommonWhitespacePrefix cs0 = go cs0
+rmCommonWhitespacePrefix :: InterMultiString x -> InterMultiString x
+rmCommonWhitespacePrefix s0 =
+ -- Whitespace prefix, by definition, only comes after newline characters, and there can
+ -- never be an interpolated expr within a whitespace prefix (since the expr would end
+ -- the prefix). So we can use a plain `map` to just process the string parts, because
+ -- the "drop prefix" logic will never span over multiple parts.
+ map (first go) parts
where
- commonWSPrefix = getCommonWsPrefix cs0
+ -- treat interpolated exprs as a single, non-space character string
+ commonWSPrefix = getCommonWsPrefix $ case s0 of InterMultiString pre parts -> pre ++ concatMap snd parts
go = \case
- c@'\n' : cs -> c : go (dropLine commonWSPrefix cs)
+ c@'\n' : cs -> c : go (dropPrefix commonWSPrefix cs)
c : cs -> c : go cs
[] -> []
-- drop x characters from the string, or up to a newline, whichever comes first
- dropLine !x = \case
+ dropPrefix !x = \case
cs | x <= 0 -> cs
cs@('\n' : _) -> cs
- _ : cs -> dropLine (x - 1) cs
+ _ : cs -> dropPrefix (x - 1) cs
[] -> []
-- | See step 4 in Note [Multiline string literals]
@@ -334,30 +450,42 @@ getCommonWsPrefix s =
. drop 1 -- ignore first line in calculation
$ lines s
-collapseOnlyWsLines :: String -> String
-collapseOnlyWsLines = go
+collapseOnlyWsLines :: InterMultiString x -> InterMultiString x
+collapseOnlyWsLines (InterMultiString pre parts) =
+ let pre' = go (null parts) pre
+ parts' = [(expr, go isLast s) | ((expr, s), isLast) <- withIsLast parts]
+ in InterMultiString pre' parts'
where
- go = \case
- c@'\n' : cs | Just cs' <- checkAllWs cs -> c : go cs'
+ go isLast = \case
+ c@'\n' : cs | Just cs' <- checkAllWs isLast cs -> c : go cs'
c : cs -> c : go cs
[] -> []
- checkAllWs = \case
+ checkAllWs isLast = \case
-- got all the way to a newline or the end of the string, return
cs@('\n' : _) -> Just cs
- cs@[] -> Just cs
+ cs@[] | isLast -> Just cs
-- found whitespace, continue
c : cs | is_space c -> checkAllWs cs
-- anything else, stop
_ -> Nothing
-rmFirstNewline :: String -> String
-rmFirstNewline = \case
- '\n' : cs -> cs
- cs -> cs
+ -- annotate every element with a Bool indicating if it's the last element
+ withIsLast :: [a] -> [(a, Bool)]
+ withIsLast xs = zip xs $ (False <$ init xs) ++ [True]
-rmLastNewline :: String -> String
-rmLastNewline = go
+rmFirstNewline :: InterMultiString x -> InterMultiString x
+rmFirstNewline = \case
+ InterMultiString ('\n' : pre) parts -> InterMultiString pre parts
+ s -> s
+
+rmLastNewline :: InterMultiString x -> InterMultiString x
+rmLastNewline (InterMultiString pre parts) =
+ case unsnoc parts of
+ Nothing ->
+ InterMultiString (go pre) parts
+ Just (parts0, (x, lastLine)) ->
+ InterMultiString pre (parts0 ++ [(x, go lastLine)])
where
go = \case
[] -> []
=====================================
compiler/GHC/Rename/Expr.hs
=====================================
@@ -390,6 +390,9 @@ rnExpr (HsOverLit x lit)
return (HsApp noExtField (noLocA neg) (noLocA (HsOverLit x lit'))
, fvs ) }
+rnExpr (HsInterString x ty parts)
+ = undefined -- TODO(bchinn)
+
rnExpr (HsApp x fun arg)
= do { (fun',fvFun) <- rnLExpr fun
; (arg',fvArg) <- rnLExpr arg
=====================================
compiler/Language/Haskell/Syntax/Expr.hs
=====================================
@@ -346,19 +346,24 @@ data HsExpr p
-- erroring expression will be written after
-- solving. See Note [Holes] in GHC.Tc.Types.Constraint.
-
-
| HsOverLabel (XOverLabel p) FastString
-- ^ Overloaded label (Note [Overloaded labels] in GHC.OverloadedLabels)
| HsIPVar (XIPVar p)
HsIPName -- ^ Implicit parameter (not in use after typechecking)
+
| HsOverLit (XOverLitE p)
(HsOverLit p) -- ^ Overloaded literals
| HsLit (XLitE p)
(HsLit p) -- ^ Simple (non-overloaded) literals
+ | -- | See Note [Interpolated strings]
+ HsInterString
+ (XInterString p)
+ HsStringType
+ [Either FastString (LHsExpr p)]
+
-- | Lambda, Lambda-case, and Lambda-cases
| HsLam (XLam p)
HsLamVariant -- ^ Tells whether this is for lambda, \case, or \cases
=====================================
compiler/Language/Haskell/Syntax/Extension.hs
=====================================
@@ -410,6 +410,9 @@ type family XOverLabel x
type family XIPVar x
type family XOverLitE x
type family XLitE x
+type family XInterString x
+type family XInterStringRaw x
+type family XInterStringExp x
type family XLam x
type family XLamCase x
type family XApp x
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/f7cdd5a88193dee78d9603217645f99ff8be9727
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/f7cdd5a88193dee78d9603217645f99ff8be9727
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20241123/8c2ea72c/attachment-0001.html>
More information about the ghc-commits
mailing list