[Git][ghc/ghc][wip/multiline-strings] 3 commits: Factor out string processing functions

Brandon Chinn (@brandonchinn178) gitlab at gitlab.haskell.org
Fri Feb 9 05:45:24 UTC 2024



Brandon Chinn pushed to branch wip/multiline-strings at Glasgow Haskell Compiler / GHC


Commits:
198669ca by Brandon Chinn at 2024-02-08T21:42:47-08:00
Factor out string processing functions

- - - - -
cf0a62e2 by Brandon Chinn at 2024-02-08T21:45:07-08:00
Implement MultilineStrings

- - - - -
51875771 by Brandon Chinn at 2024-02-08T21:45:08-08:00
Add docs for MultilineStrings

- - - - -


5 changed files:

- compiler/GHC/Hs/Lit.hs
- compiler/GHC/Parser/Lexer.x
- + compiler/GHC/Parser/String.hs
- compiler/Language/Haskell/Syntax/Lit.hs
- + docs/users_guide/exts/multiline_strings.rst


Changes:

=====================================
compiler/GHC/Hs/Lit.hs
=====================================
@@ -46,6 +46,7 @@ import Language.Haskell.Syntax.Lit
 type instance XHsChar       (GhcPass _) = SourceText
 type instance XHsCharPrim   (GhcPass _) = SourceText
 type instance XHsString     (GhcPass _) = SourceText
+type instance XHsMultilineString (GhcPass _) = SourceText
 type instance XHsStringPrim (GhcPass _) = SourceText
 type instance XHsInt        (GhcPass _) = NoExtField
 type instance XHsIntPrim    (GhcPass _) = SourceText


=====================================
compiler/GHC/Parser/Lexer.x
=====================================
@@ -130,6 +130,7 @@ import GHC.Driver.Flags
 import GHC.Parser.Errors.Basic
 import GHC.Parser.Errors.Types
 import GHC.Parser.Errors.Ppr ()
+import GHC.Parser.String
 }
 
 -- -----------------------------------------------------------------------------
@@ -662,7 +663,8 @@ $unigraphic / { isSmartQuote } { smart_quote_error }
 -- to convert it to a String.
 <0> {
   \'                            { lex_char_tok }
-  \"                            { lex_string_tok }
+  \"\"\" / { ifExtension MultilineStringsBit} { lex_string_tok StringTypeMulti }
+  \"                            { lex_string_tok StringTypeSingle }
 }
 
 -- Note [Whitespace-sensitive operator parsing]
@@ -948,6 +950,7 @@ data Token
 
   | ITchar     SourceText Char       -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITstring   SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
+  | ITmultilinestring SourceText FastString -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITinteger  IntegralLit           -- Note [Literal source text] in "GHC.Types.SourceText"
   | ITrational FractionalLit
 
@@ -2160,33 +2163,36 @@ lex_string_prag_comment mkTok span _buf _len _buf2
 
 -- This stuff is horrible.  I hates it.
 
-lex_string_tok :: Action
-lex_string_tok span buf _len _buf2 = do
-  s <- lex_string
+lex_string_tok :: LexStringType -> Action
+lex_string_tok strType span buf _len _buf2 = do
+  s <- lex_string strType
 
   i@(AI end bufEnd) <- getInput
   let src = lexemeToFastString buf (cur bufEnd - cur buf)
 
   tok <-
-    lex_magic_hash i >>= \case
-      Just i' -> do
-        setInput i'
-        when (any (> '\xFF') s) $ do
-          pState <- getPState
-          let msg = PsErrPrimStringInvalidChar
-          let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
-          addError err
-        pure $ ITprimstring (SourceText src) (unsafeMkByteString s)
-      Nothing -> do
-        pure $ ITstring (SourceText src) (mkFastString s)
+    case strType of
+      StringTypeSingle ->
+        lex_magic_hash i >>= \case
+          Just i' -> do
+            setInput i'
+            when (any (> '\xFF') s) $ do
+              pState <- getPState
+              let msg = PsErrPrimStringInvalidChar
+              let err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (last_loc pState)) msg
+              addError err
+            pure $ ITprimstring (SourceText src) (unsafeMkByteString s)
+          Nothing -> do
+            pure $ ITstring (SourceText src) (mkFastString s)
+      StringTypeMulti ->
+        pure $ ITmultilinestring (SourceText src) (mkFastString s)
 
   return $ L (mkPsSpan (psSpanStart span) end) tok
 
 
 lex_quoted_label :: Action
 lex_quoted_label span buf _len _buf2 = do
-  start <- getInput
-  s <- lex_string_helper "" start
+  s <- lex_string StringTypeSingle
   (AI end bufEnd) <- getInput
   let
     token = ITlabelvarid (SourceText src) (mkFastString s)
@@ -2196,56 +2202,74 @@ lex_quoted_label span buf _len _buf2 = do
   return $ L (mkPsSpan start end) token
 
 
-lex_string :: P String
-lex_string = getInput >>= lex_string_helper ""
-
-
-lex_string_helper :: String -> AlexInput -> P String
-lex_string_helper s start = do
-  i <- getInput
-  case alexGetChar' i of
-    Nothing -> lit_error i
-
-    Just ('"',i)  -> do
-      setInput i
-      return (reverse s)
-
-    Just ('\\',i)
-        | Just ('&',i) <- next -> do
-                setInput i; lex_string_helper s start
-        | Just (c,i) <- next, c <= '\x7f' && is_space c -> do
-                           -- is_space only works for <= '\x7f' (#3751, #5425)
-                setInput i; lex_stringgap s start
-        where next = alexGetChar' i
-
-    Just (c, i1) -> do
-        case c of
-          '\\' -> do setInput i1; c' <- lex_escape; lex_string_helper (c':s) start
-          c | isAny c -> do setInput i1; lex_string_helper (c:s) start
-          _other | any isDoubleSmartQuote s -> do
-            -- if the built-up string s contains a smart double quote character, it was
-            -- likely the reason why the string literal was not lexed correctly
-            setInput start -- rewind to the first character in the string literal
-                           -- so we can find the smart quote character's location
-            advance_to_smart_quote_character
-            i2@(AI loc _) <- getInput
-            case alexGetChar' i2 of
-              Just (c, _) -> do add_nonfatal_smart_quote_error c loc; lit_error i
-              Nothing -> lit_error i -- should never get here
-          _other -> lit_error i
-
-
-lex_stringgap :: String -> AlexInput -> P String
-lex_stringgap s start = do
-  i <- getInput
-  c <- getCharOrFail i
-  case c of
-    '\\' -> lex_string_helper s start
-    c | c <= '\x7f' && is_space c -> lex_stringgap s start
-                           -- is_space only works for <= '\x7f' (#3751, #5425)
-    _other -> lit_error i
-
-
+lex_string :: StringType -> P String
+lex_string strType = do
+  start <- getInput
+  lexedStr <-
+    case lexString [] start of
+      Right (lexedStr, next) -> do
+        setInput next
+        pure lexedStr
+      Left (e, s, i) -> do
+        -- see if we can find a smart quote in the string we've found so far.
+        -- if the built-up string s contains a smart double quote character, it was
+        -- likely the reason why the string literal was not lexed correctly
+        case filter (\(LexedChar c _) -> isDoubleSmartQuote c) s of
+          LexedChar c (AI loc _) : _ -> add_nonfatal_smart_quote_error c loc
+          _ -> pure ()
+
+        -- regardless whether we found a smart quote, throw a lexical error
+        setInput i >> lexError e
+
+  case resolveLexedString strType lexedStr of
+    Right s -> pure s
+    Left e ->
+      case e of
+        SmartQuoteError c (AI loc _) -> add_smart_quote_error c loc
+        StringLexError _ i e -> setInput i >> lexError e
+  where
+    -- Given the (reversed) string we've seen so far and the current location,
+    -- return Right with the fully lexed string and the subsequent location,
+    -- or Left with the string we've seen so far and the location where lexing
+    -- failed.
+    lexString ::
+      LexedString AlexInput ->
+      AlexInput ->
+      Either (LexedString AlexInput, AlexInput) (LexError, LexedString AlexInput, AlexInput)
+    lexString acc0 i0 = do
+      let acc = reverse acc0
+      case alexGetChar' i0 of
+        -- TODO: change delimiter if MultilineString
+        Just ('"', i1) -> Right (acc, i1)
+
+        Just (c0, i1) -> do
+          let acc1 = LexedChar c0 i0 : acc0
+          case c0 of
+            '\\' -> do
+              case alexGetChar' i1 of
+                Just (c1, i2)
+                  | is_space' c1 -> lexStringGap (LexedChar c1 i1 : acc1) i2
+                  | otherwise -> lexString (LexedChar c1 i1 : acc1) i2
+                Nothing -> Left (LexStringCharLit, acc, i1)
+            -- TODO: allow newlines and tabs if MultilineString
+            _ | isAny c0 -> lexString acc1 i1
+            _ -> Left (LexStringCharLit, acc, i1)
+
+        Nothing -> Left (LexStringCharLit, acc, i0)
+
+    lexStringGap acc0 i0 = do
+      let acc = reverse acc0
+      case alexGetChar' i0 of
+        Just (c0, i1) -> do
+          let acc1 = LexedChar c0 i0 : acc0
+          case c0 of
+            '\\' -> lexString acc1 i1
+            _ | is_space' c0 -> lexStringGap acc1 i1
+            _ -> Left (LexStringCharLit, acc, i1)
+        Nothing -> Left (LexStringCharLitEOF, acc, i0)
+
+
+-- TODO: refactor to use new resolveEscapeCharacter function
 lex_char_tok :: Action
 -- Here we are basically parsing character literals, such as 'x' or '\n'
 -- but we additionally spot 'x and ''T, returning ITsimpleQuote and
@@ -2322,115 +2346,12 @@ isAny :: Char -> Bool
 isAny c | c > '\x7f' = isPrint c
         | otherwise  = is_any c
 
-lex_escape :: P Char
-lex_escape = do
-  i0@(AI loc _) <- getInput
-  c <- getCharOrFail i0
-  case c of
-        'a'   -> return '\a'
-        'b'   -> return '\b'
-        'f'   -> return '\f'
-        'n'   -> return '\n'
-        'r'   -> return '\r'
-        't'   -> return '\t'
-        'v'   -> return '\v'
-        '\\'  -> return '\\'
-        '"'   -> return '\"'
-        '\''  -> return '\''
-        -- the next two patterns build up a Unicode smart quote error (#21843)
-        smart_double_quote | isDoubleSmartQuote smart_double_quote ->
-          add_smart_quote_error smart_double_quote loc
-        smart_single_quote | isSingleSmartQuote smart_single_quote ->
-          add_smart_quote_error smart_single_quote loc
-        '^'   -> do i1 <- getInput
-                    c <- getCharOrFail i1
-                    if c >= '@' && c <= '_'
-                        then return (chr (ord c - ord '@'))
-                        else lit_error i1
-
-        'x'   -> readNum is_hexdigit 16 hexDigit
-        'o'   -> readNum is_octdigit  8 octDecDigit
-        x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
-
-        c1 ->  do
-           i <- getInput
-           case alexGetChar' i of
-            Nothing -> lit_error i0
-            Just (c2,i2) ->
-              case alexGetChar' i2 of
-                Nothing -> do lit_error i0
-                Just (c3,i3) ->
-                   let str = [c1,c2,c3] in
-                   case [ (c,rest) | (p,c) <- silly_escape_chars,
-                                     Just rest <- [stripPrefix p str] ] of
-                          (escape_char,[]):_ -> do
-                                setInput i3
-                                return escape_char
-                          (escape_char,_:_):_ -> do
-                                setInput i2
-                                return escape_char
-                          [] -> lit_error i0
-
-readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
-readNum is_digit base conv = do
-  i <- getInput
-  c <- getCharOrFail i
-  if is_digit c
-        then readNum2 is_digit base conv (conv c)
-        else lit_error i
-
-readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
-readNum2 is_digit base conv i = do
-  input <- getInput
-  read i input
-  where read i input = do
-          case alexGetChar' input of
-            Just (c,input') | is_digit c -> do
-               let i' = i*base + conv c
-               if i' > 0x10ffff
-                  then setInput input >> lexError LexNumEscapeRange
-                  else read i' input'
-            _other -> do
-              setInput input; return (chr i)
-
-
-silly_escape_chars :: [(String, Char)]
-silly_escape_chars = [
-        ("NUL", '\NUL'),
-        ("SOH", '\SOH'),
-        ("STX", '\STX'),
-        ("ETX", '\ETX'),
-        ("EOT", '\EOT'),
-        ("ENQ", '\ENQ'),
-        ("ACK", '\ACK'),
-        ("BEL", '\BEL'),
-        ("BS", '\BS'),
-        ("HT", '\HT'),
-        ("LF", '\LF'),
-        ("VT", '\VT'),
-        ("FF", '\FF'),
-        ("CR", '\CR'),
-        ("SO", '\SO'),
-        ("SI", '\SI'),
-        ("DLE", '\DLE'),
-        ("DC1", '\DC1'),
-        ("DC2", '\DC2'),
-        ("DC3", '\DC3'),
-        ("DC4", '\DC4'),
-        ("NAK", '\NAK'),
-        ("SYN", '\SYN'),
-        ("ETB", '\ETB'),
-        ("CAN", '\CAN'),
-        ("EM", '\EM'),
-        ("SUB", '\SUB'),
-        ("ESC", '\ESC'),
-        ("FS", '\FS'),
-        ("GS", '\GS'),
-        ("RS", '\RS'),
-        ("US", '\US'),
-        ("SP", '\SP'),
-        ("DEL", '\DEL')
-        ]
+-- is_space only works for <= '\x7f' (#3751, #5425)
+--
+-- TODO: why not put this logic in is_space directly?
+is_space' :: Char -> Bool
+is_space' c | c > '\x7f' = False
+            | otherwise  = is_space c
 
 -- before calling lit_error, ensure that the current input is pointing to
 -- the position of the error in the buffer.  This is so that we can report
@@ -2499,16 +2420,6 @@ quasiquote_error start = do
 -- -----------------------------------------------------------------------------
 -- Unicode Smart Quote detection (#21843)
 
-isDoubleSmartQuote :: Char -> Bool
-isDoubleSmartQuote '“' = True
-isDoubleSmartQuote '”' = True
-isDoubleSmartQuote _ = False
-
-isSingleSmartQuote :: Char -> Bool
-isSingleSmartQuote '‘' = True
-isSingleSmartQuote '’' = True
-isSingleSmartQuote _ = False
-
 isSmartQuote :: AlexAccPred ExtsBitmap
 isSmartQuote _ _ _ (AI _ buf) = let c = prevChar buf ' ' in isSingleSmartQuote c || isDoubleSmartQuote c
 
@@ -3037,6 +2948,7 @@ data ExtBits
   | OverloadedRecordDotBit
   | OverloadedRecordUpdateBit
   | ExtendedLiteralsBit
+  | MultilineStringsBit
 
   -- Flags that are updated once parsing starts
   | InRulePragBit
@@ -3117,6 +3029,7 @@ mkParserOpts extensionFlags diag_opts supported
       .|. OverloadedRecordDotBit      `xoptBit` LangExt.OverloadedRecordDot
       .|. OverloadedRecordUpdateBit   `xoptBit` LangExt.OverloadedRecordUpdate  -- Enable testing via 'getBit OverloadedRecordUpdateBit' in the parser (RecordDotSyntax parsing uses that information).
       .|. ExtendedLiteralsBit         `xoptBit` LangExt.ExtendedLiterals
+      .|. MultilineStringsBit         `xoptBit` LangExt.MultilineStrings
     optBits =
           HaddockBit        `setBitIf` isHaddock
       .|. RawTokenStreamBit `setBitIf` rawTokStream


=====================================
compiler/GHC/Parser/String.hs
=====================================
@@ -0,0 +1,298 @@
+{-# LANGUAGE LambdaCase #-}
+
+module GHC.Parser.String (
+  LexedString,
+  LexedChar (..),
+  StringLexError (..),
+  LexStringType (..),
+  resolveLexedString,
+
+  -- * Unicode smart quote helpers
+  isDoubleSmartQuote,
+  isSingleSmartQuote,
+) where
+
+import Control.Monad (guard, unless, when, (>=>))
+import Data.Char (chr, ord)
+import Data.Maybe (listToMaybe, mapMaybe)
+import GHC.Parser.CharClass (
+  hexDigit,
+  is_decdigit,
+  is_hexdigit,
+  is_octdigit,
+  octDecDigit,
+ )
+import GHC.Parser.Errors.Types (LexErr (..))
+import GHC.Prelude
+
+data LexStringType = StringTypeSingle | StringTypeMulti deriving (Eq)
+
+data LexedChar loc = LexedChar !Char !loc
+type LexedString loc = [LexedChar loc]
+
+-- | Apply the given StringProcessors to the given LexedString left-to-right,
+-- and return the processed string.
+resolveLexedString ::
+  LexStringType ->
+  LexedString loc ->
+  Either (StringLexError loc) String
+resolveLexedString strType = fmap toString . foldr (>=>) pure processString
+  where
+    toString = map (\(LexedChar c _) -> c)
+    processString =
+      case strType of
+        StringTypeSingle ->
+          [ collapseStringGaps
+          , resolveEscapeCharacters
+          ]
+        StringTypeMulti ->
+          [ _
+          ]
+
+data StringLexError loc
+  = SmartQuoteError !Char !loc
+  | StringLexError !Char !loc !LexErr
+
+type StringProcessor loc = LexedString loc -> Either (StringLexError loc) (LexedString loc)
+
+-- TODO
+collapseStringGaps :: StringProcessor loc
+collapseStringGaps = _
+
+resolveEscapeCharacters :: StringProcessor loc
+resolveEscapeCharacters = go
+  where
+    go = \case
+      [] -> pure []
+      backslashChar@(LexedChar '\\' _) : s -> do
+        (c, s') <- resolveEscapeCharacter backslashChar s
+        (c :) <$> go s'
+      c : s ->
+        (c :) <$> go s
+
+-- | After finding a backslash, parse the rest of the escape character.
+resolveEscapeCharacter ::
+  LexedChar loc ->   -- the backslash character
+  LexedString loc -> -- the rest of the string to parse
+  Either
+    (StringLexError loc)
+    (LexedChar loc, LexedString loc) -- the resolved escape character and the rest of the string
+resolveEscapeCharacter backslashChar s0 = do
+  (firstChar@(LexedChar c loc), s1) <- expectNext backslashChar s0
+  let rewrap c' = pure (LexedChar c' loc, s1)
+  case c of
+    'a'  -> rewrap '\a'
+    'b'  -> rewrap '\b'
+    'f'  -> rewrap '\f'
+    'n'  -> rewrap '\n'
+    'r'  -> rewrap '\r'
+    't'  -> rewrap '\t'
+    'v'  -> rewrap '\v'
+    '\\' -> rewrap '\\'
+    '"'  -> rewrap '\"'
+    '\'' -> rewrap '\''
+    -- escape codes
+    'x' -> expectNum is_hexdigit 16 hexDigit (firstChar, s1)
+    'o' -> expectNum is_octdigit 8 octDecDigit (firstChar, s1)
+    _ | is_decdigit c -> expectNum is_decdigit 10 octDecDigit (backslashChar, s0)
+    -- control characters (e.g. '\^M')
+    '^' -> do
+      (LexedChar c1 loc1, s2) <- expectNext firstChar s1
+      if c1 >= '@' && c1 <= '_'
+        then do
+          let c' = chr $ ord c1 - ord '@'
+          pure (LexedChar c' loc, s2)
+        else do
+          Left $ StringLexError c1 loc1 LexStringCharLit
+    -- long form escapes (e.g. '\NUL')
+    _ | Just (c', s2) <- parseLongEscape c s1 -> pure (LexedChar c' loc, s2)
+    -- check unicode smart quotes (#21843)
+    _ | isDoubleSmartQuote c -> Left $ SmartQuoteError c loc
+    _ | isSingleSmartQuote c -> Left $ SmartQuoteError c loc
+    -- unknown escape
+    _ -> Left $ StringLexError c loc LexStringCharLit
+  where
+    expectNext lastChar = \case
+      [] -> do
+        let LexedChar c loc = lastChar
+        Left $ StringLexError c loc LexStringCharLitEOF
+      c : cs -> pure (c, cs)
+
+    expectNum isDigit base toDigit (lastChar, s0) = do
+      (LexedChar c loc, s1) <- expectNext lastChar s0
+      unless (isDigit c) $ Left $ StringLexError c loc LexStringCharLit
+      let parseNum x = \case
+            LexedChar c' loc' : s' | isDigit c' -> do
+              let x' = x * base + toDigit c'
+              when (x' > 0x10ffff) $ Left $ StringLexError c' loc' LexNumEscapeRange
+              parseNum x' s'
+            s ->
+              pure (LexedChar (chr x) loc, s)
+      parseNum (toDigit c) s1
+
+-- | Check if the escape characters match a long escape code.
+--
+-- >>> parseLongEscape 'C' [LexedChar 'R', LexedChar 'X', ...s] = Just ('\CR', [LexedChar 'X', ...s])
+-- >>> parseLongEscape 'X' [LexedChar 'X', LexedChar 'X', ...s] = Nothing
+parseLongEscape :: Char -> LexedString loc -> Maybe (Char, LexedString loc)
+parseLongEscape c s = listToMaybe $ mapMaybe tryParse longEscapeCodes
+  where
+    tryParse (prefix, c') = do
+      p0 : p <- pure prefix
+      guard (p0 == c)       -- see if the first character matches
+      s' <- parsePrefix p s -- see if the rest of the prefix matches
+      pure (c', s')
+
+    parsePrefix (p : ps) (LexedChar t _ : ts) | p == t = parsePrefix ps ts
+    parsePrefix [] s' = Just s' -- we've matched the whole prefix, return the rest
+    parsePrefix _ _ = Nothing
+
+    longEscapeCodes =
+      [ ("NUL", '\NUL')
+      , ("SOH", '\SOH')
+      , ("STX", '\STX')
+      , ("ETX", '\ETX')
+      , ("EOT", '\EOT')
+      , ("ENQ", '\ENQ')
+      , ("ACK", '\ACK')
+      , ("BEL", '\BEL')
+      , ("BS", '\BS')
+      , ("HT", '\HT')
+      , ("LF", '\LF')
+      , ("VT", '\VT')
+      , ("FF", '\FF')
+      , ("CR", '\CR')
+      , ("SO", '\SO')
+      , ("SI", '\SI')
+      , ("DLE", '\DLE')
+      , ("DC1", '\DC1')
+      , ("DC2", '\DC2')
+      , ("DC3", '\DC3')
+      , ("DC4", '\DC4')
+      , ("NAK", '\NAK')
+      , ("SYN", '\SYN')
+      , ("ETB", '\ETB')
+      , ("CAN", '\CAN')
+      , ("EM", '\EM')
+      , ("SUB", '\SUB')
+      , ("ESC", '\ESC')
+      , ("FS", '\FS')
+      , ("GS", '\GS')
+      , ("RS", '\RS')
+      , ("US", '\US')
+      , ("SP", '\SP')
+      , ("DEL", '\DEL')
+      ]
+
+-- -----------------------------------------------------------------------------
+-- Unicode Smart Quote detection (#21843)
+
+isDoubleSmartQuote :: Char -> Bool
+isDoubleSmartQuote = \case
+  '“' -> True
+  '”' -> True
+  _ -> False
+
+isSingleSmartQuote :: Char -> Bool
+isSingleSmartQuote = \case
+  '‘' -> True
+  '’' -> True
+  _ -> False
+
+{-
+Note [Multiline string literals]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Multiline string literals were added following the acceptance of the
+proposal: https://github.com/ghc-proposals/ghc-proposals/pull/569
+
+Multiline string literals are syntax sugar for normal string literals,
+with an extra post processing step on the SourceText. We do this on
+the SourceText instead of the parsed output because the lexer resolves
+escaped characters, but we need the actual escaped characters here.
+
+The string is post-process with the following steps:
+1. Collapse string gaps
+2. Split the string by newlines
+3. Convert leading tabs into spaces
+    * In each line, any tabs preceding non-whitespace characters are replaced with spaces up to the next tab stop
+4. Remove common whitespace prefix in every line
+    * See below
+5. Join the string back with `\n` delimiters
+6. If the first character of the string is a newline, remove it
+7. Interpret escaped characters
+
+The common whitespace prefix can be informally defined as "The longest
+prefix of whitespace shared by all lines in the string, excluding the
+first line and any whitespace-only lines".
+
+It's more precisely defined with the following algorithm:
+
+1. Take a list representing the lines in the string
+2. Ignore the following elements in the list:
+    * The first line (we want to ignore everything before the first newline)
+    * Empty lines
+    * Lines with only whitespace characters
+3. Calculate the longest prefix of whitespace shared by all lines in the remaining list
+-}
+
+-- | See Note [Multiline string literals]
+processMultilineStringLiteral :: SourceText -> FastString -> FastString
+processMultilineStringLiteral = \case
+  SourceText s | Just s' <- fromSourceText s -> \_ -> mkFastString $ process s'
+  -- if we don't get a valid SourceText, be safe and don't post-process
+  _ -> id
+  where
+    (.>) :: (a -> b) -> (b -> c) -> (a -> c)
+    (.>) = flip (.)
+
+    fromSourceText s =
+      let stripSuffix x = fmap reverse . stripPrefix x . reverse
+       in stripSuffix "\"\"\"" =<< stripPrefix "\"\"\"" (unpackFS s)
+
+    process =
+         collapseStringGaps
+      .> splitLines
+      .> convertLeadingTabs
+      .> rmCommonWhitespacePrefix
+      .> joinLines
+      .> rmFirstNewline
+
+    -- avoid `lines` because it treats a trailing newline the same as no trailing newline
+    splitLines =
+      foldr
+        ( curry $ \case
+            ('\n', ls) -> "" : ls
+            (c, l : ls) -> (c:l) : ls
+            (c, []) -> [c] : [] -- should not happen
+        )
+        [""]
+
+    convertLeadingTabs =
+      let convertLine col = \case
+            [] -> ""
+            ' ' : cs -> ' ' : convertLine (col + 1) cs
+            '\t' : cs ->
+              let fill = 8 - (col `mod` 8)
+               in replicate fill ' ' ++ convertLine (col + fill) cs
+            c : cs -> c : cs
+       in map (convertLine 0)
+
+    rmCommonWhitespacePrefix strLines =
+      let
+        excludeLines =
+             drop 1                      -- ignore first line
+          .> filter (not . all (== ' ')) -- ignore lines that are all whitespace
+        commonWSPrefix =
+          case NonEmpty.nonEmpty (excludeLines strLines) of
+            Nothing -> 0
+            Just strLines' -> Foldable1.minimum $ NonEmpty.map (length . takeWhile (== ' ')) strLines'
+      in
+        map (drop commonWSPrefix) strLines
+        -- map (drop commonWSPrefix) . (\s -> traceShow ("rmCommonWhitespacePrefix", commonWSPrefix, excludeLines strLines, s) s) $ strLines
+
+    joinLines = intercalate "\n"
+
+    rmFirstNewline = \case
+      '\n' : s -> s
+      s -> s


=====================================
compiler/Language/Haskell/Syntax/Lit.hs
=====================================
@@ -54,6 +54,8 @@ data HsLit x
       -- ^ Unboxed character
   | HsString (XHsString x) {- SourceText -} FastString
       -- ^ String
+  | HsMultilineString (XHsMultilineString x) {- SourceText -} FastString
+      -- ^ String
   | HsStringPrim (XHsStringPrim x) {- SourceText -} !ByteString
       -- ^ Packed bytes
   | HsInt (XHsInt x)  IntegralLit


=====================================
docs/users_guide/exts/multiline_strings.rst
=====================================
@@ -0,0 +1,17 @@
+.. _multiline-strings:
+
+Multiline string literals
+-------------------------
+
+.. extension:: MultilineStrings
+    :shortdesc: Enable multiline string literals.
+
+    :since: 9.10.1
+
+    Enable multiline string literals.
+
+With this extension, GHC now recognizes multiline string literals with ``"""`` delimiters. Indentation is automatically stripped, and gets desugared to normal string literals, so it works as expected for ``OverloadedStrings`` and any other functionality.
+
+TODO: explain removing common whitespace prefix
+TODO: add full spec
+TODO: add examples



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/9eb5a9eceba4ac57e7d4bccece829623b012efae...518757714a3f73f2cbffe25bd7d603169e2a7b64

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/9eb5a9eceba4ac57e7d4bccece829623b012efae...518757714a3f73f2cbffe25bd7d603169e2a7b64
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240209/0e5f3574/attachment-0001.html>


More information about the ghc-commits mailing list