[Git][ghc/ghc][wip/ncg-simd] 4 commits: Fix C calls with SIMD vectors

Fri Sep 20 15:20:48 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
be70f8c0 by sheaf at 2024-09-20T17:20:18+02:00
Fix C calls with SIMD vectors

This commit fixes the code generation for C calls, to take into account
the calling convention.

This is particularly tricky on Windows, where all vectors are expected
to be passed by reference. See Note [The Windows X64 C calling convention]
in GHC.CmmToAsm.X86.CodeGen.

- - - - -
970a1b81 by sheaf at 2024-09-20T17:20:18+02:00
X86 CodeGen: refactor getRegister CmmLit

This refactors the code dealing with loading literals into registers,
removing duplication and putting all the code in a single place.
It also changes which XOR instruction is used to place a zero value
into a register, so that we use VPXOR for a 128-bit integer vector
when AVX is supported.

- - - - -
d61d6f99 by sheaf at 2024-09-20T17:20:18+02:00
X86 genCCall64: simplify loadArg code

This commit simplifies the argument loading code by making the
assumption that it is safe to directly load the argument into register,
because doing so will not clobber any previous assignments.

This assumption is borne from the use of 'evalArgs', which evaluates
any arguments which might necessitate non-trivial code generation into
separate temporary registers.

- - - - -
ad8c5c64 by sheaf at 2024-09-20T17:20:19+02:00
LLVM: propagate GlobalRegUse information

This commit ensures we keep track of how any particular global register
is being used in the LLVM backend. This informs the LLVM type
annotations, and avoids type mismatches of the following form:

  argument is not of expected type '<2 x double>'
    call ccc <2 x double> (<2 x double>)
      (<4 x i32> arg)

- - - - -


12 changed files:

- compiler/GHC/CmmToAsm/Format.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/CmmToLlvm.hs
- compiler/GHC/CmmToLlvm/Base.hs
- compiler/GHC/CmmToLlvm/CodeGen.hs
- compiler/GHC/CmmToLlvm/Ppr.hs
- compiler/GHC/CmmToLlvm/Regs.hs
- testsuite/tests/simd/should_run/simd013.hs
- testsuite/tests/simd/should_run/simd013.stdout
- testsuite/tests/simd/should_run/simd013C.c


Changes:

=====================================
compiler/GHC/CmmToAsm/Format.hs
=====================================
@@ -28,6 +28,7 @@ module GHC.CmmToAsm.Format (
     scalarWidth,
     formatInBytes,
     isFloatScalarFormat,
+    isFloatOrFloatVecFormat,
     floatScalarFormat,
     scalarFormatFormat,
     VirtualRegWithFormat(..),
@@ -134,6 +135,11 @@ isFloatScalarFormat = \case
   FmtDouble -> True
   _ -> False
 
+isFloatOrFloatVecFormat :: Format -> Bool
+isFloatOrFloatVecFormat = \case
+  VecFormat _ sFmt -> isFloatScalarFormat sFmt
+  fmt -> isFloatFormat fmt
+
 floatScalarFormat :: Width -> ScalarFormat
 floatScalarFormat W32 = FmtFloat
 floatScalarFormat W64 = FmtDouble


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1,6 +1,9 @@
+{-# LANGUAGE DuplicateRecordFields #-}
+{-# LANGUAGE GADTs #-}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE MultiWayIf #-}
-{-# LANGUAGE GADTs #-}
+{-# LANGUAGE ParallelListComp #-}
+{-# LANGUAGE TupleSections #-}
 {-# LANGUAGE NondecreasingIndentation #-}
 
 {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
@@ -73,22 +76,29 @@ import GHC.Types.Tickish ( GenTickish(..) )
 import GHC.Types.SrcLoc  ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
 
 -- The rest:
+import GHC.Data.Maybe ( expectJust )
 import GHC.Types.ForeignCall ( CCallConv(..) )
 import GHC.Data.OrdList
 import GHC.Utils.Outputable
 import GHC.Utils.Constants (debugIsOn)
+import GHC.Utils.Monad ( foldMapM )
 import GHC.Utils.Panic
 import GHC.Data.FastString
 import GHC.Utils.Misc
 import GHC.Types.Unique.Supply ( getUniqueM )
 
+import qualified Data.Semigroup as S
+
 import Control.Monad
+import Control.Monad.Trans.State.Strict
+  ( StateT, evalStateT, get, put )
+import Control.Monad.Trans.Class (lift)
 import Data.Foldable (fold)
 import Data.Int
 import Data.Maybe
 import Data.Word
 
-import qualified Data.Map as M
+import qualified Data.Map as Map
 
 is32BitPlatform :: NatM Bool
 is32BitPlatform = do
@@ -227,7 +237,7 @@ addSpUnwindings instr@(DELTA d) = do
     let platform = ncgPlatform config
     if ncgDwarfUnwindings config
         then do lbl <- mkAsmTempLabel <$> getUniqueM
-                let unwind = M.singleton MachSp (Just $ UwReg (GlobalRegUse MachSp (bWord platform)) $ negate d)
+                let unwind = Map.singleton MachSp (Just $ UwReg (GlobalRegUse MachSp (bWord platform)) $ negate d)
                 return $ toOL [ instr, UNWIND lbl unwind ]
         else return (unitOL instr)
 addSpUnwindings instr = return $ unitOL instr
@@ -328,10 +338,10 @@ stmtToInstrs bid stmt = do
 
       CmmUnwind regs -> do
         let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
-            to_unwind_entry (reg, expr) = M.singleton reg (fmap (toUnwindExpr platform) expr)
+            to_unwind_entry (reg, expr) = Map.singleton reg (fmap (toUnwindExpr platform) expr)
         case foldMap to_unwind_entry regs of
-          tbl | M.null tbl -> return nilOL
-              | otherwise  -> do
+          tbl | Map.null tbl -> return nilOL
+              | otherwise    -> do
                   lbl <- mkAsmTempLabel <$> getUniqueM
                   return $ unitOL $ UNWIND lbl tbl
 
@@ -836,12 +846,12 @@ iselExpr64ParallelBin op e1 e2 = do
 -- targetted for any particular type like Int8, Int32 etc
 data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div | VA_Min | VA_Max
 
-getRegister :: CmmExpr -> NatM Register
+getRegister :: HasDebugCallStack => CmmExpr -> NatM Register
 getRegister e = do platform <- getPlatform
                    is32Bit <- is32BitPlatform
                    getRegister' platform is32Bit e
 
-getRegister' :: Platform -> Bool -> CmmExpr -> NatM Register
+getRegister' :: HasDebugCallStack => Platform -> Bool -> CmmExpr -> NatM Register
 
 getRegister' platform is32Bit (CmmReg reg)
   = case reg of
@@ -903,20 +913,6 @@ getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W16) [x])
   ro <- getNewRegNat II16
   return $ Fixed II16 ro (code `appOL` toOL [ MOVZxL II16 (OpReg rlo) (OpReg ro) ])
 
-getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
-  float_const_sse2  where
-  float_const_sse2
-    | f == 0.0 = do
-      -- TODO: this mishandles negative zero floating point literals.
-      let
-          format = floatFormat w
-          code dst = unitOL  (XOR format (OpReg dst) (OpReg dst))
-        -- I don't know why there are xorpd, xorps, and pxor instructions.
-        -- They all appear to do the same thing --SDM
-      return (Any format code)
-
-   | otherwise = getFloatLitRegister lit
-
 -- catch simple cases of zero- or sign-extended load
 getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _ _]) = do
   code <- intLoadCode (MOVZxL II8) addr
@@ -1922,7 +1918,7 @@ getRegister' platform is32Bit load@(CmmLoad mem ty _)
   | isFloatType ty
   = do
     Amode addr mem_code <- getAmode mem
-    loadFloatAmode width addr mem_code
+    loadAmode (floatFormat width) addr mem_code
 
   | is32Bit && not (isWord64 ty)
   = do
@@ -1950,20 +1946,6 @@ getRegister' platform is32Bit load@(CmmLoad mem ty _)
     format = cmmTypeFormat ty
     width = typeWidth ty
 
-getRegister' _ is32Bit (CmmLit (CmmInt 0 width))
-  = let
-        format = intFormat width
-
-        -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
-        format1 = if is32Bit then format
-                           else case format of
-                                II64 -> II32
-                                _ -> format
-        code dst
-           = unitOL (XOR format1 (OpReg dst) (OpReg dst))
-    in
-        return (Any format code)
-
 -- Handle symbol references with LEA and %rip-relative addressing.
 -- See Note [%rip-relative addressing on x86-64].
 getRegister' platform is32Bit (CmmLit lit)
@@ -1980,80 +1962,102 @@ getRegister' platform is32Bit (CmmLit lit)
     is_label (CmmLabelDiffOff {}) = True
     is_label _                    = False
 
-  -- optimisation for loading small literals on x86_64: take advantage
-  -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
-  -- instruction forms are shorter.
-getRegister' platform is32Bit (CmmLit lit)
-  | not is32Bit, isWord64 (cmmLitType platform lit), not (isBigLit lit)
-  = let
-        imm = litToImm lit
-        code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
-    in
-        return (Any II64 code)
-  where
-   isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
-   isBigLit _ = False
+getRegister' platform is32Bit (CmmLit lit) = do
+  avx <- avxEnabled
+
+  -- NB: it is important that the code produced here (to load a literal into
+  -- a register) doesn't clobber any registers other than the destination
+  -- register; the code for generating C calls relies on this property.
+  --
+  -- In particular, we have:
+  --
+  -- > loadIntoRegMightClobberOtherReg (CmmLit _) = False
+  --
+  -- which means that we assume that loading a literal into a register
+  -- will not clobber any other registers.
+
+  -- TODO: this function mishandles floating-point negative zero,
+  -- because -0.0 == 0.0 returns True and because we represent CmmFloat as
+  -- Rational, which can't properly represent negative zero.
+
+  if
+    -- Zero: use XOR.
+    | isZeroLit lit
+    -> let code dst
+             | isIntFormat fmt
+             = let fmt'
+                     | is32Bit
+                     = fmt
+                     | otherwise
+                     -- x86_64: 32-bit xor is one byte shorter,
+                     -- and zero-extends to 64 bits
+                     = case fmt of
+                         II64 -> II32
+                         _ -> fmt
+               in unitOL (XOR fmt' (OpReg dst) (OpReg dst))
+             | avx
+             = if float_or_floatvec
+               then unitOL (VXOR fmt (OpReg dst) dst dst)
+               else unitOL (VPXOR fmt dst dst dst)
+             | otherwise
+             = if float_or_floatvec
+               then unitOL (XOR fmt (OpReg dst) (OpReg dst))
+               else unitOL (PXOR fmt (OpReg dst) dst)
+       in return $ Any fmt code
+
+    -- Constant vector: use broadcast.
+    | VecFormat l sFmt <- fmt
+    , CmmVec (f:fs) <- lit
+    , all (== f) fs
+    -> do let w = scalarWidth sFmt
+              broadcast = if isFloatScalarFormat sFmt
+                          then MO_VF_Broadcast l w
+                          else MO_V_Broadcast l w
+          valCode <- getAnyReg (CmmMachOp broadcast [CmmLit f])
+          return $ Any fmt valCode
+
+    -- Optimisation for loading small literals on x86_64: take advantage
+    -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
+    -- instruction forms are shorter.
+    | not is32Bit, isWord64 cmmTy, not (isBigLit lit)
+    -> let
+          imm = litToImm lit
+          code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
+      in
+          return (Any II64 code)
+
+    -- Scalar integer: use an immediate.
+    | isIntFormat fmt
+    -> let imm = litToImm lit
+           code dst = unitOL (MOV fmt (OpImm imm) (OpReg dst))
+       in return (Any fmt code)
+
+    -- General case: load literal from data address.
+    | otherwise
+    -> do let w = formatToWidth fmt
+          Amode addr addr_code <- memConstant (mkAlignment $ widthInBytes w) lit
+          loadAmode fmt addr addr_code
+
+    where
+      cmmTy = cmmLitType platform lit
+      fmt = cmmTypeFormat cmmTy
+      float_or_floatvec = isFloatOrFloatVecFormat fmt
+      isZeroLit (CmmInt i _) = i == 0
+      isZeroLit (CmmFloat f _) = f == 0 -- TODO: mishandles negative zero
+      isZeroLit (CmmVec fs) = all isZeroLit fs
+      isZeroLit _ = False
+
+      isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
+      isBigLit _ = False
         -- note1: not the same as (not.is32BitLit), because that checks for
         -- signed literals that fit in 32 bits, but we want unsigned
         -- literals here.
         -- note2: all labels are small, because we're assuming the
         -- small memory model. See Note [%rip-relative addressing on x86-64].
 
-getRegister' platform _ (CmmLit lit) = do
-  avx <- avxEnabled
-  case fmt of
-    VecFormat l sFmt
-      | CmmVec fs <- lit
-      , all is_zero fs
-      -> let code dst
-               | avx
-               = if isFloatScalarFormat sFmt
-                 then unitOL (VXOR fmt (OpReg dst) dst dst)
-                 else unitOL (VPXOR fmt dst dst dst)
-               | otherwise
-               = unitOL (XOR fmt (OpReg dst) (OpReg dst))
-         in return (Any fmt code)
-      | CmmVec (f:fs) <- lit
-      , all (== f) fs
-      -- TODO: mishandles negative zero (because -0.0 == 0.0 returns True), and because we
-      -- represent CmmFloat as Rational which can't properly represent negative zero.
-      -> do let w = scalarWidth sFmt
-                broadcast = if isFloatScalarFormat sFmt
-                            then MO_VF_Broadcast l w
-                            else MO_V_Broadcast l w
-            valCode <- getAnyReg (CmmMachOp broadcast [CmmLit f])
-            return $ Any fmt valCode
-
-      | otherwise
-      -> do
-           let w = formatToWidth fmt
-           config <- getConfig
-           Amode addr addr_code <- memConstant (mkAlignment $ widthInBytes w) lit
-           let code dst = addr_code `snocOL`
-                            movInstr config fmt (OpAddr addr) (OpReg dst)
-           return (Any fmt code)
-       where
-        is_zero (CmmInt i _) = i == 0
-        is_zero (CmmFloat f _) = f == 0 -- TODO: mishandles negative zero
-        is_zero _ = False
-
-    _ -> let imm = litToImm lit
-             code dst = unitOL (MOV fmt (OpImm imm) (OpReg dst))
-         in return (Any fmt code)
-  where
-    cmmTy = cmmLitType platform lit
-    fmt = cmmTypeFormat cmmTy
-
 getRegister' platform _ slot@(CmmStackSlot {}) =
   pprPanic "getRegister(x86) CmmStackSlot" (pdoc platform slot)
 
-getFloatLitRegister :: CmmLit -> NatM Register
-getFloatLitRegister lit = do
-  let w :: Width
-      w = case lit of { CmmInt _ w -> w; CmmFloat _ w -> w; _ -> panic "getFloatLitRegister" (ppr lit) }
-  Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
-  loadFloatAmode w addr code
-
 intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
    -> NatM (Reg -> InstrBlock)
 intLoadCode instr mem = do
@@ -2264,7 +2268,7 @@ getNonClobberedOperand (CmmLit lit) =
     return (OpAddr addr, code)
   else do
     platform <- getPlatform
-    if is32BitLit platform lit && not (isFloatType (cmmLitType platform lit))
+    if is32BitLit platform lit && isIntFormat (cmmTypeFormat (cmmLitType platform lit))
     then return (OpImm (litToImm lit), nilOL)
     else getNonClobberedOperand_generic (CmmLit lit)
 
@@ -2321,13 +2325,13 @@ getOperand (CmmLit lit) = do
     else do
 
   platform <- getPlatform
-  if is32BitLit platform lit && not (isFloatType (cmmLitType platform lit))
+  if is32BitLit platform lit && (isIntFormat $ cmmTypeFormat (cmmLitType platform lit))
     then return (OpImm (litToImm lit), nilOL)
     else getOperand_generic (CmmLit lit)
 
 getOperand (CmmLoad mem ty _) = do
   is32Bit <- is32BitPlatform
-  if not (isFloatType ty) && (if is32Bit then not (isWord64 ty) else True)
+  if isIntFormat (cmmTypeFormat ty) && (if is32Bit then not (isWord64 ty) else True)
      then do
        Amode src mem_code <- getAmode mem
        return (OpAddr src, mem_code)
@@ -2358,7 +2362,7 @@ addAlignmentCheck align reg =
   where
     check :: Format -> Reg -> InstrBlock
     check fmt reg =
-        assert (not $ isFloatFormat fmt) $
+        assert (isIntFormat fmt) $
         toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
              , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
              ]
@@ -2382,15 +2386,12 @@ memConstant align lit = do
         `consOL` addr_code
   return (Amode addr code)
 
-
-loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register
-loadFloatAmode w addr addr_code = do
-  let format = floatFormat w
-      code dst = addr_code `snocOL`
-                    MOV format (OpAddr addr) (OpReg dst)
-
-  return (Any format code)
-
+-- | Load the value at the given address into any register.
+loadAmode :: Format -> AddrMode -> InstrBlock -> NatM Register
+loadAmode fmt addr addr_code = do
+  config <- getConfig
+  let load dst = movInstr config fmt (OpAddr addr) (OpReg dst)
+  return $ Any fmt (\ dst -> addr_code `snocOL` load dst)
 
 -- if we want a floating-point literal as an operand, we can
 -- use it directly from memory.  However, if the literal is
@@ -2403,7 +2404,7 @@ isSuitableFloatingPointLit _ = False
 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
 getRegOrMem e@(CmmLoad mem ty _) = do
   is32Bit <- is32BitPlatform
-  if not (isFloatType ty) && (if is32Bit then not (isWord64 ty) else True)
+  if isIntFormat (cmmTypeFormat ty) && (if is32Bit then not (isWord64 ty) else True)
      then do
        Amode src mem_code <- getAmode mem
        return (OpAddr src, mem_code)
@@ -3090,10 +3091,8 @@ genSimplePrim _   op                   dst     args           = do
   platform <- ncgPlatform <$> getConfig
   pprPanic "genSimplePrim: unhandled primop" (ppr (pprCallishMachOp op, dst, fmap (pdoc platform) args))
 
-{-
-Note [Evaluate C-call arguments before placing in destination registers]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+{- Note [Evaluate C-call arguments before placing in destination registers]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 When producing code for C calls we must take care when placing arguments
 in their final registers. Specifically, we must ensure that temporary register
 usage due to evaluation of one argument does not clobber a register in which we
@@ -3144,15 +3143,11 @@ genForeignCall{32,64}.
 -- | See Note [Evaluate C-call arguments before placing in destination registers]
 evalArgs :: BlockId -> [CmmActual] -> NatM (InstrBlock, [CmmActual])
 evalArgs bid actuals
-  | any mightContainMachOp actuals = do
+  | any loadIntoRegMightClobberOtherReg actuals = do
       regs_blks <- mapM evalArg actuals
       return (concatOL $ map fst regs_blks, map snd regs_blks)
   | otherwise = return (nilOL, actuals)
   where
-    mightContainMachOp (CmmReg _)      = False
-    mightContainMachOp (CmmRegOff _ _) = False
-    mightContainMachOp (CmmLit _)      = False
-    mightContainMachOp _               = True
 
     evalArg :: CmmActual -> NatM (InstrBlock, CmmExpr)
     evalArg actual = do
@@ -3166,6 +3161,16 @@ evalArgs bid actuals
     newLocalReg :: CmmType -> NatM LocalReg
     newLocalReg ty = LocalReg <$> getUniqueM <*> pure ty
 
+-- | Might the code to put this expression into a register
+-- clobber any other registers?
+loadIntoRegMightClobberOtherReg :: CmmExpr -> Bool
+loadIntoRegMightClobberOtherReg (CmmReg _)      = False
+loadIntoRegMightClobberOtherReg (CmmRegOff _ _) = False
+loadIntoRegMightClobberOtherReg (CmmLit _)      = False
+  -- NB: this last 'False' is slightly risky, because the code for loading
+  -- a literal into a register is not entirely trivial.
+loadIntoRegMightClobberOtherReg _               = True
+
 -- Note [DIV/IDIV for bytes]
 -- ~~~~~~~~~~~~~~~~~~~~~~~~~
 -- IDIV reminder:
@@ -3246,6 +3251,15 @@ genCCall bid addr conv dest_regs args = do
     else genCCall64 addr conv dest_regs args'
   return (instrs0 `appOL` instrs1)
 
+maybePromoteCArg :: Platform -> Width -> (CmmExpr, ForeignHint) -> CmmExpr
+maybePromoteCArg platform wto (arg, hint)
+ | wfrom < wto = case hint of
+     SignedHint -> CmmMachOp (MO_SS_Conv wfrom wto) [arg]
+     _          -> CmmMachOp (MO_UU_Conv wfrom wto) [arg]
+ | otherwise   = arg
+ where
+   wfrom = cmmExprWidth platform arg
+
 genCCall32 :: CmmExpr           -- ^ address of the function to call
            -> ForeignConvention -- ^ calling convention
            -> [CmmFormal]       -- ^ where to put the result
@@ -3278,7 +3292,7 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
                                      DELTA (delta-8)]
                     )
 
-              | isFloatType arg_ty = do
+              | isFloatType arg_ty || isVecType arg_ty = do
                 (reg, code) <- getSomeReg arg
                 delta <- getDeltaNat
                 setDeltaNat (delta-size)
@@ -3288,11 +3302,10 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
                                       let addr = AddrBaseIndex (EABaseReg esp)
                                                                 EAIndexNone
                                                                 (ImmInt 0)
-                                          format = floatFormat (typeWidth arg_ty)
+                                          format = cmmTypeFormat arg_ty
                                       in
 
-                                      -- assume SSE2
-                                       MOV format (OpReg reg) (OpAddr addr)
+                                       movInstr config format (OpReg reg) (OpAddr addr)
 
                                      ]
                                )
@@ -3316,7 +3329,7 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
         let
             -- Align stack to 16n for calls, assuming a starting stack
             -- alignment of 16n - word_size on procedure entry. Which we
-            -- maintiain. See Note [Stack Alignment on X86] in rts/StgCRun.c.
+            -- maintain. See Note [Stack Alignment on X86] in rts/StgCRun.c.
             sizes               = map (arg_size_bytes . cmmExprType platform) (reverse args)
             raw_arg_size        = sum sizes + platformWordSizeInBytes platform
             arg_pad_size        = (roundTo 16 $ raw_arg_size) - raw_arg_size
@@ -3361,6 +3374,8 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
             -- assign the results, if necessary
             assign_code []     = nilOL
             assign_code [dest]
+              | isVecType ty
+              = unitOL (mkRegRegMoveInstr config (cmmTypeFormat ty) xmm0 r_dest)
               | isFloatType ty =
                   -- we assume SSE2
                   let tmp_amode = AddrBaseIndex (EABaseReg esp)
@@ -3402,189 +3417,70 @@ genCCall64 :: CmmExpr           -- ^ address of function to call
 genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
     config <- getConfig
     let platform = ncgPlatform config
-    -- load up the register arguments
-    let args_hints = zip args (argHints ++ repeat NoHint)
-    let prom_args = map (maybePromoteCArg platform W32) args_hints
-
-    let load_args :: [CmmExpr]
-                  -> [RegWithFormat]         -- int regs avail for args
-                  -> [RegWithFormat]         -- FP regs avail for args
-                  -> InstrBlock    -- code computing args
-                  -> InstrBlock    -- code assigning args to ABI regs
-                  -> NatM ([CmmExpr],[RegWithFormat],[RegWithFormat],InstrBlock,InstrBlock)
-        -- no more regs to use
-        load_args args [] [] code acode     =
-            return (args, [], [], code, acode)
-
-        -- no more args to push
-        load_args [] aregs fregs code acode =
-            return ([], aregs, fregs, code, acode)
-
-        load_args (arg : rest) aregs fregs code acode
-            | isFloatType arg_rep = case fregs of
-                 []     -> push_this_arg
-                 (RegWithFormat r _fmt:rs) -> do
-                    (code',acode') <- reg_this_arg r
-                    load_args rest aregs rs code' acode'
-            | otherwise           = case aregs of
-                 []     -> push_this_arg
-                 (RegWithFormat r _fmt:rs) -> do
-                    (code',acode') <- reg_this_arg r
-                    load_args rest rs fregs code' acode'
-            where
+        args_hints = zip args (argHints ++ repeat NoHint)
+        prom_args = map (maybePromoteCArg platform W32) args_hints
+        word_size = platformWordSizeInBytes platform
+        wordFmt = archWordFormat (target32Bit platform)
 
-              -- put arg into the list of stack pushed args
-              push_this_arg = do
-                 (args',ars,frs,code',acode')
-                     <- load_args rest aregs fregs code acode
-                 return (arg:args', ars, frs, code', acode')
-
-              -- pass the arg into the given register
-              reg_this_arg r
-                -- "operand" args can be directly assigned into r
-                | isOperand platform arg = do
-                    arg_code <- getAnyReg arg
-                    return (code, (acode `appOL` arg_code r))
-                -- The last non-operand arg can be directly assigned after its
-                -- computation without going into a temporary register
-                | all (isOperand platform) rest = do
-                    arg_code   <- getAnyReg arg
-                    return (code `appOL` arg_code r,acode)
-
-                -- other args need to be computed beforehand to avoid clobbering
-                -- previously assigned registers used to pass parameters (see
-                -- #11792, #12614). They are assigned into temporary registers
-                -- and get assigned to proper call ABI registers after they all
-                -- have been computed.
-                | otherwise     = do
-                    arg_code <- getAnyReg arg
-                    tmp      <- getNewRegNat arg_fmt
-                    let
-                      code'  = code `appOL` arg_code tmp
-                      acode' = acode `snocOL` mkRegRegMoveInstr config arg_fmt tmp r
-                    return (code',acode')
-
-              arg_rep = cmmExprType platform arg
-              arg_fmt = cmmTypeFormat arg_rep
-
-        load_args_win :: [CmmExpr]
-                      -> [RegWithFormat]        -- used int regs
-                      -> [RegWithFormat]        -- used FP regs
-                      -> [(Reg, Reg)] -- (int, FP) regs avail for args
-                      -> InstrBlock
-                      -> NatM ([CmmExpr],[RegWithFormat],[RegWithFormat],InstrBlock,InstrBlock)
-        load_args_win args usedInt usedFP [] code
-            = return (args, usedInt, usedFP, code, nilOL)
-            -- no more regs to use
-        load_args_win [] usedInt usedFP _ code
-            = return ([], usedInt, usedFP, code, nilOL)
-            -- no more args to push
-        load_args_win (arg : rest) usedInt usedFP
-                      ((ireg, freg) : regs) code
-            | isFloatType arg_rep = do
-                 arg_code <- getAnyReg arg
-                 load_args_win rest (RegWithFormat ireg II64: usedInt) (RegWithFormat freg FF64 : usedFP) regs
-                               (code `appOL`
-                                arg_code freg `snocOL`
-                                -- If we are calling a varargs function
-                                -- then we need to define ireg as well
-                                -- as freg
-                                MOVD FF64 (OpReg freg) (OpReg ireg))
-            | otherwise = do
-                 arg_code <- getAnyReg arg
-                 load_args_win rest (RegWithFormat ireg II64: usedInt) usedFP regs
-                               (code `appOL` arg_code ireg)
-            where
-              arg_rep = cmmExprType platform arg
-
-        arg_size = 8 -- always, at the mo
-
-        push_args [] code = return code
-        push_args (arg:rest) code
-           | isFloatType arg_rep = do
-             (arg_reg, arg_code) <- getSomeReg arg
-             delta <- getDeltaNat
-             setDeltaNat (delta-arg_size)
-             let fmt = floatFormat width
-                 code' = code `appOL` arg_code `appOL` toOL [
-                            SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
-                            DELTA (delta-arg_size),
-                            MOV fmt (OpReg arg_reg) (OpAddr (spRel platform 0))]
-             push_args rest code'
-
-           | otherwise = do
-             -- Arguments can be smaller than 64-bit, but we still use @PUSH
-             -- II64@ - the usual calling conventions expect integers to be
-             -- 8-byte aligned.
-             massert (width <= W64)
-             (arg_op, arg_code) <- getOperand arg
-             delta <- getDeltaNat
-             setDeltaNat (delta-arg_size)
-             let code' = code `appOL` arg_code `appOL` toOL [
-                                    PUSH II64 arg_op,
-                                    DELTA (delta-arg_size)]
-             push_args rest code'
-            where
-              arg_rep = cmmExprType platform arg
-              width = typeWidth arg_rep
-
-        leaveStackSpace n = do
-             delta <- getDeltaNat
-             setDeltaNat (delta - n * arg_size)
-             return $ toOL [
-                         SUB II64 (OpImm (ImmInt (n * platformWordSizeInBytes platform))) (OpReg rsp),
-                         DELTA (delta - n * arg_size)]
-
-    (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
-         <-
-        if platformOS platform == OSMinGW32
-        then load_args_win prom_args [] [] (allArgRegs platform) nilOL
-        else do
-           let intArgRegs = map (\r -> RegWithFormat r II64) $ allIntArgRegs platform
-               fpArgRegs = map (\r -> RegWithFormat r FF64) $ allFPArgRegs platform
-           (stack_args, aregs, fregs, load_args_code, assign_args_code)
-               <- load_args prom_args intArgRegs fpArgRegs nilOL nilOL
-           let used_regs rs as = dropTail (length rs) as
-               fregs_used      = used_regs fregs fpArgRegs
-               aregs_used      = used_regs aregs intArgRegs
-           return (stack_args, aregs_used, fregs_used, load_args_code
-                                                      , assign_args_code)
+    -- Compute the code for loading arguments into registers,
+    -- returning the leftover arguments that will need to be passed on the stack.
+    --
+    -- NB: the code for loading references to data into registers is computed
+    -- later (in 'pushArgs'), because we don't yet know where the data will be
+    -- placed (due to alignment requirements).
+    LoadArgs
+      { stackArgs       = proper_stack_args
+      , stackDataArgs   = stack_data_args
+      , usedRegs        = arg_regs_used
+      , assignArgsCode  = assign_args_code
+      }
+      <- loadArgs config prom_args
 
     let
-        wordFmt = archWordFormat (target32Bit platform)
-        arg_regs_used = int_regs_used ++ fp_regs_used
-        arg_regs = [RegWithFormat eax wordFmt] ++ arg_regs_used
-                -- for annotating the call instruction with
-        sse_regs = length fp_regs_used
-        arg_stack_slots = if platformOS platform == OSMinGW32
-                          then length stack_args + length (allArgRegs platform)
-                          else length stack_args
-        tot_arg_size = arg_size * arg_stack_slots
 
+    -- Pad all arguments and data passed on stack to align them properly.
+        (stk_args_with_padding, args_aligned_16) =
+          padStackArgs platform (proper_stack_args, stack_data_args)
 
     -- Align stack to 16n for calls, assuming a starting stack
     -- alignment of 16n - word_size on procedure entry. Which we
     -- maintain. See Note [Stack Alignment on X86] in rts/StgCRun.c
-    let word_size = platformWordSizeInBytes platform
-    (real_size, adjust_rsp) <-
-        if (tot_arg_size + word_size) `rem` 16 == 0
-            then return (tot_arg_size, nilOL)
-            else do -- we need to adjust...
-                delta <- getDeltaNat
-                setDeltaNat (delta - word_size)
-                return (tot_arg_size + word_size, toOL [
-                                SUB II64 (OpImm (ImmInt word_size)) (OpReg rsp),
-                                DELTA (delta - word_size) ])
-
-    -- push the stack args, right to left
-    push_code <- push_args (reverse stack_args) nilOL
-    -- On Win64, we also have to leave stack space for the arguments
-    -- that we are passing in registers
-    lss_code <- if platformOS platform == OSMinGW32
-                then leaveStackSpace (length (allArgRegs platform))
-                else return nilOL
+        need_realign_call = args_aligned_16
+    align_call_code <-
+      if need_realign_call
+      then addStackPadding word_size
+      else return nilOL
+
+    -- Compute the code that pushes data to the stack, and also
+    -- the code that loads references to that data into registers,
+    -- when the data is passed by reference in a register.
+    (load_data_refs, push_code) <-
+      pushArgs config proper_stack_args stk_args_with_padding
+
+    -- On Windows, leave stack space for the arguments that we are passing
+    -- in registers (the so-called shadow space).
+    let shadow_space =
+          if platformOS platform == OSMinGW32
+          then 8 * length (allArgRegs platform)
+            -- NB: the shadow store is always 8 * 4 = 32 bytes large,
+            -- i.e. the cumulative size of rcx, rdx, r8, r9 (see 'allArgRegs').
+          else 0
+    shadow_space_code <- addStackPadding shadow_space
+
+    let total_args_size
+          = shadow_space
+          + sum (map (stackArgSpace platform) stk_args_with_padding)
+        real_size =
+          total_args_size + if need_realign_call then word_size else 0
+
+    -- End of argument passing.
+    --
+    -- Next step: emit the appropriate call instruction.
     delta <- getDeltaNat
 
+    let arg_regs = [RegWithFormat eax wordFmt] ++ arg_regs_used
+          -- for annotating the call instruction with
+
     -- deal with static vs dynamic call targets
     (callinsns,_cconv) <- case addr of
       CmmLit (CmmLabel lbl) ->
@@ -3594,7 +3490,7 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
         return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
 
     let
-        -- The x86_64 ABI requires us to set %al to the number of SSE2
+        -- The System V AMD64 ABI requires us to set %al to the number of SSE2
         -- registers that contain arguments, if the called routine
         -- is a varargs function.  We don't know whether it's a
         -- varargs function or not, so we have to assume it is.
@@ -3602,7 +3498,12 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
         -- It's not safe to omit this assignment, even if the number
         -- of SSE2 regs in use is zero.  If %al is larger than 8
         -- on entry to a varargs function, seg faults ensue.
-        assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
+        nb_sse_regs_used = count (isFloatFormat . regWithFormat_format) arg_regs_used
+        assign_eax_sse_regs
+          | platformOS platform == OSMinGW32
+          = nilOL
+          | otherwise
+          = unitOL (MOV II32 (OpImm (ImmInt nb_sse_regs_used)) (OpReg eax))
 
     let call = callinsns `appOL`
                toOL (
@@ -3619,7 +3520,7 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
         assign_code []     = nilOL
         assign_code [dest] =
           unitOL $
-            movInstr config fmt (OpReg reg) (OpReg r_dest)
+            mkRegRegMoveInstr config fmt reg r_dest
           where
             reg = if isIntFormat fmt then rax else xmm0
             fmt = cmmTypeFormat rep
@@ -3627,24 +3528,581 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
             r_dest = getRegisterReg platform (CmmLocal dest)
         assign_code _many = panic "genForeignCall.assign_code many"
 
-    return (adjust_rsp          `appOL`
+    return (align_call_code     `appOL`
             push_code           `appOL`
-            load_args_code      `appOL`
             assign_args_code    `appOL`
-            lss_code            `appOL`
-            assign_eax sse_regs `appOL`
+            load_data_refs      `appOL`
+            shadow_space_code   `appOL`
+            assign_eax_sse_regs `appOL`
             call                `appOL`
             assign_code dest_regs)
 
+-- -----------------------------------------------------------------------------
+-- Loading arguments into registers for 64-bit C calls.
+
+-- | Information needed to know how to pass arguments in a C call,
+-- and in particular how to load them into registers.
+data LoadArgs
+  = LoadArgs
+  -- | Arguments that should be passed on the stack
+  { stackArgs     :: [RawStackArg]
+  -- | Additional values to store onto the stack.
+  , stackDataArgs :: [CmmExpr]
+  -- | Which registers are we using for argument passing?
+  , usedRegs      :: [RegWithFormat]
+  -- | The code to assign arguments to registers used for argument passing.
+  , assignArgsCode :: InstrBlock
+  }
+instance Semigroup LoadArgs where
+  LoadArgs a1 d1 r1 j1 <> LoadArgs a2 d2 r2 j2
+    = LoadArgs (a1 ++ a2) (d1 ++ d2) (r1 ++ r2) (j1 S.<> j2)
+instance Monoid LoadArgs where
+  mempty = LoadArgs [] [] [] nilOL
+
+-- | An argument passed on the stack, either directly or by reference.
+--
+-- The padding information hasn't yet been computed (see 'StackArg').
+data RawStackArg
+  -- | Pass the argument on the stack directly.
+  = RawStackArg { stackArgExpr :: CmmExpr }
+  -- | Pass the argument by reference.
+  | RawStackArgRef
+    { stackRef :: StackRef
+       -- ^ is the reference passed in a register, or on the stack?
+    , stackRefArgSize :: Int
+        -- ^ the size of the data pointed to
+    }
+  deriving ( Show )
+
+-- | An argument passed on the stack, either directly or by reference,
+-- with additional padding information.
+data StackArg
+  -- | Pass the argument on the stack directly.
+  = StackArg
+      { stackArgExpr :: CmmExpr
+      , stackArgPadding :: Int
+        -- ^ padding required (in bytes)
+      }
+  -- | Pass the argument by reference.
+  | StackArgRef
+     { stackRef :: StackRef
+        -- ^ where the reference is passed
+     , stackRefArgSize :: Int
+        -- ^ the size of the data pointed to
+     , stackRefArgPadding :: Int
+       -- ^ padding of the data pointed to
+       -- (the reference itself never requires padding)
+     }
+  deriving ( Show )
+
+-- | Where is a reference to data on the stack passed?
+data StackRef
+  -- | In a register.
+  = InReg Reg
+  -- | On the stack.
+  | OnStack
+  deriving ( Eq, Ord, Show )
+
+newtype Padding = Padding { paddingBytes :: Int }
+  deriving ( Show, Eq, Ord )
+
+-- | How much space does this 'StackArg' take up on the stack?
+--
+-- Only counts the "reference" part for references, not the data it points to.
+stackArgSpace :: Platform -> StackArg -> Int
+stackArgSpace platform = \case
+  StackArg arg padding ->
+    argSize platform arg + padding
+  StackArgRef { stackRef = ref } ->
+    case ref of
+      InReg   {} -> 0
+      OnStack {} -> 8
+
+-- | Pad arguments, assuming we start aligned to a 16-byte boundary.
+--
+-- Returns padded arguments, together with whether we end up aligned
+-- to a 16-byte boundary.
+padStackArgs :: Platform
+             -> ([RawStackArg], [CmmExpr])
+             -> ([StackArg], Bool)
+padStackArgs platform (args0, data_args0) =
+  let
+    -- Pad the direct args
+    (args, align_16_mid) = pad_args True args0
+
+    -- Pad the data section
+    (data_args, align_16_end) = pad_args align_16_mid (map RawStackArg data_args0)
+
+    -- Now figure out where the data is placed relative to the direct arguments,
+    -- in order to resolve references.
+    resolve_args :: Int -> [((RawStackArg, Padding), Padding)] -> [StackArg]
+    resolve_args _ [] = []
+    resolve_args i (((stk_arg, Padding pad), Padding arg_pad) : rest) =
+      let (this_arg, i') =
+            case stk_arg of
+              RawStackArg arg -> (StackArg arg pad, i)
+              RawStackArgRef ref size ->
+                let arg =
+                      StackArgRef
+                        { stackRef = ref
+                        , stackRefArgSize = size
+                        , stackRefArgPadding = arg_pad }
+                in (arg, i+1)
+      in this_arg : resolve_args i' rest
+
+    args_data_pads :: [((RawStackArg, Padding), Padding)]
+    args_data_pads = zipWith (\ a ( _, p ) -> ( a, p )) args data_args
+
+  in
+    ( resolve_args 0 args_data_pads ++
+        [ case data_arg of
+            RawStackArg arg -> StackArg arg pad
+            RawStackArgRef {} -> panic "padStackArgs: reference in data section"
+        | (data_arg, Padding pad) <- data_args
+        ]
+    , align_16_end )
+
+  where
+    pad_args :: Bool -> [RawStackArg] -> ([(RawStackArg, Padding)], Bool)
+    pad_args aligned_16 [] = ([], aligned_16)
+    pad_args aligned_16 (arg:args)
+      | needed_alignment > 16
+      -- We don't know if the stack is aligned to 8 (mod 32) or 24 (mod 32).
+      -- This makes aligning the stack to a 32 or 64 byte boundary more
+      -- complicated, in particular with DELTA.
+      = sorry $ unlines
+        [ "X86_86 C call: unsupported argument."
+        , "  Alignment requirement: " ++ show needed_alignment ++ " bytes."
+        , if platformOS platform == OSMinGW32
+          then "  The X86_64 NCG does not (yet) support Windows C calls with 256/512 bit vectors."
+          else "  The X86_64 NCG cannot (yet) pass 256/512 bit vectors on the stack for C calls."
+        , "  Please use the LLVM backend (-fllvm)." ]
+      | otherwise
+      = let ( rest, final_align_16 ) = pad_args next_aligned_16 args
+        in  ( (arg, Padding padding) : rest, final_align_16 )
 
-maybePromoteCArg :: Platform -> Width -> (CmmExpr, ForeignHint) -> CmmExpr
-maybePromoteCArg platform wto (arg, hint)
- | wfrom < wto = case hint of
-     SignedHint -> CmmMachOp (MO_SS_Conv wfrom wto) [arg]
-     _          -> CmmMachOp (MO_UU_Conv wfrom wto) [arg]
- | otherwise   = arg
- where
-   wfrom = cmmExprWidth platform arg
+      where
+        needed_alignment = case arg of
+          RawStackArg arg   -> argSize platform arg
+          RawStackArgRef {} -> platformWordSizeInBytes platform
+        padding
+          | needed_alignment < 16 || aligned_16
+          = 0
+          | otherwise
+          = 8
+        next_aligned_16 = not ( aligned_16 && needed_alignment < 16 )
+
+-- | Load arguments into available registers.
+loadArgs :: NCGConfig -> [CmmExpr] -> NatM LoadArgs
+loadArgs config args
+  | platformOS platform == OSMinGW32
+  = evalStateT (loadArgsWin config args) (allArgRegs platform)
+  | otherwise
+  = evalStateT (loadArgsSysV config args) (allIntArgRegs platform
+                                          ,allFPArgRegs  platform)
+  where
+    platform = ncgPlatform config
+
+-- | Load arguments into available registers (System V AMD64 ABI).
+loadArgsSysV :: NCGConfig
+             -> [CmmExpr]
+             -> StateT ([Reg], [Reg]) NatM LoadArgs
+loadArgsSysV _ [] = return mempty
+loadArgsSysV config (arg:rest) = do
+  (iregs, fregs) <- get
+  -- No available registers: pass everything on the stack (shortcut).
+  if null iregs && null fregs
+  then return $
+          LoadArgs
+            { stackArgs       = map RawStackArg (arg:rest)
+            , stackDataArgs   = []
+            , assignArgsCode  = nilOL
+            , usedRegs        = []
+            }
+  else do
+    mbReg <-
+      if
+        | isIntFormat arg_fmt
+        , ireg:iregs' <- iregs
+        -> do put (iregs', fregs)
+              return $ Just ireg
+        | isFloatFormat arg_fmt || isVecFormat arg_fmt
+        , freg:fregs' <- fregs
+        -> do put (iregs, fregs')
+              return $ Just freg
+        | otherwise
+        -> return Nothing
+    this_arg <-
+      case mbReg of
+        Just reg -> do
+          assign_code <- lift $ loadArgIntoReg arg reg
+          return $
+            LoadArgs
+                { stackArgs       = [] -- passed in register
+                , stackDataArgs   = []
+                , assignArgsCode  = assign_code
+                , usedRegs        = [RegWithFormat reg arg_fmt]
+                }
+        Nothing -> do
+          return $
+            -- No available register for this argument: pass it on the stack.
+            LoadArgs
+                { stackArgs       = [RawStackArg arg]
+                , stackDataArgs   = []
+                , assignArgsCode  = nilOL
+                , usedRegs        = []
+                }
+    others <- loadArgsSysV config rest
+    return $ this_arg S.<> others
+
+  where
+    platform = ncgPlatform config
+    arg_fmt = cmmTypeFormat (cmmExprType platform arg)
+
+-- | Compute all things that will need to be pushed to the stack.
+--
+-- On Windows, an argument passed by reference will require two pieces of data:
+--
+--  - the reference (returned in the first position)
+--  - the actual data (returned in the second position)
+computeWinPushArgs :: Platform -> [CmmExpr] -> ([RawStackArg], [CmmExpr])
+computeWinPushArgs platform = go
+  where
+    go :: [CmmExpr] -> ([RawStackArg], [CmmExpr])
+    go [] = ([], [])
+    go (arg:args) =
+      let
+        arg_size = argSize platform arg
+        (this_arg, add_this_arg)
+          | arg_size > 8
+          = ( RawStackArgRef OnStack arg_size, (arg :) )
+          | otherwise
+          = ( RawStackArg arg, id )
+        (stk_args, stk_data) = go args
+      in
+        (this_arg:stk_args, add_this_arg stk_data)
+
+-- | Load arguments into available registers (Windows C X64 calling convention).
+loadArgsWin :: NCGConfig -> [CmmExpr] -> StateT [(Reg,Reg)] NatM LoadArgs
+loadArgsWin _ [] = return mempty
+loadArgsWin config (arg:rest) = do
+  regs <- get
+  case regs of
+    reg:regs' -> do
+      put regs'
+      this_arg <- lift $ load_arg_win reg
+      rest <- loadArgsWin config rest
+      return $ this_arg S.<> rest
+    [] -> do
+      -- No more registers available: pass all (remaining) arguments on the stack.
+      let (stk_args, data_args) = computeWinPushArgs platform (arg:rest)
+      return $
+        LoadArgs
+          { stackArgs       = stk_args
+          , stackDataArgs   = data_args
+          , assignArgsCode  = nilOL
+          , usedRegs        = []
+          }
+  where
+    platform = ncgPlatform config
+    arg_fmt = cmmTypeFormat $ cmmExprType platform arg
+    load_arg_win (ireg, freg)
+      | isVecFormat arg_fmt
+       -- Vectors are passed by reference.
+       -- See Note [The Windows X64 C calling convention].
+      = do return $
+             LoadArgs
+                -- Pass the reference in a register,
+                -- and the argument data on the stack.
+                { stackArgs       = [RawStackArgRef (InReg ireg) (argSize platform arg)]
+                , stackDataArgs   = [arg] -- we don't yet know where the data will reside,
+                , assignArgsCode  = nilOL -- so we defer computing the reference and storing it
+                                          -- in the register until later
+                , usedRegs        = [RegWithFormat ireg II64]
+                }
+      | otherwise
+      = do let arg_reg
+                  | isFloatFormat arg_fmt
+                  = freg
+                  | otherwise
+                  = ireg
+           assign_code <- loadArgIntoReg arg arg_reg
+           -- Recall that, for varargs, we must pass floating-point
+           -- arguments in both fp and integer registers.
+           let (assign_code', regs')
+                | isFloatFormat arg_fmt =
+                    ( assign_code `snocOL` MOVD FF64 (OpReg freg) (OpReg ireg),
+                      [ RegWithFormat freg FF64
+                      , RegWithFormat ireg II64 ])
+                | otherwise = (assign_code, [RegWithFormat ireg II64])
+           return $
+             LoadArgs
+               { stackArgs       = [] -- passed in register
+               , stackDataArgs   = []
+               , assignArgsCode = assign_code'
+               , usedRegs = regs'
+               }
+
+-- | Load an argument into a register.
+--
+-- Assumes that the expression does not contain any MachOps,
+-- as per Note [Evaluate C-call arguments before placing in destination registers].
+loadArgIntoReg :: CmmExpr -> Reg -> NatM InstrBlock
+loadArgIntoReg arg reg = do
+  when (debugIsOn && loadIntoRegMightClobberOtherReg arg) $ do
+    platform <- getPlatform
+    massertPpr False $
+      vcat [ text "loadArgIntoReg: arg might contain MachOp"
+           , text "arg:" <+> pdoc platform arg ]
+  arg_code <- getAnyReg arg
+  return $ arg_code reg
+
+-- -----------------------------------------------------------------------------
+-- Pushing arguments onto the stack for 64-bit C calls.
+
+-- | The size of an argument (in bytes).
+--
+-- Never smaller than the platform word width.
+argSize :: Platform -> CmmExpr -> Int
+argSize platform arg =
+  max (platformWordSizeInBytes platform) $
+    widthInBytes (typeWidth $ cmmExprType platform arg)
+
+-- | Add the given amount of padding on the stack.
+addStackPadding :: Int -- ^ padding (in bytes)
+                -> NatM InstrBlock
+addStackPadding pad_bytes
+  | pad_bytes == 0
+  = return nilOL
+  | otherwise
+  = do delta <- getDeltaNat
+       setDeltaNat (delta - pad_bytes)
+       return $
+         toOL [ SUB II64 (OpImm (ImmInt pad_bytes)) (OpReg rsp)
+              , DELTA (delta - pad_bytes)
+              ]
+
+-- | Push one argument directly to the stack (by value).
+--
+-- Assumes the current stack pointer fulfills any necessary alignment requirements.
+pushArgByValue :: NCGConfig -> CmmExpr -> NatM InstrBlock
+pushArgByValue config arg = do
+  (arg_reg, arg_code) <- getSomeReg arg
+  delta <- getDeltaNat
+  setDeltaNat (delta-arg_size)
+  let fmt = cmmTypeFormat arg_rep
+  return $ arg_code `appOL` toOL
+     [ SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp)
+     , DELTA (delta-arg_size)
+     -- NB: it's important to only move the actual size,
+     -- e.g. for W8 width we must only move 8 bits and not 64.
+     -- Suppose for example the argument is (CmmLoad addr W8 aln);
+     -- then we must make sure not to try to read more than 8 bits from 'addr'.
+     , movInstr config fmt (OpReg arg_reg) (OpAddr (spRel platform 0)) ]
+    where
+      platform = ncgPlatform config
+      arg_size = argSize platform arg
+      arg_rep = cmmExprType platform arg
+
+-- | Load an argument into a register or push it to the stack.
+loadOrPushArg :: NCGConfig -> (StackArg, Maybe Int) -> NatM (InstrBlock, InstrBlock)
+loadOrPushArg config (stk_arg, mb_off) =
+  case stk_arg of
+    StackArg arg pad -> do
+      push_code <- pushArgByValue config arg
+      pad_code  <- addStackPadding pad
+      return (nilOL, push_code `appOL` pad_code)
+    StackArgRef { stackRef = ref } ->
+      case ref of
+        -- Pass the reference in a register
+        InReg ireg ->
+          return (unitOL $ LEA II64 (OpAddr (spRel platform off)) (OpReg ireg), nilOL)
+        -- Pass the reference on the stack
+        OnStack {} -> do
+          tmp <- getNewRegNat II64
+          delta <- getDeltaNat
+          setDeltaNat (delta-arg_ref_size)
+          let push_code = toOL
+                [ SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_ref_size)) (OpReg rsp)
+                , DELTA (delta-arg_ref_size)
+                , LEA II64 (OpAddr (spRel platform off)) (OpReg tmp)
+                , MOV II64 (OpReg tmp) (OpAddr (spRel platform 0)) ]
+          return (nilOL, push_code)
+      where off = expectJust "push_arg_win offset" mb_off
+    where
+      arg_ref_size = 8 -- passing a reference to the argument
+      platform = ncgPlatform config
+
+-- | Push arguments to the stack, right to left.
+--
+-- On Windows, some arguments may need to be passed by reference,
+-- which requires separately passing the data and the reference.
+-- See Note [The Windows X64 C calling convention].
+pushArgs :: NCGConfig
+         -> [RawStackArg]
+            -- ^ arguments proper (i.e. don't include the data for arguments passed by reference)
+         -> [StackArg]
+            -- ^ arguments we are passing on the stack
+         -> NatM (InstrBlock, InstrBlock)
+pushArgs config proper_args all_stk_args
+  = do { let
+            vec_offs :: [Maybe Int]
+            vec_offs
+              | platformOS platform == OSMinGW32
+              = go stack_arg_size all_stk_args
+              | otherwise
+              = repeat Nothing
+
+    ---------------------
+    -- Windows-only code
+
+            -- Size of the arguments we are passing on the stack, counting only
+            -- the reference part for arguments passed by reference.
+            stack_arg_size = 8 * count not_in_reg proper_args
+            not_in_reg (RawStackArg {}) = True
+            not_in_reg (RawStackArgRef { stackRef = ref }) =
+              case ref of
+                InReg {} -> False
+                OnStack {} -> True
+
+            -- Check an offset is valid (8-byte aligned), for assertions.
+            ok off = off `rem` 8 == 0
+
+            -- Tricky code: compute the stack offset to the vector data
+            -- for this argument.
+            --
+            -- If you're confused, Note [The Windows X64 C calling convention]
+            -- contains a helpful diagram.
+            go :: Int -> [StackArg] -> [Maybe Int]
+            go _ [] = []
+            go off (stk_arg:args) =
+              assertPpr (ok off) (text "unaligned offset:" <+> ppr off) $
+              case stk_arg of
+                StackArg {} ->
+                  -- Only account for the stack pointer movement.
+                  let off' = off - stackArgSpace platform stk_arg
+                  in Nothing : go off' args
+                StackArgRef
+                  { stackRefArgSize    = data_size
+                  , stackRefArgPadding = data_pad } ->
+                  assertPpr (ok data_size) (text "unaligned data size:" <+> ppr data_size) $
+                  assertPpr (ok data_pad) (text "unaligned data padding:" <+> ppr data_pad) $
+                  let off' = off
+                        -- Next piece of data is after the data for this reference
+                           + data_size + data_pad
+                        -- ... and account for the stack pointer movement.
+                           - stackArgSpace platform stk_arg
+                  in Just (data_pad + off) : go off' args
+
+    -- end of Windows-only code
+    ----------------------------
+
+         -- Push the stack arguments (right to left),
+         -- including both the reference and the data for arguments passed by reference.
+       ; (load_regs, push_args) <- foldMapM (loadOrPushArg config) (reverse $ zip all_stk_args vec_offs)
+       ; return (load_regs, push_args) }
+  where
+    platform = ncgPlatform config
+
+{- Note [The Windows X64 C calling convention]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Here are a few facts about the Windows X64 C calling convention that
+are important:
+
+  - any argument larger than 8 bytes must be passed by reference,
+    and arguments smaller than 8 bytes are padded to 8 bytes.
+
+  - the first four arguments are passed in registers:
+      - floating-point scalar arguments are passed in %xmm0, %xmm1, %xmm2, %xmm3
+      - other arguments are passed in %rcx, %rdx, %r8, %r9
+        (this includes vector arguments, passed by reference)
+
+    For variadic functions, it is additionally expected that floating point
+    scalar arguments are copied to the corresponding integer register, e.g.
+    the data in xmm2 should also be copied to r8.
+
+    There is no requirement about setting %al like there is for the
+    System V AMD64 ABI.
+
+  - subsequent arguments are passed on the stack.
+
+There are also alignment requirements:
+
+  - the data for vectors must be aligned to the size of the vector,
+    e.g. a 32 byte vector must be aligned on a 32 byte boundary,
+
+  - the call instruction must be aligned to 16 bytes.
+  (This differs from the System V AMD64 ABI, which mandates that the call
+  instruction must be aligned to 32 bytes if there are any 32 byte vectors
+  passed on the stack.)
+
+This motivates our handling of vector values. Suppose we have a function call
+with many arguments, several of them being vectors. We proceed as follows:
+
+ - Add some padding, if necessary, to ensure the stack, when executing the call
+    instruction, is 16-byte aligned. Whether this padding is necessary depends
+    on what happens next. (Recall also that we start off at 8 (mod 16) alignment,
+    as per Note [Stack Alignment on X86] in rts/StgCRun.c)
+  - Push all the vectors to the stack first, adding padding after each one
+    if necessary.
+  - Then push the arguments:
+      - for non-vectors, proceed as usual,
+      - for vectors, push the address of the vector data we pushed above.
+  - Then assign the registers:
+      - for non-vectors, proceed as usual,
+      - for vectors, store the address in a general-purpose register, as opposed
+        to storing the data in an xmm register.
+
+For a concrete example, suppose we have a call of the form:
+
+  f x1 x2 x3 x4 x5 x6 x7
+
+in which:
+
+  - x2, x3, x5 and x7 are 16 byte vectors
+  - the other arguments are all 8 byte wide
+
+Now, x1, x2, x3, x4 will get passed in registers, except that we pass
+x2 and x3 by reference, because they are vectors. We proceed as follows:
+
+  - push the vectors to the stack: x7, x5, x3, x2 (in that order)
+  - push the stack arguments in order: addr(x7), x6, addr(x5)
+  - load the remaining arguments into registers: x4, addr(x3), addr(x2), x1
+
+The tricky part is to get the right offsets for the addresses of the vector
+data. The following visualisation will hopefully clear things up:
+
+                                  ┌──┐
+                                  │▓▓│ ─── padding to align the call instruction
+                      ╭─╴         ╞══╡     (ensures Sp, below, is 16-byte aligned)
+                      │           │  │
+                      │  x7  ───╴ │  │
+                      │           ├──┤
+                      │           │  │
+                      │  x5  ───╴ │  │
+                      │           ├──┤
+     vector data  ────┤           │  │
+(individually padded) │  x3  ───╴ │  │
+                      │           ├──┤
+                      │           │  │
+                      │  x2  ───╴ │  │
+                      │           ├┄┄┤
+                      │           │▓▓│ ─── padding to align x2 to 16 bytes
+               ╭─╴    ╰─╴         ╞══╡
+               │    addr(x7) ───╴ │  │    ╭─ from here: x7 is +64
+               │                  ├──┤ ╾──╯    = 64 (position of x5)
+     stack  ───┤         x6  ───╴ │  │         + 16 (size of x5) + 0 (padding of x7)
+   arguments   │                  ├──┤         - 2 * 8 (x7 is 2 arguments higher than x5)
+               │    addr(x5) ───╴ │  │
+               ╰─╴            ╭─╴ ╞══╡ ╾─── from here:
+                              │   │  │       - x2 is +32 = 24 (stack_arg_size) + 8 (padding of x2)
+                   shadow  ───┤   │  │       - x3 is +48 = 32 (position of x2) + 16 (size of x2) + 0 (padding of x3)
+                    space     │   │  │       - x5 is +64 = 48 (position of x3) + 16 (size of x3) + 0 (padding of x5)
+                              │   │  │
+                              ╰─╴ └──┘ ╾─── Sp
+
+This is all tested in the simd013 test.
+-}
 
 -- -----------------------------------------------------------------------------
 -- Generating a table-branch


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -306,6 +306,7 @@ data Instr
         | VMOVDQU     Format Operand Operand
 
         -- logic operations
+        | PXOR        Format Operand Reg
         | VPXOR       Format Reg Reg Reg
 
         -- Arithmetic
@@ -492,6 +493,12 @@ regUsageOfInstr platform instr
     MOVDQU       fmt src dst   -> mkRU (use_R fmt src []) (use_R fmt dst [])
     VMOVDQU      fmt src dst   -> mkRU (use_R fmt src []) (use_R fmt dst [])
 
+    PXOR fmt (OpReg src) dst
+      | src == dst
+      -> mkRU [] [mk fmt dst]
+      | otherwise
+      -> mkRU [mk fmt src, mk fmt dst] [mk fmt dst]
+
     VPXOR        fmt s1 s2 dst
       | s1 == s2, s1 == dst
       -> mkRU [] [mk fmt dst]
@@ -733,6 +740,7 @@ patchRegsOfInstr platform instr env
     MOVDQU     fmt src dst   -> MOVDQU  fmt (patchOp src) (patchOp dst)
     VMOVDQU    fmt src dst   -> VMOVDQU fmt (patchOp src) (patchOp dst)
 
+    PXOR       fmt src dst   -> PXOR fmt (patchOp src) (env dst)
     VPXOR      fmt s1 s2 dst -> VPXOR fmt (env s1) (env s2) (env dst)
 
     VADD       fmt s1 s2 dst -> VADD fmt (patchOp s1) (env s2) (env dst)
@@ -886,7 +894,7 @@ mkLoadInstr config (RegWithFormat reg fmt) delta slot =
 
 -- | A move instruction for moving the entire contents of an operand
 -- at the given 'Format'.
-movInstr :: NCGConfig -> Format -> (Operand -> Operand -> Instr)
+movInstr :: HasDebugCallStack => NCGConfig -> Format -> (Operand -> Operand -> Instr)
 movInstr config fmt =
   case fmt of
     VecFormat _ sFmt ->
@@ -908,16 +916,37 @@ movInstr config fmt =
         w -> sorry $ "Unhandled SIMD vector width: " ++ show w ++ " bits"
     _ -> MOV fmt
   where
+    plat    = ncgPlatform config
     avx     = ncgAvxEnabled config
     avx2    = ncgAvx2Enabled config
     avx512f = ncgAvx512fEnabled config
     avx_move sFmt =
       if isFloatScalarFormat sFmt
-      then VMOVU   fmt
+      then \ op1 op2 ->
+              if
+                | OpReg r1 <- op1
+                , OpReg r2 <- op2
+                , targetClassOfReg plat r1 /= targetClassOfReg plat r2
+                -> pprPanic "movInstr: VMOVU between incompatible registers"
+                     ( vcat [ text "fmt:" <+> ppr fmt
+                            , text "r1:" <+> ppr r1
+                            , text "r2:" <+> ppr r2 ] )
+                | otherwise
+                -> VMOVU   fmt op1 op2
       else VMOVDQU fmt
     sse_move sFmt =
       if isFloatScalarFormat sFmt
-      then MOVU   fmt
+      then \ op1 op2 ->
+              if
+                | OpReg r1 <- op1
+                , OpReg r2 <- op2
+                , targetClassOfReg plat r1 /= targetClassOfReg plat r2
+                -> pprPanic "movInstr: MOVU between incompatible registers"
+                     ( vcat [ text "fmt:" <+> ppr fmt
+                            , text "r1:" <+> ppr r1
+                            , text "r2:" <+> ppr r2 ] )
+                | otherwise
+                -> MOVU   fmt op1 op2
       else MOVDQU fmt
     -- NB: we are using {V}MOVU and not {V}MOVA, because we have no guarantees
     -- about the stack being sufficiently aligned (even for even numbered stack slots).
@@ -982,12 +1011,7 @@ mkRegRegMoveInstr
     -> Reg
     -> Instr
 mkRegRegMoveInstr config fmt src dst =
-  assertPpr (targetClassOfReg platform src == targetClassOfReg platform dst)
-    (vcat [ text "mkRegRegMoveInstr: incompatible register classes"
-          , text "fmt:" <+> ppr fmt
-          , text "src:" <+> ppr src
-          , text "dst:" <+> ppr dst ]) $
-    movInstr config fmt' (OpReg src) (OpReg dst)
+  movInstr config fmt' (OpReg src) (OpReg dst)
       -- Move the platform word size, at a minimum
   where
     platform = ncgPlatform config


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -1016,6 +1016,8 @@ pprInstr platform i = case i of
         VecFormat 64 FmtInt8  -> text "vmovdqu32" -- require the additional AVX512BW extension
         _ -> text "vmovdqu"
 
+   PXOR format src dst
+     -> pprPXor (text "pxor") format src dst
    VPXOR format s1 s2 dst
      -> pprXor (text "vpxor") format s1 s2 dst
    VEXTRACT format offset from to
@@ -1320,6 +1322,15 @@ pprInstr platform i = case i of
            pprReg platform format reg3
        ]
 
+   pprPXor :: Line doc -> Format -> Operand -> Reg -> doc
+   pprPXor name format src dst
+     = line $ hcat [
+           pprGenMnemonic name format,
+           pprOperand platform format src,
+           comma,
+           pprReg platform format dst
+       ]
+
    pprVxor :: Format -> Operand -> Reg -> Reg -> doc
    pprVxor fmt src1 src2 dst
      = line $ hcat [


=====================================
compiler/GHC/CmmToLlvm.hs
=====================================
@@ -139,7 +139,7 @@ llvmGroupLlvmGens cmm = do
                          Nothing                   -> l
                          Just (CmmStaticsRaw info_lbl _) -> info_lbl
               lml <- strCLabel_llvm l'
-              funInsert lml =<< llvmFunTy (map globalRegUse_reg live)
+              funInsert lml =<< llvmFunTy live
               return Nothing
         cdata <- fmap catMaybes $ mapM split cmm
 


=====================================
compiler/GHC/CmmToLlvm/Base.hs
=====================================
@@ -12,7 +12,7 @@
 module GHC.CmmToLlvm.Base (
 
         LlvmCmmDecl, LlvmBasicBlock,
-        LiveGlobalRegs,
+        LiveGlobalRegs, LiveGlobalRegUses,
         LlvmUnresData, LlvmData, UnresLabel, UnresStatic,
 
         LlvmM,
@@ -29,6 +29,8 @@ module GHC.CmmToLlvm.Base (
         llvmFunSig, llvmFunArgs, llvmStdFunAttrs, llvmFunAlign, llvmInfAlign,
         llvmPtrBits, tysToParams, llvmFunSection, padLiveArgs, isFPR,
 
+        lookupRegUse,
+
         strCLabel_llvm,
         getGlobalPtr, generateExternDecls,
 
@@ -58,9 +60,11 @@ import GHC.Types.Unique.Set
 import GHC.Types.Unique.Supply
 import GHC.Utils.Logger
 
-import Data.Maybe (fromJust)
 import Control.Monad.Trans.State (StateT (..))
-import Data.List (isPrefixOf)
+import Control.Applicative (Alternative((<|>)))
+import Data.Maybe (fromJust, mapMaybe)
+
+import Data.List (find, isPrefixOf)
 import qualified Data.List.NonEmpty as NE
 import Data.Ord (comparing)
 
@@ -73,6 +77,7 @@ type LlvmBasicBlock = GenBasicBlock LlvmStatement
 
 -- | Global registers live on proc entry
 type LiveGlobalRegs = [GlobalReg]
+type LiveGlobalRegUses = [GlobalRegUse]
 
 -- | Unresolved code.
 -- Of the form: (data label, data type, unresolved data)
@@ -116,16 +121,16 @@ llvmGhcCC platform
  | otherwise                       = CC_Ghc
 
 -- | Llvm Function type for Cmm function
-llvmFunTy :: LiveGlobalRegs -> LlvmM LlvmType
+llvmFunTy :: LiveGlobalRegUses -> LlvmM LlvmType
 llvmFunTy live = return . LMFunction =<< llvmFunSig' live (fsLit "a") ExternallyVisible
 
 -- | Llvm Function signature
-llvmFunSig :: LiveGlobalRegs ->  CLabel -> LlvmLinkageType -> LlvmM LlvmFunctionDecl
+llvmFunSig :: LiveGlobalRegUses ->  CLabel -> LlvmLinkageType -> LlvmM LlvmFunctionDecl
 llvmFunSig live lbl link = do
   lbl' <- strCLabel_llvm lbl
   llvmFunSig' live lbl' link
 
-llvmFunSig' :: LiveGlobalRegs -> LMString -> LlvmLinkageType -> LlvmM LlvmFunctionDecl
+llvmFunSig' :: LiveGlobalRegUses -> LMString -> LlvmLinkageType -> LlvmM LlvmFunctionDecl
 llvmFunSig' live lbl link
   = do let toParams x | isPointer x = (x, [NoAlias, NoCapture])
                       | otherwise   = (x, [])
@@ -149,16 +154,25 @@ llvmFunSection opts lbl
     | otherwise               = Nothing
 
 -- | A Function's arguments
-llvmFunArgs :: Platform -> LiveGlobalRegs -> [LlvmVar]
+llvmFunArgs :: Platform -> LiveGlobalRegUses -> [LlvmVar]
 llvmFunArgs platform live =
-    map (lmGlobalRegArg platform) (filter isPassed allRegs)
+    map (lmGlobalRegArg platform) (mapMaybe isPassed allRegs)
     where allRegs = activeStgRegs platform
           paddingRegs = padLiveArgs platform live
-          isLive r = r `elem` alwaysLive
-                     || r `elem` live
-                     || r `elem` paddingRegs
-          isPassed r = not (isFPR r) || isLive r
-
+          isLive :: GlobalReg -> Maybe GlobalRegUse
+          isLive r =
+            lookupRegUse r (alwaysLive platform)
+              <|>
+            lookupRegUse r live
+              <|>
+            lookupRegUse r paddingRegs
+          isPassed r =
+            if not (isFPR r)
+            then Just $ GlobalRegUse r (globalRegSpillType platform r)
+            else isLive r
+
+lookupRegUse :: GlobalReg -> [GlobalRegUse] -> Maybe GlobalRegUse
+lookupRegUse r = find ((== r) . globalRegUse_reg)
 
 isFPR :: GlobalReg -> Bool
 isFPR (FloatReg _)  = True
@@ -179,7 +193,7 @@ isFPR _             = False
 -- Invariant: Cmm FPR regs with number "n" maps to real registers with number
 -- "n" If the calling convention uses registers in a different order or if the
 -- invariant doesn't hold, this code probably won't be correct.
-padLiveArgs :: Platform -> LiveGlobalRegs -> LiveGlobalRegs
+padLiveArgs :: Platform -> LiveGlobalRegUses -> LiveGlobalRegUses
 padLiveArgs platform live =
       if platformUnregisterised platform
         then [] -- not using GHC's register convention for platform.
@@ -188,7 +202,7 @@ padLiveArgs platform live =
     ----------------------------------
     -- handle floating-point registers (FPR)
 
-    fprLive = filter isFPR live  -- real live FPR registers
+    fprLive = filter (isFPR . globalRegUse_reg) live  -- real live FPR registers
 
     -- we group live registers sharing the same classes, i.e. that use the same
     -- set of real registers to be passed. E.g. FloatReg, DoubleReg and XmmReg
@@ -196,39 +210,44 @@ padLiveArgs platform live =
     --
     classes         = NE.groupBy sharesClass fprLive
     sharesClass a b = globalRegsOverlap platform (norm a) (norm b) -- check if mapped to overlapping registers
-    norm x          = fpr_ctor x 1                                 -- get the first register of the family
+    norm x = globalRegUse_reg (fpr_ctor x 1)                  -- get the first register of the family
 
     -- For each class, we just have to fill missing registers numbers. We use
     -- the constructor of the greatest register to build padding registers.
     --
     -- E.g. sortedRs = [   F2,   XMM4, D5]
     --      output   = [D1,   D3]
+    padded :: [GlobalRegUse]
     padded      = concatMap padClass classes
+
+    padClass :: NE.NonEmpty GlobalRegUse -> [GlobalRegUse]
     padClass rs = go (NE.toList sortedRs) 1
       where
-         sortedRs = NE.sortBy (comparing fpr_num) rs
+         sortedRs = NE.sortBy (comparing (fpr_num . globalRegUse_reg)) rs
          maxr     = NE.last sortedRs
          ctor     = fpr_ctor maxr
 
          go [] _ = []
-         go (c1:c2:_) _   -- detect bogus case (see #17920)
+         go (GlobalRegUse c1 _: GlobalRegUse c2 _:_) _   -- detect bogus case (see #17920)
             | fpr_num c1 == fpr_num c2
             , Just real <- globalRegMaybe platform c1
             = sorryDoc "LLVM code generator" $
                text "Found two different Cmm registers (" <> ppr c1 <> text "," <> ppr c2 <>
                text ") both alive AND mapped to the same real register: " <> ppr real <>
                text ". This isn't currently supported by the LLVM backend."
-         go (c:cs) f
-            | fpr_num c == f = go cs f                    -- already covered by a real register
-            | otherwise      = ctor f : go (c:cs) (f + 1) -- add padding register
-
-    fpr_ctor :: GlobalReg -> Int -> GlobalReg
-    fpr_ctor (FloatReg _)  = FloatReg
-    fpr_ctor (DoubleReg _) = DoubleReg
-    fpr_ctor (XmmReg _)    = XmmReg
-    fpr_ctor (YmmReg _)    = YmmReg
-    fpr_ctor (ZmmReg _)    = ZmmReg
-    fpr_ctor _ = error "fpr_ctor expected only FPR regs"
+         go (cu@(GlobalRegUse c _):cs) f
+            | fpr_num c == f = go cs f                     -- already covered by a real register
+            | otherwise      = ctor f : go (cu:cs) (f + 1) -- add padding register
+
+    fpr_ctor :: GlobalRegUse -> Int -> GlobalRegUse
+    fpr_ctor (GlobalRegUse r fmt) i =
+      case r of
+        FloatReg _  -> GlobalRegUse (FloatReg  i) fmt
+        DoubleReg _ -> GlobalRegUse (DoubleReg i) fmt
+        XmmReg _    -> GlobalRegUse (XmmReg    i) fmt
+        YmmReg _    -> GlobalRegUse (YmmReg    i) fmt
+        ZmmReg _    -> GlobalRegUse (ZmmReg    i) fmt
+        _           -> error "fpr_ctor expected only FPR regs"
 
     fpr_num :: GlobalReg -> Int
     fpr_num (FloatReg i)  = i


=====================================
compiler/GHC/CmmToLlvm/CodeGen.hs
=====================================
@@ -37,13 +37,14 @@ import GHC.Utils.Outputable
 import qualified GHC.Utils.Panic as Panic
 import GHC.Utils.Misc
 
+import Control.Applicative (Alternative((<|>)))
 import Control.Monad.Trans.Class
 import Control.Monad.Trans.Writer
 import Control.Monad
 
 import qualified Data.Semigroup as Semigroup
 import Data.List ( nub )
-import Data.Maybe ( catMaybes )
+import Data.Maybe ( catMaybes, isJust )
 
 type Atomic = Maybe MemoryOrdering
 type LlvmStatements = OrdList LlvmStatement
@@ -57,7 +58,7 @@ genLlvmProc :: RawCmmDecl -> LlvmM [LlvmCmmDecl]
 genLlvmProc (CmmProc infos lbl live graph) = do
     let blocks = toBlockListEntryFirstFalseFallthrough graph
 
-    (lmblocks, lmdata) <- basicBlocksCodeGen (map globalRegUse_reg live) blocks
+    (lmblocks, lmdata) <- basicBlocksCodeGen live blocks
     let info = mapLookup (g_entry graph) infos
         proc = CmmProc info lbl live (ListGraph lmblocks)
     return (proc:lmdata)
@@ -76,7 +77,7 @@ newtype UnreachableBlockId = UnreachableBlockId BlockId
 -- | Generate code for a list of blocks that make up a complete
 -- procedure. The first block in the list is expected to be the entry
 -- point.
-basicBlocksCodeGen :: LiveGlobalRegs -> [CmmBlock]
+basicBlocksCodeGen :: LiveGlobalRegUses -> [CmmBlock]
                       -> LlvmM ([LlvmBasicBlock], [LlvmCmmDecl])
 basicBlocksCodeGen _    []                     = panic "no entry block!"
 basicBlocksCodeGen live cmmBlocks
@@ -152,7 +153,7 @@ stmtToInstrs ubid stmt = case stmt of
 
     -- Tail call
     CmmCall { cml_target = arg,
-              cml_args_regs = live } -> genJump arg $ map globalRegUse_reg live
+              cml_args_regs = live } -> genJump arg live
 
     _ -> panic "Llvm.CodeGen.stmtToInstrs"
 
@@ -1050,7 +1051,7 @@ cmmPrimOpFunctions mop = do
 
 
 -- | Tail function calls
-genJump :: CmmExpr -> [GlobalReg] -> LlvmM StmtData
+genJump :: CmmExpr -> LiveGlobalRegUses -> LlvmM StmtData
 
 -- Call to known function
 genJump (CmmLit (CmmLabel lbl)) live = do
@@ -2056,14 +2057,13 @@ getCmmReg (CmmLocal (LocalReg un _))
            -- have been assigned a value at some point, triggering
            -- "funPrologue" to allocate it on the stack.
 
-getCmmReg (CmmGlobal g)
-  = do let r = globalRegUse_reg g
-       onStack  <- checkStackReg r
+getCmmReg (CmmGlobal ru@(GlobalRegUse r _))
+  = do onStack  <- checkStackReg r
        platform <- getPlatform
        if onStack
-         then return (lmGlobalRegVar platform r)
+         then return (lmGlobalRegVar platform ru)
          else pprPanic "getCmmReg: Cmm register " $
-                ppr g <> text " not stack-allocated!"
+                ppr r <> text " not stack-allocated!"
 
 -- | Return the value of a given register, as well as its type. Might
 -- need to be load from stack.
@@ -2074,7 +2074,7 @@ getCmmRegVal reg =
       onStack <- checkStackReg (globalRegUse_reg g)
       platform <- getPlatform
       if onStack then loadFromStack else do
-        let r = lmGlobalRegArg platform (globalRegUse_reg g)
+        let r = lmGlobalRegArg platform g
         return (r, getVarType r, nilOL)
     _ -> loadFromStack
  where loadFromStack = do
@@ -2187,8 +2187,9 @@ convertMemoryOrdering MemOrderSeqCst  = SyncSeqCst
 -- question is never written. Therefore we skip it where we can to
 -- save a few lines in the output and hopefully speed compilation up a
 -- bit.
-funPrologue :: LiveGlobalRegs -> [CmmBlock] -> LlvmM StmtData
+funPrologue :: LiveGlobalRegUses -> [CmmBlock] -> LlvmM StmtData
 funPrologue live cmmBlocks = do
+  platform <- getPlatform
 
   let getAssignedRegs :: CmmNode O O -> [CmmReg]
       getAssignedRegs (CmmAssign reg _)  = [reg]
@@ -2196,7 +2197,8 @@ funPrologue live cmmBlocks = do
       getAssignedRegs _                  = []
       getRegsBlock (_, body, _)          = concatMap getAssignedRegs $ blockToList body
       assignedRegs = nub $ concatMap (getRegsBlock . blockSplit) cmmBlocks
-      isLive r     = r `elem` alwaysLive || r `elem` live
+      mbLive r     =
+        lookupRegUse r (alwaysLive platform) <|> lookupRegUse r live
 
   platform <- getPlatform
   stmtss <- forM assignedRegs $ \reg ->
@@ -2205,12 +2207,12 @@ funPrologue live cmmBlocks = do
         let (newv, stmts) = allocReg reg
         varInsert un (pLower $ getVarType newv)
         return stmts
-      CmmGlobal (GlobalRegUse r _) -> do
-        let reg   = lmGlobalRegVar platform r
-            arg   = lmGlobalRegArg platform r
+      CmmGlobal ru@(GlobalRegUse r _) -> do
+        let reg   = lmGlobalRegVar platform ru
+            arg   = lmGlobalRegArg platform ru
             ty    = (pLower . getVarType) reg
             trash = LMLitVar $ LMUndefLit ty
-            rval  = if isLive r then arg else trash
+            rval  = if isJust (mbLive r) then arg else trash
             alloc = Assignment reg $ Alloca (pLower $ getVarType reg) 1
         markStackReg r
         return $ toOL [alloc, Store rval reg Nothing []]
@@ -2222,7 +2224,7 @@ funPrologue live cmmBlocks = do
 
 -- | Function epilogue. Load STG variables to use as argument for call.
 -- STG Liveness optimisation done here.
-funEpilogue :: LiveGlobalRegs -> LlvmM ([LlvmVar], LlvmStatements)
+funEpilogue :: LiveGlobalRegUses -> LlvmM ([LlvmVar], LlvmStatements)
 funEpilogue live = do
     platform <- getPlatform
 
@@ -2248,12 +2250,16 @@ funEpilogue live = do
     let allRegs = activeStgRegs platform
     loads <- forM allRegs $ \r -> if
       -- load live registers
-      | r `elem` alwaysLive  -> loadExpr (GlobalRegUse r (globalRegSpillType platform r))
-      | r `elem` live        -> loadExpr (GlobalRegUse r (globalRegSpillType platform r))
+      | Just ru <- lookupRegUse r (alwaysLive platform)
+      -> loadExpr ru
+      | Just ru <- lookupRegUse r live
+      -> loadExpr ru
       -- load all non Floating-Point Registers
-      | not (isFPR r)        -> loadUndef r
+      | not (isFPR r)
+      -> loadUndef (GlobalRegUse r (globalRegSpillType platform r))
       -- load padding Floating-Point Registers
-      | r `elem` paddingRegs -> loadUndef r
+      | Just ru <- lookupRegUse r paddingRegs
+      -> loadUndef ru
       | otherwise            -> return (Nothing, nilOL)
 
     let (vars, stmts) = unzip loads
@@ -2263,7 +2269,7 @@ funEpilogue live = do
 --
 -- This is for Haskell functions, function type is assumed, so doesn't work
 -- with foreign functions.
-getHsFunc :: LiveGlobalRegs -> CLabel -> LlvmM ExprData
+getHsFunc :: LiveGlobalRegUses -> CLabel -> LlvmM ExprData
 getHsFunc live lbl
   = do fty <- llvmFunTy live
        name <- strCLabel_llvm lbl


=====================================
compiler/GHC/CmmToLlvm/Ppr.hs
=====================================
@@ -49,9 +49,8 @@ pprLlvmCmmDecl (CmmData _ lmdata) = do
   return ( vcat $ map (pprLlvmData opts) lmdata
          , vcat $ map (pprLlvmData opts) lmdata)
 
-pprLlvmCmmDecl (CmmProc mb_info entry_lbl liveWithUses (ListGraph blks))
-  = do let live = map globalRegUse_reg liveWithUses
-           lbl = case mb_info of
+pprLlvmCmmDecl (CmmProc mb_info entry_lbl live (ListGraph blks))
+  = do let lbl = case mb_info of
                      Nothing -> entry_lbl
                      Just (CmmStaticsRaw info_lbl _) -> info_lbl
            link = if externallyVisibleCLabel lbl


=====================================
compiler/GHC/CmmToLlvm/Regs.hs
=====================================
@@ -14,25 +14,27 @@ import GHC.Prelude
 import GHC.Llvm
 
 import GHC.Cmm.Expr
+import GHC.CmmToAsm.Format
 import GHC.Platform
 import GHC.Data.FastString
 import GHC.Utils.Panic ( panic )
 import GHC.Types.Unique
 
+
 -- | Get the LlvmVar function variable storing the real register
-lmGlobalRegVar :: Platform -> GlobalReg -> LlvmVar
+lmGlobalRegVar :: Platform -> GlobalRegUse -> LlvmVar
 lmGlobalRegVar platform = pVarLift . lmGlobalReg platform "_Var"
 
 -- | Get the LlvmVar function argument storing the real register
-lmGlobalRegArg :: Platform -> GlobalReg -> LlvmVar
+lmGlobalRegArg :: Platform -> GlobalRegUse -> LlvmVar
 lmGlobalRegArg platform = lmGlobalReg platform "_Arg"
 
 {- Need to make sure the names here can't conflict with the unique generated
    names. Uniques generated names containing only base62 chars. So using say
    the '_' char guarantees this.
 -}
-lmGlobalReg :: Platform -> String -> GlobalReg -> LlvmVar
-lmGlobalReg platform suf reg
+lmGlobalReg :: Platform -> String -> GlobalRegUse -> LlvmVar
+lmGlobalReg platform suf (GlobalRegUse reg ty)
   = case reg of
         BaseReg        -> ptrGlobal $ "Base" ++ suf
         Sp             -> ptrGlobal $ "Sp" ++ suf
@@ -88,13 +90,26 @@ lmGlobalReg platform suf reg
         ptrGlobal    name = LMNLocalVar (fsLit name) (llvmWordPtr platform)
         floatGlobal  name = LMNLocalVar (fsLit name) LMFloat
         doubleGlobal name = LMNLocalVar (fsLit name) LMDouble
-        xmmGlobal    name = LMNLocalVar (fsLit name) (LMVector 4 (LMInt 32))
-        ymmGlobal    name = LMNLocalVar (fsLit name) (LMVector 8 (LMInt 32))
-        zmmGlobal    name = LMNLocalVar (fsLit name) (LMVector 16 (LMInt 32))
+        fmt = cmmTypeFormat ty
+        xmmGlobal    name = LMNLocalVar (fsLit name) (formatLlvmType fmt)
+        ymmGlobal    name = LMNLocalVar (fsLit name) (formatLlvmType fmt)
+        zmmGlobal    name = LMNLocalVar (fsLit name) (formatLlvmType fmt)
+
+formatLlvmType :: Format -> LlvmType
+formatLlvmType II8 = LMInt 8
+formatLlvmType II16 = LMInt 16
+formatLlvmType II32 = LMInt 32
+formatLlvmType II64 = LMInt 64
+formatLlvmType FF32 = LMFloat
+formatLlvmType FF64 = LMDouble
+formatLlvmType (VecFormat l sFmt) = LMVector l (formatLlvmType $ scalarFormatFormat sFmt)
 
 -- | A list of STG Registers that should always be considered alive
-alwaysLive :: [GlobalReg]
-alwaysLive = [BaseReg, Sp, Hp, SpLim, HpLim, node]
+alwaysLive :: Platform -> [GlobalRegUse]
+alwaysLive platform =
+  [ GlobalRegUse r (globalRegSpillType platform r)
+  | r <- [BaseReg, Sp, Hp, SpLim, HpLim, node]
+  ]
 
 -- | STG Type Based Alias Analysis hierarchy
 stgTBAA :: [(Unique, LMString, Maybe Unique)]


=====================================
testsuite/tests/simd/should_run/simd013.hs
=====================================
@@ -8,25 +8,26 @@ module Main where
 import GHC.Exts
 import GHC.Prim
 
-foreign import ccall "sub"
+foreign import ccall unsafe "sub"
   sub :: DoubleX2# -> DoubleX2# -> DoubleX2#
 
-foreign import ccall "add6"
-  add6 :: DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+foreign import ccall unsafe "add7"
+  add7 :: DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
 
 main :: IO ()
 main = do
   let x1, x2 :: DoubleX2#
       x1 = packDoubleX2# (# 9.9##, 99.99## #)
       x2 = packDoubleX2# (# 1.1##, 11.11## #)
-      y1, y2, y3, y4, y5, y6 :: DoubleX2#
-      !y1 = packDoubleX2# (#      1.5##,           2.5## #)
-      !y2 = packDoubleX2# (#     10.25##,         20.25## #)
-      !y3 = packDoubleX2# (#    100.125##,       200.125## #)
-      !y4 = packDoubleX2# (#   1000.0625##,     2000.0625## #)
-      !y5 = packDoubleX2# (#  10000.03125##,   20000.03125## #)
-      !y6 = packDoubleX2# (# 100000.015625##, 200000.015625## #)
+      y1, y2, y3, y4, y5, y6, y7 :: DoubleX2#
+      !y1 = packDoubleX2# (#       1.5##,             2.5## #)
+      !y2 = packDoubleX2# (#      10.25##,           20.25## #)
+      !y3 = packDoubleX2# (#     100.125##,         200.125## #)
+      !y4 = packDoubleX2# (#    1000.0625##,       2000.0625## #)
+      !y5 = packDoubleX2# (#   10000.03125##,     20000.03125## #)
+      !y6 = packDoubleX2# (#  100000.015625##,   200000.015625## #)
+      !y7 = packDoubleX2# (# 1000000.0078125##, 2000000.0078125## #)
       !(# a, b #) = unpackDoubleX2# ( sub x1 x2 )
-      !(# c, d #) = unpackDoubleX2# ( add6 y1 y2 y3 y4 y5 y6 )
+      !(# c, d #) = unpackDoubleX2# ( add7 y1 y2 y3 y4 y5 y6 y7 )
   print ( D# a, D# b )
   print ( D# c, D# d )


=====================================
testsuite/tests/simd/should_run/simd013.stdout
=====================================
@@ -1,2 +1,2 @@
 (8.8,88.88)
-(111111.984375,222222.984375)
+(1111111.9921875,2222222.9921875)


=====================================
testsuite/tests/simd/should_run/simd013C.c
=====================================
@@ -1,12 +1,12 @@
 
-#include <xmmintrin.h>
+#include <immintrin.h>
 
 __m128d sub(__m128d x, __m128d y)
 {
   return _mm_sub_pd(x,y);
 }
 
-__m128d add6(__m128d x1, __m128d x2, __m128d x3, __m128d x4, __m128d x5, __m128d x6)
+__m128d add7(__m128d x1, __m128d x2, __m128d x3, __m128d x4, __m128d x5, __m128d x6, __m128d x7)
 {
-  return _mm_add_pd(x1,_mm_add_pd(x2,_mm_add_pd(x3,_mm_add_pd(x4,_mm_add_pd(x5,x6)))));
+  return _mm_add_pd(x1,_mm_add_pd(x2,_mm_add_pd(x3,_mm_add_pd(x4,_mm_add_pd(x5,_mm_add_pd(x6, x7))))));
 }



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/c27747f25c7209555ae96573f4525839297b7703...ad8c5c6410e818a4677894e0efcb14d7aa45a5e6

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/c27747f25c7209555ae96573f4525839297b7703...ad8c5c6410e818a4677894e0efcb14d7aa45a5e6
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240920/628929f7/attachment-0001.html>