[Git][ghc/ghc][wip/ncg-simd] Fix C calls with SIMD vectors

Fri Aug 30 19:58:02 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
c0e72083 by sheaf at 2024-08-30T21:57:42+02:00
Fix C calls with SIMD vectors

This commit fixes the code generation for C calls, to take into account
the calling convention.

This is particularly tricky on Windows, where all vectors are expected
to be passed by reference. See Note [The Windows X64 C calling convention]
in GHC.CmmToAsm.X86.CodeGen.

- - - - -


5 changed files:

- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- testsuite/tests/simd/should_run/simd013.hs
- testsuite/tests/simd/should_run/simd013.stdout
- testsuite/tests/simd/should_run/simd013C.c


Changes:

=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1,6 +1,9 @@
+{-# LANGUAGE DuplicateRecordFields #-}
+{-# LANGUAGE GADTs #-}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE MultiWayIf #-}
-{-# LANGUAGE GADTs #-}
+{-# LANGUAGE ParallelListComp #-}
+{-# LANGUAGE TupleSections #-}
 {-# LANGUAGE NondecreasingIndentation #-}
 
 {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
@@ -73,22 +76,29 @@ import GHC.Types.Tickish ( GenTickish(..) )
 import GHC.Types.SrcLoc  ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
 
 -- The rest:
+import GHC.Data.Maybe ( expectJust )
 import GHC.Types.ForeignCall ( CCallConv(..) )
 import GHC.Data.OrdList
 import GHC.Utils.Outputable
 import GHC.Utils.Constants (debugIsOn)
+import GHC.Utils.Monad ( foldMapM )
 import GHC.Utils.Panic
 import GHC.Data.FastString
 import GHC.Utils.Misc
 import GHC.Types.Unique.Supply ( getUniqueM )
 
+import qualified Data.Semigroup as S
+
 import Control.Monad
+import Control.Monad.Trans.State.Strict
+  ( StateT, evalStateT, get, put )
+import Control.Monad.Trans.Class (lift)
 import Data.Foldable (fold)
 import Data.Int
 import Data.Maybe
 import Data.Word
 
-import qualified Data.Map as M
+import qualified Data.Map as Map
 
 is32BitPlatform :: NatM Bool
 is32BitPlatform = do
@@ -227,7 +237,7 @@ addSpUnwindings instr@(DELTA d) = do
     let platform = ncgPlatform config
     if ncgDwarfUnwindings config
         then do lbl <- mkAsmTempLabel <$> getUniqueM
-                let unwind = M.singleton MachSp (Just $ UwReg (GlobalRegUse MachSp (bWord platform)) $ negate d)
+                let unwind = Map.singleton MachSp (Just $ UwReg (GlobalRegUse MachSp (bWord platform)) $ negate d)
                 return $ toOL [ instr, UNWIND lbl unwind ]
         else return (unitOL instr)
 addSpUnwindings instr = return $ unitOL instr
@@ -328,10 +338,10 @@ stmtToInstrs bid stmt = do
 
       CmmUnwind regs -> do
         let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
-            to_unwind_entry (reg, expr) = M.singleton reg (fmap (toUnwindExpr platform) expr)
+            to_unwind_entry (reg, expr) = Map.singleton reg (fmap (toUnwindExpr platform) expr)
         case foldMap to_unwind_entry regs of
-          tbl | M.null tbl -> return nilOL
-              | otherwise  -> do
+          tbl | Map.null tbl -> return nilOL
+              | otherwise    -> do
                   lbl <- mkAsmTempLabel <$> getUniqueM
                   return $ unitOL $ UNWIND lbl tbl
 
@@ -844,12 +854,12 @@ iselExpr64ParallelBin op e1 e2 = do
 -- targetted for any particular type like Int8, Int32 etc
 data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div | VA_Min | VA_Max
 
-getRegister :: CmmExpr -> NatM Register
+getRegister :: HasDebugCallStack => CmmExpr -> NatM Register
 getRegister e = do platform <- getPlatform
                    is32Bit <- is32BitPlatform
                    getRegister' platform is32Bit e
 
-getRegister' :: Platform -> Bool -> CmmExpr -> NatM Register
+getRegister' :: HasDebugCallStack => Platform -> Bool -> CmmExpr -> NatM Register
 
 getRegister' platform is32Bit (CmmReg reg)
   = case reg of
@@ -2312,7 +2322,7 @@ getNonClobberedOperand (CmmLit lit) =
     return (OpAddr addr, code)
   else do
     platform <- getPlatform
-    if is32BitLit platform lit && not (isFloatType (cmmLitType platform lit))
+    if is32BitLit platform lit && isIntFormat (cmmTypeFormat (cmmLitType platform lit))
     then return (OpImm (litToImm lit), nilOL)
     else getNonClobberedOperand_generic (CmmLit lit)
 
@@ -2369,13 +2379,13 @@ getOperand (CmmLit lit) = do
     else do
 
   platform <- getPlatform
-  if is32BitLit platform lit && not (isFloatType (cmmLitType platform lit))
+  if is32BitLit platform lit && (isIntFormat $ cmmTypeFormat (cmmLitType platform lit))
     then return (OpImm (litToImm lit), nilOL)
     else getOperand_generic (CmmLit lit)
 
 getOperand (CmmLoad mem ty _) = do
   is32Bit <- is32BitPlatform
-  if not (isFloatType ty) && (if is32Bit then not (isWord64 ty) else True)
+  if isIntFormat (cmmTypeFormat ty) && (if is32Bit then not (isWord64 ty) else True)
      then do
        Amode src mem_code <- getAmode mem
        return (OpAddr src, mem_code)
@@ -2406,7 +2416,7 @@ addAlignmentCheck align reg =
   where
     check :: Format -> Reg -> InstrBlock
     check fmt reg =
-        assert (not $ isFloatFormat fmt) $
+        assert (isIntFormat fmt) $
         toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
              , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
              ]
@@ -2451,7 +2461,7 @@ isSuitableFloatingPointLit _ = False
 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
 getRegOrMem e@(CmmLoad mem ty _) = do
   is32Bit <- is32BitPlatform
-  if not (isFloatType ty) && (if is32Bit then not (isWord64 ty) else True)
+  if isIntFormat (cmmTypeFormat ty) && (if is32Bit then not (isWord64 ty) else True)
      then do
        Amode src mem_code <- getAmode mem
        return (OpAddr src, mem_code)
@@ -3293,6 +3303,15 @@ genCCall bid addr conv dest_regs args = do
     else genCCall64 addr conv dest_regs args'
   return (instrs0 `appOL` instrs1)
 
+maybePromoteCArg :: Platform -> Width -> (CmmExpr, ForeignHint) -> CmmExpr
+maybePromoteCArg platform wto (arg, hint)
+ | wfrom < wto = case hint of
+     SignedHint -> CmmMachOp (MO_SS_Conv wfrom wto) [arg]
+     _          -> CmmMachOp (MO_UU_Conv wfrom wto) [arg]
+ | otherwise   = arg
+ where
+   wfrom = cmmExprWidth platform arg
+
 genCCall32 :: CmmExpr           -- ^ address of the function to call
            -> ForeignConvention -- ^ calling convention
            -> [CmmFormal]       -- ^ where to put the result
@@ -3325,7 +3344,7 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
                                      DELTA (delta-8)]
                     )
 
-              | isFloatType arg_ty = do
+              | isFloatType arg_ty || isVecType arg_ty = do
                 (reg, code) <- getSomeReg arg
                 delta <- getDeltaNat
                 setDeltaNat (delta-size)
@@ -3335,11 +3354,10 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
                                       let addr = AddrBaseIndex (EABaseReg esp)
                                                                 EAIndexNone
                                                                 (ImmInt 0)
-                                          format = floatFormat (typeWidth arg_ty)
+                                          format = cmmTypeFormat arg_ty
                                       in
 
-                                      -- assume SSE2
-                                       MOV format (OpReg reg) (OpAddr addr)
+                                       movInstr config format (OpReg reg) (OpAddr addr)
 
                                      ]
                                )
@@ -3363,7 +3381,7 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
         let
             -- Align stack to 16n for calls, assuming a starting stack
             -- alignment of 16n - word_size on procedure entry. Which we
-            -- maintiain. See Note [Stack Alignment on X86] in rts/StgCRun.c.
+            -- maintain. See Note [Stack Alignment on X86] in rts/StgCRun.c.
             sizes               = map (arg_size_bytes . cmmExprType platform) (reverse args)
             raw_arg_size        = sum sizes + platformWordSizeInBytes platform
             arg_pad_size        = (roundTo 16 $ raw_arg_size) - raw_arg_size
@@ -3408,6 +3426,8 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
             -- assign the results, if necessary
             assign_code []     = nilOL
             assign_code [dest]
+              | isVecType ty
+              = unitOL (movInstr config (cmmTypeFormat ty) (OpReg xmm0) (OpReg r_dest))
               | isFloatType ty =
                   -- we assume SSE2
                   let tmp_amode = AddrBaseIndex (EABaseReg esp)
@@ -3449,189 +3469,74 @@ genCCall64 :: CmmExpr           -- ^ address of function to call
 genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
     config <- getConfig
     let platform = ncgPlatform config
-    -- load up the register arguments
-    let args_hints = zip args (argHints ++ repeat NoHint)
-    let prom_args = map (maybePromoteCArg platform W32) args_hints
-
-    let load_args :: [CmmExpr]
-                  -> [RegFormat]         -- int regs avail for args
-                  -> [RegFormat]         -- FP regs avail for args
-                  -> InstrBlock    -- code computing args
-                  -> InstrBlock    -- code assigning args to ABI regs
-                  -> NatM ([CmmExpr],[RegFormat],[RegFormat],InstrBlock,InstrBlock)
-        -- no more regs to use
-        load_args args [] [] code acode     =
-            return (args, [], [], code, acode)
-
-        -- no more args to push
-        load_args [] aregs fregs code acode =
-            return ([], aregs, fregs, code, acode)
-
-        load_args (arg : rest) aregs fregs code acode
-            | isFloatType arg_rep = case fregs of
-                 []     -> push_this_arg
-                 (RegFormat r _fmt:rs) -> do
-                    (code',acode') <- reg_this_arg r
-                    load_args rest aregs rs code' acode'
-            | otherwise           = case aregs of
-                 []     -> push_this_arg
-                 (RegFormat r _fmt:rs) -> do
-                    (code',acode') <- reg_this_arg r
-                    load_args rest rs fregs code' acode'
-            where
+        args_hints = zip args (argHints ++ repeat NoHint)
+        prom_args = map (maybePromoteCArg platform W32) args_hints
+        word_size = platformWordSizeInBytes platform
+        wordFmt = archWordFormat (target32Bit platform)
 
-              -- put arg into the list of stack pushed args
-              push_this_arg = do
-                 (args',ars,frs,code',acode')
-                     <- load_args rest aregs fregs code acode
-                 return (arg:args', ars, frs, code', acode')
-
-              -- pass the arg into the given register
-              reg_this_arg r
-                -- "operand" args can be directly assigned into r
-                | isOperand platform arg = do
-                    arg_code <- getAnyReg arg
-                    return (code, (acode `appOL` arg_code r))
-                -- The last non-operand arg can be directly assigned after its
-                -- computation without going into a temporary register
-                | all (isOperand platform) rest = do
-                    arg_code   <- getAnyReg arg
-                    return (code `appOL` arg_code r,acode)
-
-                -- other args need to be computed beforehand to avoid clobbering
-                -- previously assigned registers used to pass parameters (see
-                -- #11792, #12614). They are assigned into temporary registers
-                -- and get assigned to proper call ABI registers after they all
-                -- have been computed.
-                | otherwise     = do
-                    arg_code <- getAnyReg arg
-                    tmp      <- getNewRegNat arg_fmt
-                    let
-                      code'  = code `appOL` arg_code tmp
-                      acode' = acode `snocOL` mkRegRegMoveInstr config arg_fmt tmp r
-                    return (code',acode')
-
-              arg_rep = cmmExprType platform arg
-              arg_fmt = cmmTypeFormat arg_rep
-
-        load_args_win :: [CmmExpr]
-                      -> [RegFormat]        -- used int regs
-                      -> [RegFormat]        -- used FP regs
-                      -> [(Reg, Reg)] -- (int, FP) regs avail for args
-                      -> InstrBlock
-                      -> NatM ([CmmExpr],[RegFormat],[RegFormat],InstrBlock,InstrBlock)
-        load_args_win args usedInt usedFP [] code
-            = return (args, usedInt, usedFP, code, nilOL)
-            -- no more regs to use
-        load_args_win [] usedInt usedFP _ code
-            = return ([], usedInt, usedFP, code, nilOL)
-            -- no more args to push
-        load_args_win (arg : rest) usedInt usedFP
-                      ((ireg, freg) : regs) code
-            | isFloatType arg_rep = do
-                 arg_code <- getAnyReg arg
-                 load_args_win rest (RegFormat ireg II64: usedInt) (RegFormat freg FF64 : usedFP) regs
-                               (code `appOL`
-                                arg_code freg `snocOL`
-                                -- If we are calling a varargs function
-                                -- then we need to define ireg as well
-                                -- as freg
-                                MOVD FF64 (OpReg freg) (OpReg ireg))
-            | otherwise = do
-                 arg_code <- getAnyReg arg
-                 load_args_win rest (RegFormat ireg II64: usedInt) usedFP regs
-                               (code `appOL` arg_code ireg)
-            where
-              arg_rep = cmmExprType platform arg
-
-        arg_size = 8 -- always, at the mo
-
-        push_args [] code = return code
-        push_args (arg:rest) code
-           | isFloatType arg_rep = do
-             (arg_reg, arg_code) <- getSomeReg arg
-             delta <- getDeltaNat
-             setDeltaNat (delta-arg_size)
-             let fmt = floatFormat width
-                 code' = code `appOL` arg_code `appOL` toOL [
-                            SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
-                            DELTA (delta-arg_size),
-                            MOV fmt (OpReg arg_reg) (OpAddr (spRel platform 0))]
-             push_args rest code'
-
-           | otherwise = do
-             -- Arguments can be smaller than 64-bit, but we still use @PUSH
-             -- II64@ - the usual calling conventions expect integers to be
-             -- 8-byte aligned.
-             massert (width <= W64)
-             (arg_op, arg_code) <- getOperand arg
-             delta <- getDeltaNat
-             setDeltaNat (delta-arg_size)
-             let code' = code `appOL` arg_code `appOL` toOL [
-                                    PUSH II64 arg_op,
-                                    DELTA (delta-arg_size)]
-             push_args rest code'
-            where
-              arg_rep = cmmExprType platform arg
-              width = typeWidth arg_rep
-
-        leaveStackSpace n = do
-             delta <- getDeltaNat
-             setDeltaNat (delta - n * arg_size)
-             return $ toOL [
-                         SUB II64 (OpImm (ImmInt (n * platformWordSizeInBytes platform))) (OpReg rsp),
-                         DELTA (delta - n * arg_size)]
-
-    (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
-         <-
-        if platformOS platform == OSMinGW32
-        then load_args_win prom_args [] [] (allArgRegs platform) nilOL
-        else do
-           let intArgRegs = map (\r -> RegFormat r II64) $ allIntArgRegs platform
-               fpArgRegs = map (\r -> RegFormat r FF64) $ allFPArgRegs platform
-           (stack_args, aregs, fregs, load_args_code, assign_args_code)
-               <- load_args prom_args intArgRegs fpArgRegs nilOL nilOL
-           let used_regs rs as = dropTail (length rs) as
-               fregs_used      = used_regs fregs fpArgRegs
-               aregs_used      = used_regs aregs intArgRegs
-           return (stack_args, aregs_used, fregs_used, load_args_code
-                                                      , assign_args_code)
+    -- Compute the code for loading arguments into registers,
+    -- returning the leftover arguments that will need to be passed on the stack.
+    --
+    -- NB: the code for loading references to data into registers is computed
+    -- later (in 'pushArgs'), because we don't yet know where the data will be
+    -- placed (due to alignment requirements).
+    LoadArgs
+      { stackArgs       = proper_stack_args
+      , stackDataArgs   = stack_data_args
+      , usedRegs        = arg_regs_used
+      , computeArgsCode = compute_args_code
+      , assignArgsCode  = assign_args_code
+      }
+      <- if platformOS platform == OSMinGW32
+         then evalStateT (loadArgsWin config prom_args) (allArgRegs platform)
+         else evalStateT (loadArgs config prom_args) (allIntArgRegs platform
+                                                     ,allFPArgRegs  platform)
 
     let
-        wordFmt = archWordFormat (target32Bit platform)
-        arg_regs_used = int_regs_used ++ fp_regs_used
-        arg_regs = [RegFormat eax wordFmt] ++ arg_regs_used
-                -- for annotating the call instruction with
-        sse_regs = length fp_regs_used
-        arg_stack_slots = if platformOS platform == OSMinGW32
-                          then length stack_args + length (allArgRegs platform)
-                          else length stack_args
-        tot_arg_size = arg_size * arg_stack_slots
 
+    -- Pad all arguments and data passed on stack to align them properly.
+        (stk_args_with_padding, args_aligned_16) =
+          padStackArgs platform (proper_stack_args, stack_data_args)
 
     -- Align stack to 16n for calls, assuming a starting stack
     -- alignment of 16n - word_size on procedure entry. Which we
     -- maintain. See Note [Stack Alignment on X86] in rts/StgCRun.c
-    let word_size = platformWordSizeInBytes platform
-    (real_size, adjust_rsp) <-
-        if (tot_arg_size + word_size) `rem` 16 == 0
-            then return (tot_arg_size, nilOL)
-            else do -- we need to adjust...
-                delta <- getDeltaNat
-                setDeltaNat (delta - word_size)
-                return (tot_arg_size + word_size, toOL [
-                                SUB II64 (OpImm (ImmInt word_size)) (OpReg rsp),
-                                DELTA (delta - word_size) ])
-
-    -- push the stack args, right to left
-    push_code <- push_args (reverse stack_args) nilOL
-    -- On Win64, we also have to leave stack space for the arguments
-    -- that we are passing in registers
-    lss_code <- if platformOS platform == OSMinGW32
-                then leaveStackSpace (length (allArgRegs platform))
-                else return nilOL
+        need_realign_call = args_aligned_16
+    align_call_code <-
+      if need_realign_call
+      then addStackPadding word_size
+      else return nilOL
+
+    -- Compute the code that pushes data to the stack, and also
+    -- the code that loads references to that data into registers,
+    -- when the data is passed by reference in a register.
+    (load_data_refs, push_code) <-
+      pushArgs config proper_stack_args stk_args_with_padding
+
+    -- On Windows, leave stack space for the arguments that we are passing
+    -- in registers (the so-called shadow space).
+    let shadow_space =
+          if platformOS platform == OSMinGW32
+          then 8 * length (allArgRegs platform)
+            -- NB: the shadow store is always 8 * 4 = 32 bytes large,
+            -- i.e. the cumulative size of rcx, rdx, r8, r9 (see 'allArgRegs').
+          else 0
+    shadow_space_code <- addStackPadding shadow_space
+
+    let total_args_size
+          = shadow_space
+          + sum (map (stackArgSize platform) stk_args_with_padding)
+        real_size =
+          total_args_size + if need_realign_call then word_size else 0
+
+    -- End of argument passing.
+    --
+    -- Next step: emit the appropriate call instruction.
     delta <- getDeltaNat
 
+    let arg_regs = [RegFormat eax wordFmt] ++ arg_regs_used
+          -- for annotating the call instruction with
+
     -- deal with static vs dynamic call targets
     (callinsns,_cconv) <- case addr of
       CmmLit (CmmLabel lbl) ->
@@ -3641,7 +3546,7 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
         return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
 
     let
-        -- The x86_64 ABI requires us to set %al to the number of SSE2
+        -- The System V AMD64 ABI requires us to set %al to the number of SSE2
         -- registers that contain arguments, if the called routine
         -- is a varargs function.  We don't know whether it's a
         -- varargs function or not, so we have to assume it is.
@@ -3649,7 +3554,12 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
         -- It's not safe to omit this assignment, even if the number
         -- of SSE2 regs in use is zero.  If %al is larger than 8
         -- on entry to a varargs function, seg faults ensue.
-        assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
+        nb_sse_regs_used = count (isFloatFormat . regFormatFormat) arg_regs_used
+        assign_eax_sse_regs
+          | platformOS platform == OSMinGW32
+          = nilOL
+          | otherwise
+          = unitOL (MOV II32 (OpImm (ImmInt nb_sse_regs_used)) (OpReg eax))
 
     let call = callinsns `appOL`
                toOL (
@@ -3674,24 +3584,601 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
             r_dest = getRegisterReg platform (CmmLocal dest)
         assign_code _many = panic "genForeignCall.assign_code many"
 
-    return (adjust_rsp          `appOL`
+    return (align_call_code     `appOL`
             push_code           `appOL`
-            load_args_code      `appOL`
+            compute_args_code   `appOL`
             assign_args_code    `appOL`
-            lss_code            `appOL`
-            assign_eax sse_regs `appOL`
+            load_data_refs      `appOL`
+            shadow_space_code   `appOL`
+            assign_eax_sse_regs `appOL`
             call                `appOL`
             assign_code dest_regs)
 
+-- -----------------------------------------------------------------------------
+-- Loading arguments into registers for 64-bit C calls.
+
+-- | Information needed to know how to pass arguments in a C call,
+-- and in particular how to load them into registers.
+data LoadArgs
+  = LoadArgs
+  -- | Arguments that should be passed on the stack
+  { stackArgs     :: [RawStackArg]
+  -- | Additional values to store onto the stack.
+  , stackDataArgs :: [CmmExpr]
+  -- | Which registers are we using for argument passing?
+  , usedRegs      :: [RegFormat]
+  -- | The code to compute arguments into (possibly temporary) registers.
+  , computeArgsCode :: InstrBlock
+  -- | The code to assign arguments to registers used for argument passing.
+  , assignArgsCode :: InstrBlock
+  }
+instance Semigroup LoadArgs where
+  LoadArgs a1 d1 r1 i1 j1 <> LoadArgs a2 d2 r2 i2 j2
+    = LoadArgs (a1 ++ a2) (d1 ++ d2) (r1 ++ r2) (i1 S.<> i2) (j1 S.<> j2)
+instance Monoid LoadArgs where
+  mempty = LoadArgs [] [] [] nilOL nilOL
+
+-- | An argument passed on the stack, either directly or by reference.
+--
+-- The padding information hasn't yet been computed (see 'StackArg').
+data RawStackArg
+  -- | Pass the argument on the stack directly.
+  = RawStackArg { stackArgExpr :: CmmExpr }
+  -- | Pass the argument by reference.
+  | RawStackArgRef
+    { stackRef :: StackRef
+       -- ^ is the reference passed in a register, or on the stack?
+    , stackRefArgSize :: Int
+        -- ^ the size of the data pointed to
+    }
+  deriving ( Show )
+
+-- | An argument passed on the stack, either directly or by reference,
+-- with additional padding information.
+data StackArg
+  -- | Pass the argument on the stack directly.
+  = StackArg
+      { stackArgExpr :: CmmExpr
+      , stackArgPadding :: Int
+        -- ^ padding required (in bytes)
+      }
+  -- | Pass the argument by reference.
+  | StackArgRef
+     { stackRef :: StackRef
+        -- ^ where the reference is passed
+     , stackRefArgSize :: Int
+        -- ^ the size of the data pointed to
+     , stackRefArgPadding :: Int
+       -- ^ padding of the data pointed to
+       -- (the reference itself never requires padding)
+     }
+  deriving ( Show )
+
+-- | Where is a reference to data on the stack passed?
+data StackRef
+  -- | In a register.
+  = InReg Reg
+  -- | On the stack.
+  | OnStack
+  deriving ( Eq, Ord, Show )
+
+newtype Padding = Padding { paddingBytes :: Int }
+  deriving ( Show, Eq, Ord )
+
+-- | How much space does this 'StackArg' take up on the stack?
+--
+-- Only counts the "reference" part for references, not the data it points to.
+stackArgSize :: Platform -> StackArg -> Int
+stackArgSize platform = \case
+  StackArg arg padding ->
+    argSize platform arg + padding
+  StackArgRef { stackRef = ref } ->
+    case ref of
+      InReg   {} -> 0
+      OnStack {} -> 8
+
+-- | Pad arguments, assuming we start aligned to a 16-byte boundary.
+padStackArgs :: Platform
+             -> ([RawStackArg], [CmmExpr])
+             -> ([StackArg], Bool)
+padStackArgs platform (args0, data_args0) =
+  let
+    -- Pad the direct args
+    (args, align_16_mid) = pad_args True args0
+
+    -- Pad the data section
+    (data_args, align_16_end) = pad_args align_16_mid (map RawStackArg data_args0)
+
+    -- Now figure out where the data is placed relative to the direct arguments,
+    -- in order to resolve references.
+    resolve_args :: Int -> [(RawStackArg, Padding)] -> [StackArg]
+    resolve_args _ [] = []
+    resolve_args i ((stk_arg, Padding pad) : rest) =
+      let (this_arg, i') =
+            case stk_arg of
+              RawStackArg arg -> (StackArg arg pad, i)
+              RawStackArgRef ref size ->
+                let Padding arg_pad = snd (data_args !! i)
+                    arg =
+                      StackArgRef
+                        { stackRef = ref
+                        , stackRefArgSize = size
+                        , stackRefArgPadding = arg_pad }
+                in (arg, i+1)
+      in this_arg : resolve_args i' rest
+
+  in
+    ( resolve_args 0 args ++
+        [ case data_arg of
+            RawStackArg arg -> StackArg arg pad
+            RawStackArgRef {} -> panic "padStackArgs: reference in data section"
+        | (data_arg, Padding pad) <- data_args
+        ]
+    , align_16_end )
 
-maybePromoteCArg :: Platform -> Width -> (CmmExpr, ForeignHint) -> CmmExpr
-maybePromoteCArg platform wto (arg, hint)
- | wfrom < wto = case hint of
-     SignedHint -> CmmMachOp (MO_SS_Conv wfrom wto) [arg]
-     _          -> CmmMachOp (MO_UU_Conv wfrom wto) [arg]
- | otherwise   = arg
- where
-   wfrom = cmmExprWidth platform arg
+  where
+    pad_args :: Bool -> [RawStackArg] -> ([(RawStackArg, Padding)], Bool)
+    pad_args aligned_16 [] = ([], aligned_16)
+    pad_args aligned_16 (arg:args)
+      | needed_alignment > 16
+      -- We don't know if the stack is aligned to 8 (mod 32) or 24 (mod 32).
+      -- This makes aligning the stack to a 32 or 64 byte boundary more
+      -- complicated, in particular with DELTA.
+      = sorry $ unlines
+        [ "X86_86 C call: unsupported argument."
+        , "  Alignment requirement: " ++ show needed_alignment ++ " bytes."
+        , if platformOS platform == OSMinGW32
+          then "  The X86_64 NCG does not (yet) support Windows C calls with 256/512 bit vectors."
+          else "  The X86_64 NCG cannot (yet) pass 256/512 bit vectors on the stack for C calls."
+        , "  Please use the LLVM backend (-fllvm)." ]
+      | otherwise
+      = let ( rest, final_align_16 ) = pad_args next_aligned_16 args
+        in  ( (arg, Padding padding) : rest, final_align_16 )
+
+      where
+        needed_alignment = case arg of
+          RawStackArg arg   -> argSize platform arg
+          RawStackArgRef {} -> platformWordSizeInBytes platform
+        padding
+          | needed_alignment < 16 || aligned_16
+          = 0
+          | otherwise
+          = 8
+        next_aligned_16 = not ( aligned_16 && needed_alignment < 16 )
+
+-- | Load arguments into available registers (System V AMD64 ABI).
+loadArgs :: NCGConfig
+         -> [CmmExpr]
+         -> StateT ([Reg], [Reg]) NatM LoadArgs
+loadArgs _ [] = return mempty
+loadArgs config (arg:rest) = do
+  (iregs, fregs) <- get
+  -- No available registers: pass everything on the stack (shortcut).
+  if null iregs && null fregs
+  then return $
+          LoadArgs
+            { stackArgs       = map RawStackArg (arg:rest)
+            , stackDataArgs   = []
+            , computeArgsCode = nilOL
+            , assignArgsCode  = nilOL
+            , usedRegs        = []
+            }
+  else do
+    mbReg <-
+      if
+        | isIntFormat arg_fmt
+        , ireg:iregs' <- iregs
+        -> do put (iregs', fregs)
+              return $ Just ireg
+        | isFloatFormat arg_fmt || isVecFormat arg_fmt
+        , freg:fregs' <- fregs
+        -> do put (iregs, fregs')
+              return $ Just freg
+        | otherwise
+        -> return Nothing
+    this_arg <-
+      case mbReg of
+        Just reg -> do
+          (compute_code, assign_code) <- lift $ loadArgIntoReg config arg rest reg
+          return $
+            LoadArgs
+                { stackArgs       = [] -- passed in register
+                , stackDataArgs   = []
+                , computeArgsCode = compute_code
+                , assignArgsCode  = assign_code
+                , usedRegs        = [RegFormat reg arg_fmt]
+                }
+        Nothing -> do
+          return $
+            -- No available register for this argument: pass it on the stack.
+            LoadArgs
+                { stackArgs       = [RawStackArg arg]
+                , stackDataArgs   = []
+                , computeArgsCode = nilOL
+                , assignArgsCode  = nilOL
+                , usedRegs        = []
+                }
+    others <- loadArgs config rest
+    return $ this_arg S.<> others
+
+  where
+    platform = ncgPlatform config
+    arg_fmt = cmmTypeFormat (cmmExprType platform arg)
+
+-- | Compute all things that will need to be pushed to the stack.
+--
+-- On Windows, an argument passed by reference will require two pieces of data:
+--
+--  - the reference (returned in the first position)
+--  - the actual data (returned in the second position)
+computeWinPushArgs :: Platform -> [CmmExpr] -> ([RawStackArg], [CmmExpr])
+computeWinPushArgs platform = go
+  where
+    go :: [CmmExpr] -> ([RawStackArg], [CmmExpr])
+    go [] = ([], [])
+    go (arg:args) =
+      let
+        arg_size = argSize platform arg
+        (this_arg, add_this_arg)
+          | arg_size > 8
+          = ( RawStackArgRef OnStack arg_size, (arg :) )
+          | otherwise
+          = ( RawStackArg arg, id )
+        (stk_args, stk_data) = go args
+      in
+        (this_arg:stk_args, add_this_arg stk_data)
+
+-- | Load arguments into available registers (Windows C X64 calling convention).
+loadArgsWin :: NCGConfig -> [CmmExpr] -> StateT [(Reg,Reg)] NatM LoadArgs
+loadArgsWin _ [] = return mempty
+loadArgsWin config (arg:rest) = do
+  regs <- get
+  case regs of
+    reg:regs' -> do
+      put regs'
+      this_arg <- lift $ load_arg_win reg
+      rest <- loadArgsWin config rest
+      return $ this_arg S.<> rest
+    [] -> do
+      -- No more registers available: pass all (remaining) arguments on the stack.
+      let (stk_args, data_args) = computeWinPushArgs platform (arg:rest)
+      return $
+        LoadArgs
+          { stackArgs       = stk_args
+          , stackDataArgs   = data_args
+          , computeArgsCode = nilOL
+          , assignArgsCode  = nilOL
+          , usedRegs        = []
+          }
+  where
+    platform = ncgPlatform config
+    arg_fmt = cmmTypeFormat $ cmmExprType platform arg
+    load_arg_win (ireg, freg)
+      | isFloatFormat arg_fmt
+      = do (compute_code, assign_code) <- loadArgIntoReg config arg rest freg
+           return $
+             LoadArgs
+               { stackArgs       = [] -- passed in register
+               , stackDataArgs   = []
+               , computeArgsCode = compute_code
+                  -- Recall that, for varargs, we must pass floating-point
+                  -- arguments in both fp and integer registers.
+               , assignArgsCode = assign_code `snocOL` MOVD FF64 (OpReg freg) (OpReg ireg)
+               , usedRegs = [ RegFormat freg FF64
+                            , RegFormat ireg II64 ]
+               }
+      | isVecFormat arg_fmt
+       -- Vectors are passed by reference.
+       -- See Note [The Windows X64 C calling convention].
+      = do return $
+             LoadArgs
+                -- Pass the reference in a register,
+                -- and the argument data on the stack.
+                { stackArgs       = [RawStackArgRef (InReg ireg) (argSize platform arg)]
+                , stackDataArgs   = [arg]
+                , computeArgsCode = nilOL -- we don't yet know where the data will reside,
+                , assignArgsCode  = nilOL -- so we defer computing the reference and storing it
+                                          -- in the register until later
+                , usedRegs        = [RegFormat ireg II64]
+                }
+       | otherwise
+       = do (compute_code, assign_code) <- loadArgIntoReg config arg rest ireg
+            return $
+               LoadArgs
+                 { stackArgs       = [] -- passed in register
+                 , stackDataArgs   = []
+                 , computeArgsCode = compute_code
+                 , assignArgsCode  = assign_code
+                 , usedRegs        = [RegFormat ireg II64]
+                 }
+
+-- | Return two pieces of code:
+--
+--  - code to compute a the given 'CmmExpr' into some (possibly temporary) register
+--  - code to assign the resulting value to the specified register
+--
+-- Using two separate pieces of code handles clobbering issues reported
+-- in e.g. #11792, #12614.
+loadArgIntoReg :: NCGConfig -> CmmExpr -> [CmmExpr] -> Reg -> NatM (InstrBlock, InstrBlock)
+loadArgIntoReg config arg rest reg
+  -- "operand" args can be directly assigned into the register
+  | isOperand platform arg
+  = do arg_code <- getAnyReg arg
+       return (nilOL, arg_code reg)
+  -- The last non-operand arg can be directly assigned after its
+  -- computation without going into a temporary register
+  | all (isOperand platform) rest
+  = do arg_code <- getAnyReg arg
+       return (arg_code reg, nilOL)
+  -- Other args need to be computed beforehand to avoid clobbering
+  -- previously assigned registers used to pass parameters (see
+  -- #11792, #12614). They are assigned into temporary registers
+  -- and get assigned to proper call ABI registers after they all
+  -- have been computed.
+  | otherwise
+  = do arg_code <- getAnyReg arg
+       tmp      <- getNewRegNat arg_fmt
+       return (arg_code tmp, unitOL $ mkRegRegMoveInstr config arg_fmt tmp reg)
+  where
+    platform = ncgPlatform config
+    arg_fmt = cmmTypeFormat $ cmmExprType platform arg
+
+-- -----------------------------------------------------------------------------
+-- Pushing arguments onto the stack for 64-bit C calls.
+
+-- | The size of an argument (in bytes).
+--
+-- Never smaller than the platform word width.
+argSize :: Platform -> CmmExpr -> Int
+argSize platform arg =
+  max (widthInBytes (wordWidth platform)) $
+    widthInBytes (typeWidth $ cmmExprType platform arg)
+
+-- | Add the given amount of padding on the stack.
+addStackPadding :: Int -- ^ padding (in bytes)
+                -> NatM InstrBlock
+addStackPadding pad_bytes
+  | pad_bytes == 0
+  = return nilOL
+  | otherwise
+  = do delta <- getDeltaNat
+       setDeltaNat (delta - pad_bytes)
+       return $
+         toOL [ SUB II64 (OpImm (ImmInt pad_bytes)) (OpReg rsp)
+              , DELTA (delta - pad_bytes)
+              ]
+
+-- | Push one argument directly to the stack (by value).
+--
+-- Assumes the current stack pointer fulfills any necessary alignment requirements.
+pushArgByValue :: NCGConfig -> CmmExpr -> NatM InstrBlock
+pushArgByValue config arg
+   | isFloatType arg_rep || isVecType arg_rep
+   = do
+     (arg_reg, arg_code) <- getSomeReg arg
+     delta <- getDeltaNat
+     setDeltaNat (delta-arg_size)
+     let fmt = cmmTypeFormat arg_rep
+     return $ arg_code `appOL` toOL
+        [ SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp)
+        , DELTA (delta-arg_size)
+        , movInstr config fmt (OpReg arg_reg) (OpAddr (spRel platform 0)) ]
+
+   | otherwise
+   = do
+     -- Arguments can be smaller than 64-bit, but we still use @PUSH II64 at .
+     -- The usual calling conventions expect integers to be 8-byte aligned.
+     (arg_op, arg_code) <- getOperand arg
+     delta <- getDeltaNat
+     setDeltaNat (delta-arg_size)
+     return $
+       arg_code `appOL` toOL
+       [ PUSH II64 arg_op
+       , DELTA (delta-arg_size) ]
+    where
+      platform = ncgPlatform config
+      arg_size = argSize platform arg
+      arg_rep = cmmExprType platform arg
+
+-- | Load an argument into a register or push it to the stack.
+loadOrPushArg :: NCGConfig -> (StackArg, Maybe Int) -> NatM (InstrBlock, InstrBlock)
+loadOrPushArg config (stk_arg, mb_off) =
+  case stk_arg of
+    StackArg arg pad -> do
+      push_code <- pushArgByValue config arg
+      pad_code  <- addStackPadding pad
+      return (nilOL, push_code `appOL` pad_code)
+    StackArgRef { stackRef = ref } ->
+      case ref of
+        -- Pass the reference in a register
+        InReg ireg ->
+          return (unitOL $ LEA II64 (OpAddr (spRel platform off)) (OpReg ireg), nilOL)
+        -- Pass the reference on the stack
+        OnStack {} -> do
+          tmp <- getNewRegNat II64
+          delta <- getDeltaNat
+          setDeltaNat (delta-arg_ref_size)
+          let push_code = toOL
+                [ SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_ref_size)) (OpReg rsp)
+                , DELTA (delta-arg_ref_size)
+                , LEA II64 (OpAddr (spRel platform off)) (OpReg tmp)
+                , MOV II64 (OpReg tmp) (OpAddr (spRel platform 0)) ]
+          return (nilOL, push_code)
+      where off = expectJust "push_arg_win offset" mb_off
+    where
+      arg_ref_size = 8 -- passing a reference to the argument
+      platform = ncgPlatform config
+
+-- | Push arguments to the stack, right to left.
+--
+-- On Windows, some arguments may need to be passed by reference,
+-- which requires separately passing the data and the reference.
+-- See Note [The Windows X64 C calling convention].
+pushArgs :: NCGConfig
+         -> [RawStackArg]
+            -- ^ arguments proper (i.e. don't include the data for arguments passed by reference)
+         -> [StackArg]
+            -- ^ arguments we are passing on the stack
+         -> NatM (InstrBlock, InstrBlock)
+pushArgs config proper_args all_stk_args
+  = do { let
+            vec_offs :: [Maybe Int]
+            vec_offs
+              | platformOS platform == OSMinGW32
+              = go stack_arg_size all_stk_args
+              | otherwise
+              = repeat Nothing
+
+    ---------------------
+    -- Windows-only code
+
+            -- Size of the arguments we are passing on the stack, counting only
+            -- the reference part for arguments passed by reference.
+            stack_arg_size = 8 * count not_in_reg proper_args
+            not_in_reg (RawStackArg {}) = True
+            not_in_reg (RawStackArgRef { stackRef = ref }) =
+              case ref of
+                InReg {} -> False
+                OnStack {} -> True
+
+            -- Check an offset is valid (8-byte aligned), for assertions.
+            ok off = off `rem` 8 == 0
+
+            -- Tricky code: compute the stack offset to the vector data
+            -- for this argument.
+            --
+            -- If you're confused, Note [The Windows X64 C calling convention]
+            -- contains a helpful diagram.
+            go :: Int -> [StackArg] -> [Maybe Int]
+            go _ [] = []
+            go off (stk_arg:args) =
+              assertPpr (ok off) (text "unaligned offset:" <+> ppr off) $
+              case stk_arg of
+                StackArg {} ->
+                  -- Only account for the stack pointer movement.
+                  let off' = off - stackArgSize platform stk_arg
+                  in Nothing : go off' args
+                StackArgRef
+                  { stackRefArgSize    = data_size
+                  , stackRefArgPadding = data_pad } ->
+                  assertPpr (ok data_size) (text "unaligned data size:" <+> ppr data_size) $
+                  assertPpr (ok data_pad) (text "unaligned data padding:" <+> ppr data_pad) $
+                  let off' = off
+                        -- Next piece of data is after the data for this reference
+                           + data_size + data_pad
+                        -- ... and account for the stack pointer movement.
+                           - stackArgSize platform stk_arg
+                  in Just (data_pad + off) : go off' args
+
+    -- end of Windows-only code
+    ----------------------------
+
+         -- Push the stack arguments (right to left),
+         -- including both the reference and the data for arguments passed by reference.
+       ; (load_regs, push_args) <- foldMapM (loadOrPushArg config) (reverse $ zip all_stk_args vec_offs)
+       ; return (load_regs, push_args) }
+  where
+    platform = ncgPlatform config
+
+{- Note [The Windows X64 C calling convention]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Here are a few facts about the Windows X64 C calling convention that
+are important:
+
+  - any argument larger than 8 bytes must be passed by reference,
+    and arguments smaller than 8 bytes are padded to 8 bytes.
+
+  - the first four arguments are passed in registers:
+      - floating-point scalar arguments are passed in %xmm0, %xmm1, %xmm2, %xmm3
+      - other arguments are passed in %rcx, %rdx, %r8, %r9
+        (this includes vector arguments, passed by reference)
+
+    For variadic functions, it is additionally expected that floating point
+    scalar arguments are copied to the corresponding integer register, e.g.
+    the data in xmm2 should also be copied to r8.
+
+    There is no requirement about setting %al like there is for the
+    System V AMD64 ABI.
+
+  - subsequent arguments are passed on the stack.
+
+There are also alignment requirements:
+
+  - the data for vectors must be aligned to the size of the vector,
+    e.g. a 32 byte vector must be aligned on a 32 byte boundary,
+
+  - the call instruction must be aligned to 16 bytes.
+  (This differs from the System V AMD64 ABI, which mandates that the call
+  instruction must be aligned to 32 bytes if there are any 32 byte vectors
+  passed on the stack.)
+
+This motivates our handling of vector values. Suppose we have a function call
+with many arguments, several of them being vectors. We proceed as follows:
+
+  - Add some padding, if necessary, to ensure the call instruction is
+    16-byte aligned. Whether this padding is necessary depends on what happens
+    next. (Recall also that we start off at 8 (mod 16) alignment, as per
+    Note [Stack Alignment on X86] in rts/StgCRun.c)
+  - Push all the vectors to the stack first, adding padding after each one
+    if necessary.
+  - Then push the arguments:
+      - for non-vectors, proceed as usual,
+      - for vectors, push the address of the vector data we pushed above.
+  - Then assign the registers:
+      - for non-vectors, proceed as usual,
+      - for vectors, store the address in a general-purpose register, as opposed
+        to storing the data in an xmm register.
+
+For a concrete example, suppose we have a call of the form:
+
+  f x1 x2 x3 x4 x5 x6 x7
+
+in which:
+
+  - x2, x3, x5 and x7 are 16 byte vectors
+  - the other arguments are all 8 byte wide
+
+Now, x1, x2, x3, x4 will get passed in registers, except that we pass
+x2 and x3 by reference, because they are vectors. We proceed as follows:
+
+  - push the vectors to the stack: x7, x5, x3, x2 (in that order)
+  - push the stack arguments in order: addr(x7), x6, addr(x5)
+  - load the remaining arguments into registers: x4, addr(x3), addr(x2), x1
+
+The tricky part is to get the right offsets for the addresses of the vector
+data. The following visualisation will hopefully clear things up:
+
+                                  ┌──┐
+                                  │▓▓│ ─── padding to align the call instruction
+                      ╭─╴         ╞══╡     (ensures Sp, below, is 16-byte aligned)
+                      │           │  │
+                      │  x7  ───╴ │  │
+                      │           ├──┤
+                      │           │  │
+                      │  x5  ───╴ │  │
+                      │           ├──┤
+     vector data  ────┤           │  │
+(individually padded) │  x3  ───╴ │  │
+                      │           ├──┤
+                      │           │  │
+                      │  x2  ───╴ │  │
+                      │           ├┄┄┤
+                      │           │▓▓│ ─── padding to align x2 to 16 bytes
+               ╭─╴    ╰─╴         ╞══╡
+               │    addr(x7) ───╴ │  │    ╭─ from here: x7 is +64
+               │                  ├──┤ ╾──╯    = 64 (position of x5)
+     stack  ───┤         x6  ───╴ │  │         + 16 (size of x5) + 0 (padding of x7)
+   arguments   │                  ├──┤         - 2 * 8 (x7 is 2 arguments higher than x5)
+               │    addr(x5) ───╴ │  │
+               ╰─╴            ╭─╴ ╞══╡ ╾─── from here:
+                              │   │  │       - x2 is +32 = 24 (stack_arg_size) + 8 (padding of x2)
+                   shadow  ───┤   │  │       - x3 is +48 = 32 (position of x2) + 16 (size of x2) + 0 (padding of x3)
+                    space     │   │  │       - x5 is +64 = 48 (position of x3) + 16 (size of x3) + 0 (padding of x5)
+                              │   │  │
+                              ╰─╴ └──┘ ╾─── Sp
+
+This is all tested in the simd013 test.
+-}
 
 -- -----------------------------------------------------------------------------
 -- Generating a table-branch


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -892,7 +892,7 @@ mkLoadInstr config (RegFormat reg fmt) delta slot =
 
 -- | A move instruction for moving the entire contents of an operand
 -- at the given 'Format'.
-movInstr :: NCGConfig -> Format -> (Operand -> Operand -> Instr)
+movInstr :: HasDebugCallStack => NCGConfig -> Format -> (Operand -> Operand -> Instr)
 movInstr config fmt =
   case fmt of
     VecFormat _ sFmt ->
@@ -914,17 +914,38 @@ movInstr config fmt =
         _ -> sorry $ "Unhandled SIMD vector width: " ++ show (8 * bytes) ++ " bits"
     _ -> MOV fmt
   where
+    plat    = ncgPlatform config
     bytes   = formatInBytes fmt
     avx     = ncgAvxEnabled config
     avx2    = ncgAvx2Enabled config
     avx512f = ncgAvx512fEnabled config
     avx_move sFmt =
       if isFloatScalarFormat sFmt
-      then VMOVU   fmt
+      then \ op1 op2 ->
+              if
+                | OpReg r1 <- op1
+                , OpReg r2 <- op2
+                , targetClassOfReg plat r1 /= targetClassOfReg plat r2
+                -> pprPanic "movInstr: VMOVU between incompatible registers"
+                     ( vcat [ text "fmt:" <+> ppr fmt
+                            , text "r1:" <+> ppr r1
+                            , text "r2:" <+> ppr r2 ] )
+                | otherwise
+                -> VMOVU   fmt op1 op2
       else VMOVDQU fmt
     sse_move sFmt =
       if isFloatScalarFormat sFmt
-      then MOVU   fmt
+      then \ op1 op2 ->
+              if
+                | OpReg r1 <- op1
+                , OpReg r2 <- op2
+                , targetClassOfReg plat r1 /= targetClassOfReg plat r2
+                -> pprPanic "movInstr: MOVU between incompatible registers"
+                     ( vcat [ text "fmt:" <+> ppr fmt
+                            , text "r1:" <+> ppr r1
+                            , text "r2:" <+> ppr r2 ] )
+                | otherwise
+                -> MOVU   fmt op1 op2
       else MOVDQU fmt
     -- NB: we are using {V}MOVU and not {V}MOVA, because we have no guarantees
     -- about the stack being sufficiently aligned (even for even numbered stack slots).
@@ -989,12 +1010,7 @@ mkRegRegMoveInstr
     -> Reg
     -> Instr
 mkRegRegMoveInstr config fmt src dst =
-  assertPpr (targetClassOfReg platform src == targetClassOfReg platform dst)
-    (vcat [ text "mkRegRegMoveInstr: incompatible register classes"
-          , text "fmt:" <+> ppr fmt
-          , text "src:" <+> ppr src
-          , text "dst:" <+> ppr dst ]) $
-    movInstr config fmt' (OpReg src) (OpReg dst)
+  movInstr config fmt' (OpReg src) (OpReg dst)
       -- Move the platform word size, at a minimum
   where
     platform = ncgPlatform config


=====================================
testsuite/tests/simd/should_run/simd013.hs
=====================================
@@ -8,25 +8,26 @@ module Main where
 import GHC.Exts
 import GHC.Prim
 
-foreign import ccall "sub"
+foreign import ccall unsafe "sub"
   sub :: DoubleX2# -> DoubleX2# -> DoubleX2#
 
-foreign import ccall "add6"
-  add6 :: DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+foreign import ccall unsafe "add7"
+  add7 :: DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
 
 main :: IO ()
 main = do
   let x1, x2 :: DoubleX2#
       x1 = packDoubleX2# (# 9.9##, 99.99## #)
       x2 = packDoubleX2# (# 1.1##, 11.11## #)
-      y1, y2, y3, y4, y5, y6 :: DoubleX2#
-      !y1 = packDoubleX2# (#      1.5##,           2.5## #)
-      !y2 = packDoubleX2# (#     10.25##,         20.25## #)
-      !y3 = packDoubleX2# (#    100.125##,       200.125## #)
-      !y4 = packDoubleX2# (#   1000.0625##,     2000.0625## #)
-      !y5 = packDoubleX2# (#  10000.03125##,   20000.03125## #)
-      !y6 = packDoubleX2# (# 100000.015625##, 200000.015625## #)
+      y1, y2, y3, y4, y5, y6, y7 :: DoubleX2#
+      !y1 = packDoubleX2# (#       1.5##,             2.5## #)
+      !y2 = packDoubleX2# (#      10.25##,           20.25## #)
+      !y3 = packDoubleX2# (#     100.125##,         200.125## #)
+      !y4 = packDoubleX2# (#    1000.0625##,       2000.0625## #)
+      !y5 = packDoubleX2# (#   10000.03125##,     20000.03125## #)
+      !y6 = packDoubleX2# (#  100000.015625##,   200000.015625## #)
+      !y7 = packDoubleX2# (# 1000000.0078125##, 2000000.0078125## #)
       !(# a, b #) = unpackDoubleX2# ( sub x1 x2 )
-      !(# c, d #) = unpackDoubleX2# ( add6 y1 y2 y3 y4 y5 y6 )
+      !(# c, d #) = unpackDoubleX2# ( add7 y1 y2 y3 y4 y5 y6 y7 )
   print ( D# a, D# b )
   print ( D# c, D# d )


=====================================
testsuite/tests/simd/should_run/simd013.stdout
=====================================
@@ -1,2 +1,2 @@
 (8.8,88.88)
-(111111.984375,222222.984375)
+(1111111.9921875,2222222.9921875)


=====================================
testsuite/tests/simd/should_run/simd013C.c
=====================================
@@ -1,12 +1,12 @@
 
-#include <xmmintrin.h>
+#include <immintrin.h>
 
 __m128d sub(__m128d x, __m128d y)
 {
   return _mm_sub_pd(x,y);
 }
 
-__m128d add6(__m128d x1, __m128d x2, __m128d x3, __m128d x4, __m128d x5, __m128d x6)
+__m128d add7(__m128d x1, __m128d x2, __m128d x3, __m128d x4, __m128d x5, __m128d x6, __m128d x7)
 {
-  return _mm_add_pd(x1,_mm_add_pd(x2,_mm_add_pd(x3,_mm_add_pd(x4,_mm_add_pd(x5,x6)))));
+  return _mm_add_pd(x1,_mm_add_pd(x2,_mm_add_pd(x3,_mm_add_pd(x4,_mm_add_pd(x5,_mm_add_pd(x6, x7))))));
 }



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/c0e72083097e1a6cd5648087aab0a3818fc01beb

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/c0e72083097e1a6cd5648087aab0a3818fc01beb
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240830/a1234eb5/attachment-0001.html>