[Git][ghc/ghc][wip/ncg-simd] 2 commits: Fix C calls with SIMD vectors

Tue Aug 27 16:06:10 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
f884fc29 by sheaf at 2024-08-27T18:05:50+02:00
Fix C calls with SIMD vectors

This commit fixes the code generation for C calls, to take into account
the calling convention.

This is particularly tricky on Windows, where all vectors are expected
to be passed by reference. See Note [The Windows X64 C calling convention]
in GHC.CmmToAsm.X86.CodeGen.

- - - - -
73c8ef14 by sheaf at 2024-08-27T18:05:50+02:00
Add test for #25169

- - - - -


8 changed files:

- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- + testsuite/tests/simd/should_run/T25169.hs
- + testsuite/tests/simd/should_run/T25169.stdout
- testsuite/tests/simd/should_run/all.T
- + testsuite/tests/simd/should_run/simd014.hs
- + testsuite/tests/simd/should_run/simd014.stdout
- + testsuite/tests/simd/should_run/simd014Cmm.cmm


Changes:

=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1,6 +1,7 @@
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE MultiWayIf #-}
 {-# LANGUAGE GADTs #-}
+{-# LANGUAGE TupleSections #-}
 {-# LANGUAGE NondecreasingIndentation #-}
 
 {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
@@ -73,10 +74,12 @@ import GHC.Types.Tickish ( GenTickish(..) )
 import GHC.Types.SrcLoc  ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
 
 -- The rest:
+import GHC.Data.Maybe ( expectJust )
 import GHC.Types.ForeignCall ( CCallConv(..) )
 import GHC.Data.OrdList
 import GHC.Utils.Outputable
 import GHC.Utils.Constants (debugIsOn)
+import GHC.Utils.Monad ( foldMapM )
 import GHC.Utils.Panic
 import GHC.Data.FastString
 import GHC.Utils.Misc
@@ -844,12 +847,12 @@ iselExpr64ParallelBin op e1 e2 = do
 -- targetted for any particular type like Int8, Int32 etc
 data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div | VA_Min | VA_Max
 
-getRegister :: CmmExpr -> NatM Register
+getRegister :: HasDebugCallStack => CmmExpr -> NatM Register
 getRegister e = do platform <- getPlatform
                    is32Bit <- is32BitPlatform
                    getRegister' platform is32Bit e
 
-getRegister' :: Platform -> Bool -> CmmExpr -> NatM Register
+getRegister' :: HasDebugCallStack => Platform -> Bool -> CmmExpr -> NatM Register
 
 getRegister' platform is32Bit (CmmReg reg)
   = case reg of
@@ -2312,7 +2315,7 @@ getNonClobberedOperand (CmmLit lit) =
     return (OpAddr addr, code)
   else do
     platform <- getPlatform
-    if is32BitLit platform lit && not (isFloatType (cmmLitType platform lit))
+    if is32BitLit platform lit && isIntFormat (cmmTypeFormat (cmmLitType platform lit))
     then return (OpImm (litToImm lit), nilOL)
     else getNonClobberedOperand_generic (CmmLit lit)
 
@@ -2369,13 +2372,13 @@ getOperand (CmmLit lit) = do
     else do
 
   platform <- getPlatform
-  if is32BitLit platform lit && not (isFloatType (cmmLitType platform lit))
+  if is32BitLit platform lit && (isIntFormat $ cmmTypeFormat (cmmLitType platform lit))
     then return (OpImm (litToImm lit), nilOL)
     else getOperand_generic (CmmLit lit)
 
 getOperand (CmmLoad mem ty _) = do
   is32Bit <- is32BitPlatform
-  if not (isFloatType ty) && (if is32Bit then not (isWord64 ty) else True)
+  if isIntFormat (cmmTypeFormat ty) && (if is32Bit then not (isWord64 ty) else True)
      then do
        Amode src mem_code <- getAmode mem
        return (OpAddr src, mem_code)
@@ -2406,7 +2409,7 @@ addAlignmentCheck align reg =
   where
     check :: Format -> Reg -> InstrBlock
     check fmt reg =
-        assert (not $ isFloatFormat fmt) $
+        assert (isIntFormat fmt) $
         toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
              , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
              ]
@@ -2451,7 +2454,7 @@ isSuitableFloatingPointLit _ = False
 getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
 getRegOrMem e@(CmmLoad mem ty _) = do
   is32Bit <- is32BitPlatform
-  if not (isFloatType ty) && (if is32Bit then not (isWord64 ty) else True)
+  if isIntFormat (cmmTypeFormat ty) && (if is32Bit then not (isWord64 ty) else True)
      then do
        Amode src mem_code <- getAmode mem
        return (OpAddr src, mem_code)
@@ -3325,7 +3328,7 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
                                      DELTA (delta-8)]
                     )
 
-              | isFloatType arg_ty = do
+              | isFloatType arg_ty || isVecType arg_ty = do
                 (reg, code) <- getSomeReg arg
                 delta <- getDeltaNat
                 setDeltaNat (delta-size)
@@ -3335,11 +3338,10 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
                                       let addr = AddrBaseIndex (EABaseReg esp)
                                                                 EAIndexNone
                                                                 (ImmInt 0)
-                                          format = floatFormat (typeWidth arg_ty)
+                                          format = cmmTypeFormat arg_ty
                                       in
 
-                                      -- assume SSE2
-                                       MOV format (OpReg reg) (OpAddr addr)
+                                       movInstr config format (OpReg reg) (OpAddr addr)
 
                                      ]
                                )
@@ -3363,7 +3365,7 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
         let
             -- Align stack to 16n for calls, assuming a starting stack
             -- alignment of 16n - word_size on procedure entry. Which we
-            -- maintiain. See Note [Stack Alignment on X86] in rts/StgCRun.c.
+            -- maintain. See Note [Stack Alignment on X86] in rts/StgCRun.c.
             sizes               = map (arg_size_bytes . cmmExprType platform) (reverse args)
             raw_arg_size        = sum sizes + platformWordSizeInBytes platform
             arg_pad_size        = (roundTo 16 $ raw_arg_size) - raw_arg_size
@@ -3408,6 +3410,8 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
             -- assign the results, if necessary
             assign_code []     = nilOL
             assign_code [dest]
+              | isVecType ty
+              = unitOL (movInstr config (cmmTypeFormat ty) (OpReg xmm0) (OpReg r_dest))
               | isFloatType ty =
                   -- we assume SSE2
                   let tmp_amode = AddrBaseIndex (EABaseReg esp)
@@ -3454,36 +3458,41 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
     let prom_args = map (maybePromoteCArg platform W32) args_hints
 
     let load_args :: [CmmExpr]
-                  -> [RegFormat]         -- int regs avail for args
-                  -> [RegFormat]         -- FP regs avail for args
+                  -> [Reg]         -- int regs avail for args
+                  -> [Reg]         -- FP regs avail for args
+                  -> [RegFormat]   -- used int regs
+                  -> [RegFormat]   -- used FP regs
                   -> InstrBlock    -- code computing args
                   -> InstrBlock    -- code assigning args to ABI regs
                   -> NatM ([CmmExpr],[RegFormat],[RegFormat],InstrBlock,InstrBlock)
         -- no more regs to use
-        load_args args [] [] code acode     =
-            return (args, [], [], code, acode)
+        load_args args [] [] used_aregs used_fregs code acode     =
+            return (args, used_aregs, used_fregs, code, acode)
 
         -- no more args to push
-        load_args [] aregs fregs code acode =
-            return ([], aregs, fregs, code, acode)
-
-        load_args (arg : rest) aregs fregs code acode
-            | isFloatType arg_rep = case fregs of
-                 []     -> push_this_arg
-                 (RegFormat r _fmt:rs) -> do
-                    (code',acode') <- reg_this_arg r
-                    load_args rest aregs rs code' acode'
-            | otherwise           = case aregs of
-                 []     -> push_this_arg
-                 (RegFormat r _fmt:rs) -> do
-                    (code',acode') <- reg_this_arg r
-                    load_args rest rs fregs code' acode'
+        load_args [] _aregs _fregs used_aregs used_fregs code acode =
+            return ([], used_aregs, used_fregs, code, acode)
+
+        load_args (arg : rest) aregs fregs used_aregs used_fregs code acode
+            | isFloatType arg_rep || isVecType arg_rep
+            = case fregs of
+                []     -> push_this_arg
+                (r:rs) -> do
+                   (code',acode') <- reg_this_arg r
+                   load_args rest aregs rs used_aregs (RegFormat r fmt:used_fregs) code' acode'
+            | otherwise
+            = case aregs of
+                []     -> push_this_arg
+                (r:rs) -> do
+                   (code',acode') <- reg_this_arg r
+                   load_args rest rs fregs (RegFormat r fmt:used_aregs) used_fregs code' acode'
             where
+              fmt = cmmTypeFormat arg_rep
 
               -- put arg into the list of stack pushed args
               push_this_arg = do
                  (args',ars,frs,code',acode')
-                     <- load_args rest aregs fregs code acode
+                     <- load_args rest aregs fregs used_aregs used_fregs code acode
                  return (arg:args', ars, frs, code', acode')
 
               -- pass the arg into the given register
@@ -3515,86 +3524,65 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
               arg_fmt = cmmTypeFormat arg_rep
 
         load_args_win :: [CmmExpr]
-                      -> [RegFormat]        -- used int regs
-                      -> [RegFormat]        -- used FP regs
+                      -> [RegFormat] -- used int regs
+                      -> [RegFormat] -- used FP regs
                       -> [(Reg, Reg)] -- (int, FP) regs avail for args
                       -> InstrBlock
-                      -> NatM ([CmmExpr],[RegFormat],[RegFormat],InstrBlock,InstrBlock)
+                      -> NatM ([(CmmExpr, Maybe Reg)],[RegFormat],[RegFormat],InstrBlock,InstrBlock)
         load_args_win args usedInt usedFP [] code
-            = return (args, usedInt, usedFP, code, nilOL)
+            = return (map (, Nothing) args, usedInt, usedFP, code, nilOL)
             -- no more regs to use
         load_args_win [] usedInt usedFP _ code
             = return ([], usedInt, usedFP, code, nilOL)
             -- no more args to push
         load_args_win (arg : rest) usedInt usedFP
                       ((ireg, freg) : regs) code
-            | isFloatType arg_rep = do
-                 arg_code <- getAnyReg arg
-                 load_args_win rest (RegFormat ireg II64: usedInt) (RegFormat freg FF64 : usedFP) regs
-                               (code `appOL`
-                                arg_code freg `snocOL`
-                                -- If we are calling a varargs function
-                                -- then we need to define ireg as well
-                                -- as freg
-                                MOVD FF64 (OpReg freg) (OpReg ireg))
-            | otherwise = do
-                 arg_code <- getAnyReg arg
+            | isFloatType arg_rep
+            = do arg_code <- getAnyReg arg
+                 load_args_win rest
+                   (RegFormat ireg II64: usedInt) (RegFormat freg FF64 : usedFP) regs
+                   (code `appOL`
+                    arg_code freg `snocOL`
+                    -- If we are calling a varargs function
+                    -- then we need to define ireg as well
+                    -- as freg
+                    MOVD FF64 (OpReg freg) (OpReg ireg))
+            | isVecType arg_rep
+            -- Vectors are passed by reference.
+            -- See Note [The Windows X64 C calling convention].
+            = do (args', usedInt', usedFP', code', acode') <-
+                   load_args_win rest (RegFormat ireg II64 : usedInt) usedFP regs code
+                 return $
+                    -- return the argument so that we put it on the stack
+                    ((arg, Just ireg):args', usedInt', usedFP', code', acode')
+            | otherwise
+            = do arg_code <- getAnyReg arg
                  load_args_win rest (RegFormat ireg II64: usedInt) usedFP regs
-                               (code `appOL` arg_code ireg)
+                    (code `appOL` arg_code ireg)
             where
               arg_rep = cmmExprType platform arg
 
-        arg_size = 8 -- always, at the mo
-
-        push_args [] code = return code
-        push_args (arg:rest) code
-           | isFloatType arg_rep = do
-             (arg_reg, arg_code) <- getSomeReg arg
-             delta <- getDeltaNat
-             setDeltaNat (delta-arg_size)
-             let fmt = floatFormat width
-                 code' = code `appOL` arg_code `appOL` toOL [
-                            SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
-                            DELTA (delta-arg_size),
-                            MOV fmt (OpReg arg_reg) (OpAddr (spRel platform 0))]
-             push_args rest code'
-
-           | otherwise = do
-             -- Arguments can be smaller than 64-bit, but we still use @PUSH
-             -- II64@ - the usual calling conventions expect integers to be
-             -- 8-byte aligned.
-             massert (width <= W64)
-             (arg_op, arg_code) <- getOperand arg
-             delta <- getDeltaNat
-             setDeltaNat (delta-arg_size)
-             let code' = code `appOL` arg_code `appOL` toOL [
-                                    PUSH II64 arg_op,
-                                    DELTA (delta-arg_size)]
-             push_args rest code'
-            where
-              arg_rep = cmmExprType platform arg
-              width = typeWidth arg_rep
+        expr_size arg = max (widthInBytes (wordWidth platform)) $ widthInBytes (typeWidth $ cmmExprType platform arg)
 
         leaveStackSpace n = do
              delta <- getDeltaNat
-             setDeltaNat (delta - n * arg_size)
+             setDeltaNat (delta - n * 8)
              return $ toOL [
                          SUB II64 (OpImm (ImmInt (n * platformWordSizeInBytes platform))) (OpReg rsp),
-                         DELTA (delta - n * arg_size)]
+                         DELTA (delta - n * 8)]
+              -- NB: the shadow store is always 8 * 4 = 32 bytes large,
+              -- i.e. the cumulative size of rcx, rdx, r8, r9 (see 'allArgRegs').
 
     (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
          <-
         if platformOS platform == OSMinGW32
         then load_args_win prom_args [] [] (allArgRegs platform) nilOL
         else do
-           let intArgRegs = map (\r -> RegFormat r II64) $ allIntArgRegs platform
-               fpArgRegs = map (\r -> RegFormat r FF64) $ allFPArgRegs platform
-           (stack_args, aregs, fregs, load_args_code, assign_args_code)
-               <- load_args prom_args intArgRegs fpArgRegs nilOL nilOL
-           let used_regs rs as = dropTail (length rs) as
-               fregs_used      = used_regs fregs fpArgRegs
-               aregs_used      = used_regs aregs intArgRegs
-           return (stack_args, aregs_used, fregs_used, load_args_code
+           let intArgRegs = allIntArgRegs platform
+               fpArgRegs = allFPArgRegs platform
+           (stack_args, aregs_used, fregs_used, load_args_code, assign_args_code)
+               <- load_args prom_args intArgRegs fpArgRegs [] [] nilOL nilOL
+           return (map (, Nothing) stack_args, aregs_used, fregs_used, load_args_code
                                                       , assign_args_code)
 
     let
@@ -3603,11 +3591,29 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
         arg_regs = [RegFormat eax wordFmt] ++ arg_regs_used
                 -- for annotating the call instruction with
         sse_regs = length fp_regs_used
-        arg_stack_slots = if platformOS platform == OSMinGW32
-                          then length stack_args + length (allArgRegs platform)
-                          else length stack_args
-        tot_arg_size = arg_size * arg_stack_slots
 
+        stack_arg_size
+          | platformOS platform == OSMinGW32
+          = sum
+              [ 8 -- each argument takes up 8 bytes,
+                  -- because vector arguments are passed by reference
+              | (_arg, mb_irep) <- stack_args
+              , isNothing mb_irep
+                  -- don't count an argument passed in a register
+              ]
+          | otherwise
+          = sum (map (expr_size . fst) stack_args)
+
+        tot_arg_size
+          | platformOS platform == OSMinGW32
+          = 8 * length (allArgRegs platform) -- shadow store
+          + stack_arg_size
+          + sum [ expr_size vec_arg
+                | ( vec_arg, _) <- stack_args
+                , isVecType (cmmExprType platform vec_arg)
+                ]
+          | otherwise
+          = stack_arg_size
 
     -- Align stack to 16n for calls, assuming a starting stack
     -- alignment of 16n - word_size on procedure entry. Which we
@@ -3622,13 +3628,104 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
                 return (tot_arg_size + word_size, toOL [
                                 SUB II64 (OpImm (ImmInt word_size)) (OpReg rsp),
                                 DELTA (delta - word_size) ])
+    let
+        -- push one argument to the stack
+        push_arg :: CmmExpr -> NatM (OrdList Instr)
+        push_arg arg
+           | isFloatType arg_rep || isVecType arg_rep = do
+             (arg_reg, arg_code) <- getSomeReg arg
+             delta <- getDeltaNat
+             setDeltaNat (delta-arg_size)
+             let fmt = cmmTypeFormat arg_rep
+             return $ arg_code `appOL` toOL
+                   [ SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp)
+                   , DELTA (delta-arg_size)
+                   , movInstr config fmt (OpReg arg_reg) (OpAddr (spRel platform 0)) ]
+
+           | otherwise = do
+             -- Arguments can be smaller than 64-bit, but we still use @PUSH
+             -- II64@ - the usual calling conventions expect integers to be
+             -- 8-byte aligned.
+             massert (width <= W64)
+             (arg_op, arg_code) <- getOperand arg
+             delta <- getDeltaNat
+             setDeltaNat (delta-arg_size)
+             return $
+               arg_code `appOL` toOL [
+                        PUSH II64 arg_op,
+                        DELTA (delta-arg_size)]
+            where
+              arg_size = expr_size arg
+              arg_rep = cmmExprType platform arg
+              width = typeWidth arg_rep
+
+        push_arg_win :: ((CmmExpr, Maybe Reg), Maybe Int) -> NatM (OrdList Instr, OrdList Instr)
+        push_arg_win ((arg, mb_ireg), mb_off)
+          | isVecType arg_rep
+          , let off = expectJust "push_arg_win vector offset" mb_off
+          -- Pass references for each of the vector arguments.
+          -- See Note [The Windows X64 C calling convention].
+          = case mb_ireg of
+              -- Pass the reference in a register
+              Just ireg -> return (unitOL $ LEA II64 (OpAddr (spRel platform off)) (OpReg ireg), nilOL)
+              -- Pass the reference on the stack
+              Nothing ->
+                do tmp <- getNewRegNat II64
+                   delta <- getDeltaNat
+                   setDeltaNat (delta-arg_ref_size)
+                   let push_code = toOL
+                         [ SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_ref_size)) (OpReg rsp)
+                         , DELTA (delta-arg_ref_size)
+                         , LEA II64 (OpAddr (spRel platform off)) (OpReg tmp)
+                         , MOV II64 (OpReg tmp) (OpAddr (spRel platform 0)) ]
+                   return (nilOL, push_code)
+          | otherwise
+          = do { push_code <- push_arg arg; return (nilOL, push_code) }
+            where
+              arg_ref_size = 8 -- passing a reference to the argument
+              arg_rep = cmmExprType platform arg
+
+        push_args_direct :: [CmmExpr] -> NatM (OrdList Instr)
+        push_args_direct = foldMapM push_arg
+
+        push_args :: [(CmmExpr, Maybe Reg)] -> NatM (OrdList Instr, OrdList Instr)
+        push_args rev_args
+          | platformOS platform == OSMinGW32
+          = do { let is_vec = isVecFormat . cmmTypeFormat . cmmExprType platform
+                     vecs = map fst $ filter (is_vec . fst) rev_args
+
+                  -- Slightly tricky code: compute the stack offset to the
+                  -- vector data for this argument.
+                  --
+                  -- If you're confused, Note [The Windows X64 C calling convention]
+                  -- contains a helpful diagram.
+               ; let vec_offs = reverse $ go stack_arg_size $ reverse rev_args
+                       where go _ [] = []
+                             go off ((arg, mb_reg):args)
+                               | is_vec arg
+                               = Just off : go (expr_size arg + off') args
+                               | otherwise
+                               = Nothing : go off' args
+                               where
+                                -- this little computation below accounts for
+                                -- registers in the shadow space
+                                off' = case mb_reg of
+                                  Just {} -> off
+                                  _       -> off - 8
+
+               ; push_vectors <- push_args_direct vecs
+               ; (load_regs, push_args) <- foldMapM push_arg_win (zip rev_args vec_offs)
+               ; return (load_regs, push_vectors `appOL` push_args) }
+          | otherwise
+          = do { push_code <- push_args_direct (map fst rev_args)
+               ; return (nilOL, push_code) }
 
     -- push the stack args, right to left
-    push_code <- push_args (reverse stack_args) nilOL
+    (load_vecrefs_win, push_code) <- push_args (reverse stack_args)
     -- On Win64, we also have to leave stack space for the arguments
     -- that we are passing in registers
     lss_code <- if platformOS platform == OSMinGW32
-                then leaveStackSpace (length (allArgRegs platform))
+                then leaveStackSpace $ length (allArgRegs platform)
                 else return nilOL
     delta <- getDeltaNat
 
@@ -3649,7 +3746,11 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
         -- It's not safe to omit this assignment, even if the number
         -- of SSE2 regs in use is zero.  If %al is larger than 8
         -- on entry to a varargs function, seg faults ensue.
-        assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
+        assign_eax n
+          | platformOS platform == OSMinGW32
+          = nilOL
+          | otherwise
+          = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
 
     let call = callinsns `appOL`
                toOL (
@@ -3677,12 +3778,96 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
     return (adjust_rsp          `appOL`
             push_code           `appOL`
             load_args_code      `appOL`
+            load_vecrefs_win    `appOL`
             assign_args_code    `appOL`
             lss_code            `appOL`
             assign_eax sse_regs `appOL`
             call                `appOL`
             assign_code dest_regs)
 
+{- Note [The Windows X64 C calling convention]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Here are a few facts about the Windows X64 C calling convention that
+are important:
+
+  - any argument larger than 8 bytes must be passed by reference,
+    and arguments smaller than 8 bytes are padded to 8 bytes.
+
+  - the first four arguments are passed in registers:
+      - floating-point scalar arguments are passed in %xmm0, %xmm1, %xmm2, %xmm3
+      - other arguments are passed in %rcx, %rdx, %r8, %r9
+        (this includes vector arguments, passed by reference)
+
+    For variadic functions, it is additionally expected that floating point
+    scalar arguments are copied to the corresponding integer register, e.g.
+    the data in xmm2 should also be copied to r8.
+
+    There is no requirement about setting %al like there is for the
+    System V AMD64 ABI.
+
+  - subsequent arguments are passed on the stack.
+
+This motivates our handling of vector values. Suppose we have a function call
+with many arguments, several of them being vectors. We proceed as follows:
+
+  - push all the vectors to the stack first,
+  - then push the arguments:
+      - for non-vectors, proceed as usual
+      - for vectors, push the address of the vector data we pushed above,
+  - then assign the registers:
+      - for non-vectors, proceed as usual,
+      - for vectors, store the address in a general-purpose register, as opposed
+        to storing the data in an xmm register.
+
+For a concrete example, suppose we have a call of the form:
+
+  f x1 x2 x3 x4 x5 x6 x7
+
+in which:
+
+  x2, x3 and x7 are 16 byte vectors
+  x5 is a 32 byte vector
+  the other arguments are all 8 byte wide
+
+Now, x1, x2, x3, x4 will get passed in registers, except that we pass
+x2 and x3 by reference, because they are vectors. We proceed as follows:
+
+  - push the vectors to the stack: x7, x5, x3, x2 (in that order)
+  - push the stack arguments in order: addr(x7), x6, addr(x5)
+  - load the remaining arguments into registers: x4, addr(x3), addr(x2), x1
+
+The tricky part is to get the right offsets for the addresses of the vector
+data. The following visualisation will hopefully clear things up:
+
+                  ╭─╴            ┌──┐
+                  │              │  │
+                  │     x7  ───╴ │  │
+                  │              ├──┤
+                  │              │  │
+                  │              │  │
+                  │              │  │
+       vector ────┤     x5  ───╴ │  │
+        data      │              ├──┤
+                  │              │  │
+                  │     x3  ───╴ │  │
+                  │              ├──┤
+                  │              │  │
+                  │     x2  ───╴ │  │    ╭─ from here: x7 is +72
+              ╭─╴ ╰─╴            ╞══╡    │    note:
+              │    addr(x7) ───╴ │  │    │      a) 72 = 56 + 32 - 2 * 8
+              │                  ├──┤ ╾──╯      b) 56 = offset of x5 (relative to two arguments down)
+    stack  ───┤         x6  ───╴ │  │           c) remove 8 twice because we are two arguments further up
+  arguments   │                  ├──┤
+              │    addr(x5) ───╴ │  │
+              ╰─╴            ╭─╴ ╞══╡ ╾─── from here: x2 is +24, x3 is +40, x5 is +56
+                             │   │  │      note: 24 is stack_arg_size
+                  shadow  ───┤   │  │          = 8 * length [ x5, x6, x7 ]
+                   space     │   │  │
+                             │   │  │
+                             ╰─╴ └──┘ ╾─── Sp
+
+This is all tested in the simd013 test.
+-}
 
 maybePromoteCArg :: Platform -> Width -> (CmmExpr, ForeignHint) -> CmmExpr
 maybePromoteCArg platform wto (arg, hint)


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -892,7 +892,7 @@ mkLoadInstr config (RegFormat reg fmt) delta slot =
 
 -- | A move instruction for moving the entire contents of an operand
 -- at the given 'Format'.
-movInstr :: NCGConfig -> Format -> (Operand -> Operand -> Instr)
+movInstr :: HasDebugCallStack => NCGConfig -> Format -> (Operand -> Operand -> Instr)
 movInstr config fmt =
   case fmt of
     VecFormat _ sFmt ->
@@ -914,17 +914,38 @@ movInstr config fmt =
         _ -> sorry $ "Unhandled SIMD vector width: " ++ show (8 * bytes) ++ " bits"
     _ -> MOV fmt
   where
+    plat    = ncgPlatform config
     bytes   = formatInBytes fmt
     avx     = ncgAvxEnabled config
     avx2    = ncgAvx2Enabled config
     avx512f = ncgAvx512fEnabled config
     avx_move sFmt =
       if isFloatScalarFormat sFmt
-      then VMOVU   fmt
+      then \ op1 op2 ->
+              if
+                | OpReg r1 <- op1
+                , OpReg r2 <- op2
+                , targetClassOfReg plat r1 /= targetClassOfReg plat r2
+                -> pprPanic "movInstr: VMOVU between incompatible registers"
+                     ( vcat [ text "fmt:" <+> ppr fmt
+                            , text "r1:" <+> ppr r1
+                            , text "r2:" <+> ppr r2 ] )
+                | otherwise
+                -> VMOVU   fmt op1 op2
       else VMOVDQU fmt
     sse_move sFmt =
       if isFloatScalarFormat sFmt
-      then MOVU   fmt
+      then \ op1 op2 ->
+              if
+                | OpReg r1 <- op1
+                , OpReg r2 <- op2
+                , targetClassOfReg plat r1 /= targetClassOfReg plat r2
+                -> pprPanic "movInstr: MOVU between incompatible registers"
+                     ( vcat [ text "fmt:" <+> ppr fmt
+                            , text "r1:" <+> ppr r1
+                            , text "r2:" <+> ppr r2 ] )
+                | otherwise
+                -> MOVU   fmt op1 op2
       else MOVDQU fmt
     -- NB: we are using {V}MOVU and not {V}MOVA, because we have no guarantees
     -- about the stack being sufficiently aligned (even for even numbered stack slots).
@@ -989,12 +1010,7 @@ mkRegRegMoveInstr
     -> Reg
     -> Instr
 mkRegRegMoveInstr config fmt src dst =
-  assertPpr (targetClassOfReg platform src == targetClassOfReg platform dst)
-    (vcat [ text "mkRegRegMoveInstr: incompatible register classes"
-          , text "fmt:" <+> ppr fmt
-          , text "src:" <+> ppr src
-          , text "dst:" <+> ppr dst ]) $
-    movInstr config fmt' (OpReg src) (OpReg dst)
+  movInstr config fmt' (OpReg src) (OpReg dst)
       -- Move the platform word size, at a minimum
   where
     platform = ncgPlatform config


=====================================
testsuite/tests/simd/should_run/T25169.hs
=====================================
@@ -0,0 +1,202 @@
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+
+module Main where
+
+-- base
+import GHC.Exts
+
+--------------------------------------------------------------------------------
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+  show ( DX2# d ) = case unpackDoubleX2# d of
+    (# a, b #) -> show ( D# a, D# b )
+
+type T = (# DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+          , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+          , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+          , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+          , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+          , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+          , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+          , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+          , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+          , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2# #)
+
+type F =  DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+       -> T
+
+f :: T -> T
+f (# x00, x01, x02, x03, x04, x05, x06, x07, x08, x09
+   , x10, x11, x12, x13, x14, x15, x16, x17, x18, x19
+   , x20, x21, x22, x23, x24, x25, x26, x27, x28, x29
+   , x30, x31, x32, x33, x34, x35, x36, x37, x38, x39
+   , x40, x41, x42, x43, x44, x45, x46, x47, x48, x49 #)
+ = (# x01, x02, x03, x04, x05, x06, x07, x08, x09
+    , x10, x11, x12, x13, x14, x15, x16, x17, x18, x19
+    , x20, x21, x22, x23, x24, x25, x26, x27, x28, x29
+    , x30, x31, x32, x33, x34, x35, x36, x37, x38, x39
+    , x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x00 #)
+{-# OPAQUE f #-}
+
+f2 :: T -> T -> T
+f2 a b = f a
+{-# OPAQUE f2 #-}
+
+w :: Int -> T -> T
+w 0 t = t
+w i t =
+  -- This recursion is designed to trigger stack overflow,
+  -- so that we run into the register save/restore logic in the RTS
+  -- when we reach the corresponding stack underflow frame.
+  case w ( i - 1 ) t of
+    r -> f2 r t
+{-# OPAQUE w #-}
+
+main :: IO ()
+main = do
+  let !( DX2# x00 ) = z00
+      !( DX2# x01 ) = z01
+      !( DX2# x02 ) = z02
+      !( DX2# x03 ) = z03
+      !( DX2# x04 ) = z04
+      !( DX2# x05 ) = z05
+      !( DX2# x06 ) = z06
+      !( DX2# x07 ) = z07
+      !( DX2# x08 ) = z08
+      !( DX2# x09 ) = z09
+      !( DX2# x10 ) = z10
+      !( DX2# x11 ) = z11
+      !( DX2# x12 ) = z12
+      !( DX2# x13 ) = z13
+      !( DX2# x14 ) = z14
+      !( DX2# x15 ) = z15
+      !( DX2# x16 ) = z16
+      !( DX2# x17 ) = z17
+      !( DX2# x18 ) = z18
+      !( DX2# x19 ) = z19
+      !( DX2# x20 ) = z20
+      !( DX2# x21 ) = z21
+      !( DX2# x22 ) = z22
+      !( DX2# x23 ) = z23
+      !( DX2# x24 ) = z24
+      !( DX2# x25 ) = z25
+      !( DX2# x26 ) = z26
+      !( DX2# x27 ) = z27
+      !( DX2# x28 ) = z28
+      !( DX2# x29 ) = z29
+      !( DX2# x30 ) = z30
+      !( DX2# x31 ) = z31
+      !( DX2# x32 ) = z32
+      !( DX2# x33 ) = z33
+      !( DX2# x34 ) = z34
+      !( DX2# x35 ) = z35
+      !( DX2# x36 ) = z36
+      !( DX2# x37 ) = z37
+      !( DX2# x38 ) = z38
+      !( DX2# x39 ) = z39
+      !( DX2# x40 ) = z40
+      !( DX2# x41 ) = z41
+      !( DX2# x42 ) = z42
+      !( DX2# x43 ) = z43
+      !( DX2# x44 ) = z44
+      !( DX2# x45 ) = z45
+      !( DX2# x46 ) = z46
+      !( DX2# x47 ) = z47
+      !( DX2# x48 ) = z48
+      !( DX2# x49 ) = z49
+      !t = (# x00, x01, x02, x03, x04, x05, x06, x07, x08, x09
+            , x10, x11, x12, x13, x14, x15, x16, x17, x18, x19
+            , x20, x21, x22, x23, x24, x25, x26, x27, x28, x29
+            , x30, x31, x32, x33, x34, x35, x36, x37, x38, x39
+            , x40, x41, x42, x43, x44, x45, x46, x47, x48, x49 #)
+  let !u = w ( 2 * 50 + 16 ) t
+  case u of
+    (# r00, r01, r02, r03, r04, r05, r06, r07, r08, r09
+     , r10, r11, r12, r13, r14, r15, r16, r17, r18, r19
+     , r20, r21, r22, r23, r24, r25, r26, r27, r28, r29
+     , r30, r31, r32, r33, r34, r35, r36, r37, r38, r39
+     , r40, r41, r42, r43, r44, r45, r46, r47, r48, r49 #)
+       -> do putStrLn "Should start listing pairs from 16:\n"
+             putStrLn $ unlines $ map show $
+               [ DX2# r00, DX2# r01, DX2# r02, DX2# r03, DX2# r04, DX2# r05, DX2# r06, DX2# r07, DX2# r08, DX2# r09
+               , DX2# r10, DX2# r11, DX2# r12, DX2# r13, DX2# r14, DX2# r15, DX2# r16, DX2# r17, DX2# r18, DX2# r19
+               , DX2# r20, DX2# r21, DX2# r22, DX2# r23, DX2# r24, DX2# r25, DX2# r26, DX2# r27, DX2# r28, DX2# r29
+               , DX2# r30, DX2# r31, DX2# r32, DX2# r33, DX2# r34, DX2# r35, DX2# r36, DX2# r37, DX2# r38, DX2# r39
+               , DX2# r40, DX2# r41, DX2# r42, DX2# r43, DX2# r44, DX2# r45, DX2# r46, DX2# r47, DX2# r48, DX2# r49 ]
+  let !v = w ( 2 * 50 + 23 - 16 ) $ u
+  case v of
+    (# r00, r01, r02, r03, r04, r05, r06, r07, r08, r09
+     , r10, r11, r12, r13, r14, r15, r16, r17, r18, r19
+     , r20, r21, r22, r23, r24, r25, r26, r27, r28, r29
+     , r30, r31, r32, r33, r34, r35, r36, r37, r38, r39
+     , r40, r41, r42, r43, r44, r45, r46, r47, r48, r49 #)
+       -> do putStrLn "\nShould start listing pairs from 23:\n"
+             putStrLn $ unlines $ map show $
+               [ DX2# r00, DX2# r01, DX2# r02, DX2# r03, DX2# r04, DX2# r05, DX2# r06, DX2# r07, DX2# r08, DX2# r09
+               , DX2# r10, DX2# r11, DX2# r12, DX2# r13, DX2# r14, DX2# r15, DX2# r16, DX2# r17, DX2# r18, DX2# r19
+               , DX2# r20, DX2# r21, DX2# r22, DX2# r23, DX2# r24, DX2# r25, DX2# r26, DX2# r27, DX2# r28, DX2# r29
+               , DX2# r30, DX2# r31, DX2# r32, DX2# r33, DX2# r34, DX2# r35, DX2# r36, DX2# r37, DX2# r38, DX2# r39
+               , DX2# r40, DX2# r41, DX2# r42, DX2# r43, DX2# r44, DX2# r45, DX2# r46, DX2# r47, DX2# r48, DX2# r49 ]
+
+z00 :: DoubleX2; z00 = DX2# ( packDoubleX2# (#  0.1##,  0.2## #) ); {-# OPAQUE z00 #-}
+z01 :: DoubleX2; z01 = DX2# ( packDoubleX2# (#  1.1##,  1.2## #) ); {-# OPAQUE z01 #-}
+z02 :: DoubleX2; z02 = DX2# ( packDoubleX2# (#  2.1##,  2.2## #) ); {-# OPAQUE z02 #-}
+z03 :: DoubleX2; z03 = DX2# ( packDoubleX2# (#  3.1##,  3.2## #) ); {-# OPAQUE z03 #-}
+z04 :: DoubleX2; z04 = DX2# ( packDoubleX2# (#  4.1##,  4.2## #) ); {-# OPAQUE z04 #-}
+z05 :: DoubleX2; z05 = DX2# ( packDoubleX2# (#  5.1##,  5.2## #) ); {-# OPAQUE z05 #-}
+z06 :: DoubleX2; z06 = DX2# ( packDoubleX2# (#  6.1##,  6.2## #) ); {-# OPAQUE z06 #-}
+z07 :: DoubleX2; z07 = DX2# ( packDoubleX2# (#  7.1##,  7.2## #) ); {-# OPAQUE z07 #-}
+z08 :: DoubleX2; z08 = DX2# ( packDoubleX2# (#  8.1##,  8.2## #) ); {-# OPAQUE z08 #-}
+z09 :: DoubleX2; z09 = DX2# ( packDoubleX2# (#  9.1##,  9.2## #) ); {-# OPAQUE z09 #-}
+z10 :: DoubleX2; z10 = DX2# ( packDoubleX2# (# 10.1##, 10.2## #) ); {-# OPAQUE z10 #-}
+z11 :: DoubleX2; z11 = DX2# ( packDoubleX2# (# 11.1##, 11.2## #) ); {-# OPAQUE z11 #-}
+z12 :: DoubleX2; z12 = DX2# ( packDoubleX2# (# 12.1##, 12.2## #) ); {-# OPAQUE z12 #-}
+z13 :: DoubleX2; z13 = DX2# ( packDoubleX2# (# 13.1##, 13.2## #) ); {-# OPAQUE z13 #-}
+z14 :: DoubleX2; z14 = DX2# ( packDoubleX2# (# 14.1##, 14.2## #) ); {-# OPAQUE z14 #-}
+z15 :: DoubleX2; z15 = DX2# ( packDoubleX2# (# 15.1##, 15.2## #) ); {-# OPAQUE z15 #-}
+z16 :: DoubleX2; z16 = DX2# ( packDoubleX2# (# 16.1##, 16.2## #) ); {-# OPAQUE z16 #-}
+z17 :: DoubleX2; z17 = DX2# ( packDoubleX2# (# 17.1##, 17.2## #) ); {-# OPAQUE z17 #-}
+z18 :: DoubleX2; z18 = DX2# ( packDoubleX2# (# 18.1##, 18.2## #) ); {-# OPAQUE z18 #-}
+z19 :: DoubleX2; z19 = DX2# ( packDoubleX2# (# 19.1##, 19.2## #) ); {-# OPAQUE z19 #-}
+z20 :: DoubleX2; z20 = DX2# ( packDoubleX2# (# 20.1##, 20.2## #) ); {-# OPAQUE z20 #-}
+z21 :: DoubleX2; z21 = DX2# ( packDoubleX2# (# 21.1##, 21.2## #) ); {-# OPAQUE z21 #-}
+z22 :: DoubleX2; z22 = DX2# ( packDoubleX2# (# 22.1##, 22.2## #) ); {-# OPAQUE z22 #-}
+z23 :: DoubleX2; z23 = DX2# ( packDoubleX2# (# 23.1##, 23.2## #) ); {-# OPAQUE z23 #-}
+z24 :: DoubleX2; z24 = DX2# ( packDoubleX2# (# 24.1##, 24.2## #) ); {-# OPAQUE z24 #-}
+z25 :: DoubleX2; z25 = DX2# ( packDoubleX2# (# 25.1##, 25.2## #) ); {-# OPAQUE z25 #-}
+z26 :: DoubleX2; z26 = DX2# ( packDoubleX2# (# 26.1##, 26.2## #) ); {-# OPAQUE z26 #-}
+z27 :: DoubleX2; z27 = DX2# ( packDoubleX2# (# 27.1##, 27.2## #) ); {-# OPAQUE z27 #-}
+z28 :: DoubleX2; z28 = DX2# ( packDoubleX2# (# 28.1##, 28.2## #) ); {-# OPAQUE z28 #-}
+z29 :: DoubleX2; z29 = DX2# ( packDoubleX2# (# 29.1##, 29.2## #) ); {-# OPAQUE z29 #-}
+z30 :: DoubleX2; z30 = DX2# ( packDoubleX2# (# 30.1##, 30.2## #) ); {-# OPAQUE z30 #-}
+z31 :: DoubleX2; z31 = DX2# ( packDoubleX2# (# 31.1##, 31.2## #) ); {-# OPAQUE z31 #-}
+z32 :: DoubleX2; z32 = DX2# ( packDoubleX2# (# 32.1##, 32.2## #) ); {-# OPAQUE z32 #-}
+z33 :: DoubleX2; z33 = DX2# ( packDoubleX2# (# 33.1##, 33.2## #) ); {-# OPAQUE z33 #-}
+z34 :: DoubleX2; z34 = DX2# ( packDoubleX2# (# 34.1##, 34.2## #) ); {-# OPAQUE z34 #-}
+z35 :: DoubleX2; z35 = DX2# ( packDoubleX2# (# 35.1##, 35.2## #) ); {-# OPAQUE z35 #-}
+z36 :: DoubleX2; z36 = DX2# ( packDoubleX2# (# 36.1##, 36.2## #) ); {-# OPAQUE z36 #-}
+z37 :: DoubleX2; z37 = DX2# ( packDoubleX2# (# 37.1##, 37.2## #) ); {-# OPAQUE z37 #-}
+z38 :: DoubleX2; z38 = DX2# ( packDoubleX2# (# 38.1##, 38.2## #) ); {-# OPAQUE z38 #-}
+z39 :: DoubleX2; z39 = DX2# ( packDoubleX2# (# 39.1##, 39.2## #) ); {-# OPAQUE z39 #-}
+z40 :: DoubleX2; z40 = DX2# ( packDoubleX2# (# 40.1##, 40.2## #) ); {-# OPAQUE z40 #-}
+z41 :: DoubleX2; z41 = DX2# ( packDoubleX2# (# 41.1##, 41.2## #) ); {-# OPAQUE z41 #-}
+z42 :: DoubleX2; z42 = DX2# ( packDoubleX2# (# 42.1##, 42.2## #) ); {-# OPAQUE z42 #-}
+z43 :: DoubleX2; z43 = DX2# ( packDoubleX2# (# 43.1##, 43.2## #) ); {-# OPAQUE z43 #-}
+z44 :: DoubleX2; z44 = DX2# ( packDoubleX2# (# 44.1##, 44.2## #) ); {-# OPAQUE z44 #-}
+z45 :: DoubleX2; z45 = DX2# ( packDoubleX2# (# 45.1##, 45.2## #) ); {-# OPAQUE z45 #-}
+z46 :: DoubleX2; z46 = DX2# ( packDoubleX2# (# 46.1##, 46.2## #) ); {-# OPAQUE z46 #-}
+z47 :: DoubleX2; z47 = DX2# ( packDoubleX2# (# 47.1##, 47.2## #) ); {-# OPAQUE z47 #-}
+z48 :: DoubleX2; z48 = DX2# ( packDoubleX2# (# 48.1##, 48.2## #) ); {-# OPAQUE z48 #-}
+z49 :: DoubleX2; z49 = DX2# ( packDoubleX2# (# 49.1##, 49.2## #) ); {-# OPAQUE z49 #-}


=====================================
testsuite/tests/simd/should_run/T25169.stdout
=====================================
@@ -0,0 +1,107 @@
+Should start listing pairs from 16:
+
+(16.1,16.2)
+(17.1,17.2)
+(18.1,18.2)
+(19.1,19.2)
+(20.1,20.2)
+(21.1,21.2)
+(22.1,22.2)
+(23.1,23.2)
+(24.1,24.2)
+(25.1,25.2)
+(26.1,26.2)
+(27.1,27.2)
+(28.1,28.2)
+(29.1,29.2)
+(30.1,30.2)
+(31.1,31.2)
+(32.1,32.2)
+(33.1,33.2)
+(34.1,34.2)
+(35.1,35.2)
+(36.1,36.2)
+(37.1,37.2)
+(38.1,38.2)
+(39.1,39.2)
+(40.1,40.2)
+(41.1,41.2)
+(42.1,42.2)
+(43.1,43.2)
+(44.1,44.2)
+(45.1,45.2)
+(46.1,46.2)
+(47.1,47.2)
+(48.1,48.2)
+(49.1,49.2)
+(0.1,0.2)
+(1.1,1.2)
+(2.1,2.2)
+(3.1,3.2)
+(4.1,4.2)
+(5.1,5.2)
+(6.1,6.2)
+(7.1,7.2)
+(8.1,8.2)
+(9.1,9.2)
+(10.1,10.2)
+(11.1,11.2)
+(12.1,12.2)
+(13.1,13.2)
+(14.1,14.2)
+(15.1,15.2)
+
+
+Should start listing pairs from 23:
+
+(23.1,23.2)
+(24.1,24.2)
+(25.1,25.2)
+(26.1,26.2)
+(27.1,27.2)
+(28.1,28.2)
+(29.1,29.2)
+(30.1,30.2)
+(31.1,31.2)
+(32.1,32.2)
+(33.1,33.2)
+(34.1,34.2)
+(35.1,35.2)
+(36.1,36.2)
+(37.1,37.2)
+(38.1,38.2)
+(39.1,39.2)
+(40.1,40.2)
+(41.1,41.2)
+(42.1,42.2)
+(43.1,43.2)
+(44.1,44.2)
+(45.1,45.2)
+(46.1,46.2)
+(47.1,47.2)
+(48.1,48.2)
+(49.1,49.2)
+(0.1,0.2)
+(1.1,1.2)
+(2.1,2.2)
+(3.1,3.2)
+(4.1,4.2)
+(5.1,5.2)
+(6.1,6.2)
+(7.1,7.2)
+(8.1,8.2)
+(9.1,9.2)
+(10.1,10.2)
+(11.1,11.2)
+(12.1,12.2)
+(13.1,13.2)
+(14.1,14.2)
+(15.1,15.2)
+(16.1,16.2)
+(17.1,17.2)
+(18.1,18.2)
+(19.1,19.2)
+(20.1,20.2)
+(21.1,21.2)
+(22.1,22.2)
+


=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -49,6 +49,7 @@ test('simd013',
      , unless(arch('x86_64'), skip) # because the C file uses Intel intrinsics
      ],
      compile_and_run, ['simd013C.c'])
+test('simd014', [], compile_and_run, ['simd014Cmm.cmm'])
 
 
 
@@ -63,3 +64,5 @@ test('T25062_V64', [ unless(have_cpu_feature('avx512f'), skip)
                    , only_ways(llvm_ways) # SIMD NCG TODO: support 512 bit wide vectors
                    ]
                  , compile_and_run, [''])
+
+test('T25169', [], compile_and_run, [''])


=====================================
testsuite/tests/simd/should_run/simd014.hs
=====================================
@@ -0,0 +1,35 @@
+{-# LANGUAGE GHCForeignImportPrim #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+module Main where
+
+-- base
+import GHC.Exts
+  ( Double(..), DoubleX2#
+  , packDoubleX2#, unpackDoubleX2#
+  )
+
+-- Test for handwritten Cmm code and realArgsRegCover (relates to #25169).
+
+--------------------------------------------------------------------------------
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+  show ( DX2# d ) = case unpackDoubleX2# d of
+    (# a, b #) -> show ( D# a, D# b )
+
+foreign import prim "f1"
+  f1 :: DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+     -> (# DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2# #)
+
+main :: IO ()
+main = do
+  let !x1 = packDoubleX2# (# 1.1##, 1.2## #)
+      !x2 = packDoubleX2# (# 2.1##, 2.2## #)
+      !x3 = packDoubleX2# (# 3.1##, 3.2## #)
+      !x4 = packDoubleX2# (# 4.1##, 4.2## #)
+      !(# y1, y2, y3, y4 #) = f1 x1 x2 x3 x4
+  print [ DX2# y1, DX2# y2, DX2# y3, DX2# y4 ]


=====================================
testsuite/tests/simd/should_run/simd014.stdout
=====================================
@@ -0,0 +1 @@
+[(3.1,3.2),(2.1,2.2),(1.1,1.2),(4.1,4.2)]


=====================================
testsuite/tests/simd/should_run/simd014Cmm.cmm
=====================================
@@ -0,0 +1,28 @@
+#include "Cmm.h"
+
+f1
+{
+  // Switch XMM1 and XMM2
+  XMM6 = XMM1 ;
+  XMM1 = XMM2 ;
+  XMM2 = XMM6 ;
+  jump f2 [ XMM1, XMM2, XMM3, XMM4 ];
+}
+
+f2
+{
+  // Switch XMM2 and XMM3
+  XMM6 = XMM2 ;
+  XMM2 = XMM3 ;
+  XMM3 = XMM6 ;
+  jump f3 [ XMM1, XMM2, XMM3, XMM4 ];
+}
+
+f3
+{
+  // Switch XMM1 and XMM2
+  XMM6 = XMM1 ;
+  XMM1 = XMM2 ;
+  XMM2 = XMM6 ;
+  jump %ENTRY_CODE(Sp(0)) [ XMM1, XMM2, XMM3, XMM4 ];
+}



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/f538ca1cbc7757305a5e635c3226418728a88d80...73c8ef14c329ae29467f1e267239bc19d564f1ef

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/f538ca1cbc7757305a5e635c3226418728a88d80...73c8ef14c329ae29467f1e267239bc19d564f1ef
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240827/02308f87/attachment-0001.html>