[Git][ghc/ghc][wip/ncg-simd] 2 commits: Fix C calls with SIMD vectors
sheaf (@sheaf)
gitlab at gitlab.haskell.org
Tue Aug 27 16:06:10 UTC 2024
sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC
Commits:
f884fc29 by sheaf at 2024-08-27T18:05:50+02:00
Fix C calls with SIMD vectors
This commit fixes the code generation for C calls, to take into account
the calling convention.
This is particularly tricky on Windows, where all vectors are expected
to be passed by reference. See Note [The Windows X64 C calling convention]
in GHC.CmmToAsm.X86.CodeGen.
- - - - -
73c8ef14 by sheaf at 2024-08-27T18:05:50+02:00
Add test for #25169
- - - - -
8 changed files:
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- + testsuite/tests/simd/should_run/T25169.hs
- + testsuite/tests/simd/should_run/T25169.stdout
- testsuite/tests/simd/should_run/all.T
- + testsuite/tests/simd/should_run/simd014.hs
- + testsuite/tests/simd/should_run/simd014.stdout
- + testsuite/tests/simd/should_run/simd014Cmm.cmm
Changes:
=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1,6 +1,7 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE GADTs #-}
+{-# LANGUAGE TupleSections #-}
{-# LANGUAGE NondecreasingIndentation #-}
{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
@@ -73,10 +74,12 @@ import GHC.Types.Tickish ( GenTickish(..) )
import GHC.Types.SrcLoc ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
-- The rest:
+import GHC.Data.Maybe ( expectJust )
import GHC.Types.ForeignCall ( CCallConv(..) )
import GHC.Data.OrdList
import GHC.Utils.Outputable
import GHC.Utils.Constants (debugIsOn)
+import GHC.Utils.Monad ( foldMapM )
import GHC.Utils.Panic
import GHC.Data.FastString
import GHC.Utils.Misc
@@ -844,12 +847,12 @@ iselExpr64ParallelBin op e1 e2 = do
-- targetted for any particular type like Int8, Int32 etc
data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div | VA_Min | VA_Max
-getRegister :: CmmExpr -> NatM Register
+getRegister :: HasDebugCallStack => CmmExpr -> NatM Register
getRegister e = do platform <- getPlatform
is32Bit <- is32BitPlatform
getRegister' platform is32Bit e
-getRegister' :: Platform -> Bool -> CmmExpr -> NatM Register
+getRegister' :: HasDebugCallStack => Platform -> Bool -> CmmExpr -> NatM Register
getRegister' platform is32Bit (CmmReg reg)
= case reg of
@@ -2312,7 +2315,7 @@ getNonClobberedOperand (CmmLit lit) =
return (OpAddr addr, code)
else do
platform <- getPlatform
- if is32BitLit platform lit && not (isFloatType (cmmLitType platform lit))
+ if is32BitLit platform lit && isIntFormat (cmmTypeFormat (cmmLitType platform lit))
then return (OpImm (litToImm lit), nilOL)
else getNonClobberedOperand_generic (CmmLit lit)
@@ -2369,13 +2372,13 @@ getOperand (CmmLit lit) = do
else do
platform <- getPlatform
- if is32BitLit platform lit && not (isFloatType (cmmLitType platform lit))
+ if is32BitLit platform lit && (isIntFormat $ cmmTypeFormat (cmmLitType platform lit))
then return (OpImm (litToImm lit), nilOL)
else getOperand_generic (CmmLit lit)
getOperand (CmmLoad mem ty _) = do
is32Bit <- is32BitPlatform
- if not (isFloatType ty) && (if is32Bit then not (isWord64 ty) else True)
+ if isIntFormat (cmmTypeFormat ty) && (if is32Bit then not (isWord64 ty) else True)
then do
Amode src mem_code <- getAmode mem
return (OpAddr src, mem_code)
@@ -2406,7 +2409,7 @@ addAlignmentCheck align reg =
where
check :: Format -> Reg -> InstrBlock
check fmt reg =
- assert (not $ isFloatFormat fmt) $
+ assert (isIntFormat fmt) $
toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
, JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
]
@@ -2451,7 +2454,7 @@ isSuitableFloatingPointLit _ = False
getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
getRegOrMem e@(CmmLoad mem ty _) = do
is32Bit <- is32BitPlatform
- if not (isFloatType ty) && (if is32Bit then not (isWord64 ty) else True)
+ if isIntFormat (cmmTypeFormat ty) && (if is32Bit then not (isWord64 ty) else True)
then do
Amode src mem_code <- getAmode mem
return (OpAddr src, mem_code)
@@ -3325,7 +3328,7 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
DELTA (delta-8)]
)
- | isFloatType arg_ty = do
+ | isFloatType arg_ty || isVecType arg_ty = do
(reg, code) <- getSomeReg arg
delta <- getDeltaNat
setDeltaNat (delta-size)
@@ -3335,11 +3338,10 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
let addr = AddrBaseIndex (EABaseReg esp)
EAIndexNone
(ImmInt 0)
- format = floatFormat (typeWidth arg_ty)
+ format = cmmTypeFormat arg_ty
in
- -- assume SSE2
- MOV format (OpReg reg) (OpAddr addr)
+ movInstr config format (OpReg reg) (OpAddr addr)
]
)
@@ -3363,7 +3365,7 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
let
-- Align stack to 16n for calls, assuming a starting stack
-- alignment of 16n - word_size on procedure entry. Which we
- -- maintiain. See Note [Stack Alignment on X86] in rts/StgCRun.c.
+ -- maintain. See Note [Stack Alignment on X86] in rts/StgCRun.c.
sizes = map (arg_size_bytes . cmmExprType platform) (reverse args)
raw_arg_size = sum sizes + platformWordSizeInBytes platform
arg_pad_size = (roundTo 16 $ raw_arg_size) - raw_arg_size
@@ -3408,6 +3410,8 @@ genCCall32 addr (ForeignConvention _ argHints _ _) dest_regs args = do
-- assign the results, if necessary
assign_code [] = nilOL
assign_code [dest]
+ | isVecType ty
+ = unitOL (movInstr config (cmmTypeFormat ty) (OpReg xmm0) (OpReg r_dest))
| isFloatType ty =
-- we assume SSE2
let tmp_amode = AddrBaseIndex (EABaseReg esp)
@@ -3454,36 +3458,41 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
let prom_args = map (maybePromoteCArg platform W32) args_hints
let load_args :: [CmmExpr]
- -> [RegFormat] -- int regs avail for args
- -> [RegFormat] -- FP regs avail for args
+ -> [Reg] -- int regs avail for args
+ -> [Reg] -- FP regs avail for args
+ -> [RegFormat] -- used int regs
+ -> [RegFormat] -- used FP regs
-> InstrBlock -- code computing args
-> InstrBlock -- code assigning args to ABI regs
-> NatM ([CmmExpr],[RegFormat],[RegFormat],InstrBlock,InstrBlock)
-- no more regs to use
- load_args args [] [] code acode =
- return (args, [], [], code, acode)
+ load_args args [] [] used_aregs used_fregs code acode =
+ return (args, used_aregs, used_fregs, code, acode)
-- no more args to push
- load_args [] aregs fregs code acode =
- return ([], aregs, fregs, code, acode)
-
- load_args (arg : rest) aregs fregs code acode
- | isFloatType arg_rep = case fregs of
- [] -> push_this_arg
- (RegFormat r _fmt:rs) -> do
- (code',acode') <- reg_this_arg r
- load_args rest aregs rs code' acode'
- | otherwise = case aregs of
- [] -> push_this_arg
- (RegFormat r _fmt:rs) -> do
- (code',acode') <- reg_this_arg r
- load_args rest rs fregs code' acode'
+ load_args [] _aregs _fregs used_aregs used_fregs code acode =
+ return ([], used_aregs, used_fregs, code, acode)
+
+ load_args (arg : rest) aregs fregs used_aregs used_fregs code acode
+ | isFloatType arg_rep || isVecType arg_rep
+ = case fregs of
+ [] -> push_this_arg
+ (r:rs) -> do
+ (code',acode') <- reg_this_arg r
+ load_args rest aregs rs used_aregs (RegFormat r fmt:used_fregs) code' acode'
+ | otherwise
+ = case aregs of
+ [] -> push_this_arg
+ (r:rs) -> do
+ (code',acode') <- reg_this_arg r
+ load_args rest rs fregs (RegFormat r fmt:used_aregs) used_fregs code' acode'
where
+ fmt = cmmTypeFormat arg_rep
-- put arg into the list of stack pushed args
push_this_arg = do
(args',ars,frs,code',acode')
- <- load_args rest aregs fregs code acode
+ <- load_args rest aregs fregs used_aregs used_fregs code acode
return (arg:args', ars, frs, code', acode')
-- pass the arg into the given register
@@ -3515,86 +3524,65 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
arg_fmt = cmmTypeFormat arg_rep
load_args_win :: [CmmExpr]
- -> [RegFormat] -- used int regs
- -> [RegFormat] -- used FP regs
+ -> [RegFormat] -- used int regs
+ -> [RegFormat] -- used FP regs
-> [(Reg, Reg)] -- (int, FP) regs avail for args
-> InstrBlock
- -> NatM ([CmmExpr],[RegFormat],[RegFormat],InstrBlock,InstrBlock)
+ -> NatM ([(CmmExpr, Maybe Reg)],[RegFormat],[RegFormat],InstrBlock,InstrBlock)
load_args_win args usedInt usedFP [] code
- = return (args, usedInt, usedFP, code, nilOL)
+ = return (map (, Nothing) args, usedInt, usedFP, code, nilOL)
-- no more regs to use
load_args_win [] usedInt usedFP _ code
= return ([], usedInt, usedFP, code, nilOL)
-- no more args to push
load_args_win (arg : rest) usedInt usedFP
((ireg, freg) : regs) code
- | isFloatType arg_rep = do
- arg_code <- getAnyReg arg
- load_args_win rest (RegFormat ireg II64: usedInt) (RegFormat freg FF64 : usedFP) regs
- (code `appOL`
- arg_code freg `snocOL`
- -- If we are calling a varargs function
- -- then we need to define ireg as well
- -- as freg
- MOVD FF64 (OpReg freg) (OpReg ireg))
- | otherwise = do
- arg_code <- getAnyReg arg
+ | isFloatType arg_rep
+ = do arg_code <- getAnyReg arg
+ load_args_win rest
+ (RegFormat ireg II64: usedInt) (RegFormat freg FF64 : usedFP) regs
+ (code `appOL`
+ arg_code freg `snocOL`
+ -- If we are calling a varargs function
+ -- then we need to define ireg as well
+ -- as freg
+ MOVD FF64 (OpReg freg) (OpReg ireg))
+ | isVecType arg_rep
+ -- Vectors are passed by reference.
+ -- See Note [The Windows X64 C calling convention].
+ = do (args', usedInt', usedFP', code', acode') <-
+ load_args_win rest (RegFormat ireg II64 : usedInt) usedFP regs code
+ return $
+ -- return the argument so that we put it on the stack
+ ((arg, Just ireg):args', usedInt', usedFP', code', acode')
+ | otherwise
+ = do arg_code <- getAnyReg arg
load_args_win rest (RegFormat ireg II64: usedInt) usedFP regs
- (code `appOL` arg_code ireg)
+ (code `appOL` arg_code ireg)
where
arg_rep = cmmExprType platform arg
- arg_size = 8 -- always, at the mo
-
- push_args [] code = return code
- push_args (arg:rest) code
- | isFloatType arg_rep = do
- (arg_reg, arg_code) <- getSomeReg arg
- delta <- getDeltaNat
- setDeltaNat (delta-arg_size)
- let fmt = floatFormat width
- code' = code `appOL` arg_code `appOL` toOL [
- SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
- DELTA (delta-arg_size),
- MOV fmt (OpReg arg_reg) (OpAddr (spRel platform 0))]
- push_args rest code'
-
- | otherwise = do
- -- Arguments can be smaller than 64-bit, but we still use @PUSH
- -- II64@ - the usual calling conventions expect integers to be
- -- 8-byte aligned.
- massert (width <= W64)
- (arg_op, arg_code) <- getOperand arg
- delta <- getDeltaNat
- setDeltaNat (delta-arg_size)
- let code' = code `appOL` arg_code `appOL` toOL [
- PUSH II64 arg_op,
- DELTA (delta-arg_size)]
- push_args rest code'
- where
- arg_rep = cmmExprType platform arg
- width = typeWidth arg_rep
+ expr_size arg = max (widthInBytes (wordWidth platform)) $ widthInBytes (typeWidth $ cmmExprType platform arg)
leaveStackSpace n = do
delta <- getDeltaNat
- setDeltaNat (delta - n * arg_size)
+ setDeltaNat (delta - n * 8)
return $ toOL [
SUB II64 (OpImm (ImmInt (n * platformWordSizeInBytes platform))) (OpReg rsp),
- DELTA (delta - n * arg_size)]
+ DELTA (delta - n * 8)]
+ -- NB: the shadow store is always 8 * 4 = 32 bytes large,
+ -- i.e. the cumulative size of rcx, rdx, r8, r9 (see 'allArgRegs').
(stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
<-
if platformOS platform == OSMinGW32
then load_args_win prom_args [] [] (allArgRegs platform) nilOL
else do
- let intArgRegs = map (\r -> RegFormat r II64) $ allIntArgRegs platform
- fpArgRegs = map (\r -> RegFormat r FF64) $ allFPArgRegs platform
- (stack_args, aregs, fregs, load_args_code, assign_args_code)
- <- load_args prom_args intArgRegs fpArgRegs nilOL nilOL
- let used_regs rs as = dropTail (length rs) as
- fregs_used = used_regs fregs fpArgRegs
- aregs_used = used_regs aregs intArgRegs
- return (stack_args, aregs_used, fregs_used, load_args_code
+ let intArgRegs = allIntArgRegs platform
+ fpArgRegs = allFPArgRegs platform
+ (stack_args, aregs_used, fregs_used, load_args_code, assign_args_code)
+ <- load_args prom_args intArgRegs fpArgRegs [] [] nilOL nilOL
+ return (map (, Nothing) stack_args, aregs_used, fregs_used, load_args_code
, assign_args_code)
let
@@ -3603,11 +3591,29 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
arg_regs = [RegFormat eax wordFmt] ++ arg_regs_used
-- for annotating the call instruction with
sse_regs = length fp_regs_used
- arg_stack_slots = if platformOS platform == OSMinGW32
- then length stack_args + length (allArgRegs platform)
- else length stack_args
- tot_arg_size = arg_size * arg_stack_slots
+ stack_arg_size
+ | platformOS platform == OSMinGW32
+ = sum
+ [ 8 -- each argument takes up 8 bytes,
+ -- because vector arguments are passed by reference
+ | (_arg, mb_irep) <- stack_args
+ , isNothing mb_irep
+ -- don't count an argument passed in a register
+ ]
+ | otherwise
+ = sum (map (expr_size . fst) stack_args)
+
+ tot_arg_size
+ | platformOS platform == OSMinGW32
+ = 8 * length (allArgRegs platform) -- shadow store
+ + stack_arg_size
+ + sum [ expr_size vec_arg
+ | ( vec_arg, _) <- stack_args
+ , isVecType (cmmExprType platform vec_arg)
+ ]
+ | otherwise
+ = stack_arg_size
-- Align stack to 16n for calls, assuming a starting stack
-- alignment of 16n - word_size on procedure entry. Which we
@@ -3622,13 +3628,104 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
return (tot_arg_size + word_size, toOL [
SUB II64 (OpImm (ImmInt word_size)) (OpReg rsp),
DELTA (delta - word_size) ])
+ let
+ -- push one argument to the stack
+ push_arg :: CmmExpr -> NatM (OrdList Instr)
+ push_arg arg
+ | isFloatType arg_rep || isVecType arg_rep = do
+ (arg_reg, arg_code) <- getSomeReg arg
+ delta <- getDeltaNat
+ setDeltaNat (delta-arg_size)
+ let fmt = cmmTypeFormat arg_rep
+ return $ arg_code `appOL` toOL
+ [ SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp)
+ , DELTA (delta-arg_size)
+ , movInstr config fmt (OpReg arg_reg) (OpAddr (spRel platform 0)) ]
+
+ | otherwise = do
+ -- Arguments can be smaller than 64-bit, but we still use @PUSH
+ -- II64@ - the usual calling conventions expect integers to be
+ -- 8-byte aligned.
+ massert (width <= W64)
+ (arg_op, arg_code) <- getOperand arg
+ delta <- getDeltaNat
+ setDeltaNat (delta-arg_size)
+ return $
+ arg_code `appOL` toOL [
+ PUSH II64 arg_op,
+ DELTA (delta-arg_size)]
+ where
+ arg_size = expr_size arg
+ arg_rep = cmmExprType platform arg
+ width = typeWidth arg_rep
+
+ push_arg_win :: ((CmmExpr, Maybe Reg), Maybe Int) -> NatM (OrdList Instr, OrdList Instr)
+ push_arg_win ((arg, mb_ireg), mb_off)
+ | isVecType arg_rep
+ , let off = expectJust "push_arg_win vector offset" mb_off
+ -- Pass references for each of the vector arguments.
+ -- See Note [The Windows X64 C calling convention].
+ = case mb_ireg of
+ -- Pass the reference in a register
+ Just ireg -> return (unitOL $ LEA II64 (OpAddr (spRel platform off)) (OpReg ireg), nilOL)
+ -- Pass the reference on the stack
+ Nothing ->
+ do tmp <- getNewRegNat II64
+ delta <- getDeltaNat
+ setDeltaNat (delta-arg_ref_size)
+ let push_code = toOL
+ [ SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_ref_size)) (OpReg rsp)
+ , DELTA (delta-arg_ref_size)
+ , LEA II64 (OpAddr (spRel platform off)) (OpReg tmp)
+ , MOV II64 (OpReg tmp) (OpAddr (spRel platform 0)) ]
+ return (nilOL, push_code)
+ | otherwise
+ = do { push_code <- push_arg arg; return (nilOL, push_code) }
+ where
+ arg_ref_size = 8 -- passing a reference to the argument
+ arg_rep = cmmExprType platform arg
+
+ push_args_direct :: [CmmExpr] -> NatM (OrdList Instr)
+ push_args_direct = foldMapM push_arg
+
+ push_args :: [(CmmExpr, Maybe Reg)] -> NatM (OrdList Instr, OrdList Instr)
+ push_args rev_args
+ | platformOS platform == OSMinGW32
+ = do { let is_vec = isVecFormat . cmmTypeFormat . cmmExprType platform
+ vecs = map fst $ filter (is_vec . fst) rev_args
+
+ -- Slightly tricky code: compute the stack offset to the
+ -- vector data for this argument.
+ --
+ -- If you're confused, Note [The Windows X64 C calling convention]
+ -- contains a helpful diagram.
+ ; let vec_offs = reverse $ go stack_arg_size $ reverse rev_args
+ where go _ [] = []
+ go off ((arg, mb_reg):args)
+ | is_vec arg
+ = Just off : go (expr_size arg + off') args
+ | otherwise
+ = Nothing : go off' args
+ where
+ -- this little computation below accounts for
+ -- registers in the shadow space
+ off' = case mb_reg of
+ Just {} -> off
+ _ -> off - 8
+
+ ; push_vectors <- push_args_direct vecs
+ ; (load_regs, push_args) <- foldMapM push_arg_win (zip rev_args vec_offs)
+ ; return (load_regs, push_vectors `appOL` push_args) }
+ | otherwise
+ = do { push_code <- push_args_direct (map fst rev_args)
+ ; return (nilOL, push_code) }
-- push the stack args, right to left
- push_code <- push_args (reverse stack_args) nilOL
+ (load_vecrefs_win, push_code) <- push_args (reverse stack_args)
-- On Win64, we also have to leave stack space for the arguments
-- that we are passing in registers
lss_code <- if platformOS platform == OSMinGW32
- then leaveStackSpace (length (allArgRegs platform))
+ then leaveStackSpace $ length (allArgRegs platform)
else return nilOL
delta <- getDeltaNat
@@ -3649,7 +3746,11 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
-- It's not safe to omit this assignment, even if the number
-- of SSE2 regs in use is zero. If %al is larger than 8
-- on entry to a varargs function, seg faults ensue.
- assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
+ assign_eax n
+ | platformOS platform == OSMinGW32
+ = nilOL
+ | otherwise
+ = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
let call = callinsns `appOL`
toOL (
@@ -3677,12 +3778,96 @@ genCCall64 addr conv@(ForeignConvention _ argHints _ _) dest_regs args = do
return (adjust_rsp `appOL`
push_code `appOL`
load_args_code `appOL`
+ load_vecrefs_win `appOL`
assign_args_code `appOL`
lss_code `appOL`
assign_eax sse_regs `appOL`
call `appOL`
assign_code dest_regs)
+{- Note [The Windows X64 C calling convention]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Here are a few facts about the Windows X64 C calling convention that
+are important:
+
+ - any argument larger than 8 bytes must be passed by reference,
+ and arguments smaller than 8 bytes are padded to 8 bytes.
+
+ - the first four arguments are passed in registers:
+ - floating-point scalar arguments are passed in %xmm0, %xmm1, %xmm2, %xmm3
+ - other arguments are passed in %rcx, %rdx, %r8, %r9
+ (this includes vector arguments, passed by reference)
+
+ For variadic functions, it is additionally expected that floating point
+ scalar arguments are copied to the corresponding integer register, e.g.
+ the data in xmm2 should also be copied to r8.
+
+ There is no requirement about setting %al like there is for the
+ System V AMD64 ABI.
+
+ - subsequent arguments are passed on the stack.
+
+This motivates our handling of vector values. Suppose we have a function call
+with many arguments, several of them being vectors. We proceed as follows:
+
+ - push all the vectors to the stack first,
+ - then push the arguments:
+ - for non-vectors, proceed as usual
+ - for vectors, push the address of the vector data we pushed above,
+ - then assign the registers:
+ - for non-vectors, proceed as usual,
+ - for vectors, store the address in a general-purpose register, as opposed
+ to storing the data in an xmm register.
+
+For a concrete example, suppose we have a call of the form:
+
+ f x1 x2 x3 x4 x5 x6 x7
+
+in which:
+
+ x2, x3 and x7 are 16 byte vectors
+ x5 is a 32 byte vector
+ the other arguments are all 8 byte wide
+
+Now, x1, x2, x3, x4 will get passed in registers, except that we pass
+x2 and x3 by reference, because they are vectors. We proceed as follows:
+
+ - push the vectors to the stack: x7, x5, x3, x2 (in that order)
+ - push the stack arguments in order: addr(x7), x6, addr(x5)
+ - load the remaining arguments into registers: x4, addr(x3), addr(x2), x1
+
+The tricky part is to get the right offsets for the addresses of the vector
+data. The following visualisation will hopefully clear things up:
+
+ ╭─╴ ┌──┐
+ │ │ │
+ │ x7 ───╴ │ │
+ │ ├──┤
+ │ │ │
+ │ │ │
+ │ │ │
+ vector ────┤ x5 ───╴ │ │
+ data │ ├──┤
+ │ │ │
+ │ x3 ───╴ │ │
+ │ ├──┤
+ │ │ │
+ │ x2 ───╴ │ │ ╭─ from here: x7 is +72
+ ╭─╴ ╰─╴ ╞══╡ │ note:
+ │ addr(x7) ───╴ │ │ │ a) 72 = 56 + 32 - 2 * 8
+ │ ├──┤ ╾──╯ b) 56 = offset of x5 (relative to two arguments down)
+ stack ───┤ x6 ───╴ │ │ c) remove 8 twice because we are two arguments further up
+ arguments │ ├──┤
+ │ addr(x5) ───╴ │ │
+ ╰─╴ ╭─╴ ╞══╡ ╾─── from here: x2 is +24, x3 is +40, x5 is +56
+ │ │ │ note: 24 is stack_arg_size
+ shadow ───┤ │ │ = 8 * length [ x5, x6, x7 ]
+ space │ │ │
+ │ │ │
+ ╰─╴ └──┘ ╾─── Sp
+
+This is all tested in the simd013 test.
+-}
maybePromoteCArg :: Platform -> Width -> (CmmExpr, ForeignHint) -> CmmExpr
maybePromoteCArg platform wto (arg, hint)
=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -892,7 +892,7 @@ mkLoadInstr config (RegFormat reg fmt) delta slot =
-- | A move instruction for moving the entire contents of an operand
-- at the given 'Format'.
-movInstr :: NCGConfig -> Format -> (Operand -> Operand -> Instr)
+movInstr :: HasDebugCallStack => NCGConfig -> Format -> (Operand -> Operand -> Instr)
movInstr config fmt =
case fmt of
VecFormat _ sFmt ->
@@ -914,17 +914,38 @@ movInstr config fmt =
_ -> sorry $ "Unhandled SIMD vector width: " ++ show (8 * bytes) ++ " bits"
_ -> MOV fmt
where
+ plat = ncgPlatform config
bytes = formatInBytes fmt
avx = ncgAvxEnabled config
avx2 = ncgAvx2Enabled config
avx512f = ncgAvx512fEnabled config
avx_move sFmt =
if isFloatScalarFormat sFmt
- then VMOVU fmt
+ then \ op1 op2 ->
+ if
+ | OpReg r1 <- op1
+ , OpReg r2 <- op2
+ , targetClassOfReg plat r1 /= targetClassOfReg plat r2
+ -> pprPanic "movInstr: VMOVU between incompatible registers"
+ ( vcat [ text "fmt:" <+> ppr fmt
+ , text "r1:" <+> ppr r1
+ , text "r2:" <+> ppr r2 ] )
+ | otherwise
+ -> VMOVU fmt op1 op2
else VMOVDQU fmt
sse_move sFmt =
if isFloatScalarFormat sFmt
- then MOVU fmt
+ then \ op1 op2 ->
+ if
+ | OpReg r1 <- op1
+ , OpReg r2 <- op2
+ , targetClassOfReg plat r1 /= targetClassOfReg plat r2
+ -> pprPanic "movInstr: MOVU between incompatible registers"
+ ( vcat [ text "fmt:" <+> ppr fmt
+ , text "r1:" <+> ppr r1
+ , text "r2:" <+> ppr r2 ] )
+ | otherwise
+ -> MOVU fmt op1 op2
else MOVDQU fmt
-- NB: we are using {V}MOVU and not {V}MOVA, because we have no guarantees
-- about the stack being sufficiently aligned (even for even numbered stack slots).
@@ -989,12 +1010,7 @@ mkRegRegMoveInstr
-> Reg
-> Instr
mkRegRegMoveInstr config fmt src dst =
- assertPpr (targetClassOfReg platform src == targetClassOfReg platform dst)
- (vcat [ text "mkRegRegMoveInstr: incompatible register classes"
- , text "fmt:" <+> ppr fmt
- , text "src:" <+> ppr src
- , text "dst:" <+> ppr dst ]) $
- movInstr config fmt' (OpReg src) (OpReg dst)
+ movInstr config fmt' (OpReg src) (OpReg dst)
-- Move the platform word size, at a minimum
where
platform = ncgPlatform config
=====================================
testsuite/tests/simd/should_run/T25169.hs
=====================================
@@ -0,0 +1,202 @@
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+
+module Main where
+
+-- base
+import GHC.Exts
+
+--------------------------------------------------------------------------------
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+ show ( DX2# d ) = case unpackDoubleX2# d of
+ (# a, b #) -> show ( D# a, D# b )
+
+type T = (# DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+ , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+ , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+ , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+ , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+ , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+ , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+ , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+ , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#
+ , DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2# #)
+
+type F = DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> T
+
+f :: T -> T
+f (# x00, x01, x02, x03, x04, x05, x06, x07, x08, x09
+ , x10, x11, x12, x13, x14, x15, x16, x17, x18, x19
+ , x20, x21, x22, x23, x24, x25, x26, x27, x28, x29
+ , x30, x31, x32, x33, x34, x35, x36, x37, x38, x39
+ , x40, x41, x42, x43, x44, x45, x46, x47, x48, x49 #)
+ = (# x01, x02, x03, x04, x05, x06, x07, x08, x09
+ , x10, x11, x12, x13, x14, x15, x16, x17, x18, x19
+ , x20, x21, x22, x23, x24, x25, x26, x27, x28, x29
+ , x30, x31, x32, x33, x34, x35, x36, x37, x38, x39
+ , x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x00 #)
+{-# OPAQUE f #-}
+
+f2 :: T -> T -> T
+f2 a b = f a
+{-# OPAQUE f2 #-}
+
+w :: Int -> T -> T
+w 0 t = t
+w i t =
+ -- This recursion is designed to trigger stack overflow,
+ -- so that we run into the register save/restore logic in the RTS
+ -- when we reach the corresponding stack underflow frame.
+ case w ( i - 1 ) t of
+ r -> f2 r t
+{-# OPAQUE w #-}
+
+main :: IO ()
+main = do
+ let !( DX2# x00 ) = z00
+ !( DX2# x01 ) = z01
+ !( DX2# x02 ) = z02
+ !( DX2# x03 ) = z03
+ !( DX2# x04 ) = z04
+ !( DX2# x05 ) = z05
+ !( DX2# x06 ) = z06
+ !( DX2# x07 ) = z07
+ !( DX2# x08 ) = z08
+ !( DX2# x09 ) = z09
+ !( DX2# x10 ) = z10
+ !( DX2# x11 ) = z11
+ !( DX2# x12 ) = z12
+ !( DX2# x13 ) = z13
+ !( DX2# x14 ) = z14
+ !( DX2# x15 ) = z15
+ !( DX2# x16 ) = z16
+ !( DX2# x17 ) = z17
+ !( DX2# x18 ) = z18
+ !( DX2# x19 ) = z19
+ !( DX2# x20 ) = z20
+ !( DX2# x21 ) = z21
+ !( DX2# x22 ) = z22
+ !( DX2# x23 ) = z23
+ !( DX2# x24 ) = z24
+ !( DX2# x25 ) = z25
+ !( DX2# x26 ) = z26
+ !( DX2# x27 ) = z27
+ !( DX2# x28 ) = z28
+ !( DX2# x29 ) = z29
+ !( DX2# x30 ) = z30
+ !( DX2# x31 ) = z31
+ !( DX2# x32 ) = z32
+ !( DX2# x33 ) = z33
+ !( DX2# x34 ) = z34
+ !( DX2# x35 ) = z35
+ !( DX2# x36 ) = z36
+ !( DX2# x37 ) = z37
+ !( DX2# x38 ) = z38
+ !( DX2# x39 ) = z39
+ !( DX2# x40 ) = z40
+ !( DX2# x41 ) = z41
+ !( DX2# x42 ) = z42
+ !( DX2# x43 ) = z43
+ !( DX2# x44 ) = z44
+ !( DX2# x45 ) = z45
+ !( DX2# x46 ) = z46
+ !( DX2# x47 ) = z47
+ !( DX2# x48 ) = z48
+ !( DX2# x49 ) = z49
+ !t = (# x00, x01, x02, x03, x04, x05, x06, x07, x08, x09
+ , x10, x11, x12, x13, x14, x15, x16, x17, x18, x19
+ , x20, x21, x22, x23, x24, x25, x26, x27, x28, x29
+ , x30, x31, x32, x33, x34, x35, x36, x37, x38, x39
+ , x40, x41, x42, x43, x44, x45, x46, x47, x48, x49 #)
+ let !u = w ( 2 * 50 + 16 ) t
+ case u of
+ (# r00, r01, r02, r03, r04, r05, r06, r07, r08, r09
+ , r10, r11, r12, r13, r14, r15, r16, r17, r18, r19
+ , r20, r21, r22, r23, r24, r25, r26, r27, r28, r29
+ , r30, r31, r32, r33, r34, r35, r36, r37, r38, r39
+ , r40, r41, r42, r43, r44, r45, r46, r47, r48, r49 #)
+ -> do putStrLn "Should start listing pairs from 16:\n"
+ putStrLn $ unlines $ map show $
+ [ DX2# r00, DX2# r01, DX2# r02, DX2# r03, DX2# r04, DX2# r05, DX2# r06, DX2# r07, DX2# r08, DX2# r09
+ , DX2# r10, DX2# r11, DX2# r12, DX2# r13, DX2# r14, DX2# r15, DX2# r16, DX2# r17, DX2# r18, DX2# r19
+ , DX2# r20, DX2# r21, DX2# r22, DX2# r23, DX2# r24, DX2# r25, DX2# r26, DX2# r27, DX2# r28, DX2# r29
+ , DX2# r30, DX2# r31, DX2# r32, DX2# r33, DX2# r34, DX2# r35, DX2# r36, DX2# r37, DX2# r38, DX2# r39
+ , DX2# r40, DX2# r41, DX2# r42, DX2# r43, DX2# r44, DX2# r45, DX2# r46, DX2# r47, DX2# r48, DX2# r49 ]
+ let !v = w ( 2 * 50 + 23 - 16 ) $ u
+ case v of
+ (# r00, r01, r02, r03, r04, r05, r06, r07, r08, r09
+ , r10, r11, r12, r13, r14, r15, r16, r17, r18, r19
+ , r20, r21, r22, r23, r24, r25, r26, r27, r28, r29
+ , r30, r31, r32, r33, r34, r35, r36, r37, r38, r39
+ , r40, r41, r42, r43, r44, r45, r46, r47, r48, r49 #)
+ -> do putStrLn "\nShould start listing pairs from 23:\n"
+ putStrLn $ unlines $ map show $
+ [ DX2# r00, DX2# r01, DX2# r02, DX2# r03, DX2# r04, DX2# r05, DX2# r06, DX2# r07, DX2# r08, DX2# r09
+ , DX2# r10, DX2# r11, DX2# r12, DX2# r13, DX2# r14, DX2# r15, DX2# r16, DX2# r17, DX2# r18, DX2# r19
+ , DX2# r20, DX2# r21, DX2# r22, DX2# r23, DX2# r24, DX2# r25, DX2# r26, DX2# r27, DX2# r28, DX2# r29
+ , DX2# r30, DX2# r31, DX2# r32, DX2# r33, DX2# r34, DX2# r35, DX2# r36, DX2# r37, DX2# r38, DX2# r39
+ , DX2# r40, DX2# r41, DX2# r42, DX2# r43, DX2# r44, DX2# r45, DX2# r46, DX2# r47, DX2# r48, DX2# r49 ]
+
+z00 :: DoubleX2; z00 = DX2# ( packDoubleX2# (# 0.1##, 0.2## #) ); {-# OPAQUE z00 #-}
+z01 :: DoubleX2; z01 = DX2# ( packDoubleX2# (# 1.1##, 1.2## #) ); {-# OPAQUE z01 #-}
+z02 :: DoubleX2; z02 = DX2# ( packDoubleX2# (# 2.1##, 2.2## #) ); {-# OPAQUE z02 #-}
+z03 :: DoubleX2; z03 = DX2# ( packDoubleX2# (# 3.1##, 3.2## #) ); {-# OPAQUE z03 #-}
+z04 :: DoubleX2; z04 = DX2# ( packDoubleX2# (# 4.1##, 4.2## #) ); {-# OPAQUE z04 #-}
+z05 :: DoubleX2; z05 = DX2# ( packDoubleX2# (# 5.1##, 5.2## #) ); {-# OPAQUE z05 #-}
+z06 :: DoubleX2; z06 = DX2# ( packDoubleX2# (# 6.1##, 6.2## #) ); {-# OPAQUE z06 #-}
+z07 :: DoubleX2; z07 = DX2# ( packDoubleX2# (# 7.1##, 7.2## #) ); {-# OPAQUE z07 #-}
+z08 :: DoubleX2; z08 = DX2# ( packDoubleX2# (# 8.1##, 8.2## #) ); {-# OPAQUE z08 #-}
+z09 :: DoubleX2; z09 = DX2# ( packDoubleX2# (# 9.1##, 9.2## #) ); {-# OPAQUE z09 #-}
+z10 :: DoubleX2; z10 = DX2# ( packDoubleX2# (# 10.1##, 10.2## #) ); {-# OPAQUE z10 #-}
+z11 :: DoubleX2; z11 = DX2# ( packDoubleX2# (# 11.1##, 11.2## #) ); {-# OPAQUE z11 #-}
+z12 :: DoubleX2; z12 = DX2# ( packDoubleX2# (# 12.1##, 12.2## #) ); {-# OPAQUE z12 #-}
+z13 :: DoubleX2; z13 = DX2# ( packDoubleX2# (# 13.1##, 13.2## #) ); {-# OPAQUE z13 #-}
+z14 :: DoubleX2; z14 = DX2# ( packDoubleX2# (# 14.1##, 14.2## #) ); {-# OPAQUE z14 #-}
+z15 :: DoubleX2; z15 = DX2# ( packDoubleX2# (# 15.1##, 15.2## #) ); {-# OPAQUE z15 #-}
+z16 :: DoubleX2; z16 = DX2# ( packDoubleX2# (# 16.1##, 16.2## #) ); {-# OPAQUE z16 #-}
+z17 :: DoubleX2; z17 = DX2# ( packDoubleX2# (# 17.1##, 17.2## #) ); {-# OPAQUE z17 #-}
+z18 :: DoubleX2; z18 = DX2# ( packDoubleX2# (# 18.1##, 18.2## #) ); {-# OPAQUE z18 #-}
+z19 :: DoubleX2; z19 = DX2# ( packDoubleX2# (# 19.1##, 19.2## #) ); {-# OPAQUE z19 #-}
+z20 :: DoubleX2; z20 = DX2# ( packDoubleX2# (# 20.1##, 20.2## #) ); {-# OPAQUE z20 #-}
+z21 :: DoubleX2; z21 = DX2# ( packDoubleX2# (# 21.1##, 21.2## #) ); {-# OPAQUE z21 #-}
+z22 :: DoubleX2; z22 = DX2# ( packDoubleX2# (# 22.1##, 22.2## #) ); {-# OPAQUE z22 #-}
+z23 :: DoubleX2; z23 = DX2# ( packDoubleX2# (# 23.1##, 23.2## #) ); {-# OPAQUE z23 #-}
+z24 :: DoubleX2; z24 = DX2# ( packDoubleX2# (# 24.1##, 24.2## #) ); {-# OPAQUE z24 #-}
+z25 :: DoubleX2; z25 = DX2# ( packDoubleX2# (# 25.1##, 25.2## #) ); {-# OPAQUE z25 #-}
+z26 :: DoubleX2; z26 = DX2# ( packDoubleX2# (# 26.1##, 26.2## #) ); {-# OPAQUE z26 #-}
+z27 :: DoubleX2; z27 = DX2# ( packDoubleX2# (# 27.1##, 27.2## #) ); {-# OPAQUE z27 #-}
+z28 :: DoubleX2; z28 = DX2# ( packDoubleX2# (# 28.1##, 28.2## #) ); {-# OPAQUE z28 #-}
+z29 :: DoubleX2; z29 = DX2# ( packDoubleX2# (# 29.1##, 29.2## #) ); {-# OPAQUE z29 #-}
+z30 :: DoubleX2; z30 = DX2# ( packDoubleX2# (# 30.1##, 30.2## #) ); {-# OPAQUE z30 #-}
+z31 :: DoubleX2; z31 = DX2# ( packDoubleX2# (# 31.1##, 31.2## #) ); {-# OPAQUE z31 #-}
+z32 :: DoubleX2; z32 = DX2# ( packDoubleX2# (# 32.1##, 32.2## #) ); {-# OPAQUE z32 #-}
+z33 :: DoubleX2; z33 = DX2# ( packDoubleX2# (# 33.1##, 33.2## #) ); {-# OPAQUE z33 #-}
+z34 :: DoubleX2; z34 = DX2# ( packDoubleX2# (# 34.1##, 34.2## #) ); {-# OPAQUE z34 #-}
+z35 :: DoubleX2; z35 = DX2# ( packDoubleX2# (# 35.1##, 35.2## #) ); {-# OPAQUE z35 #-}
+z36 :: DoubleX2; z36 = DX2# ( packDoubleX2# (# 36.1##, 36.2## #) ); {-# OPAQUE z36 #-}
+z37 :: DoubleX2; z37 = DX2# ( packDoubleX2# (# 37.1##, 37.2## #) ); {-# OPAQUE z37 #-}
+z38 :: DoubleX2; z38 = DX2# ( packDoubleX2# (# 38.1##, 38.2## #) ); {-# OPAQUE z38 #-}
+z39 :: DoubleX2; z39 = DX2# ( packDoubleX2# (# 39.1##, 39.2## #) ); {-# OPAQUE z39 #-}
+z40 :: DoubleX2; z40 = DX2# ( packDoubleX2# (# 40.1##, 40.2## #) ); {-# OPAQUE z40 #-}
+z41 :: DoubleX2; z41 = DX2# ( packDoubleX2# (# 41.1##, 41.2## #) ); {-# OPAQUE z41 #-}
+z42 :: DoubleX2; z42 = DX2# ( packDoubleX2# (# 42.1##, 42.2## #) ); {-# OPAQUE z42 #-}
+z43 :: DoubleX2; z43 = DX2# ( packDoubleX2# (# 43.1##, 43.2## #) ); {-# OPAQUE z43 #-}
+z44 :: DoubleX2; z44 = DX2# ( packDoubleX2# (# 44.1##, 44.2## #) ); {-# OPAQUE z44 #-}
+z45 :: DoubleX2; z45 = DX2# ( packDoubleX2# (# 45.1##, 45.2## #) ); {-# OPAQUE z45 #-}
+z46 :: DoubleX2; z46 = DX2# ( packDoubleX2# (# 46.1##, 46.2## #) ); {-# OPAQUE z46 #-}
+z47 :: DoubleX2; z47 = DX2# ( packDoubleX2# (# 47.1##, 47.2## #) ); {-# OPAQUE z47 #-}
+z48 :: DoubleX2; z48 = DX2# ( packDoubleX2# (# 48.1##, 48.2## #) ); {-# OPAQUE z48 #-}
+z49 :: DoubleX2; z49 = DX2# ( packDoubleX2# (# 49.1##, 49.2## #) ); {-# OPAQUE z49 #-}
=====================================
testsuite/tests/simd/should_run/T25169.stdout
=====================================
@@ -0,0 +1,107 @@
+Should start listing pairs from 16:
+
+(16.1,16.2)
+(17.1,17.2)
+(18.1,18.2)
+(19.1,19.2)
+(20.1,20.2)
+(21.1,21.2)
+(22.1,22.2)
+(23.1,23.2)
+(24.1,24.2)
+(25.1,25.2)
+(26.1,26.2)
+(27.1,27.2)
+(28.1,28.2)
+(29.1,29.2)
+(30.1,30.2)
+(31.1,31.2)
+(32.1,32.2)
+(33.1,33.2)
+(34.1,34.2)
+(35.1,35.2)
+(36.1,36.2)
+(37.1,37.2)
+(38.1,38.2)
+(39.1,39.2)
+(40.1,40.2)
+(41.1,41.2)
+(42.1,42.2)
+(43.1,43.2)
+(44.1,44.2)
+(45.1,45.2)
+(46.1,46.2)
+(47.1,47.2)
+(48.1,48.2)
+(49.1,49.2)
+(0.1,0.2)
+(1.1,1.2)
+(2.1,2.2)
+(3.1,3.2)
+(4.1,4.2)
+(5.1,5.2)
+(6.1,6.2)
+(7.1,7.2)
+(8.1,8.2)
+(9.1,9.2)
+(10.1,10.2)
+(11.1,11.2)
+(12.1,12.2)
+(13.1,13.2)
+(14.1,14.2)
+(15.1,15.2)
+
+
+Should start listing pairs from 23:
+
+(23.1,23.2)
+(24.1,24.2)
+(25.1,25.2)
+(26.1,26.2)
+(27.1,27.2)
+(28.1,28.2)
+(29.1,29.2)
+(30.1,30.2)
+(31.1,31.2)
+(32.1,32.2)
+(33.1,33.2)
+(34.1,34.2)
+(35.1,35.2)
+(36.1,36.2)
+(37.1,37.2)
+(38.1,38.2)
+(39.1,39.2)
+(40.1,40.2)
+(41.1,41.2)
+(42.1,42.2)
+(43.1,43.2)
+(44.1,44.2)
+(45.1,45.2)
+(46.1,46.2)
+(47.1,47.2)
+(48.1,48.2)
+(49.1,49.2)
+(0.1,0.2)
+(1.1,1.2)
+(2.1,2.2)
+(3.1,3.2)
+(4.1,4.2)
+(5.1,5.2)
+(6.1,6.2)
+(7.1,7.2)
+(8.1,8.2)
+(9.1,9.2)
+(10.1,10.2)
+(11.1,11.2)
+(12.1,12.2)
+(13.1,13.2)
+(14.1,14.2)
+(15.1,15.2)
+(16.1,16.2)
+(17.1,17.2)
+(18.1,18.2)
+(19.1,19.2)
+(20.1,20.2)
+(21.1,21.2)
+(22.1,22.2)
+
=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -49,6 +49,7 @@ test('simd013',
, unless(arch('x86_64'), skip) # because the C file uses Intel intrinsics
],
compile_and_run, ['simd013C.c'])
+test('simd014', [], compile_and_run, ['simd014Cmm.cmm'])
@@ -63,3 +64,5 @@ test('T25062_V64', [ unless(have_cpu_feature('avx512f'), skip)
, only_ways(llvm_ways) # SIMD NCG TODO: support 512 bit wide vectors
]
, compile_and_run, [''])
+
+test('T25169', [], compile_and_run, [''])
=====================================
testsuite/tests/simd/should_run/simd014.hs
=====================================
@@ -0,0 +1,35 @@
+{-# LANGUAGE GHCForeignImportPrim #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+module Main where
+
+-- base
+import GHC.Exts
+ ( Double(..), DoubleX2#
+ , packDoubleX2#, unpackDoubleX2#
+ )
+
+-- Test for handwritten Cmm code and realArgsRegCover (relates to #25169).
+
+--------------------------------------------------------------------------------
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+ show ( DX2# d ) = case unpackDoubleX2# d of
+ (# a, b #) -> show ( D# a, D# b )
+
+foreign import prim "f1"
+ f1 :: DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+ -> (# DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2# #)
+
+main :: IO ()
+main = do
+ let !x1 = packDoubleX2# (# 1.1##, 1.2## #)
+ !x2 = packDoubleX2# (# 2.1##, 2.2## #)
+ !x3 = packDoubleX2# (# 3.1##, 3.2## #)
+ !x4 = packDoubleX2# (# 4.1##, 4.2## #)
+ !(# y1, y2, y3, y4 #) = f1 x1 x2 x3 x4
+ print [ DX2# y1, DX2# y2, DX2# y3, DX2# y4 ]
=====================================
testsuite/tests/simd/should_run/simd014.stdout
=====================================
@@ -0,0 +1 @@
+[(3.1,3.2),(2.1,2.2),(1.1,1.2),(4.1,4.2)]
=====================================
testsuite/tests/simd/should_run/simd014Cmm.cmm
=====================================
@@ -0,0 +1,28 @@
+#include "Cmm.h"
+
+f1
+{
+ // Switch XMM1 and XMM2
+ XMM6 = XMM1 ;
+ XMM1 = XMM2 ;
+ XMM2 = XMM6 ;
+ jump f2 [ XMM1, XMM2, XMM3, XMM4 ];
+}
+
+f2
+{
+ // Switch XMM2 and XMM3
+ XMM6 = XMM2 ;
+ XMM2 = XMM3 ;
+ XMM3 = XMM6 ;
+ jump f3 [ XMM1, XMM2, XMM3, XMM4 ];
+}
+
+f3
+{
+ // Switch XMM1 and XMM2
+ XMM6 = XMM1 ;
+ XMM1 = XMM2 ;
+ XMM2 = XMM6 ;
+ jump %ENTRY_CODE(Sp(0)) [ XMM1, XMM2, XMM3, XMM4 ];
+}
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/f538ca1cbc7757305a5e635c3226418728a88d80...73c8ef14c329ae29467f1e267239bc19d564f1ef
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/f538ca1cbc7757305a5e635c3226418728a88d80...73c8ef14c329ae29467f1e267239bc19d564f1ef
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240827/02308f87/attachment-0001.html>
More information about the ghc-commits
mailing list