[Git][ghc/ghc][master] x86 NCG: Make MOVD's output format explicit
Marge Bot (@marge-bot)
gitlab at gitlab.haskell.org
Mon Jan 27 21:23:41 UTC 2025
Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC
Commits:
62760367 by ARATA Mizuki at 2025-01-27T16:23:06-05:00
x86 NCG: Make MOVD's output format explicit
The old design led to inference of a wrong format,
losing upper bits of a vector register.
Fixes #25659
Co-authored-by: sheaf <sam.derbyshire at gmail.com>
- - - - -
6 changed files:
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- + testsuite/tests/simd/should_run/T25659.hs
- + testsuite/tests/simd/should_run/T25659.stdout
- testsuite/tests/simd/should_run/all.T
Changes:
=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1171,7 +1171,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
bitcast :: Format -> Format -> CmmExpr -> NatM Register
bitcast fmt rfmt expr =
do (src, e_code) <- getSomeReg expr
- let code = \dst -> e_code `snocOL` (MOVD fmt (OpReg src) (OpReg dst))
+ let code = \dst -> e_code `snocOL` (MOVD fmt rfmt (OpReg src) (OpReg dst))
return (Any rfmt code)
toI8Reg :: Width -> CmmExpr -> NatM Register
@@ -1262,7 +1262,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
code dst = exp `snocOL`
-- VPBROADCAST from GPR requires AVX-512,
-- so we use an additional MOVD.
- (MOVD movFormat (OpReg reg) (OpReg dst)) `snocOL`
+ (MOVD movFormat fmt (OpReg reg) (OpReg dst)) `snocOL`
(VPBROADCAST fmt fmt (OpReg dst) dst)
return $ Any fmt code
@@ -1272,7 +1272,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
(reg, exp) <- getNonClobberedReg expr
let fmt = VecFormat 16 FmtInt8
return $ Any fmt (\dst -> exp `snocOL`
- (MOVD II32 (OpReg reg) (OpReg dst)) `snocOL`
+ (MOVD II32 fmt (OpReg reg) (OpReg dst)) `snocOL`
(PUNPCKLBW fmt (OpReg dst) dst) `snocOL`
(PUNPCKLWD (VecFormat 8 FmtInt16) (OpReg dst) dst) `snocOL`
(PSHUFD fmt (ImmInt 0x00) (OpReg dst) dst)
@@ -1284,7 +1284,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
(reg, exp) <- getNonClobberedReg expr
let fmt = VecFormat 8 FmtInt16
return $ Any fmt (\dst -> exp `snocOL`
- (MOVD II32 (OpReg reg) (OpReg dst)) `snocOL`
+ (MOVD II32 fmt (OpReg reg) (OpReg dst)) `snocOL`
(PUNPCKLWD fmt (OpReg dst) dst) `snocOL`
(PSHUFD fmt (ImmInt 0x00) (OpReg dst) dst)
)
@@ -1295,7 +1295,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
(reg, exp) <- getNonClobberedReg expr
let fmt = VecFormat 4 FmtInt32
return $ Any fmt (\dst -> exp `snocOL`
- (MOVD II32 (OpReg reg) (OpReg dst)) `snocOL`
+ (MOVD II32 fmt (OpReg reg) (OpReg dst)) `snocOL`
(PSHUFD fmt (ImmInt 0x00) (OpReg dst) dst)
)
@@ -1305,7 +1305,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
(reg, exp) <- getNonClobberedReg expr
let fmt = VecFormat 2 FmtInt64
return $ Any fmt (\dst -> exp `snocOL`
- (MOVD II64 (OpReg reg) (OpReg dst)) `snocOL`
+ (MOVD II64 fmt (OpReg reg) (OpReg dst)) `snocOL`
(PUNPCKLQDQ fmt (OpReg dst) dst)
)
@@ -1793,16 +1793,16 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
let code dst =
case i of
0 -> exp `snocOL`
- (MOVD FF32 (OpReg r) (OpReg dst))
+ (MOVD fmt II32 (OpReg r) (OpReg dst))
1 -> exp `snocOL`
(PSHUFD fmt (ImmInt 0b01_01_01_01) (OpReg r) tmp) `snocOL` -- tmp <- (r[1],r[1],r[1],r[1])
- (MOVD FF32 (OpReg tmp) (OpReg dst))
+ (MOVD fmt II32 (OpReg tmp) (OpReg dst))
2 -> exp `snocOL`
(PSHUFD fmt (ImmInt 0b11_10_11_10) (OpReg r) tmp) `snocOL` -- tmp <- (r[2],r[3],r[2],r[3])
- (MOVD FF32 (OpReg tmp) (OpReg dst))
+ (MOVD fmt II32 (OpReg tmp) (OpReg dst))
_ -> exp `snocOL`
(PSHUFD fmt (ImmInt 0b11_11_11_11) (OpReg r) tmp) `snocOL` -- tmp <- (r[3],r[3],r[3],r[3])
- (MOVD FF32 (OpReg tmp) (OpReg dst))
+ (MOVD fmt II32 (OpReg tmp) (OpReg dst))
return (Any II32 code)
vector_int32x4_extract_sse2 _ offset
= pprPanic "Unsupported offset" (pdoc platform offset)
@@ -1818,10 +1818,10 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
let code dst =
case lit of
CmmInt 0 _ -> exp `snocOL`
- (MOVD FF64 (OpReg r) (OpReg dst))
+ (MOVD fmt II64 (OpReg r) (OpReg dst))
CmmInt 1 _ -> exp `snocOL`
(MOVHLPS fmt r tmp) `snocOL`
- (MOVD FF64 (OpReg tmp) (OpReg dst))
+ (MOVD fmt II64 (OpReg tmp) (OpReg dst))
_ -> panic "Error in offset while unpacking"
return (Any II64 code)
vector_int64x2_extract_sse2 _ offset
@@ -2103,22 +2103,22 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
= case offset of
0 -> valExp `appOL`
(vecCode dst) `snocOL`
- (MOVD II32 (OpReg valReg) (OpReg tmp1)) `snocOL`
+ (MOVD II32 vectorFormat (OpReg valReg) (OpReg tmp1)) `snocOL`
(MOV floatVectorFormat (OpReg tmp1) (OpReg dst)) -- MOVSS; dst <- (tmp1[0],dst[1],dst[2],dst[3])
1 -> valExp `appOL`
(vecCode tmp1) `snocOL`
- (MOVD II32 (OpReg valReg) (OpReg dst)) `snocOL` -- dst <- (val,0,0,0)
+ (MOVD II32 vectorFormat (OpReg valReg) (OpReg dst)) `snocOL` -- dst <- (val,0,0,0)
(PUNPCKLQDQ vectorFormat (OpReg tmp1) dst) `snocOL` -- dst <- (dst[0],dst[1],tmp1[0],tmp1[1])
(SHUF floatVectorFormat (ImmInt 0b11_10_00_10) (OpReg tmp1) dst) -- SHUFPS; dst <- (dst[2],dst[0],tmp1[2],tmp1[3])
2 -> valExp `appOL`
(vecCode dst) `snocOL`
- (MOVD II32 (OpReg valReg) (OpReg tmp1)) `snocOL` -- tmp1 <- (val,0,0,0)
+ (MOVD II32 vectorFormat (OpReg valReg) (OpReg tmp1)) `snocOL` -- tmp1 <- (val,0,0,0)
(MOVU floatVectorFormat (OpReg dst) (OpReg tmp2)) `snocOL` -- MOVUPS; tmp2 <- dst
(SHUF floatVectorFormat (ImmInt 0b01_00_01_11) (OpReg tmp1) tmp2) `snocOL` -- SHUFPS; tmp2 <- (tmp2[3],tmp2[1],tmp1[0],tmp1[1])
(SHUF floatVectorFormat (ImmInt 0b00_10_01_00) (OpReg tmp2) dst) -- SHUFPS; dst <- (dst[0],dst[1],tmp2[2],tmp2[0])
_ -> valExp `appOL`
(vecCode dst) `snocOL`
- (MOVD II32 (OpReg valReg) (OpReg tmp1)) `snocOL` -- tmp1 <- (val,0,0,0)
+ (MOVD II32 vectorFormat (OpReg valReg) (OpReg tmp1)) `snocOL` -- tmp1 <- (val,0,0,0)
(SHUF floatVectorFormat (ImmInt 0b11_10_01_00) (OpReg dst) tmp1) `snocOL` -- SHUFPS; tmp1 <- (tmp1[0],tmp1[1],dst[2],dst[3])
(SHUF floatVectorFormat (ImmInt 0b00_10_01_00) (OpReg tmp1) dst) -- SHUFPS; dst <- (dst[0],dst[1],tmp1[2],tmp1[0])
return $ Any vectorFormat code
@@ -2139,12 +2139,12 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
CmmInt 0 _ -> valExp `appOL`
vecExp `snocOL`
(MOVHLPS fmt vecReg tmp) `snocOL`
- (MOVD II64 (OpReg valReg) (OpReg dst)) `snocOL`
+ (MOVD II64 fmt (OpReg valReg) (OpReg dst)) `snocOL`
(PUNPCKLQDQ fmt (OpReg tmp) dst)
CmmInt 1 _ -> valExp `appOL`
vecExp `snocOL`
- (MOV II64 (OpReg vecReg) (OpReg dst)) `snocOL`
- (MOVD II64 (OpReg valReg) (OpReg tmp)) `snocOL`
+ (MOVDQU fmt (OpReg vecReg) (OpReg dst)) `snocOL`
+ (MOVD II64 fmt (OpReg valReg) (OpReg tmp)) `snocOL`
(PUNPCKLQDQ fmt (OpReg tmp) dst)
_ -> pprPanic "MO_V_Insert Int64X2: unsupported offset" (ppr offset)
in return $ Any fmt code
@@ -4083,7 +4083,7 @@ loadArgsWin config (arg:rest) = do
-- arguments in both fp and integer registers.
let (assign_code', regs')
| isFloatFormat arg_fmt =
- ( assign_code `snocOL` MOVD FF64 (OpReg freg) (OpReg ireg),
+ ( assign_code `snocOL` MOVD FF64 II64 (OpReg freg) (OpReg ireg),
[ RegWithFormat freg FF64
, RegWithFormat ireg II64 ])
| otherwise = (assign_code, [RegWithFormat ireg II64])
=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -39,7 +39,6 @@ module GHC.CmmToAsm.X86.Instr
, patchJumpInstr
, isMetaInstr
, isJumpishInstr
- , movdOutFormat
, MinOrMax(..), MinMaxType(..)
)
where
@@ -127,11 +126,16 @@ data Instr
-- with @MOVABS@; we currently do not use this instruction in GHC.
-- See https://stackoverflow.com/questions/52434073/whats-the-difference-between-the-x86-64-att-instructions-movq-and-movabsq.
- | MOVD Format Operand Operand -- ^ MOVD/MOVQ SSE2 instructions
- -- (bitcast between a general purpose
- -- register and a float register).
- -- Format is input format, output format is
- -- calculated in the 'movdOutFormat' function.
+ -- | MOVD/MOVQ SSE2 instructions
+ -- (bitcast between a general purpose register and a float register).
+ | MOVD
+ Format -- ^ input format
+ Format -- ^ output format
+ Operand Operand
+ -- NB: MOVD stores both the input and output formats. This is because
+ -- neither format fully determines the other, as either might be
+ -- a vector format, and we need to know the exact format in order to
+ -- correctly spill/unspill. See #25659.
| CMOV Cond Format Operand Reg
| MOVZxL Format Operand Operand
-- ^ The format argument is the size of operand 1 (the number of bits we keep)
@@ -377,10 +381,10 @@ regUsageOfInstr platform instr
-- (largely to avoid partial register stalls)
| otherwise
-> usageRW fmt src dst
- MOVD fmt src dst ->
+ MOVD fmt1 fmt2 src dst ->
-- NB: MOVD and MOVQ always zero any remaining upper part of destination,
-- so the destination is "written" not "modified".
- usageRW' fmt (movdOutFormat fmt) src dst
+ usageRW' fmt1 fmt2 src dst
CMOV _ fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
MOVZxL fmt src dst -> usageRW fmt src dst
MOVSxL fmt src dst -> usageRW fmt src dst
@@ -650,14 +654,6 @@ interesting :: Platform -> Reg -> Bool
interesting _ (RegVirtual _) = True
interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
-movdOutFormat :: Format -> Format
-movdOutFormat format = case format of
- II32 -> FF32
- II64 -> FF64
- FF32 -> II32
- FF64 -> II64
- _ -> pprPanic "X86: improper format for movd/movq" (ppr format)
-
-- | Applies the supplied function to all registers in instructions.
-- Typically used to change virtual registers to real registers.
@@ -665,7 +661,7 @@ patchRegsOfInstr :: HasDebugCallStack => Platform -> Instr -> (Reg -> Reg) -> In
patchRegsOfInstr platform instr env
= case instr of
MOV fmt src dst -> MOV fmt (patchOp src) (patchOp dst)
- MOVD fmt src dst -> patch2 (MOVD fmt) src dst
+ MOVD fmt1 fmt2 src dst -> patch2 (MOVD fmt1 fmt2) src dst
CMOV cc fmt src dst -> CMOV cc fmt (patchOp src) (env dst)
MOVZxL fmt src dst -> patch2 (MOVZxL fmt) src dst
MOVSxL fmt src dst -> patch2 (MOVSxL fmt) src dst
=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -657,8 +657,8 @@ pprInstr platform i = case i of
CMOV cc format src dst
-> pprCondOpReg (text "cmov") format cc src dst
- MOVD format src dst
- -> pprMovdOpOp (text "mov") format src dst
+ MOVD format1 format2 src dst
+ -> pprMovdOpOp (text "mov") format1 format2 src dst
MOVZxL II32 src dst
-> pprFormatOpOp (text "mov") II32 src dst
@@ -1151,21 +1151,21 @@ pprInstr platform i = case i of
pprOperand platform format op2
]
- pprMovdOpOp :: Line doc -> Format -> Operand -> Operand -> doc
- pprMovdOpOp name format op1 op2
- = let instr = case format of
+ pprMovdOpOp :: Line doc -> Format -> Format -> Operand -> Operand -> doc
+ pprMovdOpOp name format1 format2 op1 op2
+ = let instr = case (format1, format2) of
-- bitcasts to/from a general purpose register to a floating point
-- register require II32 or II64.
- II32 -> text "d"
- II64 -> text "q"
- FF32 -> text "d"
- FF64 -> text "q"
- _ -> panic "X86.Ppr.pprMovdOpOp: improper format for movd/movq."
+ (II32, _) -> text "d"
+ (II64, _) -> text "q"
+ (_, II32) -> text "d"
+ (_, II64) -> text "q"
+ _ -> panic "X86.Ppr.pprMovdOpOp: improper format for movd/movq."
in line $ hcat [
char '\t' <> name <> instr <> space,
- pprOperand platform format op1,
+ pprOperand platform format1 op1,
comma,
- pprOperand platform (movdOutFormat format) op2
+ pprOperand platform format2 op2
]
pprFormatImmRegOp :: Line doc -> Format -> Imm -> Reg -> Operand -> doc
=====================================
testsuite/tests/simd/should_run/T25659.hs
=====================================
@@ -0,0 +1,19 @@
+{-# LANGUAGE MagicHash, UnboxedTuples, ExtendedLiterals #-}
+import GHC.Int
+import GHC.Prim
+
+test :: (Int64X2# -> Int64X2# -> Int64X2#) -> IO ()
+test f = do
+ let a = packInt64X2# (# 0#Int64, 11#Int64 #)
+ b = packInt64X2# (# 22#Int64, 33#Int64 #)
+ c = f a b
+ (# x0, x1 #) = unpackInt64X2# a
+ (# y0, y1 #) = unpackInt64X2# b
+ (# z0, z1 #) = unpackInt64X2# c
+ putStrLn $ "a = " ++ show (I64# x0, I64# x1)
+ putStrLn $ "b = " ++ show (I64# y0, I64# y1)
+ putStrLn $ "c = " ++ show (I64# z0, I64# z1)
+{-# NOINLINE test #-}
+
+main :: IO ()
+main = test (\_ b -> b)
=====================================
testsuite/tests/simd/should_run/T25659.stdout
=====================================
@@ -0,0 +1,3 @@
+a = (0,11)
+b = (22,33)
+c = (22,33)
=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -26,6 +26,7 @@ test('word32x4_basic_baseline', [], compile_and_run, [''])
test('word64x2_basic_baseline', [], compile_and_run, [''])
test('T25658', [], compile_and_run, ['']) # #25658 is a bug with SSE2 code generation
+test('T25659', [], compile_and_run, [''])
# Ensure we set the CPU features we have available.
#
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/62760367dca36da5828875c5423109f9a5523337
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/62760367dca36da5828875c5423109f9a5523337
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20250127/7a65a087/attachment-0001.html>
More information about the ghc-commits
mailing list