[Git][ghc/ghc][wip/ncg-simd] 2 commits: rework X86 MOV instruction
sheaf (@sheaf)
gitlab at gitlab.haskell.org
Wed Jun 26 08:51:57 UTC 2024
sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC
Commits:
37d40f25 by sheaf at 2024-06-26T10:51:41+02:00
rework X86 MOV instruction
- - - - -
125fb19c by sheaf at 2024-06-26T10:51:42+02:00
SIMD cleanups
- - - - -
6 changed files:
- compiler/GHC/Cmm/Opt.hs
- compiler/GHC/CmmToAsm/Format.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/CmmToC.hs
Changes:
=====================================
compiler/GHC/Cmm/Opt.hs
=====================================
@@ -80,10 +80,10 @@ cmmMachOpFoldM
-> [CmmExpr]
-> Maybe CmmExpr
cmmMachOpFoldM _ (MO_V_Broadcast {}) _ = Nothing
- -- SIMD NCG TODO: constant folding doesn't work correctly for Broadcast instructions,
- -- perhaps due to the fact that the argument is a scalar but the result is a vector.
cmmMachOpFoldM _ (MO_VF_Broadcast {}) _ = Nothing
-
+ -- SIMD NCG TODO: supporting constant folding for vector operations
+ -- would require augmenting getRegister' to handle them.
+ -- See the code for "getRegister' platform _ (CmmLit lit)".
cmmMachOpFoldM _ op [CmmLit (CmmInt x rep)]
= Just $! case op of
MO_S_Neg _ -> CmmLit (CmmInt (-x) rep)
=====================================
compiler/GHC/CmmToAsm/Format.hs
=====================================
@@ -25,6 +25,8 @@ module GHC.CmmToAsm.Format (
formatToWidth,
formatInBytes,
isIntScalarFormat,
+ isFloatScalarFormat,
+ scalarFormatFormat,
VirtualRegFormat(..),
RegFormat(..),
takeVirtualRegs,
@@ -101,12 +103,23 @@ data ScalarFormat
| FmtDouble
deriving (Show, Eq, Ord)
+scalarFormatFormat :: ScalarFormat -> Format
+scalarFormatFormat = \case
+ FmtInt8 -> II8
+ FmtInt16 -> II16
+ FmtInt32 -> II32
+ FmtInt64 -> II64
+ FmtFloat -> FF32
+ FmtDouble -> FF64
+
+isFloatScalarFormat :: ScalarFormat -> Bool
+isFloatScalarFormat = \case
+ FmtFloat -> True
+ FmtDouble -> True
+ _ -> False
+
isIntScalarFormat :: ScalarFormat -> Bool
-isIntScalarFormat FmtInt8 = True
-isIntScalarFormat FmtInt16 = True
-isIntScalarFormat FmtInt32 = True
-isIntScalarFormat FmtInt64 = True
-isIntScalarFormat _ = False
+isIntScalarFormat = not . isFloatScalarFormat
-- | Get the integer format of this width.
intFormat :: Width -> Format
=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -717,8 +717,8 @@ iselExpr64 (CmmMachOp (MO_S_MulMayOflo W64) _) = do
-- We always return a (usually false) positive.
Reg64 rhi rlo <- getNewReg64
let code = toOL [
- MOV II32 (OpImm (ImmInt 1)) (OpReg rhi),
- MOV II32 (OpImm (ImmInt 1)) (OpReg rlo)
+ MOV II32 (OpImm (ImmInt 1)) (OpReg rhi),
+ MOV II32 (OpImm (ImmInt 1)) (OpReg rlo)
]
return (RegCode64 code rhi rlo)
@@ -1067,11 +1067,11 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
-- has 8-bit version). So for 32-bit code, we'll just zero-extend.
MO_XX_Conv W8 W32
| is32Bit -> integerExtend W8 W32 MOVZxL x
- | otherwise -> integerExtend W8 W32 (MOV) x
+ | otherwise -> integerExtend W8 W32 MOV x
MO_XX_Conv W8 W16
| is32Bit -> integerExtend W8 W16 MOVZxL x
- | otherwise -> integerExtend W8 W16 (MOV) x
- MO_XX_Conv W16 W32 -> integerExtend W16 W32 (MOV) x
+ | otherwise -> integerExtend W8 W16 MOV x
+ MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
MO_UU_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOVZxL x
MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
@@ -1085,9 +1085,9 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
-- the form of a movzl and print it as a movl later.
-- This doesn't apply to MO_XX_Conv since in this case we don't care about
-- the upper bits. So we can just use MOV.
- MO_XX_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 (MOV) x
- MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 (MOV) x
- MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 (MOV) x
+ MO_XX_Conv W8 W64 | not is32Bit -> integerExtend W8 W64 MOV x
+ MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
+ MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
@@ -1104,7 +1104,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
| sse && sse2 -> vector_float_negate_sse l w x
| otherwise
-> sorry "Please enable the -mavx or -msse, -msse2 flag"
- -- SIMD NCG TODO
+ -- SIMD NCG TODO: add integer negation
MO_VS_Neg {} -> needLlvm mop
MO_VF_Broadcast l W32 | avx -> vector_float_broadcast_avx l W32 x
@@ -1311,7 +1311,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
(reg, exp) <- getSomeReg expr
let fmt = VecFormat len FmtInt64
return $ Any fmt (\dst -> exp `snocOL`
- (MOV2 II64 fmt (OpReg reg) (OpReg dst)) `snocOL`
+ (MOVD II64 (OpReg reg) (OpReg dst)) `snocOL`
(PUNPCKLQDQ fmt (OpReg dst) dst)
)
vector_int_broadcast _ _ c
@@ -1384,15 +1384,15 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
| otherwise
-> sorry "Please use -fllvm for wide shuffle instructions"
- MO_VF_Extract l W32 | avx -> vector_float_unpack l W32 x y
- | sse -> vector_float_unpack_sse l W32 x y
+ MO_VF_Extract l W32 | avx -> vector_float_extract l W32 x y
+ | sse -> vector_float_extract_sse l W32 x y
| otherwise
-> sorry "Please enable the -mavx or -msse flag"
- MO_VF_Extract l W64 | sse2 -> vector_float_unpack l W64 x y
+ MO_VF_Extract l W64 | sse2 -> vector_float_extract l W64 x y
| otherwise -> sorry "Please enable the -msse2 flag"
MO_VF_Extract {} -> incorrectOperands
- MO_V_Extract l W64 | sse2 -> vector_int_unpack_sse l W64 x y
+ MO_V_Extract l W64 | sse2 -> vector_int_extract_sse l W64 x y
| otherwise -> sorry "Please enable the -msse2 flag"
-- SIMD NCG TODO: W32, W16, W8
MO_V_Extract {} -> needLlvm mop
@@ -1653,7 +1653,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
let format = case w of
W32 -> VecFormat l FmtFloat
W64 -> VecFormat l FmtDouble
- _ -> pprPanic "Operation not supported for width " (ppr w)
+ _ -> pprPanic "Floating-point AVX vector operation not supported at this width"
+ (text "width:" <+> ppr w)
code dst = case op of
VA_Add -> arithInstr VADD
VA_Sub -> arithInstr VSUB
@@ -1677,7 +1678,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
let format = case w of
W32 -> VecFormat l FmtFloat
W64 -> VecFormat l FmtDouble
- _ -> pprPanic "Operation not supported for width " (ppr w)
+ _ -> pprPanic "Floating-point SSE vector operation not supported at this width"
+ (text "width:" <+> ppr w)
code dst = case op of
VA_Add -> arithInstr ADD
VA_Sub -> arithInstr SUB
@@ -1691,12 +1693,12 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
(instr format (OpReg reg2) (OpReg dst))
return (Any format code)
--------------------
- vector_float_unpack :: Length
- -> Width
- -> CmmExpr
- -> CmmExpr
- -> NatM Register
- vector_float_unpack l W32 expr (CmmLit lit)
+ vector_float_extract :: Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_float_extract l W32 expr (CmmLit lit)
= do
(r, exp) <- getSomeReg expr
let format = VecFormat l FmtFloat
@@ -1705,9 +1707,9 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
= case lit of
CmmInt 0 _ -> exp `snocOL` (MOV FF32 (OpReg r) (OpReg dst))
CmmInt _ _ -> exp `snocOL` (VPSHUFD format imm (OpReg r) dst)
- _ -> panic "Error in offset while unpacking"
+ _ -> pprPanic "Unsupported AVX floating-point vector extract offset" (ppr lit)
return (Any format code)
- vector_float_unpack l W64 expr (CmmLit lit)
+ vector_float_extract l W64 expr (CmmLit lit)
= do
(r, exp) <- getSomeReg expr
let format = VecFormat l FmtDouble
@@ -1716,19 +1718,19 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
CmmInt 0 _ -> exp `snocOL`
(MOV FF64 (OpReg r) (OpReg dst))
CmmInt 1 _ -> exp `snocOL`
- (MOVHLPS format (OpReg r) dst)
- _ -> panic "Error in offset while unpacking"
+ (MOVHLPS format r dst)
+ _ -> pprPanic "Unsupported AVX floating-point vector extract offset" (ppr lit)
return (Any format code)
- vector_float_unpack _ w c e
- = pprPanic "Unpack not supported for : " (pdoc platform c $$ pdoc platform e $$ ppr w)
+ vector_float_extract _ w c e
+ = pprPanic "Unsupported AVX floating-point vector extract" (pdoc platform c $$ pdoc platform e $$ ppr w)
-----------------------
- vector_float_unpack_sse :: Length
- -> Width
- -> CmmExpr
- -> CmmExpr
- -> NatM Register
- vector_float_unpack_sse l W32 expr (CmmLit lit)
+ vector_float_extract_sse :: Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_float_extract_sse l W32 expr (CmmLit lit)
= do
(r,exp) <- getSomeReg expr
let format = VecFormat l FmtFloat
@@ -1737,18 +1739,18 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
= case lit of
CmmInt 0 _ -> exp `snocOL` (MOVU format (OpReg r) (OpReg dst))
CmmInt _ _ -> exp `snocOL` (PSHUFD format imm (OpReg r) dst)
- _ -> panic "Error in offset while unpacking"
+ _ -> pprPanic "Unsupported SSE floating-point vector extract offset" (ppr lit)
return (Any format code)
- vector_float_unpack_sse _ w c e
- = pprPanic "Unpack not supported for : " (pdoc platform c $$ pdoc platform e $$ ppr w)
+ vector_float_extract_sse _ w c e
+ = pprPanic "Unsupported SSE floating-point vector extract" (pdoc platform c $$ pdoc platform e $$ ppr w)
-----------------------
- vector_int_unpack_sse :: Length
- -> Width
- -> CmmExpr
- -> CmmExpr
- -> NatM Register
- vector_int_unpack_sse l at 2 W64 expr (CmmLit lit)
+ vector_int_extract_sse :: Length
+ -> Width
+ -> CmmExpr
+ -> CmmExpr
+ -> NatM Register
+ vector_int_extract_sse l at 2 W64 expr (CmmLit lit)
= do
(r, exp) <- getSomeReg expr
let fmt = VecFormat l FmtInt64
@@ -1756,14 +1758,14 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
let code dst
= case lit of
CmmInt 0 _ -> exp `snocOL`
- (MOV2 fmt II64 (OpReg r) (OpReg dst))
+ (MOVD II64 (OpReg r) (OpReg dst))
CmmInt 1 _ -> exp `snocOL`
- (MOVHLPS fmt (OpReg r) tmp) `snocOL`
- (MOV2 fmt II64 (OpReg tmp) (OpReg dst))
+ (MOVHLPS fmt r tmp) `snocOL`
+ (MOVD II64 (OpReg tmp) (OpReg dst))
_ -> panic "Error in offset while unpacking"
return (Any fmt code)
- vector_int_unpack_sse _ w c e
- = pprPanic "Unpack not supported for : " (pdoc platform c $$ pdoc platform e $$ ppr w)
+ vector_int_extract_sse _ w c e
+ = pprPanic "Unsupported SSE floating-point vector extract" (pdoc platform c $$ pdoc platform e $$ ppr w)
vector_shuffle_float :: Length -> Width -> CmmExpr -> CmmExpr -> [Int] -> NatM Register
vector_shuffle_float l w v1 v2 is = do
@@ -1938,13 +1940,13 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
= case offset of
CmmInt 0 _ -> valExp `appOL`
vecExp `snocOL`
- (MOVHLPS fmt (OpReg vecReg) tmp) `snocOL`
- (MOV2 II64 fmt (OpReg valReg) (OpReg dst)) `snocOL`
+ (MOVHLPS fmt vecReg tmp) `snocOL`
+ (MOVD II64 (OpReg valReg) (OpReg dst)) `snocOL`
(PUNPCKLQDQ fmt (OpReg tmp) dst)
CmmInt 1 _ -> valExp `appOL`
vecExp `snocOL`
- (MOV fmt (OpReg vecReg) (OpReg dst)) `snocOL`
- (MOV2 II64 fmt (OpReg valReg) (OpReg tmp)) `snocOL`
+ (MOV II64 (OpReg vecReg) (OpReg dst)) `snocOL`
+ (MOVD II64 (OpReg valReg) (OpReg tmp)) `snocOL`
(PUNPCKLQDQ fmt (OpReg tmp) dst)
_ -> pprPanic "MO_V_Insert Int64X2: unsupported offset" (ppr offset)
in return $ Any fmt code
@@ -1978,26 +1980,29 @@ getRegister' _ _ (CmmLoad mem pk _)
loadFloatAmode (typeWidth pk) addr mem_code
getRegister' _ is32Bit (CmmLoad mem pk _)
+ -- SIMD NCG TODO: what about vectors?
| is32Bit && not (isWord64 pk)
= do
let
instr = case width of
W8 -> MOVZxL II8
+ -- We always zero-extend 8-bit loads, if we
+ -- can't think of anything better. This is because
+ -- we can't guarantee access to an 8-bit variant of every register
+ -- (esi and edi don't have 8-bit variants), so to make things
+ -- simpler we do our 8-bit arithmetic with full 32-bit registers.
_other -> MOV format
code <- intLoadCode instr mem
return (Any format code)
where
width = typeWidth pk
format = intFormat width
- -- We always zero-extend 8-bit loads, if we
- -- can't think of anything better. This is because
- -- we can't guarantee access to an 8-bit variant of every register
- -- (esi and edi don't have 8-bit variants), so to make things
- -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+
-- Simpler memory load code on x86_64
getRegister' _ is32Bit (CmmLoad mem pk _)
- | not is32Bit
+ -- SIMD NCG TODO: what about vectors?
+ | not is32Bit
= do
code <- intLoadCode (MOV format) mem
return (Any format code)
@@ -2058,13 +2063,17 @@ getRegister' platform _ (CmmLit lit)
where
cmmtype = cmmLitType platform lit
vectorRegister ctype
- = do
- --NOTE:
- -- This operation is only used to zero a register. For loading a
- -- vector literal there are pack and broadcast operations
- let format = cmmTypeFormat ctype
- code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
- return (Any format code)
+ | case lit of { CmmVec fs -> all (\case { CmmInt i _ -> i == 0; CmmFloat f _ -> f == 0; _ -> False }) fs; _ -> False }
+ = -- NOTE:
+ -- This operation is only used to zero a register. For loading a
+ -- vector literal there are pack and broadcast operations
+ let format = cmmTypeFormat ctype
+ code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
+ in return (Any format code)
+ | otherwise
+ = pprPanic "getRegister': no support for (nonzero) vector literals" $
+ vcat [ text "lit:" <+> ppr lit ]
+ -- SIMD NCG TODO: can we do better here?
standardRegister ctype
= do
let format = cmmTypeFormat ctype
@@ -3519,10 +3528,7 @@ genCCall64 addr conv dest_regs args = do
-- If we are calling a varargs function
-- then we need to define ireg as well
-- as freg
- CVTTSD2SIQ II64 (OpReg freg) ireg)
- -- SLD TODO: I changed this from MOV FF64 (OpReg freg) (OpReg ireg)
- -- to CVTTSD2SIQ ...
- -- because it is going between two different types of register
+ MOVD FF64 (OpReg freg) (OpReg ireg))
| otherwise = do
arg_code <- getAnyReg arg
load_args_win rest (RegFormat ireg II64: usedInt) usedFP regs
@@ -3538,10 +3544,11 @@ genCCall64 addr conv dest_regs args = do
(arg_reg, arg_code) <- getSomeReg arg
delta <- getDeltaNat
setDeltaNat (delta-arg_size)
- let code' = code `appOL` arg_code `appOL` toOL [
+ let fmt = floatFormat width
+ code' = code `appOL` arg_code `appOL` toOL [
SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
DELTA (delta-arg_size),
- MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel platform 0))]
+ MOV fmt (OpReg arg_reg) (OpAddr (spRel platform 0))]
push_args rest code'
| otherwise = do
@@ -3650,17 +3657,13 @@ genCCall64 addr conv dest_regs args = do
-- assign the results, if necessary
assign_code [] = nilOL
assign_code [dest] =
- case typeWidth rep of
- W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
- (OpReg xmm0)
- (OpReg r_dest))
- W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
- (OpReg xmm0)
- (OpReg r_dest))
- _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
+ unitOL $
+ movInstr config fmt (OpReg reg) (OpReg r_dest)
where
- rep = localRegType dest
- r_dest = getRegisterReg platform (CmmLocal dest)
+ reg = if isIntFormat fmt then rax else xmm0
+ fmt = cmmTypeFormat rep
+ rep = localRegType dest
+ r_dest = getRegisterReg platform (CmmLocal dest)
assign_code _many = panic "genForeignCall.assign_code many"
return (adjust_rsp `appOL`
@@ -3769,9 +3772,10 @@ genSwitch expr targets = do
tableReg <- getNewRegNat (intFormat (platformWordWidth platform))
targetReg <- getNewRegNat (intFormat (platformWordWidth platform))
let op = OpAddr (AddrBaseIndex (EABaseReg tableReg) (EAIndex reg (platformWordSizeInBytes platform)) (ImmInt 0))
+ fmt = archWordFormat is32bit
code = e_code `appOL` toOL
- [ LEA (archWordFormat is32bit) (OpAddr (AddrBaseIndex EABaseRip EAIndexNone (ImmCLbl lbl))) (OpReg tableReg)
- , MOV (archWordFormat is32bit) op (OpReg targetReg)
+ [ LEA fmt (OpAddr (AddrBaseIndex EABaseRip EAIndexNone (ImmCLbl lbl))) (OpReg tableReg)
+ , MOV fmt op (OpReg targetReg)
, JMP_TBL (OpReg targetReg) ids (Section ReadOnlyData lbl) lbl
]
return code
@@ -4432,10 +4436,10 @@ genCtzGeneric width dst src = do
src_r <- getNewRegNat format
tmp_r <- getNewRegNat format
let instrs = code_src src_r `appOL` toOL
- ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
- [ BSF format (OpReg src_r) tmp_r
+ ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+ [ BSF format (OpReg src_r) tmp_r
, MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
- , CMOV NE format (OpReg tmp_r) dst_r
+ , CMOV NE format (OpReg tmp_r) dst_r
]) -- NB: We don't need to zero-extend the result for the
-- W8/W16 cases because the 'MOV' insn already
-- took care of implicitly clearing the upper bits
@@ -4509,11 +4513,11 @@ genMemCpyInlineMaybe align dst src n = do
go dst src tmp (i - 4)
| i >= 2 =
unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
- unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+ unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
go dst src tmp (i - 2)
| i >= 1 =
unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
- unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+ unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
go dst src tmp (i - 1)
| otherwise = nilOL
where
@@ -4789,8 +4793,8 @@ genPext bid width dst src mask = do
toOL
[ MOVZxL format (OpReg src_r ) (OpReg src_r )
, MOVZxL format (OpReg mask_r) (OpReg mask_r)
- , PEXT II32 (OpReg mask_r) (OpReg src_r ) dst_r
- , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
+ , PEXT II32 (OpReg mask_r) (OpReg src_r ) dst_r
+ , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
]
else
unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)
@@ -4832,11 +4836,11 @@ genClz bid width dst src = do
src_r <- getNewRegNat format
tmp_r <- getNewRegNat format
return $ code_src src_r `appOL` toOL
- ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
- [ BSR format (OpReg src_r) tmp_r
- , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
- , CMOV NE format (OpReg tmp_r) dst_r
- , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
+ ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+ [ BSR format (OpReg src_r) tmp_r
+ , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
+ , CMOV NE format (OpReg tmp_r) dst_r
+ , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
]) -- NB: We don't need to zero-extend the result for the
-- W8/W16 cases because the 'MOV' insn already
-- took care of implicitly clearing the upper bits
@@ -4848,7 +4852,8 @@ genWordToFloat bid width dst src =
genAtomicRead :: Width -> MemoryOrdering -> LocalReg -> CmmExpr -> NatM InstrBlock
genAtomicRead width _mord dst addr = do
- load_code <- intLoadCode (MOV (intFormat width)) addr
+ let fmt = intFormat width
+ load_code <- intLoadCode (MOV fmt) addr
return (load_code (getLocalRegReg dst))
genAtomicWrite :: Width -> MemoryOrdering -> CmmExpr -> CmmExpr -> NatM InstrBlock
=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -32,6 +32,7 @@ module GHC.CmmToAsm.X86.Instr
, mkStackDeallocInstr
, mkSpillInstr
, mkRegRegMoveInstr
+ , movInstr
, jumpDestsOfInstr
, canFallthroughTo
, patchRegsOfInstr
@@ -201,20 +202,18 @@ data Instr
-- This carries a BlockId so it can be used in unwinding information.
| DELTA Int
- -- Moves.
- | MOV Format Operand Operand
- -- ^ N.B. Due to AT&T assembler quirks, when used with 'II64'
+ -- | X86 scalar move instruction.
+ --
+ -- When used at a vector format, only moves the lower 64 bits of data;
+ -- the rest of the data in the destination may either be zeroed or
+ -- preserved, depending on the specific format.
+ | MOV Format Operand Operand
+ -- N.B. Due to AT&T assembler quirks, when used with 'II64'
-- 'Format' immediate source and memory target operand, the source
-- operand is interpreted to be a 32-bit sign-extended value.
-- True 64-bit operands need to be either first moved to a register or moved
-- with @MOVABS@; we currently do not use this instruction in GHC.
-- See https://stackoverflow.com/questions/52434073/whats-the-difference-between-the-x86-64-att-instructions-movq-and-movabsq.
- | MOV2 Format Format Operand Operand
- -- ^ Like MOV, but between two different kinds of registers
- -- (e.g. moving rax to xmm1)
- --
- -- SIMD NCG TODO: this is a bit of a hack, but the alternative would
- -- be to have MOV store two Formats to handle xmm -> rax and rax -> xmm.
| MOVD Format Operand Operand -- ^ MOVD/MOVQ SSE2 instructions
-- (bitcast between a general purpose
@@ -378,12 +377,20 @@ data Instr
| INSERTPS Format Imm Operand Reg
-- move operations
- | VMOVU Format Operand Operand
+
+ -- | SSE2 unaligned move of floating-point vectors
| MOVU Format Operand Operand
+ -- | AVX unaligned move of floating-point vectors
+ | VMOVU Format Operand Operand
+ -- | SSE2 move between memory and low-part of an xmm register
| MOVL Format Operand Operand
+ -- | SSE move between memory and high-part of an xmm register
| MOVH Format Operand Operand
+ -- | SSE aligned move of floating-point vectors
| MOVA Format Operand Operand
+ -- | SSE2 unaligned move of integer vectors
| MOVDQU Format Operand Operand
+ -- | AVX unaligned move of integer vectors
| VMOVDQU Format Operand Operand
-- logic operations
@@ -403,7 +410,9 @@ data Instr
| SHUFPD Format Imm Operand Reg
| VSHUFPD Format Imm Operand Reg Reg
- | MOVHLPS Format Operand Reg
+ -- | Move two 32-bit floats from the high part of an xmm register
+ -- to the low part of another xmm register.
+ | MOVHLPS Format Reg Reg
| PUNPCKLQDQ Format Operand Reg
-- Shift
@@ -426,9 +435,21 @@ data FMAPermutation = FMA132 | FMA213 | FMA231
regUsageOfInstr :: Platform -> Instr -> RegUsage
regUsageOfInstr platform instr
= case instr of
- MOV fmt src dst -> usageRW fmt src dst
- MOV2 srcFmt dstFmt src dst -> mkRU (use_R srcFmt src []) (use_R dstFmt dst [])
- MOVD fmt src dst -> mkRU (use_R fmt src []) (use_R (movdOutFormat fmt) dst [])
+ MOV fmt src dst
+ -- MOVSS/MOVSD preserve the upper half of vector registers,
+ -- but only for reg-2-reg moves
+ | VecFormat _ sFmt <- fmt
+ , isFloatScalarFormat sFmt
+ , OpReg {} <- src
+ , OpReg {} <- dst
+ -> usageRM fmt src dst
+ -- other MOV instructions zero any remaining upper part of the destination
+ -- (largely to avoid partial register stalls)
+ | otherwise
+ -> usageRW fmt src dst
+ MOVD fmt src dst ->
+ -- NB: MOVD/MOVQ always zero any remaining upper part of destination
+ mkRU (use_R fmt src []) (use_R (movdOutFormat fmt) dst [])
CMOV _ fmt src dst -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
MOVZxL fmt src dst -> usageRW fmt src dst
MOVSxL fmt src dst -> usageRW fmt src dst
@@ -566,7 +587,7 @@ regUsageOfInstr platform instr
PSLLDQ fmt off dst -> mkRU (use_R fmt off []) [mk fmt dst]
MOVHLPS fmt src dst
- -> mkRU (use_R fmt src []) [mk fmt dst]
+ -> mkRU [mk fmt src] [mk fmt dst]
PUNPCKLQDQ fmt src dst
-> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
@@ -680,7 +701,6 @@ patchRegsOfInstr :: HasDebugCallStack => Platform -> Instr -> (Reg -> Reg) -> In
patchRegsOfInstr platform instr env
= case instr of
MOV fmt src dst -> MOV fmt (patchOp src) (patchOp dst)
- MOV2 srcFmt dstFmt src dst -> MOV2 srcFmt dstFmt (patchOp src) (patchOp dst)
MOVD fmt src dst -> patch2 (MOVD fmt) src dst
CMOV cc fmt src dst -> CMOV cc fmt (patchOp src) (env dst)
MOVZxL fmt src dst -> patch2 (MOVZxL fmt) src dst
@@ -805,7 +825,7 @@ patchRegsOfInstr platform instr env
-> PSRLDQ fmt (patchOp off) (env dst)
MOVHLPS fmt src dst
- -> MOVHLPS fmt (patchOp src) (env dst)
+ -> MOVHLPS fmt (env src) (env dst)
PUNPCKLQDQ fmt src dst
-> PUNPCKLQDQ fmt (patchOp src) (env dst)
@@ -899,8 +919,14 @@ mkSpillInstr
-> [Instr]
mkSpillInstr config (RegFormat reg fmt) delta slot =
- [ movInstr config fmt (OpReg reg) (OpAddr (spRel platform off)) ]
+ [ movInstr config fmt' (OpReg reg) (OpAddr (spRel platform off)) ]
where
+ fmt'
+ | isVecFormat fmt
+ = fmt
+ | otherwise
+ = scalarMoveFormat platform fmt
+ -- Spill the platform word size, at a minimum
platform = ncgPlatform config
off = spillSlotToOffset platform slot - delta
@@ -914,16 +940,27 @@ mkLoadInstr
-> [Instr]
mkLoadInstr config (RegFormat reg fmt) delta slot =
- [ movInstr config fmt (OpAddr (spRel platform off)) (OpReg reg) ]
+ [ movInstr config fmt' (OpAddr (spRel platform off)) (OpReg reg) ]
where
+ fmt'
+ | isVecFormat fmt
+ = fmt
+ | otherwise
+ = scalarMoveFormat platform fmt
+ -- Load the platform word size, at a minimum
platform = ncgPlatform config
off = spillSlotToOffset platform slot - delta
+-- | A move instruction for moving the entire contents of an operand
+-- at the given 'Format'.
movInstr :: NCGConfig -> Format -> (Operand -> Operand -> Instr)
movInstr config fmt =
case fmt of
-- NB: we are using {V}MOVU and not {V}MOVA, because we have no guarantees
-- about the stack being sufficiently aligned (even for even numbered stack slots).
+ --
+ -- (Ben Gamari told me that using MOVA instead of MOVU does not make a
+ -- difference in practice when moving between registers.)
VecFormat {}
| formatInBytes fmt > 16
-> if avx
@@ -936,9 +973,8 @@ movInstr config fmt =
-> MOVU fmt
| otherwise
-> sorry "128-bit wide vectors require either -msse2 or -mavx"
- _ -> MOV (scalarMoveFormat platform fmt)
+ _ -> MOV fmt
where
- platform = ncgPlatform config
avx = ncgAvxEnabled config
sse = ncgSseVersion config
@@ -1002,11 +1038,18 @@ mkRegRegMoveInstr
mkRegRegMoveInstr config fmt src dst =
assertPpr (targetClassOfReg platform src == targetClassOfReg platform dst)
(vcat [ text "mkRegRegMoveInstr: incompatible register classes"
+ , text "fmt:" <+> ppr fmt
, text "src:" <+> ppr src
, text "dst:" <+> ppr dst ]) $
- movInstr config fmt (OpReg src) (OpReg dst)
+ movInstr config fmt' (OpReg src) (OpReg dst)
+ -- Move the platform word size, at a minimum
where
platform = ncgPlatform config
+ fmt'
+ | isVecFormat fmt
+ = fmt
+ | otherwise
+ = scalarMoveFormat platform fmt
scalarMoveFormat :: Platform -> Format -> Format
scalarMoveFormat platform fmt
@@ -1028,32 +1071,40 @@ takeRegRegMoveInstr
takeRegRegMoveInstr platform = \case
MOV fmt (OpReg r1) (OpReg r2)
- -- MOV zeroes the upper part of vector registers,
- -- so it is not a real "move" in that case.
- | not (isVecFormat fmt)
- -> go r1 r2
- MOVD _ (OpReg r1) (OpReg r2)
- -> go r1 r2
+ -- When used with vector registers, MOV only deals with the lower part,
+ -- so it is not a real move. For example, MOVSS/MOVSD between xmm registers
+ -- preserves the upper half, and MOVQ between xmm registers zeroes the upper half.
+ | not $ isVecFormat fmt
+ -- Don't eliminate a move between e.g. RAX and XMM:
+ -- even though we might be using XMM to store a scalar integer value,
+ -- some instructions only support XMM registers.
+ , targetClassOfReg platform r1 == targetClassOfReg platform r2
+ -> Just (r1, r2)
+ MOVD {}
+ -- MOVD moves between xmm registers and general-purpose registers,
+ -- and we don't want to eliminate those moves (as noted for MOV).
+ -> Nothing
+
+ -- SSE2/AVX move instructions always move the full register.
MOVA _ (OpReg r1) (OpReg r2)
- -> go r1 r2
+ -> Just (r1, r2)
MOVU _ (OpReg r1) (OpReg r2)
- -> go r1 r2
+ -> Just (r1, r2)
VMOVU _ (OpReg r1) (OpReg r2)
- -> go r1 r2
+ -> Just (r1, r2)
MOVDQU _ (OpReg r1) (OpReg r2)
- -> go r1 r2
+ -> Just (r1, r2)
VMOVDQU _ (OpReg r1) (OpReg r2)
- -> go r1 r2
+ -> Just (r1, r2)
+
+ -- MOVL, MOVH and MOVHLPS preserve some part of the destination register,
+ -- so are not simple moves.
+ MOVL {} -> Nothing
+ MOVH {} -> Nothing
+ MOVHLPS {} -> Nothing
+
+ -- Other instructions are not moves.
_ -> Nothing
- where
- go r1 r2
- -- Don't eliminate a move between e.g. RAX and XMM:
- -- even though we might be using XMM to store a scalar integer value,
- -- some instructions only support XMM registers.
- | targetClassOfReg platform r1 == targetClassOfReg platform r2
- = Just (r1, r2)
- | otherwise
- = Nothing
-- | Make an unconditional branch instruction.
mkJumpInstr
=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -619,15 +619,12 @@ pprInstr platform i = case i of
II64 -> II32 -- 32-bit version is equivalent, and smaller
_ -> format
- MOV format src dst
- -> pprFormatOpOp (text "mov") format src dst
-
- MOV2 srcFmt dstFmt src dst
- -> pprFormatOpOp (text "mov") fmt src dst
- where
- fmt = if formatInBytes srcFmt <= formatInBytes dstFmt
- then srcFmt
- else dstFmt
+ MOV fmt src dst
+ -> pprFormatOpOp (text "mov") fmt' src dst
+ where
+ fmt' = case fmt of
+ VecFormat _l sFmt -> scalarFormatFormat sFmt
+ _ -> fmt
CMOV cc format src dst
-> pprCondOpReg (text "cmov") format cc src dst
@@ -1004,7 +1001,7 @@ pprInstr platform i = case i of
-> pprDoubleShift (text "psrldq") format offset dst
MOVHLPS format from to
- -> pprOpReg (text "movhlps") format from to
+ -> pprOpReg (text "movhlps") format (OpReg from) to
PUNPCKLQDQ format from to
-> pprOpReg (text "punpcklqdq") format from to
=====================================
compiler/GHC/CmmToC.hs
=====================================
@@ -820,9 +820,6 @@ pprMachOp_for_C platform mop = case mop of
MO_SF_Round _from to -> parens (machRep_F_CType to)
MO_FS_Truncate _from to -> parens (machRep_S_CType platform to)
- MO_V_Shuffle {} -> text "__builtin_shufflevector"
- MO_VF_Shuffle {} -> text "__builtin_shufflevector"
-
MO_RelaxedRead _ -> pprTrace "offending mop:"
(text "MO_RelaxedRead")
(panic $ "PprC.pprMachOp_for_C: MO_S_MulMayOflo"
@@ -833,88 +830,93 @@ pprMachOp_for_C platform mop = case mop of
(panic $ "PprC.pprMachOp_for_C: MO_S_MulMayOflo"
++ " should have been handled earlier!")
+ MO_AlignmentCheck {} -> panic "-falignment-sanitisation not supported by unregisterised backend"
+
+-- SIMD vector instructions: currently unsupported
+ MO_V_Shuffle {} -> pprTrace "offending mop:"
+ (text "MO_V_Shuffle")
+ (panic $ "PprC.pprMachOp_for_C: MO_V_Shuffle"
+ ++ "unsupported by the unregisterised backend")
+ MO_VF_Shuffle {} -> pprTrace "offending mop:"
+ (text "MO_VF_Shuffle")
+ (panic $ "PprC.pprMachOp_for_C: MO_VF_Shuffle"
+ ++ "unsupported by the unregisterised backend")
MO_V_Insert {} -> pprTrace "offending mop:"
(text "MO_V_Insert")
(panic $ "PprC.pprMachOp_for_C: MO_V_Insert"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_V_Extract {} -> pprTrace "offending mop:"
(text "MO_V_Extract")
(panic $ "PprC.pprMachOp_for_C: MO_V_Extract"
- ++ " should have been handled earlier!")
-
+ ++ "unsupported by the unregisterised backend")
MO_V_Add {} -> pprTrace "offending mop:"
(text "MO_V_Add")
(panic $ "PprC.pprMachOp_for_C: MO_V_Add"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_V_Sub {} -> pprTrace "offending mop:"
(text "MO_V_Sub")
(panic $ "PprC.pprMachOp_for_C: MO_V_Sub"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_V_Mul {} -> pprTrace "offending mop:"
(text "MO_V_Mul")
(panic $ "PprC.pprMachOp_for_C: MO_V_Mul"
- ++ " should have been handled earlier!")
-
+ ++ "unsupported by the unregisterised backend")
MO_VS_Quot {} -> pprTrace "offending mop:"
(text "MO_VS_Quot")
(panic $ "PprC.pprMachOp_for_C: MO_VS_Quot"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VS_Rem {} -> pprTrace "offending mop:"
(text "MO_VS_Rem")
(panic $ "PprC.pprMachOp_for_C: MO_VS_Rem"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VS_Neg {} -> pprTrace "offending mop:"
(text "MO_VS_Neg")
(panic $ "PprC.pprMachOp_for_C: MO_VS_Neg"
- ++ " should have been handled earlier!")
-
+ ++ "unsupported by the unregisterised backend")
MO_VU_Quot {} -> pprTrace "offending mop:"
(text "MO_VU_Quot")
(panic $ "PprC.pprMachOp_for_C: MO_VU_Quot"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VU_Rem {} -> pprTrace "offending mop:"
(text "MO_VU_Rem")
(panic $ "PprC.pprMachOp_for_C: MO_VU_Rem"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_V_Broadcast {} -> pprTrace "offending mop:"
(text "MO_V_Broadcast")
(panic $ "PprC.pprMachOp_for_C: MO_V_Broadcast"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VF_Broadcast {} -> pprTrace "offending mop:"
(text "MO_VF_Broadcast")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Broadcast"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VF_Insert {} -> pprTrace "offending mop:"
(text "MO_VF_Insert")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VF_Extract {} -> pprTrace "offending mop:"
(text "MO_VF_Extract")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Extract"
- ++ " should have been handled earlier!")
-
+ ++ "unsupported by the unregisterised backend")
MO_VF_Add {} -> pprTrace "offending mop:"
(text "MO_VF_Add")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Add"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VF_Sub {} -> pprTrace "offending mop:"
(text "MO_VF_Sub")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Sub"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VF_Neg {} -> pprTrace "offending mop:"
(text "MO_VF_Neg")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Neg"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VF_Mul {} -> pprTrace "offending mop:"
(text "MO_VF_Mul")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Mul"
- ++ " should have been handled earlier!")
+ ++ "unsupported by the unregisterised backend")
MO_VF_Quot {} -> pprTrace "offending mop:"
(text "MO_VF_Quot")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Quot"
- ++ " should have been handled earlier!")
-
- MO_AlignmentCheck {} -> panic "-falignment-sanitisation not supported by unregisterised backend"
+ ++ "unsupported by the unregisterised backend")
signedOp :: MachOp -> Bool -- Argument type(s) are signed ints
signedOp (MO_S_Quot _) = True
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/6fe8b90c2d137b60387af1d18169a7c8c1bddbcc...125fb19cec33bdd5bb0de9117222b6dbac665830
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/6fe8b90c2d137b60387af1d18169a7c8c1bddbcc...125fb19cec33bdd5bb0de9117222b6dbac665830
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240626/ccc58742/attachment-0001.html>
More information about the ghc-commits
mailing list