[Git][ghc/ghc][wip/ncg-simd] Add min/max primops
sheaf (@sheaf)
gitlab at gitlab.haskell.org
Tue Jul 30 10:06:07 UTC 2024
sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC
Commits:
63d2a5d4 by sheaf at 2024-07-30T12:05:37+02:00
Add min/max primops
This commit adds min/max primops, such as
minDouble# :: Double# -> Double# -> Double#
minFloatX4# :: FloatX4# -> FloatX4# -> FloatX4#
minWord16X8# :: Word16X8# -> Word16X8# -> Word16X8#
These are supported in:
- the X86, AArch64 and PowerPC NCGs,
- the LLVM backend,
- the WebAssembly and JavaScript backends.
Fixes #25120
- - - - -
26 changed files:
- compiler/GHC/Builtin/primops.txt.pp
- compiler/GHC/Cmm/MachOp.hs
- compiler/GHC/Cmm/Parser.y
- compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
- compiler/GHC/CmmToAsm/AArch64/Instr.hs
- compiler/GHC/CmmToAsm/AArch64/Ppr.hs
- compiler/GHC/CmmToAsm/PPC/CodeGen.hs
- compiler/GHC/CmmToAsm/PPC/Instr.hs
- compiler/GHC/CmmToAsm/PPC/Ppr.hs
- compiler/GHC/CmmToAsm/Wasm/Asm.hs
- compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
- compiler/GHC/CmmToAsm/Wasm/Types.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/CmmToC.hs
- compiler/GHC/CmmToLlvm/CodeGen.hs
- compiler/GHC/JS/Make.hs
- compiler/GHC/Llvm/Types.hs
- compiler/GHC/StgToCmm/Prim.hs
- compiler/GHC/StgToJS/Prim.hs
- docs/users_guide/9.12.1-notes.rst
- testsuite/tests/simd/should_run/all.T
- testsuite/tests/simd/should_run/simd006.hs
- + testsuite/tests/simd/should_run/simd012.hs
- + testsuite/tests/simd/should_run/simd012.stdout
Changes:
=====================================
compiler/GHC/Builtin/primops.txt.pp
=====================================
@@ -1093,6 +1093,14 @@ primop DoubleLtOp "<##" Compare Double# -> Double# -> Int#
primop DoubleLeOp "<=##" Compare Double# -> Double# -> Int#
with fixity = infix 4
+primop DoubleMinOp "minDouble#" GenPrimOp
+ Double# -> Double# -> Double#
+ with commutable = True
+
+primop DoubleMaxOp "maxDouble#" GenPrimOp
+ Double# -> Double# -> Double#
+ with commutable = True
+
primop DoubleAddOp "+##" GenPrimOp
Double# -> Double# -> Double#
with commutable = True
@@ -1259,6 +1267,14 @@ primop FloatNeOp "neFloat#" Compare
primop FloatLtOp "ltFloat#" Compare Float# -> Float# -> Int#
primop FloatLeOp "leFloat#" Compare Float# -> Float# -> Int#
+primop FloatMinOp "minFloat#" GenPrimOp
+ Float# -> Float# -> Float#
+ with commutable = True
+
+primop FloatMaxOp "maxFloat#" GenPrimOp
+ Float# -> Float# -> Float#
+ with commutable = True
+
primop FloatAddOp "plusFloat#" GenPrimOp
Float# -> Float# -> Float#
with commutable = True
@@ -4202,6 +4218,18 @@ primop VecShuffleOp "shuffle#" GenPrimOp
into the result vector.}
with vector = ALL_VECTOR_TYPES
+primop VecMinOp "min#" GenPrimOp
+ VECTOR -> VECTOR -> VECTOR
+ {Component-wise minimum of two vectors.}
+ with
+ vector = ALL_VECTOR_TYPES
+
+primop VecMaxOp "max#" GenPrimOp
+ VECTOR -> VECTOR -> VECTOR
+ {Component-wise maximum of two vectors.}
+ with
+ vector = ALL_VECTOR_TYPES
+
------------------------------------------------------------------------
section "Prefetch"
=====================================
compiler/GHC/Cmm/MachOp.hs
=====================================
@@ -126,6 +126,9 @@ data MachOp
| MO_F_Gt Width
| MO_F_Lt Width
+ | MO_F_Min Width
+ | MO_F_Max Width
+
-- Bitwise operations. Not all of these may be supported
-- at all sizes, and only integral Widths are valid.
| MO_And Width
@@ -192,6 +195,14 @@ data MachOp
| MO_VF_Mul Length Width
| MO_VF_Quot Length Width
+ -- Min/max operations
+ | MO_VS_Min Length Width
+ | MO_VS_Max Length Width
+ | MO_VU_Min Length Width
+ | MO_VU_Max Length Width
+ | MO_VF_Min Length Width
+ | MO_VF_Max Length Width
+
-- | An atomic read with no memory ordering. Address msut
-- be naturally aligned.
| MO_RelaxedRead Width
@@ -322,6 +333,8 @@ isCommutableMachOp mop =
MO_Xor _ -> True
MO_F_Add _ -> True
MO_F_Mul _ -> True
+ MO_F_Min {} -> True
+ MO_F_Max {} -> True
_other -> False
-- ----------------------------------------------------------------------------
@@ -464,6 +477,8 @@ machOpResultType platform mop tys =
MO_F_Mul r -> cmmFloat r
MO_F_Quot r -> cmmFloat r
MO_F_Neg r -> cmmFloat r
+ MO_F_Min r -> cmmFloat r
+ MO_F_Max r -> cmmFloat r
MO_FMA _ l r -> if l == 1 then cmmFloat r else cmmVec l (cmmFloat r)
@@ -502,9 +517,13 @@ machOpResultType platform mop tys =
MO_VS_Quot l w -> cmmVec l (cmmBits w)
MO_VS_Rem l w -> cmmVec l (cmmBits w)
MO_VS_Neg l w -> cmmVec l (cmmBits w)
+ MO_VS_Min l w -> cmmVec l (cmmBits w)
+ MO_VS_Max l w -> cmmVec l (cmmBits w)
MO_VU_Quot l w -> cmmVec l (cmmBits w)
MO_VU_Rem l w -> cmmVec l (cmmBits w)
+ MO_VU_Min l w -> cmmVec l (cmmBits w)
+ MO_VU_Max l w -> cmmVec l (cmmBits w)
MO_V_Shuffle l w _ -> cmmVec l (cmmBits w)
MO_VF_Shuffle l w _ -> cmmVec l (cmmFloat w)
@@ -518,6 +537,8 @@ machOpResultType platform mop tys =
MO_VF_Mul l w -> cmmVec l (cmmFloat w)
MO_VF_Quot l w -> cmmVec l (cmmFloat w)
MO_VF_Neg l w -> cmmVec l (cmmFloat w)
+ MO_VF_Min l w -> cmmVec l (cmmFloat w)
+ MO_VF_Max l w -> cmmVec l (cmmFloat w)
MO_RelaxedRead r -> cmmBits r
MO_AlignmentCheck _ _ -> ty1
@@ -566,6 +587,8 @@ machOpArgReps platform op =
MO_F_Mul r -> [r,r]
MO_F_Quot r -> [r,r]
MO_F_Neg r -> [r]
+ MO_F_Min r -> [r,r]
+ MO_F_Max r -> [r,r]
MO_FMA _ l r -> [vecwidth l r, vecwidth l r, vecwidth l r]
@@ -611,9 +634,13 @@ machOpArgReps platform op =
MO_VS_Quot l w -> [vecwidth l w, vecwidth l w]
MO_VS_Rem l w -> [vecwidth l w, vecwidth l w]
MO_VS_Neg l w -> [vecwidth l w]
+ MO_VS_Min l w -> [vecwidth l w, vecwidth l w]
+ MO_VS_Max l w -> [vecwidth l w, vecwidth l w]
MO_VU_Quot l w -> [vecwidth l w, vecwidth l w]
MO_VU_Rem l w -> [vecwidth l w, vecwidth l w]
+ MO_VU_Min l w -> [vecwidth l w, vecwidth l w]
+ MO_VU_Max l w -> [vecwidth l w, vecwidth l w]
-- NOTE: The below is owing to the fact that floats use the SSE registers
MO_VF_Add l w -> [vecwidth l w, vecwidth l w]
@@ -621,6 +648,8 @@ machOpArgReps platform op =
MO_VF_Mul l w -> [vecwidth l w, vecwidth l w]
MO_VF_Quot l w -> [vecwidth l w, vecwidth l w]
MO_VF_Neg l w -> [vecwidth l w]
+ MO_VF_Min l w -> [vecwidth l w, vecwidth l w]
+ MO_VF_Max l w -> [vecwidth l w, vecwidth l w]
MO_RelaxedRead _ -> [wordWidth platform]
MO_AlignmentCheck _ r -> [r]
=====================================
compiler/GHC/Cmm/Parser.y
=====================================
@@ -1050,6 +1050,8 @@ machOps = listToUFM $
( "fneg", MO_F_Neg ),
( "fmul", MO_F_Mul ),
( "fquot", MO_F_Quot ),
+ ( "fmin", MO_F_Min ),
+ ( "fmax", MO_F_Max ),
( "fmadd" , MO_FMA FMAdd 1 ),
( "fmsub" , MO_FMA FMSub 1 ),
=====================================
compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
=====================================
@@ -812,6 +812,15 @@ getRegister' config plat expr
MO_Add {} -> notUnary
MO_Sub {} -> notUnary
+ MO_F_Min {} -> notUnary
+ MO_F_Max {} -> notUnary
+ MO_VU_Min {} -> notUnary
+ MO_VU_Max {} -> notUnary
+ MO_VS_Min {} -> notUnary
+ MO_VS_Max {} -> notUnary
+ MO_VF_Min {} -> notUnary
+ MO_VF_Max {} -> notUnary
+
MO_AlignmentCheck {} ->
pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
@@ -1126,6 +1135,8 @@ getRegister' config plat expr
MO_F_Sub w -> floatOp w (\d x y -> unitOL $ SUB d x y)
MO_F_Mul w -> floatOp w (\d x y -> unitOL $ MUL d x y)
MO_F_Quot w -> floatOp w (\d x y -> unitOL $ SDIV d x y)
+ MO_F_Min w -> floatOp w (\d x y -> unitOL $ FMIN d x y)
+ MO_F_Max w -> floatOp w (\d x y -> unitOL $ FMAX d x y)
-- Floating point comparison
MO_F_Eq w -> floatCond w (\d x y -> toOL [ CMP x y, CSET d EQ ])
@@ -1187,6 +1198,12 @@ getRegister' config plat expr
MO_VF_Quot {} -> vectorsNeedLlvm
MO_V_Shuffle {} -> vectorsNeedLlvm
MO_VF_Shuffle {} -> vectorsNeedLlvm
+ MO_VU_Min {} -> vectorsNeedLlvm
+ MO_VU_Max {} -> vectorsNeedLlvm
+ MO_VS_Min {} -> vectorsNeedLlvm
+ MO_VS_Max {} -> vectorsNeedLlvm
+ MO_VF_Min {} -> vectorsNeedLlvm
+ MO_VF_Max {} -> vectorsNeedLlvm
where
notDyadic =
pprPanic "getRegister' (non-dyadic CmmMachOp with 2 arguments): " $
=====================================
compiler/GHC/CmmToAsm/AArch64/Instr.hs
=====================================
@@ -145,6 +145,8 @@ regUsageOfInstr platform instr = case instr of
FCVTZS dst src -> usage (regOp src, regOp dst)
FABS dst src -> usage (regOp src, regOp dst)
FSQRT dst src -> usage (regOp src, regOp dst)
+ FMIN dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
+ FMAX dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
FMA _ dst src1 src2 src3 ->
usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
@@ -295,6 +297,8 @@ patchRegsOfInstr instr env = case instr of
FCVTZS o1 o2 -> FCVTZS (patchOp o1) (patchOp o2)
FABS o1 o2 -> FABS (patchOp o1) (patchOp o2)
FSQRT o1 o2 -> FSQRT (patchOp o1) (patchOp o2)
+ FMIN o1 o2 o3 -> FMIN (patchOp o1) (patchOp o2) (patchOp o3)
+ FMAX o1 o2 o3 -> FMAX (patchOp o1) (patchOp o2) (patchOp o3)
FMA s o1 o2 o3 o4 ->
FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
@@ -667,6 +671,10 @@ data Instr
| FCVTZS Operand Operand
-- Float ABSolute value
| FABS Operand Operand
+ -- Float minimum
+ | FMIN Operand Operand Operand
+ -- Float maximum
+ | FMAX Operand Operand Operand
-- Float SQuare RooT
| FSQRT Operand Operand
@@ -743,6 +751,8 @@ instrCon i =
FCVTZS{} -> "FCVTZS"
FABS{} -> "FABS"
FSQRT{} -> "FSQRT"
+ FMIN {} -> "FMIN"
+ FMAX {} -> "FMAX"
FMA variant _ _ _ _ ->
case variant of
FMAdd -> "FMADD"
=====================================
compiler/GHC/CmmToAsm/AArch64/Ppr.hs
=====================================
@@ -534,6 +534,8 @@ pprInstr platform instr = case instr of
FCVTZS o1 o2 -> op2 (text "\tfcvtzs") o1 o2
FABS o1 o2 -> op2 (text "\tfabs") o1 o2
FSQRT o1 o2 -> op2 (text "\tfsqrt") o1 o2
+ FMIN o1 o2 o3 -> op3 (text "\tfmin") o1 o2 o3
+ FMAX o1 o2 o3 -> op3 (text "\tfmax") o1 o2 o3
FMA variant d r1 r2 r3 ->
let fma = case variant of
FMAdd -> text "\tfmadd"
=====================================
compiler/GHC/CmmToAsm/PPC/CodeGen.hs
=====================================
@@ -589,6 +589,8 @@ getRegister' _ _ (CmmMachOp mop [x, y]) -- dyadic PrimOps
MO_F_Sub w -> triv_float w FSUB
MO_F_Mul w -> triv_float w FMUL
MO_F_Quot w -> triv_float w FDIV
+ MO_F_Min w -> triv_float w FMIN
+ MO_F_Max w -> triv_float w FMAX
-- optimize addition with 32-bit immediate
-- (needed for PIC)
@@ -671,6 +673,12 @@ getRegister' _ _ (CmmMachOp mop [x, y]) -- dyadic PrimOps
MO_VF_Quot {} -> vectorsNeedLlvm
MO_V_Shuffle {} -> vectorsNeedLlvm
MO_VF_Shuffle {} -> vectorsNeedLlvm
+ MO_VU_Min {} -> vectorsNeedLlvm
+ MO_VU_Max {} -> vectorsNeedLlvm
+ MO_VS_Min {} -> vectorsNeedLlvm
+ MO_VS_Max {} -> vectorsNeedLlvm
+ MO_VF_Min {} -> vectorsNeedLlvm
+ MO_VF_Max {} -> vectorsNeedLlvm
_ -> panic "PPC.CodeGen.getRegister: no match"
=====================================
compiler/GHC/CmmToAsm/PPC/Instr.hs
=====================================
@@ -279,6 +279,8 @@ data Instr
| FDIV Format Reg Reg Reg
| FABS Reg Reg -- abs is the same for single and double
| FNEG Reg Reg -- negate is the same for single and double prec.
+ | FMIN Format Reg Reg Reg
+ | FMAX Format Reg Reg Reg
-- | Fused multiply-add instructions.
--
=====================================
compiler/GHC/CmmToAsm/PPC/Ppr.hs
=====================================
@@ -941,6 +941,12 @@ pprInstr platform instr = case instr of
FNEG reg1 reg2
-> pprUnary (text "fneg") reg1 reg2
+ FMIN fmt reg1 reg2 reg3
+ -> pprBinaryF (text "fmin") fmt reg1 reg2 reg3
+
+ FMAX fmt reg1 reg2 reg3
+ -> pprBinaryF (text "fmax") fmt reg1 reg2 reg3
+
FMADD signs fmt dst ra rc rb
-> pprTernaryF (pprFMASign signs) fmt dst ra rc rb
=====================================
compiler/GHC/CmmToAsm/Wasm/Asm.hs
=====================================
@@ -362,6 +362,8 @@ asmTellWasmInstr ty_word instr = case instr of
WasmF64PromoteF32 -> asmTellLine "f64.promote_f32"
WasmAbs ty -> asmTellLine $ asmFromWasmType ty <> ".abs"
WasmNeg ty -> asmTellLine $ asmFromWasmType ty <> ".neg"
+ WasmMin ty -> asmTellLine $ asmFromWasmType ty <> ".min"
+ WasmMax ty -> asmTellLine $ asmFromWasmType ty <> ".max"
WasmCond t -> do
asmTellLine "if"
asmWithTab $ asmTellWasmInstr ty_word t
=====================================
compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
=====================================
@@ -821,6 +821,18 @@ lower_CmmMachOp lbl (MO_F_Lt w0) xs =
lbl
(cmmFloat w0)
xs
+lower_CmmMachOp lbl (MO_F_Min w0) xs =
+ lower_MO_Bin_Homo
+ WasmMin
+ lbl
+ (cmmFloat w0)
+ xs
+lower_CmmMachOp lbl (MO_F_Max w0) xs =
+ lower_MO_Bin_Homo
+ WasmMax
+ lbl
+ (cmmFloat w0)
+ xs
lower_CmmMachOp lbl (MO_And w0) xs =
lower_MO_Bin_Homo
WasmAnd
=====================================
compiler/GHC/CmmToAsm/Wasm/Types.hs
=====================================
@@ -306,6 +306,8 @@ data WasmInstr :: WasmType -> [WasmType] -> [WasmType] -> Type where
WasmF64PromoteF32 :: WasmInstr w ('F32 : pre) ('F64 : pre)
WasmAbs :: WasmTypeTag t -> WasmInstr w (t : pre) (t : pre)
WasmNeg :: WasmTypeTag t -> WasmInstr w (t : pre) (t : pre)
+ WasmMin :: WasmTypeTag t -> WasmInstr w (t : t : pre) (t : pre)
+ WasmMax :: WasmTypeTag t -> WasmInstr w (t : t : pre) (t : pre)
WasmCond :: WasmInstr w pre pre -> WasmInstr w (w : pre) pre
newtype WasmExpr w t = WasmExpr (forall pre. WasmInstr w pre (t : pre))
=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -842,7 +842,7 @@ iselExpr64ParallelBin op e1 e2 = do
-- This is a helper data type which helps reduce the code duplication for
-- the code generation of arithmetic operations. This is not specifically
-- targetted for any particular type like Int8, Int32 etc
-data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div
+data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div | VA_Min | VA_Max
getRegister :: CmmExpr -> NatM Register
getRegister e = do platform <- getPlatform
@@ -1124,6 +1124,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
MO_F_Le {} -> incorrectOperands
MO_F_Gt {} -> incorrectOperands
MO_F_Lt {} -> incorrectOperands
+ MO_F_Min {} -> incorrectOperands
+ MO_F_Max {} -> incorrectOperands
MO_And {} -> incorrectOperands
MO_Or {} -> incorrectOperands
MO_Xor {} -> incorrectOperands
@@ -1141,6 +1143,12 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
MO_VU_Rem {} -> incorrectOperands
MO_V_Shuffle {} -> incorrectOperands
MO_VF_Shuffle {} -> incorrectOperands
+ MO_VU_Min {} -> incorrectOperands
+ MO_VU_Max {} -> incorrectOperands
+ MO_VS_Min {} -> incorrectOperands
+ MO_VS_Max {} -> incorrectOperands
+ MO_VF_Min {} -> incorrectOperands
+ MO_VF_Max {} -> incorrectOperands
MO_VF_Extract {} -> incorrectOperands
MO_VF_Add {} -> incorrectOperands
@@ -1338,6 +1346,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
MO_F_Sub w -> trivialFCode_sse2 w SUB x y
MO_F_Quot w -> trivialFCode_sse2 w FDIV x y
MO_F_Mul w -> trivialFCode_sse2 w MUL x y
+ MO_F_Min w -> trivialFCode_sse2 w (MINMAX Min FloatMinMax) x y
+ MO_F_Max w -> trivialFCode_sse2 w (MINMAX Max FloatMinMax) x y
MO_Add rep -> add_code rep x y
MO_Sub rep -> sub_code rep x y
@@ -1394,6 +1404,12 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
MO_VF_Quot l w | avx -> vector_float_op_avx VA_Div l w x y
| otherwise -> vector_float_op_sse VA_Div l w x y
+ MO_VF_Min l w | avx -> vector_float_op_avx VA_Min l w x y
+ | otherwise -> vector_float_op_sse VA_Min l w x y
+
+ MO_VF_Max l w | avx -> vector_float_op_avx VA_Max l w x y
+ | otherwise -> vector_float_op_sse VA_Max l w x y
+
-- SIMD NCG TODO: integer vector operations
MO_V_Shuffle {} -> needLlvm mop
MO_V_Add {} -> needLlvm mop
@@ -1404,6 +1420,11 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
MO_VU_Quot {} -> needLlvm mop
MO_VU_Rem {} -> needLlvm mop
+ MO_VU_Min {} -> needLlvm mop
+ MO_VU_Max {} -> needLlvm mop
+ MO_VS_Min {} -> needLlvm mop
+ MO_VS_Max {} -> needLlvm mop
+
-- Unary MachOps
MO_S_Neg {} -> incorrectOperands
MO_F_Neg {} -> incorrectOperands
@@ -1633,6 +1654,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
VA_Sub -> arithInstr VSUB
VA_Mul -> arithInstr VMUL
VA_Div -> arithInstr VDIV
+ VA_Min -> arithInstr (VMINMAX Min FloatMinMax)
+ VA_Max -> arithInstr (VMINMAX Max FloatMinMax)
where
-- opcode src2 src1 dst <==> dst = src1 `opcode` src2
arithInstr instr = exp1 `appOL` exp2 `snocOL`
@@ -1658,6 +1681,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
VA_Sub -> arithInstr SUB
VA_Mul -> arithInstr MUL
VA_Div -> arithInstr FDIV
+ VA_Min -> arithInstr (MINMAX Min FloatMinMax)
+ VA_Max -> arithInstr (MINMAX Max FloatMinMax)
where
-- opcode src2 src1 <==> src1 = src1 `opcode` src2
arithInstr instr
=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -40,6 +40,7 @@ module GHC.CmmToAsm.X86.Instr
, isMetaInstr
, isJumpishInstr
, movdOutFormat
+ , MinOrMax(..), MinMaxType(..)
)
where
@@ -330,8 +331,20 @@ data Instr
| PSLLDQ Format Operand Reg
| PSRLDQ Format Operand Reg
+ -- min/max
+ | MINMAX MinOrMax MinMaxType Format Operand Operand
+ | VMINMAX MinOrMax MinMaxType Format Operand Reg Reg
+
data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
+-- | 'MIN' or 'MAX'
+data MinOrMax = Min | Max
+ deriving ( Eq, Show )
+-- | What kind of min/max operation: signed or unsigned vector integer min/max,
+-- or (scalar or vector) floating point min/max?
+data MinMaxType =
+ IntVecMinMax { minMaxSigned :: Bool } | FloatMinMax
+ deriving ( Eq, Show )
data Operand
= OpReg Reg -- register
@@ -508,6 +521,10 @@ regUsageOfInstr platform instr
PUNPCKLQDQ fmt src dst
-> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
+ MINMAX _ _ fmt src dst
+ -> mkRU (use_R fmt src $ use_R fmt dst []) (use_R fmt dst [])
+ VMINMAX _ _ fmt src1 src2 dst
+ -> mkRU (use_R fmt src1 [mk fmt src2]) [mk fmt dst]
_other -> panic "regUsage: unrecognised instr"
where
-- # Definitions
@@ -748,6 +765,11 @@ patchRegsOfInstr platform instr env
PUNPCKLQDQ fmt src dst
-> PUNPCKLQDQ fmt (patchOp src) (env dst)
+ MINMAX minMax ty fmt src dst
+ -> MINMAX minMax ty fmt (patchOp src) (patchOp dst)
+ VMINMAX minMax ty fmt src1 src2 dst
+ -> VMINMAX minMax ty fmt (patchOp src1) (env src2) (env dst)
+
where
patch1 :: (Operand -> a) -> Operand -> a
patch1 insn op = insn $! patchOp op
=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -43,6 +43,7 @@ import GHC.Types.Unique ( pprUniqueAlways )
import GHC.Utils.Outputable
import GHC.Utils.Panic
+import Data.List ( intersperse )
import Data.Word
-- Note [Subsections Via Symbols]
@@ -1034,6 +1035,11 @@ pprInstr platform i = case i of
PUNPCKLQDQ format from to
-> pprOpReg (text "punpcklqdq") format from to
+ MINMAX minMax ty fmt src dst
+ -> pprMinMax False minMax ty fmt [src, dst]
+ VMINMAX minMax ty fmt src1 src2 dst
+ -> pprMinMax True minMax ty fmt [src1, OpReg src2, OpReg dst]
+
where
gtab :: Line doc
gtab = char '\t'
@@ -1365,3 +1371,14 @@ pprInstr platform i = case i of
comma,
pprReg platform format reg
]
+
+ pprMinMax :: Bool -> MinOrMax -> MinMaxType -> Format -> [Operand] -> doc
+ pprMinMax wantV minOrMax mmTy fmt regs
+ = line $ hcat ( instr : intersperse comma ( map ( pprOperand platform fmt ) regs ) )
+ where
+ instr = (if wantV then text "v" else empty)
+ <> (case mmTy of { IntVecMinMax {} -> text "p"; FloatMinMax -> empty })
+ <> (case minOrMax of { Min -> text "min"; Max -> text "max" })
+ <> (case mmTy of { IntVecMinMax wantSigned -> if wantSigned then text "s" else text "u"; FloatMinMax -> empty })
+ <> pprFormat fmt
+ <> space
=====================================
compiler/GHC/CmmToC.hs
=====================================
@@ -725,6 +725,8 @@ pprMachOp_for_C platform mop = case mop of
MO_F_Neg _ -> char '-'
MO_F_Mul _ -> char '*'
MO_F_Quot _ -> char '/'
+ MO_F_Min _ -> text "fmin"
+ MO_F_Max _ -> text "fmax"
-- Floating-point fused multiply-add operations
MO_FMA FMAdd 1 w ->
@@ -917,6 +919,30 @@ pprMachOp_for_C platform mop = case mop of
(text "MO_VF_Quot")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Quot"
++ "unsupported by the unregisterised backend")
+ MO_VU_Min {} -> pprTrace "offending mop:"
+ (text "MO_VU_Min")
+ (panic $ "PprC.pprMachOp_for_C: MO_VU_Min"
+ ++ "unsupported by the unregisterised backend")
+ MO_VU_Max {} -> pprTrace "offending mop:"
+ (text "MO_VU_Max")
+ (panic $ "PprC.pprMachOp_for_C: MO_VU_Max"
+ ++ "unsupported by the unregisterised backend")
+ MO_VS_Min {} -> pprTrace "offending mop:"
+ (text "MO_VS_Min")
+ (panic $ "PprC.pprMachOp_for_C: MO_VS_Min"
+ ++ "unsupported by the unregisterised backend")
+ MO_VS_Max {} -> pprTrace "offending mop:"
+ (text "MO_VS_Max")
+ (panic $ "PprC.pprMachOp_for_C: MO_VS_Max"
+ ++ "unsupported by the unregisterised backend")
+ MO_VF_Min {} -> pprTrace "offending mop:"
+ (text "MO_VF_Min")
+ (panic $ "PprC.pprMachOp_for_C: MO_VU_Min"
+ ++ "unsupported by the unregisterised backend")
+ MO_VF_Max {} -> pprTrace "offending mop:"
+ (text "MO_VF_Max")
+ (panic $ "PprC.pprMachOp_for_C: MO_VU_Max"
+ ++ "unsupported by the unregisterised backend")
signedOp :: MachOp -> Bool -- Argument type(s) are signed ints
signedOp (MO_S_Quot _) = True
=====================================
compiler/GHC/CmmToLlvm/CodeGen.hs
=====================================
@@ -1493,6 +1493,8 @@ genMachOp _ op [x] = case op of
MO_F_Sub _ -> panicOp
MO_F_Mul _ -> panicOp
MO_F_Quot _ -> panicOp
+ MO_F_Min _ -> panicOp
+ MO_F_Max _ -> panicOp
MO_FMA _ _ _ -> panicOp
@@ -1519,9 +1521,13 @@ genMachOp _ op [x] = case op of
MO_VS_Quot _ _ -> panicOp
MO_VS_Rem _ _ -> panicOp
+ MO_VS_Min _ _ -> panicOp
+ MO_VS_Max _ _ -> panicOp
MO_VU_Quot _ _ -> panicOp
MO_VU_Rem _ _ -> panicOp
+ MO_VU_Min _ _ -> panicOp
+ MO_VU_Max _ _ -> panicOp
MO_VF_Insert _ _ -> panicOp
MO_VF_Extract _ _ -> panicOp
@@ -1533,6 +1539,8 @@ genMachOp _ op [x] = case op of
MO_VF_Sub _ _ -> panicOp
MO_VF_Mul _ _ -> panicOp
MO_VF_Quot _ _ -> panicOp
+ MO_VF_Min _ _ -> panicOp
+ MO_VF_Max _ _ -> panicOp
where
negate ty v2 negOp = do
@@ -1732,6 +1740,16 @@ genMachOp_slow opt op [x, y] = case op of
MO_VF_Neg {} -> panicOp
+ -- Min/max
+ MO_F_Min {} -> genMinMaxOp "minnum" x y
+ MO_F_Max {} -> genMinMaxOp "maxnum" x y
+ MO_VF_Min {} -> genMinMaxOp "minnum" x y
+ MO_VF_Max {} -> genMinMaxOp "maxnum" x y
+ MO_VU_Min {} -> genMinMaxOp "umin" x y
+ MO_VU_Max {} -> genMinMaxOp "umax" x y
+ MO_VS_Min {} -> genMinMaxOp "smin" x y
+ MO_VS_Max {} -> genMinMaxOp "smax" x y
+
MO_RelaxedRead {} -> panicOp
MO_AlignmentCheck {} -> panicOp
@@ -1786,6 +1804,19 @@ genMachOp_slow opt op [x, y] = case op of
genCastBinMach ty op = binCastLlvmOp ty (LlvmOp op)
+ genMinMaxOp intrin x y = runExprData $ do
+ vx <- exprToVarW x
+ vy <- exprToVarW y
+ let tx = getVarType vx
+ ty = getVarType vy
+ fname = "llvm." ++ intrin ++ "." ++ ppLlvmTypeShort ty
+ Panic.massertPpr
+ (tx == ty)
+ (vcat [ text (fname ++ ": mismatched arg types")
+ , ppLlvmType tx, ppLlvmType ty ])
+ fptr <- liftExprData $ getInstrinct (fsLit fname) ty [tx, ty]
+ doExprW tx $ Call StdCall fptr [vx, vy] [ReadNone, NoUnwind]
+
-- Detect if overflow will occur in signed multiply of the two
-- CmmExpr's. This is the LLVM assembly equivalent of the NCG
-- implementation. Its much longer due to type information/safety.
=====================================
compiler/GHC/JS/Make.hs
=====================================
@@ -130,7 +130,8 @@ module GHC.JS.Make
-- $math
, math_log, math_sin, math_cos, math_tan, math_exp, math_acos, math_asin,
math_atan, math_abs, math_pow, math_sqrt, math_asinh, math_acosh, math_atanh,
- math_cosh, math_sinh, math_tanh, math_expm1, math_log1p, math_fround
+ math_cosh, math_sinh, math_tanh, math_expm1, math_log1p, math_fround,
+ math_min, math_max
-- * Statement helpers
, Solo(..)
, decl
@@ -672,7 +673,8 @@ math_ op args = ApplExpr (math .^ op) args
math_log, math_sin, math_cos, math_tan, math_exp, math_acos, math_asin, math_atan,
math_abs, math_pow, math_sqrt, math_asinh, math_acosh, math_atanh, math_sign,
- math_sinh, math_cosh, math_tanh, math_expm1, math_log1p, math_fround
+ math_sinh, math_cosh, math_tanh, math_expm1, math_log1p, math_fround,
+ math_min, math_max
:: [JStgExpr] -> JStgExpr
math_log = math_ "log"
math_sin = math_ "sin"
@@ -695,6 +697,8 @@ math_tanh = math_ "tanh"
math_expm1 = math_ "expm1"
math_log1p = math_ "log1p"
math_fround = math_ "fround"
+math_min = math_ "min"
+math_max = math_ "max"
instance Num JStgExpr where
x + y = InfixExpr AddOp x y
=====================================
compiler/GHC/Llvm/Types.hs
=====================================
@@ -91,6 +91,15 @@ ppLlvmType t = case t of
{-# SPECIALIZE ppLlvmType :: LlvmType -> SDoc #-}
{-# SPECIALIZE ppLlvmType :: LlvmType -> HLine #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable
+-- | Pretty-print a short name for a scalar or vector type, e.g. @"i16"@ or @"v4f32"@.
+ppLlvmTypeShort :: LlvmType -> String
+ppLlvmTypeShort t = case t of
+ LMInt w -> 'i' : show w
+ LMFloat -> "f32"
+ LMDouble -> "f64"
+ LMVector l t -> "v" ++ show l ++ ppLlvmTypeShort t
+ _ -> pprPanic "ppLlvmTypeShort" (ppLlvmType t)
+
ppParams :: IsLine doc => LlvmParameterListType -> [LlvmParameter] -> doc
ppParams varg p
= let varg' = case varg of
=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -1481,6 +1481,9 @@ emitPrimOp cfg primop =
DoubleGtOp -> opTranslate (MO_F_Gt W64)
DoubleLtOp -> opTranslate (MO_F_Lt W64)
+ DoubleMinOp -> opTranslate (MO_F_Min W64)
+ DoubleMaxOp -> opTranslate (MO_F_Max W64)
+
DoubleAddOp -> opTranslate (MO_F_Add W64)
DoubleSubOp -> opTranslate (MO_F_Sub W64)
DoubleMulOp -> opTranslate (MO_F_Mul W64)
@@ -1512,6 +1515,9 @@ emitPrimOp cfg primop =
FloatFNMAdd -> fmaOp FNMAdd 1 W32
FloatFNMSub -> fmaOp FNMSub 1 W32
+ FloatMinOp -> opTranslate (MO_F_Min W32)
+ FloatMaxOp -> opTranslate (MO_F_Max W32)
+
-- Vector ops
(VecAddOp FloatVec n w) -> opTranslate (MO_VF_Add n w)
@@ -1521,6 +1527,8 @@ emitPrimOp cfg primop =
(VecQuotOp FloatVec _ _) -> \_ -> panic "unsupported primop"
(VecRemOp FloatVec _ _) -> \_ -> panic "unsupported primop"
(VecNegOp FloatVec n w) -> opTranslate (MO_VF_Neg n w)
+ (VecMinOp FloatVec n w) -> opTranslate (MO_VF_Min n w)
+ (VecMaxOp FloatVec n w) -> opTranslate (MO_VF_Max n w)
(VecAddOp IntVec n w) -> opTranslate (MO_V_Add n w)
(VecSubOp IntVec n w) -> opTranslate (MO_V_Sub n w)
@@ -1529,6 +1537,8 @@ emitPrimOp cfg primop =
(VecQuotOp IntVec n w) -> opTranslate (MO_VS_Quot n w)
(VecRemOp IntVec n w) -> opTranslate (MO_VS_Rem n w)
(VecNegOp IntVec n w) -> opTranslate (MO_VS_Neg n w)
+ (VecMinOp IntVec n w) -> opTranslate (MO_VS_Min n w)
+ (VecMaxOp IntVec n w) -> opTranslate (MO_VS_Min n w)
(VecAddOp WordVec n w) -> opTranslate (MO_V_Add n w)
(VecSubOp WordVec n w) -> opTranslate (MO_V_Sub n w)
@@ -1537,6 +1547,8 @@ emitPrimOp cfg primop =
(VecQuotOp WordVec n w) -> opTranslate (MO_VU_Quot n w)
(VecRemOp WordVec n w) -> opTranslate (MO_VU_Rem n w)
(VecNegOp WordVec _ _) -> \_ -> panic "unsupported primop"
+ (VecMinOp WordVec n w) -> opTranslate (MO_VU_Min n w)
+ (VecMaxOp WordVec n w) -> opTranslate (MO_VU_Min n w)
-- Vector FMA instructions
VecFMAdd _ n w -> fmaOp FMAdd n w
=====================================
compiler/GHC/StgToJS/Prim.hs
=====================================
@@ -477,6 +477,8 @@ genPrim prof bound ty op = case op of
DoubleDivOp -> \[r] [x,y] -> pure $ PrimInline $ r |= Div x y
DoubleNegOp -> \[r] [x] -> pure $ PrimInline $ r |= Negate x
DoubleFabsOp -> \[r] [x] -> pure $ PrimInline $ r |= math_abs [x]
+ DoubleMinOp -> \[r] [x,y] -> pure $ PrimInline $ r |= math_min [x,y]
+ DoubleMaxOp -> \[r] [x,y] -> pure $ PrimInline $ r |= math_max [x,y]
DoubleToIntOp -> \[r] [x] -> pure $ PrimInline $ r |= toI32 x
DoubleToFloatOp -> \[r] [x] -> pure $ PrimInline $ r |= math_fround [x]
DoubleExpOp -> \[r] [x] -> pure $ PrimInline $ r |= math_exp [x]
@@ -520,6 +522,8 @@ genPrim prof bound ty op = case op of
FloatMulOp -> \[r] [x,y] -> pure $ PrimInline $ r |= math_fround [Mul x y]
FloatDivOp -> \[r] [x,y] -> pure $ PrimInline $ r |= math_fround [Div x y]
FloatNegOp -> \[r] [x] -> pure $ PrimInline $ r |= Negate x
+ FloatMinOp -> \[r] [x,y] -> pure $ PrimInline $ r |= math_min [x,y]
+ FloatMaxOp -> \[r] [x,y] -> pure $ PrimInline $ r |= math_max [x,y]
FloatFabsOp -> \[r] [x] -> pure $ PrimInline $ r |= math_abs [x]
FloatToIntOp -> \[r] [x] -> pure $ PrimInline $ r |= toI32 x
FloatExpOp -> \[r] [x] -> pure $ PrimInline $ r |= math_fround [math_exp [x]]
@@ -1204,6 +1208,8 @@ genPrim prof bound ty op = case op of
VecReadScalarOffAddrOp _ _ _ -> unhandledPrimop op
VecWriteScalarOffAddrOp _ _ _ -> unhandledPrimop op
VecShuffleOp _ _ _ -> unhandledPrimop op
+ VecMinOp {} -> unhandledPrimop op
+ VecMaxOp {} -> unhandledPrimop op
PrefetchByteArrayOp3 -> noOp
PrefetchMutableByteArrayOp3 -> noOp
=====================================
docs/users_guide/9.12.1-notes.rst
=====================================
@@ -136,6 +136,12 @@ Runtime system
and indices ``(# 4#, 3#, 6#, 1# #)``, will return a vector with components
``(# 44.1#, 33.1#, 66.1#, 11.1# #)``.
+- New instructions for minimum/maximum, such as `minDouble#` and
+ `minFloatX4#`. These instructions compute the minimum/maximum of their inputs,
+ working component-wise for SIMD vectors. Supported argument types are scalar
+ integer values (e.g. `Word16#`, `Int32#` etc) and both scalar and vector
+ floating point values (e.g. `Float`, `DoubleX2#`, `FloatX8#` etc).
+
``ghc`` library
~~~~~~~~~~~~~~~
=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -39,6 +39,7 @@ test('simd010', [], compile_and_run, [''])
test('simd011', [ unless(have_cpu_feature('fma'), skip)
, extra_hc_opts('-mfma')
], compile_and_run, [''])
+test('simd012', [], compile_and_run, [''])
test('T25062_V16', [], compile_and_run, [''])
test('T25062_V32', [ unless(have_cpu_feature('avx2'), skip)
=====================================
testsuite/tests/simd/should_run/simd006.hs
=====================================
@@ -120,6 +120,15 @@ instance Arbitrary Word64 where
instance Arbitrary Word32 where
arbitrary = wordDownsize <$> arbitraryWord64
+class HasMinMax a where
+ mini, maxi :: a -> a -> a
+instance HasMinMax FloatNT where
+ mini (FloatNT (F# f1)) (FloatNT (F# f2)) = FloatNT (F# (minFloat# f1 f2))
+ maxi (FloatNT (F# f1)) (FloatNT (F# f2)) = FloatNT (F# (maxFloat# f1 f2))
+instance HasMinMax DoubleNT where
+ mini (DoubleNT (D# d1)) (DoubleNT (D# d2)) = DoubleNT (D# (minDouble# d1 d2))
+ maxi (DoubleNT (D# d1)) (DoubleNT (D# d2)) = DoubleNT (D# (maxDouble# d1 d2))
+
newtype FloatNT = FloatNT Float
deriving newtype (Show, Num)
instance Eq FloatNT where
@@ -167,6 +176,9 @@ instance Num FloatX4 where
abs = error "no"
signum = error "no"
fromInteger = error "no"
+instance HasMinMax FloatX4 where
+ mini (FX4# a) (FX4# b) = FX4# (minFloatX4# a b)
+ maxi (FX4# a) (FX4# b) = FX4# (maxFloatX4# a b)
data DoubleX2 = DX2# DoubleX2#
instance Show DoubleX2 where
@@ -195,6 +207,9 @@ instance Num DoubleX2 where
abs = error "no"
signum = error "no"
fromInteger = error "no"
+instance HasMinMax DoubleX2 where
+ mini (DX2# a) (DX2# b) = DX2# (minDoubleX2# a b)
+ maxi (DX2# a) (DX2# b) = DX2# (maxDoubleX2# a b)
data Expr a where
Lit :: a -> Expr a
@@ -202,6 +217,8 @@ data Expr a where
Sub :: Expr a -> Expr a -> Expr a
Neg :: Expr a -> Expr a
Mul :: Expr a -> Expr a -> Expr a
+ Min :: Expr a -> Expr a -> Expr a
+ Max :: Expr a -> Expr a -> Expr a
deriving (Show, Eq)
fmapExpr :: (a -> b) -> Expr a -> Expr b
fmapExpr f (Lit a) = Lit (f a)
@@ -209,6 +226,8 @@ fmapExpr f (Add a b) = Add (fmapExpr f a) (fmapExpr f b)
fmapExpr f (Sub a b) = Sub (fmapExpr f a) (fmapExpr f b)
fmapExpr f (Neg a) = Neg (fmapExpr f a)
fmapExpr f (Mul a b) = Mul (fmapExpr f a) (fmapExpr f b)
+fmapExpr f (Min a b) = Min (fmapExpr f a) (fmapExpr f b)
+fmapExpr f (Max a b) = Max (fmapExpr f a) (fmapExpr f b)
instance Arbitrary a => Arbitrary (Expr a) where
arbitrary = do
@@ -218,15 +237,18 @@ instance Arbitrary a => Arbitrary (Expr a) where
2 -> Sub <$> arbitrary <*> arbitrary
3 -> Neg <$> arbitrary
4 -> Mul <$> arbitrary <*> arbitrary
+ 5 -> Min <$> arbitrary <*> arbitrary
+ 6 -> Max <$> arbitrary <*> arbitrary
_ -> Lit <$> arbitrary
-eval :: Num a => Expr a -> a
+eval :: (Num a, HasMinMax a) => Expr a -> a
eval (Lit a) = a
eval (Add a b) = eval a + eval b
eval (Sub a b) = eval a - eval b
eval (Neg a) = negate (eval a)
eval (Mul a b) = eval a * eval b
-
+eval (Min a b) = mini (eval a) (eval b)
+eval (Max a b) = maxi (eval a) (eval b)
int64ToInt :: Int64 -> Int
#if WORD_SIZE_IN_BITS == 64
=====================================
testsuite/tests/simd/should_run/simd012.hs
=====================================
@@ -0,0 +1,30 @@
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- simple test for vector min/max instructions
+
+import GHC.Exts
+import GHC.Prim
+
+
+main :: IO ()
+main = do
+
+ -- FloatX4#
+ let
+ !f1 = packFloatX4# (# 1.1#, 20.1#, 3.1#, 40.1# #)
+ !f2 = packFloatX4# (# 10.2#, 2.2#, 30.2#, 4.2# #)
+
+ case unpackFloatX4# (minFloatX4# f1 f2) of
+ (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+ case unpackFloatX4# (maxFloatX4# f1 f2) of
+ (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+
+ -- DoubleX2#
+ let
+ !d1 = packDoubleX2# (# 1.1##, 20.1## #)
+ !d2 = packDoubleX2# (# 10.2##, 2.2## #)
+
+ case unpackDoubleX2# (minDoubleX2# d1 d2) of
+ (# a, b #) -> print (D# a, D# b)
+ case unpackDoubleX2# (maxDoubleX2# d1 d2) of
+ (# a, b #) -> print (D# a, D# b)
=====================================
testsuite/tests/simd/should_run/simd012.stdout
=====================================
@@ -0,0 +1,4 @@
+(1.1,2.2,3.1,4.2)
+(10.2,20.1,30.2,40.1)
+(1.1,2.2)
+(10.2,20.1)
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/63d2a5d461905d2d83a5fc12ded0b6018679b92a
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/63d2a5d461905d2d83a5fc12ded0b6018679b92a
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240730/125eec6b/attachment-0001.html>
More information about the ghc-commits
mailing list