[Git][ghc/ghc][wip/ncg-simd] Add min/max primops

Tue Jul 30 12:33:39 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
b7f6e70e by sheaf at 2024-07-30T14:33:18+02:00
Add min/max primops

This commit adds min/max primops, such as

  minDouble# :: Double# -> Double# -> Double#
  minFloatX4# :: FloatX4# -> FloatX4# -> FloatX4#
  minWord16X8# :: Word16X8# -> Word16X8# -> Word16X8#

These are supported in:
  - the X86, AArch64 and PowerPC NCGs,
  - the LLVM backend,
  - the WebAssembly and JavaScript backends.

Fixes #25120

- - - - -


28 changed files:

- compiler/GHC/Builtin/primops.txt.pp
- compiler/GHC/Cmm/MachOp.hs
- compiler/GHC/Cmm/Parser.y
- compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
- compiler/GHC/CmmToAsm/AArch64/Instr.hs
- compiler/GHC/CmmToAsm/AArch64/Ppr.hs
- compiler/GHC/CmmToAsm/PPC/CodeGen.hs
- compiler/GHC/CmmToAsm/PPC/Instr.hs
- compiler/GHC/CmmToAsm/PPC/Ppr.hs
- compiler/GHC/CmmToAsm/Wasm/Asm.hs
- compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
- compiler/GHC/CmmToAsm/Wasm/Types.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/CmmToC.hs
- compiler/GHC/CmmToLlvm/CodeGen.hs
- compiler/GHC/JS/Make.hs
- compiler/GHC/Llvm/Types.hs
- compiler/GHC/StgToCmm/Prim.hs
- compiler/GHC/StgToJS/Prim.hs
- docs/users_guide/9.12.1-notes.rst
- libraries/base/src/GHC/Base.hs
- libraries/base/src/GHC/Exts.hs
- testsuite/tests/simd/should_run/all.T
- testsuite/tests/simd/should_run/simd006.hs
- + testsuite/tests/simd/should_run/simd012.hs
- + testsuite/tests/simd/should_run/simd012.stdout


Changes:

=====================================
compiler/GHC/Builtin/primops.txt.pp
=====================================
@@ -1093,6 +1093,14 @@ primop   DoubleLtOp "<##"   Compare   Double# -> Double# -> Int#
 primop   DoubleLeOp "<=##"   Compare   Double# -> Double# -> Int#
    with fixity = infix 4
 
+primop   DoubleMinOp   "minDouble#"      GenPrimOp
+   Double# -> Double# -> Double#
+   with commutable = True
+
+primop   DoubleMaxOp   "maxDouble#"      GenPrimOp
+   Double# -> Double# -> Double#
+   with commutable = True
+
 primop   DoubleAddOp   "+##"   GenPrimOp
    Double# -> Double# -> Double#
    with commutable = True
@@ -1259,6 +1267,14 @@ primop   FloatNeOp  "neFloat#"   Compare
 primop   FloatLtOp  "ltFloat#"   Compare   Float# -> Float# -> Int#
 primop   FloatLeOp  "leFloat#"   Compare   Float# -> Float# -> Int#
 
+primop   FloatMinOp   "minFloat#"      GenPrimOp
+   Float# -> Float# -> Float#
+   with commutable = True
+
+primop   FloatMaxOp   "maxFloat#"      GenPrimOp
+   Float# -> Float# -> Float#
+   with commutable = True
+
 primop   FloatAddOp   "plusFloat#"      GenPrimOp
    Float# -> Float# -> Float#
    with commutable = True
@@ -4202,6 +4218,18 @@ primop VecShuffleOp "shuffle#" GenPrimOp
   into the result vector.}
    with vector = ALL_VECTOR_TYPES
 
+primop VecMinOp "min#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR
+   {Component-wise minimum of two vectors.}
+   with
+      vector = ALL_VECTOR_TYPES
+
+primop VecMaxOp "max#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR
+   {Component-wise maximum of two vectors.}
+   with
+      vector = ALL_VECTOR_TYPES
+
 ------------------------------------------------------------------------
 
 section "Prefetch"


=====================================
compiler/GHC/Cmm/MachOp.hs
=====================================
@@ -126,6 +126,9 @@ data MachOp
   | MO_F_Gt Width
   | MO_F_Lt Width
 
+  | MO_F_Min Width
+  | MO_F_Max Width
+
   -- Bitwise operations.  Not all of these may be supported
   -- at all sizes, and only integral Widths are valid.
   | MO_And   Width
@@ -192,6 +195,14 @@ data MachOp
   | MO_VF_Mul  Length Width
   | MO_VF_Quot Length Width
 
+  -- Min/max operations
+  | MO_VS_Min Length Width
+  | MO_VS_Max Length Width
+  | MO_VU_Min Length Width
+  | MO_VU_Max Length Width
+  | MO_VF_Min Length Width
+  | MO_VF_Max Length Width
+
   -- | An atomic read with no memory ordering. Address msut
   -- be naturally aligned.
   | MO_RelaxedRead Width
@@ -322,6 +333,8 @@ isCommutableMachOp mop =
         MO_Xor _                -> True
         MO_F_Add _              -> True
         MO_F_Mul _              -> True
+        MO_F_Min {}             -> True
+        MO_F_Max {}             -> True
         _other                  -> False
 
 -- ----------------------------------------------------------------------------
@@ -464,6 +477,8 @@ machOpResultType platform mop tys =
     MO_F_Mul r          -> cmmFloat r
     MO_F_Quot r         -> cmmFloat r
     MO_F_Neg r          -> cmmFloat r
+    MO_F_Min r          -> cmmFloat r
+    MO_F_Max r          -> cmmFloat r
 
     MO_FMA _ l r        -> if l == 1 then cmmFloat r else cmmVec l (cmmFloat r)
 
@@ -502,9 +517,13 @@ machOpResultType platform mop tys =
     MO_VS_Quot l w      -> cmmVec l (cmmBits w)
     MO_VS_Rem  l w      -> cmmVec l (cmmBits w)
     MO_VS_Neg  l w      -> cmmVec l (cmmBits w)
+    MO_VS_Min  l w      -> cmmVec l (cmmBits w)
+    MO_VS_Max  l w      -> cmmVec l (cmmBits w)
 
     MO_VU_Quot l w      -> cmmVec l (cmmBits w)
     MO_VU_Rem  l w      -> cmmVec l (cmmBits w)
+    MO_VU_Min  l w      -> cmmVec l (cmmBits w)
+    MO_VU_Max  l w      -> cmmVec l (cmmBits w)
 
     MO_V_Shuffle  l w _ -> cmmVec l (cmmBits w)
     MO_VF_Shuffle l w _ -> cmmVec l (cmmFloat w)
@@ -518,6 +537,8 @@ machOpResultType platform mop tys =
     MO_VF_Mul  l w      -> cmmVec l (cmmFloat w)
     MO_VF_Quot l w      -> cmmVec l (cmmFloat w)
     MO_VF_Neg  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Min  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Max  l w      -> cmmVec l (cmmFloat w)
 
     MO_RelaxedRead r    -> cmmBits r
     MO_AlignmentCheck _ _ -> ty1
@@ -566,6 +587,8 @@ machOpArgReps platform op =
     MO_F_Mul r          -> [r,r]
     MO_F_Quot r         -> [r,r]
     MO_F_Neg r          -> [r]
+    MO_F_Min r          -> [r,r]
+    MO_F_Max r          -> [r,r]
 
     MO_FMA _ l r        -> [vecwidth l r, vecwidth l r, vecwidth l r]
 
@@ -611,9 +634,13 @@ machOpArgReps platform op =
     MO_VS_Quot l w      -> [vecwidth l w, vecwidth l w]
     MO_VS_Rem  l w      -> [vecwidth l w, vecwidth l w]
     MO_VS_Neg  l w      -> [vecwidth l w]
+    MO_VS_Min  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VS_Max  l w      -> [vecwidth l w, vecwidth l w]
 
     MO_VU_Quot l w      -> [vecwidth l w, vecwidth l w]
     MO_VU_Rem  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VU_Min  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VU_Max  l w      -> [vecwidth l w, vecwidth l w]
 
     -- NOTE: The below is owing to the fact that floats use the SSE registers
     MO_VF_Add  l w      -> [vecwidth l w, vecwidth l w]
@@ -621,6 +648,8 @@ machOpArgReps platform op =
     MO_VF_Mul  l w      -> [vecwidth l w, vecwidth l w]
     MO_VF_Quot l w      -> [vecwidth l w, vecwidth l w]
     MO_VF_Neg  l w      -> [vecwidth l w]
+    MO_VF_Min  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VF_Max  l w      -> [vecwidth l w, vecwidth l w]
 
     MO_RelaxedRead _    -> [wordWidth platform]
     MO_AlignmentCheck _ r -> [r]


=====================================
compiler/GHC/Cmm/Parser.y
=====================================
@@ -1050,6 +1050,8 @@ machOps = listToUFM $
         ( "fneg",       MO_F_Neg ),
         ( "fmul",       MO_F_Mul ),
         ( "fquot",      MO_F_Quot ),
+        ( "fmin",       MO_F_Min ),
+        ( "fmax",       MO_F_Max ),
 
         ( "fmadd" ,     MO_FMA FMAdd  1 ),
         ( "fmsub" ,     MO_FMA FMSub  1 ),


=====================================
compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
=====================================
@@ -812,6 +812,15 @@ getRegister' config plat expr
         MO_Add {} -> notUnary
         MO_Sub {} -> notUnary
 
+        MO_F_Min {} -> notUnary
+        MO_F_Max {} -> notUnary
+        MO_VU_Min {} -> notUnary
+        MO_VU_Max {} -> notUnary
+        MO_VS_Min {} -> notUnary
+        MO_VS_Max {} -> notUnary
+        MO_VF_Min {} -> notUnary
+        MO_VF_Max {} -> notUnary
+
         MO_AlignmentCheck {} ->
           pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
 
@@ -1126,6 +1135,8 @@ getRegister' config plat expr
         MO_F_Sub w   -> floatOp w (\d x y -> unitOL $ SUB d x y)
         MO_F_Mul w   -> floatOp w (\d x y -> unitOL $ MUL d x y)
         MO_F_Quot w  -> floatOp w (\d x y -> unitOL $ SDIV d x y)
+        MO_F_Min w   -> floatOp w (\d x y -> unitOL $ FMIN d x y)
+        MO_F_Max w   -> floatOp w (\d x y -> unitOL $ FMAX d x y)
 
         -- Floating point comparison
         MO_F_Eq w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d EQ ])
@@ -1187,6 +1198,12 @@ getRegister' config plat expr
         MO_VF_Quot {} -> vectorsNeedLlvm
         MO_V_Shuffle {} -> vectorsNeedLlvm
         MO_VF_Shuffle {} -> vectorsNeedLlvm
+        MO_VU_Min {} -> vectorsNeedLlvm
+        MO_VU_Max {} -> vectorsNeedLlvm
+        MO_VS_Min {} -> vectorsNeedLlvm
+        MO_VS_Max {} -> vectorsNeedLlvm
+        MO_VF_Min {} -> vectorsNeedLlvm
+        MO_VF_Max {} -> vectorsNeedLlvm
         where
           notDyadic =
             pprPanic "getRegister' (non-dyadic CmmMachOp with 2 arguments): " $


=====================================
compiler/GHC/CmmToAsm/AArch64/Instr.hs
=====================================
@@ -145,6 +145,8 @@ regUsageOfInstr platform instr = case instr of
   FCVTZS dst src           -> usage (regOp src, regOp dst)
   FABS dst src             -> usage (regOp src, regOp dst)
   FSQRT dst src            -> usage (regOp src, regOp dst)
+  FMIN dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  FMAX dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
   FMA _ dst src1 src2 src3 ->
     usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
 
@@ -295,6 +297,8 @@ patchRegsOfInstr instr env = case instr of
     FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
     FABS o1 o2     -> FABS (patchOp o1) (patchOp o2)
     FSQRT o1 o2    -> FSQRT (patchOp o1) (patchOp o2)
+    FMIN o1 o2 o3  -> FMIN (patchOp o1) (patchOp o2) (patchOp o3)
+    FMAX o1 o2 o3  -> FMAX (patchOp o1) (patchOp o2) (patchOp o3)
     FMA s o1 o2 o3 o4 ->
       FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
 
@@ -667,6 +671,10 @@ data Instr
     | FCVTZS Operand Operand
     -- Float ABSolute value
     | FABS Operand Operand
+    -- Float minimum
+    | FMIN Operand Operand Operand
+    -- Float maximum
+    | FMAX Operand Operand Operand
     -- Float SQuare RooT
     | FSQRT Operand Operand
 
@@ -743,6 +751,8 @@ instrCon i =
       FCVTZS{} -> "FCVTZS"
       FABS{} -> "FABS"
       FSQRT{} -> "FSQRT"
+      FMIN {} -> "FMIN"
+      FMAX {} -> "FMAX"
       FMA variant _ _ _ _ ->
         case variant of
           FMAdd  -> "FMADD"


=====================================
compiler/GHC/CmmToAsm/AArch64/Ppr.hs
=====================================
@@ -534,6 +534,8 @@ pprInstr platform instr = case instr of
   FCVTZS o1 o2 -> op2 (text "\tfcvtzs") o1 o2
   FABS o1 o2 -> op2 (text "\tfabs") o1 o2
   FSQRT o1 o2 -> op2 (text "\tfsqrt") o1 o2
+  FMIN o1 o2 o3 -> op3 (text "\tfmin") o1 o2 o3
+  FMAX o1 o2 o3 -> op3 (text "\tfmax") o1 o2 o3
   FMA variant d r1 r2 r3 ->
     let fma = case variant of
                 FMAdd  -> text "\tfmadd"


=====================================
compiler/GHC/CmmToAsm/PPC/CodeGen.hs
=====================================
@@ -589,6 +589,8 @@ getRegister' _ _ (CmmMachOp mop [x, y]) -- dyadic PrimOps
       MO_F_Sub w  -> triv_float w FSUB
       MO_F_Mul w  -> triv_float w FMUL
       MO_F_Quot w -> triv_float w FDIV
+      MO_F_Min w  -> triv_float w FMIN
+      MO_F_Max w  -> triv_float w FMAX
 
          -- optimize addition with 32-bit immediate
          -- (needed for PIC)
@@ -671,6 +673,12 @@ getRegister' _ _ (CmmMachOp mop [x, y]) -- dyadic PrimOps
       MO_VF_Quot {} -> vectorsNeedLlvm
       MO_V_Shuffle {} -> vectorsNeedLlvm
       MO_VF_Shuffle {} -> vectorsNeedLlvm
+      MO_VU_Min {} -> vectorsNeedLlvm
+      MO_VU_Max {} -> vectorsNeedLlvm
+      MO_VS_Min {} -> vectorsNeedLlvm
+      MO_VS_Max {} -> vectorsNeedLlvm
+      MO_VF_Min {} -> vectorsNeedLlvm
+      MO_VF_Max {} -> vectorsNeedLlvm
 
       _ -> panic "PPC.CodeGen.getRegister: no match"
 


=====================================
compiler/GHC/CmmToAsm/PPC/Instr.hs
=====================================
@@ -279,6 +279,8 @@ data Instr
     | FDIV    Format Reg Reg Reg
     | FABS    Reg Reg               -- abs is the same for single and double
     | FNEG    Reg Reg               -- negate is the same for single and double prec.
+    | FMIN    Format Reg Reg Reg
+    | FMAX    Format Reg Reg Reg
 
     -- | Fused multiply-add instructions.
     --


=====================================
compiler/GHC/CmmToAsm/PPC/Ppr.hs
=====================================
@@ -941,6 +941,12 @@ pprInstr platform instr = case instr of
    FNEG reg1 reg2
       -> pprUnary (text "fneg") reg1 reg2
 
+   FMIN fmt reg1 reg2 reg3
+      -> pprBinaryF (text "fmin") fmt reg1 reg2 reg3
+
+   FMAX fmt reg1 reg2 reg3
+      -> pprBinaryF (text "fmax") fmt reg1 reg2 reg3
+
    FMADD signs fmt dst ra rc rb
      -> pprTernaryF (pprFMASign signs) fmt dst ra rc rb
 


=====================================
compiler/GHC/CmmToAsm/Wasm/Asm.hs
=====================================
@@ -362,6 +362,8 @@ asmTellWasmInstr ty_word instr = case instr of
   WasmF64PromoteF32 -> asmTellLine "f64.promote_f32"
   WasmAbs ty -> asmTellLine $ asmFromWasmType ty <> ".abs"
   WasmNeg ty -> asmTellLine $ asmFromWasmType ty <> ".neg"
+  WasmMin ty -> asmTellLine $ asmFromWasmType ty <> ".min"
+  WasmMax ty -> asmTellLine $ asmFromWasmType ty <> ".max"
   WasmCond t -> do
     asmTellLine "if"
     asmWithTab $ asmTellWasmInstr ty_word t


=====================================
compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
=====================================
@@ -821,6 +821,18 @@ lower_CmmMachOp lbl (MO_F_Lt w0) xs =
     lbl
     (cmmFloat w0)
     xs
+lower_CmmMachOp lbl (MO_F_Min w0) xs =
+  lower_MO_Bin_Homo
+    WasmMin
+    lbl
+    (cmmFloat w0)
+    xs
+lower_CmmMachOp lbl (MO_F_Max w0) xs =
+  lower_MO_Bin_Homo
+    WasmMax
+    lbl
+    (cmmFloat w0)
+    xs
 lower_CmmMachOp lbl (MO_And w0) xs =
   lower_MO_Bin_Homo
     WasmAnd


=====================================
compiler/GHC/CmmToAsm/Wasm/Types.hs
=====================================
@@ -306,6 +306,8 @@ data WasmInstr :: WasmType -> [WasmType] -> [WasmType] -> Type where
   WasmF64PromoteF32 :: WasmInstr w ('F32 : pre) ('F64 : pre)
   WasmAbs :: WasmTypeTag t -> WasmInstr w (t : pre) (t : pre)
   WasmNeg :: WasmTypeTag t -> WasmInstr w (t : pre) (t : pre)
+  WasmMin :: WasmTypeTag t -> WasmInstr w (t : t : pre) (t : pre)
+  WasmMax :: WasmTypeTag t -> WasmInstr w (t : t : pre) (t : pre)
   WasmCond :: WasmInstr w pre pre -> WasmInstr w (w : pre) pre
 
 newtype WasmExpr w t = WasmExpr (forall pre. WasmInstr w pre (t : pre))


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -842,7 +842,7 @@ iselExpr64ParallelBin op e1 e2 = do
 -- This is a helper data type which helps reduce the code duplication for
 -- the code generation of arithmetic operations. This is not specifically
 -- targetted for any particular type like Int8, Int32 etc
-data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div
+data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div | VA_Min | VA_Max
 
 getRegister :: CmmExpr -> NatM Register
 getRegister e = do platform <- getPlatform
@@ -1124,6 +1124,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_F_Le {}   -> incorrectOperands
       MO_F_Gt {}   -> incorrectOperands
       MO_F_Lt {}   -> incorrectOperands
+      MO_F_Min {}  -> incorrectOperands
+      MO_F_Max {}  -> incorrectOperands
       MO_And {}    -> incorrectOperands
       MO_Or {}     -> incorrectOperands
       MO_Xor {}    -> incorrectOperands
@@ -1141,6 +1143,12 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_VU_Rem {}        -> incorrectOperands
       MO_V_Shuffle {}     -> incorrectOperands
       MO_VF_Shuffle {}    -> incorrectOperands
+      MO_VU_Min {}  -> incorrectOperands
+      MO_VU_Max {}  -> incorrectOperands
+      MO_VS_Min {}  -> incorrectOperands
+      MO_VS_Max {}  -> incorrectOperands
+      MO_VF_Min {}  -> incorrectOperands
+      MO_VF_Max {}  -> incorrectOperands
 
       MO_VF_Extract {}    -> incorrectOperands
       MO_VF_Add {}        -> incorrectOperands
@@ -1338,6 +1346,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_F_Sub  w -> trivialFCode_sse2 w SUB  x y
       MO_F_Quot w -> trivialFCode_sse2 w FDIV x y
       MO_F_Mul  w -> trivialFCode_sse2 w MUL  x y
+      MO_F_Min  w -> trivialFCode_sse2 w (MINMAX Min FloatMinMax) x y
+      MO_F_Max  w -> trivialFCode_sse2 w (MINMAX Max FloatMinMax) x y
 
       MO_Add rep -> add_code rep x y
       MO_Sub rep -> sub_code rep x y
@@ -1394,6 +1404,12 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_VF_Quot l w        | avx       -> vector_float_op_avx VA_Div l w x y
                             | otherwise -> vector_float_op_sse VA_Div l w x y
 
+      MO_VF_Min l w         | avx       -> vector_float_op_avx VA_Min l w x y
+                            | otherwise -> vector_float_op_sse VA_Min l w x y
+
+      MO_VF_Max l w         | avx       -> vector_float_op_avx VA_Max l w x y
+                            | otherwise -> vector_float_op_sse VA_Max l w x y
+
       -- SIMD NCG TODO: integer vector operations
       MO_V_Shuffle {} -> needLlvm mop
       MO_V_Add {} -> needLlvm mop
@@ -1404,6 +1420,11 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_VU_Quot {} -> needLlvm mop
       MO_VU_Rem {} -> needLlvm mop
 
+      MO_VU_Min {} -> needLlvm mop
+      MO_VU_Max {} -> needLlvm mop
+      MO_VS_Min {} -> needLlvm mop
+      MO_VS_Max {} -> needLlvm mop
+
       -- Unary MachOps
       MO_S_Neg {} -> incorrectOperands
       MO_F_Neg {} -> incorrectOperands
@@ -1633,6 +1654,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
             VA_Sub -> arithInstr VSUB
             VA_Mul -> arithInstr VMUL
             VA_Div -> arithInstr VDIV
+            VA_Min -> arithInstr (VMINMAX Min FloatMinMax)
+            VA_Max -> arithInstr (VMINMAX Max FloatMinMax)
             where
               -- opcode src2 src1 dst <==> dst = src1 `opcode` src2
               arithInstr instr = exp1 `appOL` exp2 `snocOL`
@@ -1658,6 +1681,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
             VA_Sub -> arithInstr SUB
             VA_Mul -> arithInstr MUL
             VA_Div -> arithInstr FDIV
+            VA_Min -> arithInstr (MINMAX Min FloatMinMax)
+            VA_Max -> arithInstr (MINMAX Max FloatMinMax)
             where
               -- opcode src2 src1 <==> src1 = src1 `opcode` src2
               arithInstr instr


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -40,6 +40,7 @@ module GHC.CmmToAsm.X86.Instr
    , isMetaInstr
    , isJumpishInstr
    , movdOutFormat
+   , MinOrMax(..), MinMaxType(..)
    )
 where
 
@@ -330,8 +331,20 @@ data Instr
         | PSLLDQ     Format Operand Reg
         | PSRLDQ     Format Operand Reg
 
+        -- min/max
+        | MINMAX  MinOrMax MinMaxType Format Operand Operand
+        | VMINMAX MinOrMax MinMaxType Format Operand Reg Reg
+
 data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
 
+-- | 'MIN' or 'MAX'
+data MinOrMax = Min | Max
+  deriving ( Eq, Show )
+-- | What kind of min/max operation: signed or unsigned vector integer min/max,
+-- or (scalar or vector) floating point min/max?
+data MinMaxType =
+  IntVecMinMax { minMaxSigned :: Bool } | FloatMinMax
+  deriving ( Eq, Show )
 
 data Operand
         = OpReg  Reg            -- register
@@ -508,6 +521,10 @@ regUsageOfInstr platform instr
     PUNPCKLQDQ fmt src dst
       -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
 
+    MINMAX _ _ fmt src dst
+      -> mkRU (use_R fmt src $ use_R fmt dst []) (use_R fmt dst [])
+    VMINMAX _ _ fmt src1 src2 dst
+      -> mkRU (use_R fmt src1 [mk fmt src2]) [mk fmt dst]
     _other              -> panic "regUsage: unrecognised instr"
  where
     -- # Definitions
@@ -748,6 +765,11 @@ patchRegsOfInstr platform instr env
     PUNPCKLQDQ fmt src dst
       -> PUNPCKLQDQ fmt (patchOp src) (env dst)
 
+    MINMAX minMax ty fmt src dst
+      -> MINMAX minMax ty fmt (patchOp src) (patchOp dst)
+    VMINMAX minMax ty fmt src1 src2 dst
+      -> VMINMAX minMax ty fmt (patchOp src1) (env src2) (env dst)
+
   where
     patch1 :: (Operand -> a) -> Operand -> a
     patch1 insn op      = insn $! patchOp op


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -43,6 +43,7 @@ import GHC.Types.Unique ( pprUniqueAlways )
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
 
+import Data.List ( intersperse )
 import Data.Word
 
 -- Note [Subsections Via Symbols]
@@ -1034,6 +1035,11 @@ pprInstr platform i = case i of
    PUNPCKLQDQ format from to
      -> pprOpReg (text "punpcklqdq") format from to
 
+   MINMAX minMax ty fmt src dst
+     -> pprMinMax False minMax ty fmt [src, dst]
+   VMINMAX minMax ty fmt src1 src2 dst
+     -> pprMinMax True minMax ty fmt [src1, OpReg src2, OpReg dst]
+
   where
    gtab :: Line doc
    gtab  = char '\t'
@@ -1365,3 +1371,14 @@ pprInstr platform i = case i of
            comma,
            pprReg platform format reg
        ]
+
+   pprMinMax :: Bool -> MinOrMax -> MinMaxType -> Format -> [Operand] -> doc
+   pprMinMax wantV minOrMax mmTy fmt regs
+     = line $ hcat ( instr : intersperse comma ( map ( pprOperand platform fmt ) regs ) )
+      where
+        instr =  (if wantV then text "v" else empty)
+              <> (case mmTy of { IntVecMinMax {} -> text "p"; FloatMinMax -> empty })
+              <> (case minOrMax of { Min -> text "min"; Max -> text "max" })
+              <> (case mmTy of { IntVecMinMax wantSigned -> if wantSigned then text "s" else text "u"; FloatMinMax -> empty })
+              <> pprFormat fmt
+              <> space


=====================================
compiler/GHC/CmmToC.hs
=====================================
@@ -725,6 +725,8 @@ pprMachOp_for_C platform mop = case mop of
         MO_F_Neg        _ -> char '-'
         MO_F_Mul        _ -> char '*'
         MO_F_Quot       _ -> char '/'
+        MO_F_Min        _ -> text "fmin"
+        MO_F_Max        _ -> text "fmax"
 
         -- Floating-point fused multiply-add operations
         MO_FMA FMAdd 1 w ->
@@ -917,6 +919,30 @@ pprMachOp_for_C platform mop = case mop of
                                 (text "MO_VF_Quot")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Quot"
                                       ++ "unsupported by the unregisterised backend")
+        MO_VU_Min {}      -> pprTrace "offending mop:"
+                                (text "MO_VU_Min")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VU_Min"
+                                      ++ "unsupported by the unregisterised backend")
+        MO_VU_Max {}      -> pprTrace "offending mop:"
+                                (text "MO_VU_Max")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VU_Max"
+                                      ++ "unsupported by the unregisterised backend")
+        MO_VS_Min {}      -> pprTrace "offending mop:"
+                                (text "MO_VS_Min")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VS_Min"
+                                      ++ "unsupported by the unregisterised backend")
+        MO_VS_Max {}      -> pprTrace "offending mop:"
+                                (text "MO_VS_Max")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VS_Max"
+                                      ++ "unsupported by the unregisterised backend")
+        MO_VF_Min {}      -> pprTrace "offending mop:"
+                                (text "MO_VF_Min")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VU_Min"
+                                      ++ "unsupported by the unregisterised backend")
+        MO_VF_Max {}      -> pprTrace "offending mop:"
+                                (text "MO_VF_Max")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VU_Max"
+                                      ++ "unsupported by the unregisterised backend")
 
 signedOp :: MachOp -> Bool      -- Argument type(s) are signed ints
 signedOp (MO_S_Quot _)    = True


=====================================
compiler/GHC/CmmToLlvm/CodeGen.hs
=====================================
@@ -1493,6 +1493,8 @@ genMachOp _ op [x] = case op of
     MO_F_Sub        _ -> panicOp
     MO_F_Mul        _ -> panicOp
     MO_F_Quot       _ -> panicOp
+    MO_F_Min        _ -> panicOp
+    MO_F_Max        _ -> panicOp
 
     MO_FMA _ _ _      -> panicOp
 
@@ -1519,9 +1521,13 @@ genMachOp _ op [x] = case op of
 
     MO_VS_Quot    _ _ -> panicOp
     MO_VS_Rem     _ _ -> panicOp
+    MO_VS_Min     _ _ -> panicOp
+    MO_VS_Max     _ _ -> panicOp
 
     MO_VU_Quot    _ _ -> panicOp
     MO_VU_Rem     _ _ -> panicOp
+    MO_VU_Min     _ _ -> panicOp
+    MO_VU_Max     _ _ -> panicOp
 
     MO_VF_Insert  _ _ -> panicOp
     MO_VF_Extract _ _ -> panicOp
@@ -1533,6 +1539,8 @@ genMachOp _ op [x] = case op of
     MO_VF_Sub     _ _ -> panicOp
     MO_VF_Mul     _ _ -> panicOp
     MO_VF_Quot    _ _ -> panicOp
+    MO_VF_Min     _ _ -> panicOp
+    MO_VF_Max     _ _ -> panicOp
 
     where
         negate ty v2 negOp = do
@@ -1732,6 +1740,16 @@ genMachOp_slow opt op [x, y] = case op of
 
     MO_VF_Neg {} -> panicOp
 
+    -- Min/max
+    MO_F_Min  {} -> genMinMaxOp "minnum" x y
+    MO_F_Max  {} -> genMinMaxOp "maxnum" x y
+    MO_VF_Min {} -> genMinMaxOp "minnum" x y
+    MO_VF_Max {} -> genMinMaxOp "maxnum" x y
+    MO_VU_Min {} -> genMinMaxOp "umin"   x y
+    MO_VU_Max {} -> genMinMaxOp "umax"   x y
+    MO_VS_Min {} -> genMinMaxOp "smin"   x y
+    MO_VS_Max {} -> genMinMaxOp "smax"   x y
+
     MO_RelaxedRead {} -> panicOp
 
     MO_AlignmentCheck {} -> panicOp
@@ -1786,6 +1804,19 @@ genMachOp_slow opt op [x, y] = case op of
 
         genCastBinMach ty op = binCastLlvmOp ty (LlvmOp op)
 
+        genMinMaxOp intrin x y = runExprData $ do
+            vx <- exprToVarW x
+            vy <- exprToVarW y
+            let tx = getVarType vx
+                ty = getVarType vy
+                fname = "llvm." ++ intrin ++ "." ++ ppLlvmTypeShort ty
+            Panic.massertPpr
+              (tx == ty)
+              (vcat [ text (fname ++ ": mismatched arg types")
+                    , ppLlvmType tx, ppLlvmType ty ])
+            fptr <- liftExprData $ getInstrinct (fsLit fname) ty [tx, ty]
+            doExprW tx $ Call StdCall fptr [vx, vy] [ReadNone, NoUnwind]
+
         -- Detect if overflow will occur in signed multiply of the two
         -- CmmExpr's. This is the LLVM assembly equivalent of the NCG
         -- implementation. Its much longer due to type information/safety.


=====================================
compiler/GHC/JS/Make.hs
=====================================
@@ -130,7 +130,8 @@ module GHC.JS.Make
   -- $math
   , math_log, math_sin, math_cos, math_tan, math_exp, math_acos, math_asin,
     math_atan, math_abs, math_pow, math_sqrt, math_asinh, math_acosh, math_atanh,
-    math_cosh, math_sinh, math_tanh, math_expm1, math_log1p, math_fround
+    math_cosh, math_sinh, math_tanh, math_expm1, math_log1p, math_fround,
+    math_min, math_max
   -- * Statement helpers
   , Solo(..)
   , decl
@@ -672,7 +673,8 @@ math_ op args = ApplExpr (math .^ op) args
 
 math_log, math_sin, math_cos, math_tan, math_exp, math_acos, math_asin, math_atan,
   math_abs, math_pow, math_sqrt, math_asinh, math_acosh, math_atanh, math_sign,
-  math_sinh, math_cosh, math_tanh, math_expm1, math_log1p, math_fround
+  math_sinh, math_cosh, math_tanh, math_expm1, math_log1p, math_fround,
+  math_min, math_max
   :: [JStgExpr] -> JStgExpr
 math_log   = math_ "log"
 math_sin   = math_ "sin"
@@ -695,6 +697,8 @@ math_tanh  = math_ "tanh"
 math_expm1 = math_ "expm1"
 math_log1p = math_ "log1p"
 math_fround = math_ "fround"
+math_min    = math_ "min"
+math_max    = math_ "max"
 
 instance Num JStgExpr where
     x + y = InfixExpr AddOp x y


=====================================
compiler/GHC/Llvm/Types.hs
=====================================
@@ -91,6 +91,15 @@ ppLlvmType t = case t of
 {-# SPECIALIZE ppLlvmType :: LlvmType -> SDoc #-}
 {-# SPECIALIZE ppLlvmType :: LlvmType -> HLine #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable
 
+-- | Pretty-print a short name for a scalar or vector type, e.g. @"i16"@ or @"v4f32"@.
+ppLlvmTypeShort :: LlvmType -> String
+ppLlvmTypeShort t = case t of
+  LMInt w  -> 'i' : show w
+  LMFloat  -> "f32"
+  LMDouble -> "f64"
+  LMVector l t -> "v" ++ show l ++ ppLlvmTypeShort t
+  _ -> pprPanic "ppLlvmTypeShort" (ppLlvmType t)
+
 ppParams :: IsLine doc => LlvmParameterListType -> [LlvmParameter] -> doc
 ppParams varg p
   = let varg' = case varg of


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -1481,6 +1481,9 @@ emitPrimOp cfg primop =
   DoubleGtOp     -> opTranslate (MO_F_Gt W64)
   DoubleLtOp     -> opTranslate (MO_F_Lt W64)
 
+  DoubleMinOp     -> opTranslate (MO_F_Min W64)
+  DoubleMaxOp     -> opTranslate (MO_F_Max W64)
+
   DoubleAddOp    -> opTranslate (MO_F_Add W64)
   DoubleSubOp    -> opTranslate (MO_F_Sub W64)
   DoubleMulOp    -> opTranslate (MO_F_Mul W64)
@@ -1512,6 +1515,9 @@ emitPrimOp cfg primop =
   FloatFNMAdd   -> fmaOp FNMAdd 1 W32
   FloatFNMSub   -> fmaOp FNMSub 1 W32
 
+  FloatMinOp    -> opTranslate (MO_F_Min W32)
+  FloatMaxOp    -> opTranslate (MO_F_Max W32)
+
 -- Vector ops
 
   (VecAddOp  FloatVec n w) -> opTranslate (MO_VF_Add  n w)
@@ -1521,6 +1527,8 @@ emitPrimOp cfg primop =
   (VecQuotOp FloatVec _ _) -> \_ -> panic "unsupported primop"
   (VecRemOp  FloatVec _ _) -> \_ -> panic "unsupported primop"
   (VecNegOp  FloatVec n w) -> opTranslate (MO_VF_Neg  n w)
+  (VecMinOp  FloatVec n w) -> opTranslate (MO_VF_Min  n w)
+  (VecMaxOp  FloatVec n w) -> opTranslate (MO_VF_Max  n w)
 
   (VecAddOp  IntVec n w) -> opTranslate (MO_V_Add   n w)
   (VecSubOp  IntVec n w) -> opTranslate (MO_V_Sub   n w)
@@ -1529,6 +1537,8 @@ emitPrimOp cfg primop =
   (VecQuotOp IntVec n w) -> opTranslate (MO_VS_Quot n w)
   (VecRemOp  IntVec n w) -> opTranslate (MO_VS_Rem  n w)
   (VecNegOp  IntVec n w) -> opTranslate (MO_VS_Neg  n w)
+  (VecMinOp  IntVec n w) -> opTranslate (MO_VS_Min  n w)
+  (VecMaxOp  IntVec n w) -> opTranslate (MO_VS_Min  n w)
 
   (VecAddOp  WordVec n w) -> opTranslate (MO_V_Add   n w)
   (VecSubOp  WordVec n w) -> opTranslate (MO_V_Sub   n w)
@@ -1537,6 +1547,8 @@ emitPrimOp cfg primop =
   (VecQuotOp WordVec n w) -> opTranslate (MO_VU_Quot n w)
   (VecRemOp  WordVec n w) -> opTranslate (MO_VU_Rem  n w)
   (VecNegOp  WordVec _ _) -> \_ -> panic "unsupported primop"
+  (VecMinOp  WordVec n w) -> opTranslate (MO_VU_Min  n w)
+  (VecMaxOp  WordVec n w) -> opTranslate (MO_VU_Min  n w)
 
   -- Vector FMA instructions
   VecFMAdd  _ n w -> fmaOp FMAdd  n w


=====================================
compiler/GHC/StgToJS/Prim.hs
=====================================
@@ -477,6 +477,8 @@ genPrim prof bound ty op = case op of
   DoubleDivOp       -> \[r] [x,y] -> pure $ PrimInline $ r |= Div x y
   DoubleNegOp       -> \[r] [x]   -> pure $ PrimInline $ r |= Negate x
   DoubleFabsOp      -> \[r] [x]   -> pure $ PrimInline $ r |= math_abs [x]
+  DoubleMinOp       -> \[r] [x,y] -> pure $ PrimInline $ r |= math_min [x,y]
+  DoubleMaxOp       -> \[r] [x,y] -> pure $ PrimInline $ r |= math_max [x,y]
   DoubleToIntOp     -> \[r] [x]   -> pure $ PrimInline $ r |= toI32 x
   DoubleToFloatOp   -> \[r] [x]   -> pure $ PrimInline $ r |= math_fround [x]
   DoubleExpOp       -> \[r] [x]   -> pure $ PrimInline $ r |= math_exp  [x]
@@ -520,6 +522,8 @@ genPrim prof bound ty op = case op of
   FloatMulOp        -> \[r] [x,y] -> pure $ PrimInline $ r |= math_fround [Mul x y]
   FloatDivOp        -> \[r] [x,y] -> pure $ PrimInline $ r |= math_fround [Div x y]
   FloatNegOp        -> \[r] [x]   -> pure $ PrimInline $ r |= Negate x
+  FloatMinOp        -> \[r] [x,y] -> pure $ PrimInline $ r |= math_min [x,y]
+  FloatMaxOp        -> \[r] [x,y] -> pure $ PrimInline $ r |= math_max [x,y]
   FloatFabsOp       -> \[r] [x]   -> pure $ PrimInline $ r |= math_abs [x]
   FloatToIntOp      -> \[r] [x]   -> pure $ PrimInline $ r |= toI32 x
   FloatExpOp        -> \[r] [x]   -> pure $ PrimInline $ r |= math_fround [math_exp [x]]
@@ -1204,6 +1208,8 @@ genPrim prof bound ty op = case op of
   VecReadScalarOffAddrOp _ _ _      -> unhandledPrimop op
   VecWriteScalarOffAddrOp _ _ _     -> unhandledPrimop op
   VecShuffleOp _ _ _                -> unhandledPrimop op
+  VecMinOp {}                       -> unhandledPrimop op
+  VecMaxOp {}                       -> unhandledPrimop op
 
   PrefetchByteArrayOp3              -> noOp
   PrefetchMutableByteArrayOp3       -> noOp


=====================================
docs/users_guide/9.12.1-notes.rst
=====================================
@@ -136,6 +136,12 @@ Runtime system
   and indices ``(# 4#, 3#, 6#, 1# #)``, will return a vector with components
   ``(# 44.1#, 33.1#, 66.1#, 11.1# #)``.
 
+- New instructions for minimum/maximum, such as `minDouble#` and
+  `minFloatX4#`. These instructions compute the minimum/maximum of their inputs,
+  working component-wise for SIMD vectors. Supported argument types are scalar
+  integer values (e.g. `Word16#`, `Int32#` etc) and both scalar and vector
+  floating point values (e.g. `Float`, `DoubleX2#`, `FloatX8#` etc).
+
 ``ghc`` library
 ~~~~~~~~~~~~~~~
 


=====================================
libraries/base/src/GHC/Base.hs
=====================================
@@ -203,6 +203,71 @@ import GHC.Prim hiding
   , shuffleWord8X16#
   , shuffleWord8X32#
   , shuffleWord8X64#
+  -- Don't re-export min/max primops
+  , maxDouble#
+  , maxDoubleX2#
+  , maxDoubleX4#
+  , maxDoubleX8#
+  , maxFloat#
+  , maxFloatX16#
+  , maxFloatX4#
+  , maxFloatX8#
+  , maxInt16X16#
+  , maxInt16X32#
+  , maxInt16X8#
+  , maxInt32X16#
+  , maxInt32X4#
+  , maxInt32X8#
+  , maxInt64X2#
+  , maxInt64X4#
+  , maxInt64X8#
+  , maxInt8X16#
+  , maxInt8X32#
+  , maxInt8X64#
+  , maxWord16X16#
+  , maxWord16X32#
+  , maxWord16X8#
+  , maxWord32X16#
+  , maxWord32X4#
+  , maxWord32X8#
+  , maxWord64X2#
+  , maxWord64X4#
+  , maxWord64X8#
+  , maxWord8X16#
+  , maxWord8X32#
+  , maxWord8X64#
+  , minDouble#
+  , minDoubleX2#
+  , minDoubleX4#
+  , minDoubleX8#
+  , minFloat#
+  , minFloatX16#
+  , minFloatX4#
+  , minFloatX8#
+  , minInt16X16#
+  , minInt16X32#
+  , minInt16X8#
+  , minInt32X16#
+  , minInt32X4#
+  , minInt32X8#
+  , minInt64X2#
+  , minInt64X4#
+  , minInt64X8#
+  , minInt8X16#
+  , minInt8X32#
+  , minInt8X64#
+  , minWord16X16#
+  , minWord16X32#
+  , minWord16X8#
+  , minWord32X16#
+  , minWord32X4#
+  , minWord32X8#
+  , minWord64X2#
+  , minWord64X4#
+  , minWord64X8#
+  , minWord8X16#
+  , minWord8X32#
+  , minWord8X64#
   )
 
 import GHC.Prim.Ext


=====================================
libraries/base/src/GHC/Exts.hs
=====================================
@@ -176,6 +176,71 @@ import GHC.Prim hiding
   , shuffleWord8X16#
   , shuffleWord8X32#
   , shuffleWord8X64#
+  -- Don't re-export min/max primops
+  , maxDouble#
+  , maxDoubleX2#
+  , maxDoubleX4#
+  , maxDoubleX8#
+  , maxFloat#
+  , maxFloatX16#
+  , maxFloatX4#
+  , maxFloatX8#
+  , maxInt16X16#
+  , maxInt16X32#
+  , maxInt16X8#
+  , maxInt32X16#
+  , maxInt32X4#
+  , maxInt32X8#
+  , maxInt64X2#
+  , maxInt64X4#
+  , maxInt64X8#
+  , maxInt8X16#
+  , maxInt8X32#
+  , maxInt8X64#
+  , maxWord16X16#
+  , maxWord16X32#
+  , maxWord16X8#
+  , maxWord32X16#
+  , maxWord32X4#
+  , maxWord32X8#
+  , maxWord64X2#
+  , maxWord64X4#
+  , maxWord64X8#
+  , maxWord8X16#
+  , maxWord8X32#
+  , maxWord8X64#
+  , minDouble#
+  , minDoubleX2#
+  , minDoubleX4#
+  , minDoubleX8#
+  , minFloat#
+  , minFloatX16#
+  , minFloatX4#
+  , minFloatX8#
+  , minInt16X16#
+  , minInt16X32#
+  , minInt16X8#
+  , minInt32X16#
+  , minInt32X4#
+  , minInt32X8#
+  , minInt64X2#
+  , minInt64X4#
+  , minInt64X8#
+  , minInt8X16#
+  , minInt8X32#
+  , minInt8X64#
+  , minWord16X16#
+  , minWord16X32#
+  , minWord16X8#
+  , minWord32X16#
+  , minWord32X4#
+  , minWord32X8#
+  , minWord64X2#
+  , minWord64X4#
+  , minWord64X8#
+  , minWord8X16#
+  , minWord8X32#
+  , minWord8X64#
   )
 
 import GHC.Prim.Ext


=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -39,6 +39,7 @@ test('simd010', [], compile_and_run, [''])
 test('simd011', [ unless(have_cpu_feature('fma'), skip)
                 , extra_hc_opts('-mfma')
                 ], compile_and_run, [''])
+test('simd012', [], compile_and_run, [''])
 
 test('T25062_V16', [], compile_and_run, [''])
 test('T25062_V32', [ unless(have_cpu_feature('avx2'), skip)


=====================================
testsuite/tests/simd/should_run/simd006.hs
=====================================
@@ -120,6 +120,15 @@ instance Arbitrary Word64 where
 instance Arbitrary Word32 where
     arbitrary = wordDownsize <$> arbitraryWord64
 
+class HasMinMax a where
+  mini, maxi :: a -> a -> a
+instance HasMinMax FloatNT where
+  mini (FloatNT (F# f1)) (FloatNT (F# f2)) = FloatNT (F# (minFloat# f1 f2))
+  maxi (FloatNT (F# f1)) (FloatNT (F# f2)) = FloatNT (F# (maxFloat# f1 f2))
+instance HasMinMax DoubleNT where
+  mini (DoubleNT (D# d1)) (DoubleNT (D# d2)) = DoubleNT (D# (minDouble# d1 d2))
+  maxi (DoubleNT (D# d1)) (DoubleNT (D# d2)) = DoubleNT (D# (maxDouble# d1 d2))
+
 newtype FloatNT = FloatNT Float
   deriving newtype (Show, Num)
 instance Eq FloatNT where
@@ -167,6 +176,9 @@ instance Num FloatX4 where
   abs = error "no"
   signum = error "no"
   fromInteger = error "no"
+instance HasMinMax FloatX4 where
+  mini (FX4# a) (FX4# b) = FX4# (minFloatX4# a b)
+  maxi (FX4# a) (FX4# b) = FX4# (maxFloatX4# a b)
 
 data DoubleX2 = DX2# DoubleX2#
 instance Show DoubleX2 where
@@ -195,6 +207,9 @@ instance Num DoubleX2 where
   abs = error "no"
   signum = error "no"
   fromInteger = error "no"
+instance HasMinMax DoubleX2 where
+  mini (DX2# a) (DX2# b) = DX2# (minDoubleX2# a b)
+  maxi (DX2# a) (DX2# b) = DX2# (maxDoubleX2# a b)
 
 data Expr a where
   Lit :: a -> Expr a
@@ -202,6 +217,8 @@ data Expr a where
   Sub :: Expr a -> Expr a -> Expr a
   Neg :: Expr a -> Expr a
   Mul :: Expr a -> Expr a -> Expr a
+  Min :: Expr a -> Expr a -> Expr a
+  Max :: Expr a -> Expr a -> Expr a
   deriving (Show, Eq)
 fmapExpr :: (a -> b) -> Expr a -> Expr b
 fmapExpr f (Lit a) = Lit (f a)
@@ -209,6 +226,8 @@ fmapExpr f (Add a b) = Add (fmapExpr f a) (fmapExpr f b)
 fmapExpr f (Sub a b) = Sub (fmapExpr f a) (fmapExpr f b)
 fmapExpr f (Neg a) = Neg (fmapExpr f a)
 fmapExpr f (Mul a b) = Mul (fmapExpr f a) (fmapExpr f b)
+fmapExpr f (Min a b) = Min (fmapExpr f a) (fmapExpr f b)
+fmapExpr f (Max a b) = Max (fmapExpr f a) (fmapExpr f b)
 
 instance Arbitrary a => Arbitrary (Expr a) where
   arbitrary = do
@@ -218,15 +237,18 @@ instance Arbitrary a => Arbitrary (Expr a) where
       2 -> Sub <$> arbitrary <*> arbitrary
       3 -> Neg <$> arbitrary
       4 -> Mul <$> arbitrary <*> arbitrary
+      5 -> Min <$> arbitrary <*> arbitrary
+      6 -> Max <$> arbitrary <*> arbitrary
       _ -> Lit <$> arbitrary
 
-eval :: Num a => Expr a -> a
+eval :: (Num a, HasMinMax a) => Expr a -> a
 eval (Lit a) = a
 eval (Add a b) = eval a + eval b
 eval (Sub a b) = eval a - eval b
 eval (Neg a) = negate (eval a)
 eval (Mul a b) = eval a * eval b
-
+eval (Min a b) = mini (eval a) (eval b)
+eval (Max a b) = maxi (eval a) (eval b)
 
 int64ToInt :: Int64 -> Int
 #if WORD_SIZE_IN_BITS == 64


=====================================
testsuite/tests/simd/should_run/simd012.hs
=====================================
@@ -0,0 +1,30 @@
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- simple test for vector min/max instructions
+
+import GHC.Exts
+import GHC.Prim
+
+
+main :: IO ()
+main = do
+
+    -- FloatX4#
+    let
+      !f1 = packFloatX4# (# 1.1#, 20.1#, 3.1#, 40.1# #)
+      !f2 = packFloatX4# (# 10.2#, 2.2#, 30.2#, 4.2# #)
+
+    case unpackFloatX4# (minFloatX4# f1 f2) of
+        (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+    case unpackFloatX4# (maxFloatX4# f1 f2) of
+        (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+
+    -- DoubleX2#
+    let
+      !d1 = packDoubleX2# (# 1.1##, 20.1## #)
+      !d2 = packDoubleX2# (# 10.2##, 2.2## #)
+
+    case unpackDoubleX2# (minDoubleX2# d1 d2) of
+        (# a, b #) -> print (D# a, D# b)
+    case unpackDoubleX2# (maxDoubleX2# d1 d2) of
+        (# a, b #) -> print (D# a, D# b)


=====================================
testsuite/tests/simd/should_run/simd012.stdout
=====================================
@@ -0,0 +1,4 @@
+(1.1,2.2,3.1,4.2)
+(10.2,20.1,30.2,40.1)
+(1.1,2.2)
+(10.2,20.1)



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/b7f6e70eb08c01daf3b41d5b53098073e8b75173

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/b7f6e70eb08c01daf3b41d5b53098073e8b75173
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240730/36e3b0c9/attachment-0001.html>