[Git][ghc/ghc][wip/ncg-simd] SIMD: add vector FMA primops

Fri Jun 14 11:54:22 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
cd3c0b64 by sheaf at 2024-06-14T13:54:06+02:00
SIMD: add vector FMA primops

- - - - -


16 changed files:

- compiler/GHC/Builtin/primops.txt.pp
- compiler/GHC/Cmm/MachOp.hs
- compiler/GHC/Cmm/Parser.y
- compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
- compiler/GHC/CmmToAsm/Format.hs
- compiler/GHC/CmmToAsm/PPC/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToC.hs
- compiler/GHC/CmmToLlvm/CodeGen.hs
- compiler/GHC/StgToCmm/Prim.hs
- compiler/GHC/StgToJS/Prim.hs
- libraries/base/src/GHC/Exts.hs
- testsuite/tests/simd/should_run/all.T
- + testsuite/tests/simd/should_run/simd011.hs
- + testsuite/tests/simd/should_run/simd011.stdout


Changes:

=====================================
compiler/GHC/Builtin/primops.txt.pp
=====================================
@@ -4190,6 +4190,31 @@ primop VecWriteScalarOffAddrOp "writeOffAddrAs#" GenPrimOp
         llvm_only = True
         vector = ALL_VECTOR_TYPES
 
+primop   VecFMAdd   "fmadd#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused multiply-add operation @x*y+z at . See "GHC.Prim#fma".}
+   with
+      llvm_only = True
+      vector = FLOAT_VECTOR_TYPES
+primop   VecFMSub   "fmsub#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused multiply-subtract operation @x*y-z at . See "GHC.Prim#fma".}
+   with
+      llvm_only = True
+      vector = FLOAT_VECTOR_TYPES
+primop   VecFNMAdd   "fnmadd#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused negate-multiply-add operation @-x*y+z at . See "GHC.Prim#fma".}
+   with
+      llvm_only = True
+      vector = FLOAT_VECTOR_TYPES
+primop   VecFNMSub   "fnmsub#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused negate-multiply-subtract operation @-x*y-z at . See "GHC.Prim#fma".}
+   with
+      llvm_only = True
+      vector = FLOAT_VECTOR_TYPES
+
 primop VecShuffleOp "shuffle#" GenPrimOp
   VECTOR -> VECTOR -> INTVECTUPLE -> VECTOR
   { Shuffle elements of the concatenation of the input two vectors


=====================================
compiler/GHC/Cmm/MachOp.hs
=====================================
@@ -116,7 +116,7 @@ data MachOp
 
   -- Floating-point fused multiply-add operations
   -- | Fused multiply-add, see 'FMASign'.
-  | MO_FMA FMASign Width
+  | MO_FMA FMASign Length Width
 
   -- Floating point comparison
   | MO_F_Eq Width
@@ -465,7 +465,7 @@ machOpResultType platform mop tys =
     MO_F_Quot r         -> cmmFloat r
     MO_F_Neg r          -> cmmFloat r
 
-    MO_FMA _ r          -> cmmFloat r
+    MO_FMA _ l r        -> if l == 1 then cmmFloat r else cmmVec l (cmmFloat r)
 
     MO_F_Eq  {}         -> comparisonResultRep platform
     MO_F_Ne  {}         -> comparisonResultRep platform
@@ -567,7 +567,7 @@ machOpArgReps platform op =
     MO_F_Quot r         -> [r,r]
     MO_F_Neg r          -> [r]
 
-    MO_FMA _ r          -> [r,r,r]
+    MO_FMA _ l r        -> [vecwidth l r, vecwidth l r, vecwidth l r]
 
     MO_F_Eq  r          -> [r,r]
     MO_F_Ne  r          -> [r,r]


=====================================
compiler/GHC/Cmm/Parser.y
=====================================
@@ -1051,10 +1051,10 @@ machOps = listToUFM $
         ( "fmul",       MO_F_Mul ),
         ( "fquot",      MO_F_Quot ),
 
-        ( "fmadd" ,     MO_FMA FMAdd  ),
-        ( "fmsub" ,     MO_FMA FMSub  ),
-        ( "fnmadd",     MO_FMA FNMAdd ),
-        ( "fnmsub",     MO_FMA FNMSub ),
+        ( "fmadd" ,     MO_FMA FMAdd  1),
+        ( "fmsub" ,     MO_FMA FMSub  1),
+        ( "fnmadd",     MO_FMA FNMAdd 1),
+        ( "fnmsub",     MO_FMA FNMSub 1),
 
         ( "feq",        MO_F_Eq ),
         ( "fne",        MO_F_Ne ),


=====================================
compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
=====================================
@@ -1218,11 +1218,15 @@ getRegister' config plat expr
         -- x86 fnmadd - x * y + z <=> AArch64 fmsub : d = - r1 * r2 + r3
         -- x86 fnmsub - x * y - z <=> AArch64 fnmadd: d = - r1 * r2 - r3
 
-        MO_FMA var w -> case var of
-          FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
-          FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
-          FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMSub  d n m a)
-          FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
+        MO_FMA var l w
+          | l == 1
+          -> case var of
+            FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
+            FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
+            FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMSub  d n m a)
+            FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
+          | otherwise
+          -> vectorsNeedLlvm
 
         MO_V_Insert {} -> vectorsNeedLlvm
         MO_VF_Insert {} -> vectorsNeedLlvm


=====================================
compiler/GHC/CmmToAsm/Format.hs
=====================================
@@ -19,6 +19,7 @@ module GHC.CmmToAsm.Format (
     floatFormat,
     isIntFormat,
     isFloatFormat,
+    vecFormat,
     isVecFormat,
     cmmTypeFormat,
     formatToWidth,


=====================================
compiler/GHC/CmmToAsm/PPC/CodeGen.hs
=====================================
@@ -687,12 +687,14 @@ getRegister' _ _ (CmmMachOp mop [x, y, z]) -- ternary PrimOps
       -- x86 fnmadd - x * y + z ~~ PPC fnmsub rt = -(ra * rc - rb)
       -- x86 fnmsub - x * y - z ~~ PPC fnmadd rt = -(ra * rc + rb)
 
-      MO_FMA variant w ->
+      MO_FMA variant l w | l == 1 ->
         case variant of
           FMAdd  -> fma_code w (FMADD FMAdd) x y z
           FMSub  -> fma_code w (FMADD FMSub) x y z
           FNMAdd -> fma_code w (FMADD FNMAdd) x y z
           FNMSub -> fma_code w (FMADD FNMSub) x y z
+        | otherwise
+        -> vectorsNeedLlvm
 
       MO_V_Insert {} -> vectorsNeedLlvm
       MO_VF_Insert {} -> vectorsNeedLlvm


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1851,18 +1851,22 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
   sse    <- sseEnabled
   case mop of
       -- Floating point fused multiply-add operations @ ± x*y ± z@
-      MO_FMA var w -> genFMA3Code w var x y z
+      MO_FMA var l w
+        | l * widthInBits w > 256
+        -> sorry "Please use -fllvm for wide vector FMA support"
+        | otherwise
+        -> genFMA3Code l w var x y z
 
       -- Ternary vector operations
       MO_VF_Insert l W32  | sse4_1 && sse -> vector_float_insert l W32 x y z
                           | otherwise
-                          -> sorry "Please enable the -msse4 and -msse flag"
+                          -> sorry "Please enable the -msse4 and -msse flags"
       MO_VF_Insert l W64  | sse2   && sse -> vector_float_insert l W64 x y z
                           | otherwise
-                          -> sorry "Please enable the -msse2 and -msse flag"
+                          -> sorry "Please enable the -msse2 and -msse flags"
       MO_V_Insert l W64   | sse2   && sse -> vector_int_insert_sse l W64 x y z
                           | otherwise
-                          -> sorry "Please enable the -msse2 and -msse flag"
+                          -> sorry "Please enable the -msse2 and -msse flags"
 
       _other -> pprPanic "getRegister(x86) - ternary CmmMachOp (1)"
                   (pprMachOp mop)
@@ -4029,10 +4033,12 @@ _   `regClashesWithOp` _            = False
 
 -- | Generate code for a fused multiply-add operation, of the form @± x * y ± z@,
 -- with 3 operands (FMA3 instruction set).
-genFMA3Code :: Width
+genFMA3Code :: Length
+            -> Width
             -> FMASign
             -> CmmExpr -> CmmExpr -> CmmExpr -> NatM Register
-genFMA3Code w signs x y z = do
+genFMA3Code l w signs x y z = do
+  platform <- getPlatform
   -- For the FMA instruction, we want to compute x * y + z
   --
   -- There are three possible instructions we could emit:
@@ -4059,7 +4065,11 @@ genFMA3Code w signs x y z = do
   -- only possible if the other arguments don't use the destination register.
   -- We check for this and if there is a conflict we move the result only after
   -- the computation. See #24496 how this went wrong in the past.
-  let rep = floatFormat w
+  let rep
+        | l == 1
+        = floatFormat w
+        | otherwise
+        = vecFormat (cmmVec l $ cmmFloat w)
   (y_reg, y_code) <- getNonClobberedReg y
   (z_op, z_code) <- getNonClobberedOperand z
   x_code <- getAnyReg x
@@ -4069,17 +4079,17 @@ genFMA3Code w signs x y z = do
 
      code, code_direct, code_mov :: Reg -> InstrBlock
      -- Ideal: Compute the result directly into dst
-     code_direct dst = x_code  dst   `snocOL`
+     code_direct dst = x_code dst `snocOL`
                        fma213 z_op y_reg dst
      -- Fallback: Compute the result into a tmp reg and then move it.
      code_mov dst    = x_code x_tmp `snocOL`
                        fma213 z_op y_reg x_tmp `snocOL`
-                       MOV rep (OpReg x_tmp) (OpReg dst)
+                       mkRegRegMoveInstr platform rep x_tmp dst
 
      code dst =
-         y_code `appOL`
-          z_code `appOL`
-          ( if arg_regs_conflict then code_mov dst else code_direct dst )
+        y_code `appOL`
+        z_code `appOL`
+        ( if arg_regs_conflict then code_mov dst else code_direct dst )
 
       where
 


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -285,7 +285,7 @@ data Instr
         -- | FMA3 fused multiply-add operations.
         | FMA3         Format FMASign FMAPermutation Operand Reg Reg
           -- src3 (r/m), src2 (r), dst/src1 (r)
-          -- The is exactly reversed from how intel lists the arguments.
+          -- This is exactly reversed from how intel lists the arguments.
 
         -- use ADD, SUB, and SQRT for arithmetic.  In both cases, operands
         -- are  Operand Reg.


=====================================
compiler/GHC/CmmToC.hs
=====================================
@@ -727,7 +727,7 @@ pprMachOp_for_C platform mop = case mop of
         MO_F_Quot       _ -> char '/'
 
         -- Floating-point fused multiply-add operations
-        MO_FMA FMAdd w ->
+        MO_FMA FMAdd 1 w ->
           case w of
             W32 -> text "fmaf"
             W64 -> text "fma"
@@ -736,10 +736,15 @@ pprMachOp_for_C platform mop = case mop of
                 (text "FMAdd")
                 (panic $ "PprC.pprMachOp_for_C: FMAdd unsupported"
                        ++ "at width " ++ show w)
-        MO_FMA var _width  ->
-          pprTrace "offending mop:"
-            (text $ "FMA " ++ show var)
-            (panic $ "PprC.pprMachOp_for_C: should have been handled earlier!")
+        MO_FMA var l width
+          | l == 1
+          -> pprTrace "offending mop:"
+              (text $ "FMA " ++ show var)
+              (panic $ "PprC.pprMachOp_for_C: should have been handled earlier!")
+          | otherwise
+          -> pprTrace "offending mop:"
+              (text $ "FMA " ++ show var ++ " " ++ show l ++ " " ++ show width)
+              (panic $ "PprC.pprMachOp_for_C: unsupported vector operation")
 
         -- Signed comparisons
         MO_S_Ge         _ -> text ">="


=====================================
compiler/GHC/CmmToLlvm/CodeGen.hs
=====================================
@@ -1491,7 +1491,7 @@ genMachOp _ op [x] = case op of
     MO_F_Mul        _ -> panicOp
     MO_F_Quot       _ -> panicOp
 
-    MO_FMA _ _        -> panicOp
+    MO_FMA _ _ _      -> panicOp
 
     MO_F_Eq         _ -> panicOp
     MO_F_Ne         _ -> panicOp
@@ -1681,7 +1681,7 @@ genMachOp_slow opt op [x, y] = case op of
     MO_F_Mul  _ -> genBinMach LM_MO_FMul
     MO_F_Quot _ -> genBinMach LM_MO_FDiv
 
-    MO_FMA _ _ -> panicOp
+    MO_FMA _ _ _ -> panicOp
 
     MO_And _   -> genBinMach LM_MO_And
     MO_Or  _   -> genBinMach LM_MO_Or
@@ -1822,13 +1822,11 @@ genMachOp_slow opt op [x, y] = case op of
                        ++ "with two arguments! (" ++ show op ++ ")"
 
 genMachOp_slow _opt op [x, y, z] = do
-  platform <- getPlatform
   let
-    neg x = CmmMachOp (MO_F_Neg (cmmExprWidth platform x)) [x]
     panicOp = panic $ "LLVM.CodeGen.genMachOp_slow: non-ternary op encountered"
                    ++ "with three arguments! (" ++ show op ++ ")"
   case op of
-    MO_FMA var _ ->
+    MO_FMA var lg width ->
       case var of
         -- LLVM only has the fmadd variant.
         FMAdd   -> genFmaOp x y z
@@ -1837,6 +1835,12 @@ genMachOp_slow _opt op [x, y, z] = do
         FMSub   -> genFmaOp x y (neg z)
         FNMAdd  -> genFmaOp (neg x) y z
         FNMSub  -> genFmaOp (neg x) y (neg z)
+      where
+        neg x
+          | lg == 1
+          = CmmMachOp (MO_F_Neg width) [x]
+          | otherwise
+          = CmmMachOp (MO_VF_Neg lg width) [x]
     _ -> panicOp
 
 -- More than three expressions, invalid!
@@ -1873,7 +1877,13 @@ genFmaOp x y z = runExprData $ do
   let fname = case tx of
         LMFloat  -> fsLit "llvm.fma.f32"
         LMDouble -> fsLit "llvm.fma.f64"
-        _ -> pprPanic "fma: type not LMFloat or LMDouble" (ppLlvmType tx)
+        LMVector 4 LMFloat -> fsLit "llvm.fma.v4f32"
+        LMVector 8 LMFloat -> fsLit "llvm.fma.v8f32"
+        LMVector 16 LMFloat -> fsLit "llvm.fma.v16f32"
+        LMVector 2 LMDouble -> fsLit "llvm.fma.v2f64"
+        LMVector 4 LMDouble -> fsLit "llvm.fma.v4f64"
+        LMVector 8 LMDouble -> fsLit "llvm.fma.v8f64"
+        _ -> pprPanic "CmmToLlvm.genFmaOp: unsupported type" (ppLlvmType tx)
   fptr <- liftExprData $ getInstrinct fname ty [tx, ty, tz]
   doExprW tx $ Call StdCall fptr [vx, vy, vz] [ReadNone, NoUnwind]
 


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -1503,10 +1503,10 @@ emitPrimOp cfg primop =
   DoubleDivOp    -> opTranslate (MO_F_Quot W64)
   DoubleNegOp    -> opTranslate (MO_F_Neg W64)
 
-  DoubleFMAdd    -> fmaOp FMAdd  W64
-  DoubleFMSub    -> fmaOp FMSub  W64
-  DoubleFNMAdd   -> fmaOp FNMAdd W64
-  DoubleFNMSub   -> fmaOp FNMSub W64
+  DoubleFMAdd    -> fmaOp FMAdd  1 W64
+  DoubleFMSub    -> fmaOp FMSub  1 W64
+  DoubleFNMAdd   -> fmaOp FNMAdd 1 W64
+  DoubleFNMSub   -> fmaOp FNMSub 1 W64
 
 -- Float ops
 
@@ -1523,10 +1523,10 @@ emitPrimOp cfg primop =
   FloatDivOp    -> opTranslate (MO_F_Quot W32)
   FloatNegOp    -> opTranslate (MO_F_Neg  W32)
 
-  FloatFMAdd    -> fmaOp FMAdd  W32
-  FloatFMSub    -> fmaOp FMSub  W32
-  FloatFNMAdd   -> fmaOp FNMAdd W32
-  FloatFNMSub   -> fmaOp FNMSub W32
+  FloatFMAdd    -> fmaOp FMAdd  1 W32
+  FloatFMSub    -> fmaOp FMSub  1 W32
+  FloatFNMAdd   -> fmaOp FNMAdd 1 W32
+  FloatFNMSub   -> fmaOp FNMSub 1 W32
 
 -- Vector ops
 
@@ -1554,6 +1554,12 @@ emitPrimOp cfg primop =
   (VecRemOp  WordVec n w) -> opTranslate (MO_VU_Rem  n w)
   (VecNegOp  WordVec _ _) -> \_ -> panic "unsupported primop"
 
+  -- Vector FMA instructions
+  VecFMAdd  _ n w -> fmaOp FMAdd  n w
+  VecFMSub  _ n w -> fmaOp FMSub  n w
+  VecFNMAdd _ n w -> fmaOp FNMAdd n w
+  VecFNMSub _ n w -> fmaOp FNMSub n w
+
 -- Conversions
 
   IntToDoubleOp   -> opTranslate (MO_SF_Round (wordWidth platform) W64)
@@ -1851,10 +1857,11 @@ emitPrimOp cfg primop =
 
   allowFMA = stgToCmmAllowFMAInstr cfg
 
-  fmaOp :: FMASign -> Width -> [CmmActual] -> PrimopCmmEmit
-  fmaOp signs w args@[arg_x, arg_y, arg_z]
-    | allowFMA signs
-    = opTranslate (MO_FMA signs w) args
+  fmaOp :: FMASign -> Length -> Width -> [CmmActual] -> PrimopCmmEmit
+  fmaOp signs l w args@[arg_x, arg_y, arg_z]
+    |  allowFMA signs
+    || l > 1 -- (always use the MachOp for vector FMA)
+    = opTranslate (MO_FMA signs l w) args
     | otherwise
     = case signs of
 
@@ -1863,12 +1870,16 @@ emitPrimOp cfg primop =
 
         -- Other fused multiply-add operations are implemented in terms of fmadd
         -- This is sound: it does not lose any precision.
-        FMSub  -> fmaOp FMAdd w [arg_x, arg_y, neg arg_z]
-        FNMAdd -> fmaOp FMAdd w [neg arg_x, arg_y, arg_z]
-        FNMSub -> fmaOp FMAdd w [neg arg_x, arg_y, neg arg_z]
+        FMSub  -> fmaOp FMAdd l w [arg_x, arg_y, neg arg_z]
+        FNMAdd -> fmaOp FMAdd l w [neg arg_x, arg_y, arg_z]
+        FNMSub -> fmaOp FMAdd l w [neg arg_x, arg_y, neg arg_z]
     where
-      neg x = CmmMachOp (MO_F_Neg w) [x]
-  fmaOp _ _ _ = panic "fmaOp: wrong number of arguments (expected 3)"
+      neg x
+        | l == 1
+        = CmmMachOp (MO_F_Neg w) [x]
+        | otherwise
+        = CmmMachOp (MO_VF_Neg l w) [x]
+  fmaOp _ _ _ _ = panic "fmaOp: wrong number of arguments (expected 3)"
 
 data PrimopCmmEmit
   -- | Out of line fake primop that's actually just a foreign call to other


=====================================
compiler/GHC/StgToJS/Prim.hs
=====================================
@@ -1192,6 +1192,11 @@ genPrim prof bound ty op = case op of
   VecReadOffAddrOp _ _ _            -> unhandledPrimop op
   VecWriteOffAddrOp _ _ _           -> unhandledPrimop op
 
+  VecFMAdd  {} -> unhandledPrimop op
+  VecFMSub  {} -> unhandledPrimop op
+  VecFNMAdd {} -> unhandledPrimop op
+  VecFNMSub {} -> unhandledPrimop op
+
   VecIndexScalarByteArrayOp _ _ _   -> unhandledPrimop op
   VecReadScalarByteArrayOp _ _ _    -> unhandledPrimop op
   VecWriteScalarByteArrayOp _ _ _   -> unhandledPrimop op


=====================================
libraries/base/src/GHC/Exts.hs
=====================================
@@ -120,7 +120,32 @@ import GHC.Prim hiding
   , dataToTagSmall#, dataToTagLarge#
   -- whereFrom# is similarly internal.
   , whereFrom#
-  -- Don't re-export SIMD shuffle primops (to avoid changing GHC.Exts)
+  -- Don't re-export vector FMA instructions
+  , fmaddFloatX4#
+  , fmsubFloatX4#
+  , fnmaddFloatX4#
+  , fnmsubFloatX4#
+  , fmaddFloatX8#
+  , fmsubFloatX8#
+  , fnmaddFloatX8#
+  , fnmsubFloatX8#
+  , fmaddFloatX16#
+  , fmsubFloatX16#
+  , fnmaddFloatX16#
+  , fnmsubFloatX16#
+  , fmaddDoubleX2#
+  , fmsubDoubleX2#
+  , fnmaddDoubleX2#
+  , fnmsubDoubleX2#
+  , fmaddDoubleX4#
+  , fmsubDoubleX4#
+  , fnmaddDoubleX4#
+  , fnmsubDoubleX4#
+  , fmaddDoubleX8#
+  , fmsubDoubleX8#
+  , fnmaddDoubleX8#
+  , fnmsubDoubleX8#
+  -- Don't re-export SIMD shuffle primops
   , shuffleDoubleX2#
   , shuffleDoubleX4#
   , shuffleDoubleX8#


=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -15,3 +15,4 @@ test('simd007', [], compile_and_run, [''])
 test('simd008', [], compile_and_run, [''])
 test('simd009', [req_th, extra_files(['Simd009b.hs', 'Simd009c.hs'])], multimod_compile_and_run, ['simd009', ''])
 test('simd010', [], compile_and_run, [''])
+test('simd011', [when(have_cpu_feature('fma'), extra_hc_opts('-mfma'))], compile_and_run, [''])


=====================================
testsuite/tests/simd/should_run/simd011.hs
=====================================
@@ -0,0 +1,43 @@
+{-# OPTIONS_GHC -O2 #-}
+{-# OPTIONS_GHC -msse2 #-}
+{-# OPTIONS_GHC -msse4 #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+-- tests for vector FMA instructions
+
+import GHC.Exts
+import GHC.Prim
+
+
+main :: IO ()
+main = do
+
+    -- FloatX4#
+    let
+      !f1 = packFloatX4# (# 1.1#, 2.2#, 3.3#, 4.4# #)
+      !f2 = packFloatX4# (# 10.1#, 20.2#, 30.3#, 40.4# #)
+      !f3 = packFloatX4# (# 1000.0#, 2000.0#, 3000.0#, 4000.0# #)
+
+    case unpackFloatX4# (fmaddFloatX4# f1 f2 f3) of
+        (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+    case unpackFloatX4# (fmsubFloatX4# f1 f2 f3) of
+        (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+    case unpackFloatX4# (fnmaddFloatX4# f1 f2 f3) of
+        (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+    case unpackFloatX4# (fnmsubFloatX4# f1 f2 f3) of
+        (# a, b, c, d #) -> print (F# a, F# b, F# c, F# d)
+
+    -- DoubleX2#
+    let
+      !d1 = packDoubleX2# (# 1.1##, 2.2## #)
+      !d2 = packDoubleX2# (# 10.1##, 20.2## #)
+      !d3 = packDoubleX2# (# 1000.0##, 2000.0## #)
+
+    case unpackDoubleX2# (fmaddDoubleX2# d1 d2 d3) of
+        (# a, b #) -> print (D# a, D# b)
+    case unpackDoubleX2# (fmsubDoubleX2# d1 d2 d3) of
+        (# a, b #) -> print (D# a, D# b)
+    case unpackDoubleX2# (fnmaddDoubleX2# d1 d2 d3) of
+        (# a, b #) -> print (D# a, D# b)
+    case unpackDoubleX2# (fnmsubDoubleX2# d1 d2 d3) of
+        (# a, b #) -> print (D# a, D# b)


=====================================
testsuite/tests/simd/should_run/simd011.stdout
=====================================
@@ -0,0 +1,8 @@
+(1011.11,2044.4401,3099.99,4177.7603)
+(-988.89,-1955.5599,-2900.01,-3822.24)
+(988.89,1955.5599,2900.01,3822.24)
+(-1011.11,-2044.4401,-3099.99,-4177.7603)
+(1011.11,2044.44)
+(-988.89,-1955.56)
+(988.89,1955.56)
+(-1011.11,-2044.44)



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/cd3c0b64c180bb6f50f2ed63d2565e00a1888ecd

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/cd3c0b64c180bb6f50f2ed63d2565e00a1888ecd
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240614/28328336/attachment-0001.html>