[Git][ghc/ghc][wip/ncg-simd] 4 commits: Add vector fused multiply-add operations

Wed Jul 3 11:59:49 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
ffc1e910 by sheaf at 2024-07-03T13:59:20+02:00
Add vector fused multiply-add operations

This commit adds fused multiply add operations such as `fmaddDoubleX2#`.
These are handled both in the X86 NCG and the LLVM backends.

- - - - -
60d3c203 by sheaf at 2024-07-03T13:59:20+02:00
Add vector shuffle primops

This adds vector shuffle primops, such as

```
shuffleFloatX4# :: FloatX4# -> FloatX4# -> (# Int#, Int#, Int#, Int# #) -> FloatX4#
```

which shuffle the components of the input two vectors into the output vector.

NB: the indices must be compile time literals, to match the X86 SHUFPD
instruction immediate and the LLVM shufflevector instruction.

These are handled in the X86 NCG and the LLVM backend.

Tested in simd009.

- - - - -
751c6143 by sheaf at 2024-07-03T13:59:21+02:00
Add Broadcast MachOps

This adds proper MachOps for broadcast instructions, allowing us to
produce better code for broadcasting a value than simply packing that
value (doing many vector insertions in a row).

These are lowered in the X86 NCG and LLVM backends. In the LLVM backend,
it uses the previously introduced shuffle instructions.

- - - - -
f47ef2d5 by sheaf at 2024-07-03T13:59:21+02:00
Fix treatment of signed zero in vector negation

This commit fixes the handling of signed zero in floating-point vector
negation.

A slight hack was introduced to work around the fact that Cmm doesn't
currently have a notion of signed floating point literals
(see get_float_broadcast_value_reg). This can be removed once CmmFloat
can express the value -0.0.

The simd006 test has been updated to use a stricter notion of equality
of floating-point values, which ensure the validity of this change.

- - - - -


27 changed files:

- compiler/GHC/Builtin/primops.txt.pp
- compiler/GHC/Cmm/MachOp.hs
- compiler/GHC/Cmm/Opt.hs
- compiler/GHC/Cmm/Parser.y
- compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
- compiler/GHC/CmmToAsm/Format.hs
- compiler/GHC/CmmToAsm/PPC/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/CmmToC.hs
- compiler/GHC/CmmToLlvm/CodeGen.hs
- compiler/GHC/Llvm/Ppr.hs
- compiler/GHC/Llvm/Syntax.hs
- compiler/GHC/StgToCmm/Prim.hs
- compiler/GHC/StgToJS/Prim.hs
- docs/users_guide/9.12.1-notes.rst
- libraries/base/src/GHC/Base.hs
- libraries/base/src/GHC/Exts.hs
- testsuite/tests/simd/should_run/all.T
- testsuite/tests/simd/should_run/simd006.hs
- + testsuite/tests/simd/should_run/simd011.stdout
- utils/genprimopcode/Lexer.x
- utils/genprimopcode/Main.hs
- utils/genprimopcode/Parser.y
- utils/genprimopcode/ParserM.hs
- utils/genprimopcode/Syntax.hs


Changes:

=====================================
compiler/GHC/Builtin/primops.txt.pp
=====================================
@@ -4175,6 +4175,33 @@ primop VecWriteScalarOffAddrOp "writeOffAddrAs#" GenPrimOp
         can_fail_warning = YesWarnCanFail
         vector = ALL_VECTOR_TYPES
 
+primop   VecFMAdd   "fmadd#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused multiply-add operation @x*y+z at . See "GHC.Prim#fma".}
+   with
+      vector = FLOAT_VECTOR_TYPES
+primop   VecFMSub   "fmsub#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused multiply-subtract operation @x*y-z at . See "GHC.Prim#fma".}
+   with
+      vector = FLOAT_VECTOR_TYPES
+primop   VecFNMAdd   "fnmadd#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused negate-multiply-add operation @-x*y+z at . See "GHC.Prim#fma".}
+   with
+      vector = FLOAT_VECTOR_TYPES
+primop   VecFNMSub   "fnmsub#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused negate-multiply-subtract operation @-x*y-z at . See "GHC.Prim#fma".}
+   with
+      vector = FLOAT_VECTOR_TYPES
+
+primop VecShuffleOp "shuffle#" GenPrimOp
+  VECTOR -> VECTOR -> INTVECTUPLE -> VECTOR
+  {Shuffle elements of the concatenation of the input two vectors
+  into the result vector.}
+   with vector = ALL_VECTOR_TYPES
+
 ------------------------------------------------------------------------
 
 section "Prefetch"


=====================================
compiler/GHC/Cmm/MachOp.hs
=====================================
@@ -116,7 +116,7 @@ data MachOp
 
   -- Floating-point fused multiply-add operations
   -- | Fused multiply-add, see 'FMASign'.
-  | MO_FMA FMASign Width
+  | MO_FMA FMASign Length Width
 
   -- Floating point comparison
   | MO_F_Eq Width
@@ -158,6 +158,7 @@ data MachOp
   | MO_FW_Bitcast Width      -- Float/Double  -> Word32/Word64
 
   -- Vector element insertion and extraction operations
+  | MO_V_Broadcast Length Width -- Broadcast a scalar into a vector
   | MO_V_Insert    Length Width -- Insert scalar into vector
   | MO_V_Extract   Length Width -- Extract scalar from vector
 
@@ -175,7 +176,12 @@ data MachOp
   | MO_VU_Quot Length Width
   | MO_VU_Rem  Length Width
 
+  -- Vector shuffles
+  | MO_V_Shuffle  Length Width [Int]
+  | MO_VF_Shuffle Length Width [Int]
+
   -- Floating point vector element insertion and extraction operations
+  | MO_VF_Broadcast Length Width   -- Broadcast a scalar into a vector
   | MO_VF_Insert    Length Width   -- Insert scalar into vector
   | MO_VF_Extract   Length Width   -- Extract scalar from vector
 
@@ -459,7 +465,7 @@ machOpResultType platform mop tys =
     MO_F_Quot r         -> cmmFloat r
     MO_F_Neg r          -> cmmFloat r
 
-    MO_FMA _ r        -> cmmFloat r
+    MO_FMA _ l r        -> if l == 1 then cmmFloat r else cmmVec l (cmmFloat r)
 
     MO_F_Eq  {}         -> comparisonResultRep platform
     MO_F_Ne  {}         -> comparisonResultRep platform
@@ -485,6 +491,7 @@ machOpResultType platform mop tys =
     MO_WF_Bitcast   w   -> cmmFloat w
     MO_FW_Bitcast   w   -> cmmBits w
 
+    MO_V_Broadcast l w  -> cmmVec l (cmmBits w)
     MO_V_Insert  l w    -> cmmVec l (cmmBits w)
     MO_V_Extract _ w    -> cmmBits w
 
@@ -499,6 +506,10 @@ machOpResultType platform mop tys =
     MO_VU_Quot l w      -> cmmVec l (cmmBits w)
     MO_VU_Rem  l w      -> cmmVec l (cmmBits w)
 
+    MO_V_Shuffle  l w _ -> cmmVec l (cmmBits w)
+    MO_VF_Shuffle l w _ -> cmmVec l (cmmFloat w)
+
+    MO_VF_Broadcast l w -> cmmVec l (cmmFloat w)
     MO_VF_Insert  l w   -> cmmVec l (cmmFloat w)
     MO_VF_Extract _ w   -> cmmFloat w
 
@@ -556,7 +567,7 @@ machOpArgReps platform op =
     MO_F_Quot r         -> [r,r]
     MO_F_Neg r          -> [r]
 
-    MO_FMA _ r          -> [r,r,r]
+    MO_FMA _ l r        -> [vecwidth l r, vecwidth l r, vecwidth l r]
 
     MO_F_Eq  r          -> [r,r]
     MO_F_Ne  r          -> [r,r]
@@ -582,8 +593,13 @@ machOpArgReps platform op =
     MO_WF_Bitcast w       -> [w]
     MO_FW_Bitcast w       -> [w]
 
+    MO_V_Shuffle  l r _ -> [vecwidth l r, vecwidth l r]
+    MO_VF_Shuffle l r _ -> [vecwidth l r, vecwidth l r]
+
+    MO_V_Broadcast _ r  -> [r]
     MO_V_Insert   l r   -> [vecwidth l r, r, W32]
     MO_V_Extract  l r   -> [vecwidth l r, W32]
+    MO_VF_Broadcast _ r -> [r]
     MO_VF_Insert  l r   -> [vecwidth l r, r, W32]
     MO_VF_Extract l r   -> [vecwidth l r, W32]
       -- SIMD vector indices are always 32 bit


=====================================
compiler/GHC/Cmm/Opt.hs
=====================================
@@ -79,6 +79,11 @@ cmmMachOpFoldM
     -> MachOp
     -> [CmmExpr]
     -> Maybe CmmExpr
+cmmMachOpFoldM _ (MO_V_Broadcast {}) _ = Nothing
+cmmMachOpFoldM _ (MO_VF_Broadcast {}) _ = Nothing
+  -- SIMD NCG TODO: supporting constant folding for vector operations
+  -- would require augmenting getRegister' to handle them.
+  -- See the code for "getRegister' platform _ (CmmLit lit)".
 cmmMachOpFoldM _ op [CmmLit (CmmInt x rep)]
   = Just $! case op of
       MO_S_Neg _ -> CmmLit (CmmInt (-x) rep)


=====================================
compiler/GHC/Cmm/Parser.y
=====================================
@@ -1053,10 +1053,10 @@ machOps = listToUFM $
         ( "fmul",       MO_F_Mul ),
         ( "fquot",      MO_F_Quot ),
 
-        ( "fmadd" ,     MO_FMA FMAdd  ),
-        ( "fmsub" ,     MO_FMA FMSub  ),
-        ( "fnmadd",     MO_FMA FNMAdd ),
-        ( "fnmsub",     MO_FMA FNMSub ),
+        ( "fmadd" ,     MO_FMA FMAdd  1 ),
+        ( "fmsub" ,     MO_FMA FMSub  1 ),
+        ( "fnmadd",     MO_FMA FNMAdd 1 ),
+        ( "fnmsub",     MO_FMA FNMSub 1 ),
 
         ( "feq",        MO_F_Eq ),
         ( "fne",        MO_F_Ne ),


=====================================
compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
=====================================
@@ -813,6 +813,8 @@ getRegister' config plat expr
         MO_VS_Neg {} -> notUnary
         MO_VU_Quot {} -> notUnary
         MO_VU_Rem {} -> notUnary
+        MO_V_Shuffle {} -> notUnary
+        MO_VF_Shuffle  {} -> notUnary
         MO_VF_Insert {} -> notUnary
         MO_VF_Extract {} -> notUnary
         MO_VF_Add {} -> notUnary
@@ -825,6 +827,8 @@ getRegister' config plat expr
         MO_AlignmentCheck {} ->
           pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
 
+        MO_V_Broadcast {} -> vectorsNeedLlvm
+        MO_VF_Broadcast {} -> vectorsNeedLlvm
         MO_VF_Neg {} -> vectorsNeedLlvm
       where
         notUnary = pprPanic "getRegister' (non-unary CmmMachOp with 1 argument):" (pdoc plat expr)
@@ -1170,6 +1174,8 @@ getRegister' config plat expr
         MO_FF_Conv {} -> notDyadic
         MO_WF_Bitcast {} -> notDyadic
         MO_FW_Bitcast {} -> notDyadic
+        MO_V_Broadcast {} -> notDyadic
+        MO_VF_Broadcast {} -> notDyadic
         MO_V_Insert {} -> notDyadic
         MO_VF_Insert {} -> notDyadic
         MO_AlignmentCheck {} -> notDyadic
@@ -1191,6 +1197,8 @@ getRegister' config plat expr
         MO_VF_Neg {} -> vectorsNeedLlvm
         MO_VF_Mul {} -> vectorsNeedLlvm
         MO_VF_Quot {} -> vectorsNeedLlvm
+        MO_V_Shuffle {} -> vectorsNeedLlvm
+        MO_VF_Shuffle {} -> vectorsNeedLlvm
         where
           notDyadic =
             pprPanic "getRegister' (non-dyadic CmmMachOp with 2 arguments): " $
@@ -1210,11 +1218,15 @@ getRegister' config plat expr
         -- x86 fnmadd - x * y + z <=> AArch64 fmsub : d = - r1 * r2 + r3
         -- x86 fnmsub - x * y - z <=> AArch64 fnmadd: d = - r1 * r2 - r3
 
-        MO_FMA var w -> case var of
-          FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
-          FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
-          FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMSub  d n m a)
-          FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
+        MO_FMA var l w
+          | l == 1
+          -> case var of
+            FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
+            FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
+            FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMSub  d n m a)
+            FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
+          | otherwise
+          -> vectorsNeedLlvm
 
         MO_V_Insert {} -> vectorsNeedLlvm
         MO_VF_Insert {} -> vectorsNeedLlvm


=====================================
compiler/GHC/CmmToAsm/Format.hs
=====================================
@@ -24,6 +24,7 @@ module GHC.CmmToAsm.Format (
     isVecFormat,
     cmmTypeFormat,
     formatToWidth,
+    scalarWidth,
     formatInBytes,
     isFloatScalarFormat,
     scalarFormatFormat,


=====================================
compiler/GHC/CmmToAsm/PPC/CodeGen.hs
=====================================
@@ -538,6 +538,8 @@ getRegister' config platform (CmmMachOp mop [x]) -- unary MachOps
 
       MO_XX_Conv _ to -> conversionNop (intFormat to) x
 
+      MO_V_Broadcast {} -> vectorsNeedLlvm
+      MO_VF_Broadcast {} -> vectorsNeedLlvm
       MO_VF_Neg {} -> vectorsNeedLlvm
 
       _ -> panic "PPC.CodeGen.getRegister: no match"
@@ -667,6 +669,8 @@ getRegister' _ _ (CmmMachOp mop [x, y]) -- dyadic PrimOps
       MO_VF_Neg {} -> vectorsNeedLlvm
       MO_VF_Mul {} -> vectorsNeedLlvm
       MO_VF_Quot {} -> vectorsNeedLlvm
+      MO_V_Shuffle {} -> vectorsNeedLlvm
+      MO_VF_Shuffle {} -> vectorsNeedLlvm
 
       _ -> panic "PPC.CodeGen.getRegister: no match"
 
@@ -692,12 +696,14 @@ getRegister' _ _ (CmmMachOp mop [x, y, z]) -- ternary PrimOps
       -- x86 fnmadd - x * y + z ~~ PPC fnmsub rt = -(ra * rc - rb)
       -- x86 fnmsub - x * y - z ~~ PPC fnmadd rt = -(ra * rc + rb)
 
-      MO_FMA variant w ->
+      MO_FMA variant l w | l == 1 ->
         case variant of
           FMAdd  -> fma_code w (FMADD FMAdd) x y z
           FMSub  -> fma_code w (FMADD FMSub) x y z
           FNMAdd -> fma_code w (FMADD FNMAdd) x y z
           FNMSub -> fma_code w (FMADD FNMSub) x y z
+        | otherwise
+        -> vectorsNeedLlvm
 
       MO_V_Insert {} -> vectorsNeedLlvm
       MO_VF_Insert {} -> vectorsNeedLlvm


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -944,6 +944,7 @@ getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
   float_const_sse2  where
   float_const_sse2
     | f == 0.0 = do
+      -- TODO: this mishandles negative zero floating point literals.
       let
           format = floatFormat w
           code dst = unitOL  (XOR format (OpReg dst) (OpReg dst))
@@ -951,9 +952,7 @@ getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
         -- They all appear to do the same thing --SDM
       return (Any format code)
 
-   | otherwise = do
-      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
-      loadFloatAmode w addr code
+   | otherwise = getFloatLitRegister lit
 
 -- catch simple cases of zero- or sign-extended load
 getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _ _]) = do
@@ -1010,9 +1009,10 @@ getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal (GlobalRegUse
         LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
 
 getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
-    sse2 <- sse2Enabled
-    sse  <- sseEnabled
-    avx  <- avxEnabled
+    sse4_1 <- sse4_1Enabled
+    sse2   <- sse2Enabled
+    sse    <- sseEnabled
+    avx    <- avxEnabled
     case mop of
       MO_F_Neg w  -> sse2NegCode w x
 
@@ -1099,13 +1099,27 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_FS_Truncate from to -> coerceFP2Int from to x
       MO_SF_Round    from to -> coerceInt2FP from to x
 
-      MO_VF_Neg l w  | avx           -> vector_float_negate_avx l w x
-                     | sse && sse2   -> vector_float_negate_sse l w x
+      MO_VF_Neg l w  | avx              -> vector_float_negate_avx l w x
+                     | sse  && w == W32 -> vector_float_negate_sse l w x
+                     | sse2 && w == W64 -> vector_float_negate_sse l w x
                      | otherwise
-                       -> sorry "Please enable the -mavx or -msse, -msse2 flag"
+                     -> sorry "Please enable the -mavx or -msse, -msse2 flag"
       -- SIMD NCG TODO: add integer negation
       MO_VS_Neg {} -> needLlvm mop
 
+      MO_VF_Broadcast l W32 | avx       -> vector_float_broadcast_avx l W32 x
+                            | sse4_1    -> vector_float_broadcast_sse l W32 x
+                            | otherwise
+                              -> sorry "Please enable the -mavx or -msse4 flag"
+      MO_VF_Broadcast l W64 | sse2      -> vector_float_broadcast_sse l W64 x
+                            | otherwise -> sorry "Please enable the -msse2 flag"
+      MO_VF_Broadcast {} -> incorrectOperands
+
+      MO_V_Broadcast l W64  | sse2      -> vector_int_broadcast l W64 x
+                            | otherwise -> sorry "Please enable the -msse2 flag"
+      -- SIMD NCG TODO: W32, W16, W8
+      MO_V_Broadcast {} -> needLlvm mop
+
       -- Binary MachOps
       MO_Add {}    -> incorrectOperands
       MO_Sub {}    -> incorrectOperands
@@ -1150,6 +1164,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_VS_Rem {}        -> incorrectOperands
       MO_VU_Quot {}       -> incorrectOperands
       MO_VU_Rem {}        -> incorrectOperands
+      MO_V_Shuffle {}     -> incorrectOperands
+      MO_VF_Shuffle {}    -> incorrectOperands
 
       MO_VF_Extract {}    -> incorrectOperands
       MO_VF_Add {}        -> incorrectOperands
@@ -1206,37 +1222,117 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
 
         vector_float_negate_avx :: Length -> Width -> CmmExpr -> NatM Register
         vector_float_negate_avx l w expr = do
-          tmp                  <- getNewRegNat (VecFormat l FmtFloat)
-          (reg, exp)           <- getSomeReg expr
-          Amode addr addr_code <- memConstant (mkAlignment $ widthInBytes W32) (CmmFloat 0.0 W32)
-          let format   = case w of
-                           W32 -> VecFormat l FmtFloat
-                           W64 -> VecFormat l FmtDouble
-                           _ -> pprPanic "Cannot negate vector of width" (ppr w)
-              code dst = case w of
-                           W32 -> exp `appOL` addr_code `snocOL`
-                                  (VBROADCAST format addr tmp) `snocOL`
-                                  (VSUB format (OpReg reg) tmp dst)
-                           W64 -> exp `appOL` addr_code `snocOL`
-                                  (MOVL format (OpAddr addr) (OpReg tmp)) `snocOL`
-                                  (MOVH format (OpAddr addr) (OpReg tmp)) `snocOL`
-                                  (VSUB format (OpReg reg) tmp dst)
-                           _ -> pprPanic "Cannot negate vector of width" (ppr w)
-          return (Any format code)
+          let fmt :: Format
+              mask :: CmmLit
+              (fmt, mask) = case w of
+                       W32 -> (VecFormat l FmtFloat , CmmInt (bit 31) w) -- TODO: these should be negative 0 floating point literals,
+                       W64 -> (VecFormat l FmtDouble, CmmInt (bit 63) w) -- but we don't currently have those in Cmm.
+                       _ -> panic "AVX floating-point negation: elements must be FF32 or FF64"
+          (maskReg, maskCode) <- getSomeReg (CmmMachOp (MO_VF_Broadcast l w) [CmmLit mask])
+          (reg, exp) <- getSomeReg expr
+          let code dst = maskCode `appOL`
+                         exp `snocOL`
+                         (VMOVU fmt (OpReg reg) (OpReg dst)) `snocOL`
+                         (VXOR fmt (OpReg maskReg) dst dst)
+          return (Any fmt code)
 
         vector_float_negate_sse :: Length -> Width -> CmmExpr -> NatM Register
         vector_float_negate_sse l w expr = do
-          tmp                  <- getNewRegNat (VecFormat l FmtFloat)
-          (reg, exp)           <- getSomeReg expr
-          let format   = case w of
-                           W32 -> VecFormat l FmtFloat
-                           W64 -> VecFormat l FmtDouble
-                           _ -> pprPanic "Cannot negate vector of width" (ppr w)
+          let fmt :: Format
+              mask :: CmmLit
+              (fmt, mask) = case w of
+                       W32 -> (VecFormat l FmtFloat , CmmInt (bit 31) w) -- Same comment as for vector_float_negate_avx,
+                       W64 -> (VecFormat l FmtDouble, CmmInt (bit 63) w) -- these should be -0.0 CmmFloat values.
+                       _ -> panic "SSE floating-point negation: elements must be FF32 or FF64"
+          (maskReg, maskCode) <- getSomeReg (CmmMachOp (MO_VF_Broadcast l w) [CmmLit mask])
+          (reg, exp) <- getSomeReg expr
+          let code dst = maskCode `appOL`
+                         exp `snocOL`
+                         (MOVU fmt (OpReg reg) (OpReg dst)) `snocOL`
+                         (XOR  fmt (OpReg maskReg) (OpReg dst))
+          return (Any fmt code)
+
+        -----------------------
+
+        -- Like 'getSomeReg', but with special handling for int literals
+        -- used as floating point values, to work around the fact that we don't
+        -- have negative zero floating point literals in Cmm yet.
+        --
+        -- This should get removed once we have negative zero in CmmFloat.
+        get_float_broadcast_value_reg expr = case expr of
+          CmmLit lit -> do
+            r <- getFloatLitRegister lit
+            case r of
+              Any rep code -> do
+                tmp <- getNewRegNat rep
+                return (tmp, code tmp)
+              Fixed _ reg code ->
+                return (reg, code)
+          _ -> getSomeReg expr
+
+        vector_float_broadcast_avx :: Length
+                                   -> Width
+                                   -> CmmExpr
+                                   -> NatM Register
+        vector_float_broadcast_avx len W32 expr
+          = do
+          (reg, exp) <- get_float_broadcast_value_reg expr
+          let f    = VecFormat len FmtFloat
+              addr = spRel platform 0
+           in return $ Any f (\dst -> exp `snocOL`
+                                    (VMOVU f (OpReg reg) (OpAddr addr)) `snocOL`
+                                    (VBROADCAST f addr dst))
+        vector_float_broadcast_avx l w _
+          -- NB: for some reason, VBROADCASTSD does not support xmm, only ymm.
+          = pprPanic "vector_float_broadcast_avx" (text "l" <+> ppr l $$ text "w" <+> ppr w)
+
+        vector_float_broadcast_sse :: Length
+                                   -> Width
+                                   -> CmmExpr
+                                   -> NatM Register
+        vector_float_broadcast_sse len W32 expr
+          = do
+          (reg, exp) <- get_float_broadcast_value_reg expr
+          let f        = VecFormat len FmtFloat
+              addr     = spRel platform 0
               code dst = exp `snocOL`
-                         (XOR format (OpReg tmp) (OpReg tmp)) `snocOL`
-                         (MOVU format (OpReg tmp) (OpReg dst)) `snocOL`
-                         (SUB format (OpReg reg) (OpReg dst))
-          return (Any format code)
+                         (MOVU f (OpReg reg) (OpAddr addr)) `snocOL`
+                         (insertps $ 0b1110) `snocOL`
+                         (insertps $ 16) `snocOL`
+                         (insertps $ 32) `snocOL`
+                         (insertps $ 48)
+                where
+                  insertps imm =
+                    INSERTPS f (ImmInt imm) (OpAddr addr) dst
+
+           in return $ Any f code
+        vector_float_broadcast_sse len W64 expr
+          = do
+          (reg, exp) <- get_float_broadcast_value_reg expr
+          let f    = VecFormat len FmtDouble
+              addr = spRel platform 0
+           in return $ Any f (\dst -> exp `snocOL`
+                                    (MOVU f (OpReg reg) (OpAddr addr)) `snocOL`
+                                    (MOVL f (OpAddr addr) (OpReg dst)) `snocOL`
+                                    (MOVH f (OpAddr addr) (OpReg dst)))
+        vector_float_broadcast_sse _ _ c
+          = pprPanic "Broadcast not supported for : " (pdoc platform c)
+
+        vector_int_broadcast :: Length
+                             -> Width
+                             -> CmmExpr
+                             -> NatM Register
+        vector_int_broadcast len W64 expr
+          = do
+          (reg, exp) <- getSomeReg expr
+          let fmt = VecFormat len FmtInt64
+          return $ Any fmt (\dst -> exp `snocOL`
+                                    (MOVD II64 (OpReg reg) (OpReg dst)) `snocOL`
+                                    (PUNPCKLQDQ fmt (OpReg dst) dst)
+                                    )
+        vector_int_broadcast _ _ c
+          = pprPanic "Broadcast not supported for : " (pdoc platform c)
+
 
 getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
   sse2   <- sse2Enabled
@@ -1294,6 +1390,16 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_U_Shr rep -> shift_code rep SHR x y {-False-}
       MO_S_Shr rep -> shift_code rep SAR x y {-False-}
 
+      MO_VF_Shuffle l w is
+        | l * widthInBits w == 128
+        -> if
+            | avx
+            -> vector_shuffle_float l w x y is
+            | otherwise
+            -> sorry "Please enable the -mavx flag"
+        | otherwise
+        -> sorry "Please use -fllvm for wide shuffle instructions"
+
       MO_VF_Extract l W32   | avx       -> vector_float_extract l W32 x y
                             | sse       -> vector_float_extract_sse l W32 x y
                             | otherwise
@@ -1332,6 +1438,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
                               -> sorry "Please enable the -mavx or -msse flag"
 
       -- SIMD NCG TODO: integer vector operations
+      MO_V_Shuffle {} -> needLlvm mop
       MO_V_Add {} -> needLlvm mop
       MO_V_Sub {} -> needLlvm mop
       MO_V_Mul {} -> needLlvm mop
@@ -1356,6 +1463,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_AlignmentCheck {} -> incorrectOperands
       MO_VS_Neg {} -> incorrectOperands
       MO_VF_Neg {} -> incorrectOperands
+      MO_V_Broadcast {} -> incorrectOperands
+      MO_VF_Broadcast {} -> incorrectOperands
 
       -- Ternary MachOps
       MO_FMA {} -> incorrectOperands
@@ -1674,13 +1783,95 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     vector_int_extract_sse _ w c e
       = pprPanic "Unsupported SSE floating-point vector extract" (pdoc platform c $$ pdoc platform e $$ ppr w)
 
+    vector_shuffle_float :: Length -> Width -> CmmExpr -> CmmExpr -> [Int] -> NatM Register
+    vector_shuffle_float l w v1 v2 is = do
+      (r1, exp1) <- getSomeReg v1
+      (r2, exp2) <- getSomeReg v2
+      let fmt = VecFormat l (if w == W32 then FmtFloat else FmtDouble)
+          code dst
+            = exp1 `appOL` (exp2 `appOL` shuffleInstructions fmt r1 r2 is dst)
+      return (Any fmt code)
+
+    shuffleInstructions :: Format -> Reg -> Reg -> [Int] -> Reg -> OrdList Instr
+    shuffleInstructions fmt v1 v2 is dst =
+      case fmt of
+        VecFormat 2 FmtDouble ->
+          case is of
+            [i1, i2] -> case (i1, i2) of
+              (0,0) -> unitOL (VSHUFPD fmt (ImmInt 0b00) (OpReg v1) v1 dst)
+              (1,1) -> unitOL (VSHUFPD fmt (ImmInt 0b11) (OpReg v1) v1 dst)
+              (2,2) -> unitOL (VSHUFPD fmt (ImmInt 0b00) (OpReg v2) v2 dst)
+              (3,3) -> unitOL (VSHUFPD fmt (ImmInt 0b11) (OpReg v2) v2 dst)
+              (0,1) -> unitOL (VMOVU fmt (OpReg v1) (OpReg dst))
+              (2,3) -> unitOL (VMOVU fmt (OpReg v2) (OpReg dst))
+              (1,0) -> unitOL (VSHUFPD fmt (ImmInt 0b01) (OpReg v1) v1 dst)
+              (3,2) -> unitOL (VSHUFPD fmt (ImmInt 0b01) (OpReg v2) v2 dst)
+              (0,2) -> unitOL (VSHUFPD fmt (ImmInt 0b00) (OpReg v2) v1 dst)
+              (2,0) -> unitOL (VSHUFPD fmt (ImmInt 0b00) (OpReg v1) v2 dst)
+              (0,3) -> unitOL (VSHUFPD fmt (ImmInt 0b10) (OpReg v2) v1 dst)
+              (3,0) -> unitOL (VSHUFPD fmt (ImmInt 0b01) (OpReg v1) v2 dst)
+              (1,2) -> unitOL (VSHUFPD fmt (ImmInt 0b01) (OpReg v2) v1 dst)
+              (2,1) -> unitOL (VSHUFPD fmt (ImmInt 0b10) (OpReg v1) v2 dst)
+              (1,3) -> unitOL (VSHUFPD fmt (ImmInt 0b11) (OpReg v2) v1 dst)
+              (3,1) -> unitOL (VSHUFPD fmt (ImmInt 0b11) (OpReg v1) v2 dst)
+              _ -> pprPanic "vector shuffle: indices out of bounds 0 <= i <= 3" (ppr is)
+            _ -> pprPanic "vector shuffle: wrong number of indices (expected 2)" (ppr is)
+        VecFormat 4 FmtFloat
+          -- indices 0 <= i <= 7
+          | all ( (>= 0) <&&> (<= 7) ) is ->
+          case is of
+            [i1, i2, i3, i4]
+              | all ( <= 3 ) is
+              , let imm = i1 + i2 `shiftL` 2 + i3 `shiftL` 4 + i4 `shiftL` 6
+              -> unitOL (VSHUFPS fmt (ImmInt imm) (OpReg v1) v1 dst)
+              | all ( >= 4 ) is
+              , let [j1, j2, j3, j4] = map ( subtract 4 ) is
+                    imm = j1 + j2 `shiftL` 2 + j3 `shiftL` 4 + j4 `shiftL` 6
+              -> unitOL (VSHUFPS fmt (ImmInt imm) (OpReg v2) v2 dst)
+              | i1 <= 3, i2 <= 3
+              , i3 >= 4, i4 >= 4
+              , let imm = i1 + i2 `shiftL` 2 + (i3 - 4) `shiftL` 4 + (i4 - 4) `shiftL` 6
+              -> unitOL (VSHUFPS fmt (ImmInt imm) (OpReg v2) v1 dst)
+              | i1 >= 4, i2 >= 4
+              , i3 <= 3, i4 <= 3
+              , let imm = (i1 - 4) + (i2 - 4) `shiftL` 2 + i3 `shiftL` 4 + i4 `shiftL` 6
+              -> unitOL (VSHUFPS fmt (ImmInt imm) (OpReg v1) v2 dst)
+              | otherwise
+              ->
+              -- Fall-back code with 4 INSERTPS operations.
+              -- SIMD NCG TODO: handle more cases with better lowering.
+              let -- bits: ss_dd_zzzz
+                  -- ss: pick source location
+                  -- dd: pick destination location
+                  -- zzzz: pick locations to be zeroed
+                  insertImm src dst = shiftL   ( src `mod` 4 ) 6
+                                    .|. shiftL dst 4
+                  vec src = if src >= 4 then v2 else v1
+              in unitOL
+                (INSERTPS fmt (ImmInt $ insertImm i1 0 .|. 0b1110) (OpReg $ vec i1) dst)
+                `snocOL`
+                (INSERTPS fmt (ImmInt $ insertImm i2 1) (OpReg $ vec i2) dst)
+                `snocOL`
+                (INSERTPS fmt (ImmInt $ insertImm i3 2) (OpReg $ vec i3) dst)
+                `snocOL`
+                (INSERTPS fmt (ImmInt $ insertImm i4 3) (OpReg $ vec i4) dst)
+            _ -> pprPanic "vector shuffle: wrong number of indices (expected 4)" (ppr is)
+          | otherwise
+          -> pprPanic "vector shuffle: indices out of bounds 0 <= i <= 7" (ppr is)
+        _ ->
+          pprPanic "vector shuffle: unsupported format" (ppr fmt)
+
 getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
   sse4_1 <- sse4_1Enabled
   sse2   <- sse2Enabled
   sse    <- sseEnabled
   case mop of
       -- Floating point fused multiply-add operations @ ± x*y ± z@
-      MO_FMA var w -> genFMA3Code w var x y z
+      MO_FMA var l w
+        | l * widthInBits w > 256
+        -> sorry "Please use -fllvm for wide vector FMA support"
+        | otherwise
+        -> genFMA3Code l w var x y z
 
       -- Ternary vector operations
       MO_VF_Insert l W32  | sse4_1 && sse -> vector_float_insert_sse l W32 x y z
@@ -1875,33 +2066,48 @@ getRegister' platform is32Bit (CmmLit lit)
         -- note2: all labels are small, because we're assuming the
         -- small memory model. See Note [%rip-relative addressing on x86-64].
 
-getRegister' platform _ (CmmLit lit)
-  | isVecType cmmtype = vectorRegister cmmtype
-  | otherwise         = standardRegister cmmtype
-  where
-    cmmtype = cmmLitType platform lit
-    vectorRegister ctype
-      | case lit of { CmmVec fs -> all (\case { CmmInt i _ -> i == 0; CmmFloat f _ -> f == 0; _ -> False }) fs; _ -> False }
-      = -- NOTE:
-        -- This operation is only used to zero a register. For loading a
-        -- vector literal there are pack and broadcast operations
-        let format = cmmTypeFormat ctype
-            code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
-        in return (Any format code)
+getRegister' platform _ (CmmLit lit) =
+  case fmt of
+    VecFormat l sFmt
+      | case lit of { CmmVec fs -> all is_zero fs; _ -> False }
+      -> let code dst = unitOL (XOR fmt (OpReg dst) (OpReg dst))
+         in return (Any fmt code)
+      | Just f <- case lit of { CmmVec (f:fs) | all (== f) fs -> Just f; _ -> Nothing }
+      -> do config <- getConfig
+            let w = scalarWidth sFmt
+                broadcast = if isFloatScalarFormat sFmt
+                            then MO_VF_Broadcast l w
+                            else MO_V_Broadcast l w
+            (valReg, valCode) <- getSomeReg (CmmMachOp broadcast [CmmLit f])
+            let code dst =
+                   valCode `snocOL`
+                   (mkRegRegMoveInstr config fmt valReg dst)
+            return $ Any fmt code
       | otherwise
-      = pprPanic "getRegister': no support for (nonzero) vector literals" $
-          vcat [ text "lit:" <+> ppr lit ]
-      -- SIMD NCG TODO: can we do better here?
-    standardRegister ctype
-      = do
-      let format = cmmTypeFormat ctype
-          imm = litToImm lit
-          code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
-      return (Any format code)
+      -- SIMD NCG TODO: handle this case as well.
+      -> pprPanic "getRegister': non-constant vector literals not supported"
+          (ppr lit)
+       where
+        is_zero (CmmInt i _) = i == 0
+        is_zero (CmmFloat f _) = f == 0 -- TODO: mishandles negative zero
+        is_zero _ = False
+
+    _ -> let imm = litToImm lit
+             code dst = unitOL (MOV fmt (OpImm imm) (OpReg dst))
+         in return (Any fmt code)
+  where
+    cmmTy = cmmLitType platform lit
+    fmt = cmmTypeFormat cmmTy
 
 getRegister' platform _ other
   = pprPanic "getRegister(x86)" (pdoc platform other)
 
+getFloatLitRegister :: CmmLit -> NatM Register
+getFloatLitRegister lit = do
+  let w :: Width
+      w = case lit of { CmmInt _ w -> w; CmmFloat _ w -> w; _ -> panic "getFloatLitRegister" (ppr lit) }
+  Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+  loadFloatAmode w addr code
 
 intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
    -> NatM (Reg -> InstrBlock)
@@ -3852,10 +4058,11 @@ _   `regClashesWithOp` _            = False
 
 -- | Generate code for a fused multiply-add operation, of the form @± x * y ± z@,
 -- with 3 operands (FMA3 instruction set).
-genFMA3Code :: Width
+genFMA3Code :: Length
+            -> Width
             -> FMASign
             -> CmmExpr -> CmmExpr -> CmmExpr -> NatM Register
-genFMA3Code w signs x y z = do
+genFMA3Code l w signs x y z = do
   config <- getConfig
   -- For the FMA instruction, we want to compute x * y + z
   --
@@ -3883,7 +4090,11 @@ genFMA3Code w signs x y z = do
   -- only possible if the other arguments don't use the destination register.
   -- We check for this and if there is a conflict we move the result only after
   -- the computation. See #24496 how this went wrong in the past.
-  let rep = floatFormat w
+  let rep
+        | l == 1
+        = floatFormat w
+        | otherwise
+        = vecFormat (cmmVec l $ cmmFloat w)
   (y_reg, y_code) <- getNonClobberedReg y
   (z_op, z_code) <- getNonClobberedOperand z
   x_code <- getAnyReg x


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -259,6 +259,8 @@ data Instr
         | AND         Format Operand Operand
         | OR          Format Operand Operand
         | XOR         Format Operand Operand
+        -- | AVX bitwise logical XOR operation
+        | VXOR        Format Operand Reg Reg
         | NOT         Format Operand
         | NEGI        Format Operand         -- NEG instruction (name clash with Cond)
         | BSWAP       Format Reg
@@ -295,8 +297,9 @@ data Instr
 
         -- | FMA3 fused multiply-add operations.
         | FMA3         Format FMASign FMAPermutation Operand Reg Reg
-          -- src3 (r/m), src2 (r), dst/src1 (r)
-          -- This is exactly reversed from how intel lists the arguments.
+          -- For the FMA213 permutation (the only one we use currently),
+          -- this is: src3 (r/m), src2 (r), dst/src1 (r)
+          -- (NB: this isexactly reversed from how Intel lists the arguments.)
 
         -- use ADD, SUB, and SQRT for arithmetic.  In both cases, operands
         -- are  Operand Reg.
@@ -476,9 +479,16 @@ regUsageOfInstr platform instr
     OR     fmt src dst    -> usageRM fmt src dst
 
     XOR    fmt (OpReg src) (OpReg dst)
-        | src == dst    -> mkRU [] [mk fmt dst]
+      | src == dst
+      -> mkRU [] [mk fmt dst]
+    XOR    fmt src dst
+      -> usageRM fmt src dst
+    VXOR fmt (OpReg src1) src2 dst
+      | src1 == src2, src1 == dst
+      -> mkRU [] [mk fmt dst]
+    VXOR fmt src1 src2 dst
+      -> mkRU (use_R fmt src1 [mk fmt src2]) [mk fmt dst]
 
-    XOR    fmt src dst    -> usageRM fmt src dst
     NOT    fmt op         -> usageM fmt op
     BSWAP  fmt reg        -> mkRU [mk fmt reg] [mk fmt reg]
     NEGI   fmt op         -> usageM fmt op
@@ -721,6 +731,7 @@ patchRegsOfInstr platform instr env
     AND  fmt src dst     -> patch2 (AND  fmt) src dst
     OR   fmt src dst     -> patch2 (OR   fmt) src dst
     XOR  fmt src dst     -> patch2 (XOR  fmt) src dst
+    VXOR fmt src1 src2 dst -> VXOR fmt (patchOp src1) (env src2) (env dst)
     NOT  fmt op          -> patch1 (NOT  fmt) op
     BSWAP fmt reg        -> BSWAP fmt (env reg)
     NEGI fmt op          -> patch1 (NEGI fmt) op
@@ -763,6 +774,8 @@ patchRegsOfInstr platform instr env
     LOCATION {}         -> instr
     UNWIND {}           -> instr
     DELTA _             -> instr
+    LDATA {}            -> instr
+    NEWBLOCK {}         -> instr
 
     JXX _ _             -> instr
     JXX_GBL _ _         -> instr
@@ -829,8 +842,6 @@ patchRegsOfInstr platform instr env
     PUNPCKLQDQ fmt src dst
       -> PUNPCKLQDQ fmt (patchOp src) (env dst)
 
-    _other              -> panic "patchRegs: unrecognised instr"
-
   where
     patch1 :: (Operand -> a) -> Operand -> a
     patch1 insn op      = insn $! patchOp op


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -752,11 +752,14 @@ pprInstr platform i = case i of
    XOR format src dst
       -> pprFormatOpOp (text "xor") format src dst
 
+   VXOR fmt src1 src2 dst
+      -> pprVxor fmt src1 src2 dst
+
    POPCNT format src dst
       -> pprOpOp (text "popcnt") format src (OpReg dst)
 
    LZCNT format src dst
-      ->  pprOpOp (text "lzcnt") format src (OpReg dst)
+      -> pprOpOp (text "lzcnt") format src (OpReg dst)
 
    TZCNT format src dst
       -> pprOpOp (text "tzcnt") format src (OpReg dst)
@@ -1304,6 +1307,23 @@ pprInstr platform i = case i of
            pprReg platform format reg3
        ]
 
+   pprVxor :: Format -> Operand -> Reg -> Reg -> doc
+   pprVxor fmt src1 src2 dst
+     = line $ hcat [
+           pprGenMnemonic mem fmt,
+           pprOperand platform fmt src1,
+           comma,
+           pprReg platform fmt src2,
+           comma,
+           pprReg platform fmt dst
+       ]
+     where
+      mem = case fmt of
+        VecFormat _ FmtFloat -> text "vxorps"
+        VecFormat _ FmtDouble -> text "vxorpd"
+        _ -> pprPanic "GHC.CmmToAsm.X86.Ppr.pprVxor: elementy type must be Float or Double"
+              (ppr fmt)
+
    pprInsert :: Line doc -> Format -> Imm -> Operand -> Reg -> doc
    pprInsert name format off src dst
      = line $ hcat [


=====================================
compiler/GHC/CmmToC.hs
=====================================
@@ -727,7 +727,7 @@ pprMachOp_for_C platform mop = case mop of
         MO_F_Quot       _ -> char '/'
 
         -- Floating-point fused multiply-add operations
-        MO_FMA FMAdd w ->
+        MO_FMA FMAdd 1 w ->
           case w of
             W32 -> text "fmaf"
             W64 -> text "fma"
@@ -736,10 +736,15 @@ pprMachOp_for_C platform mop = case mop of
                 (text "FMAdd")
                 (panic $ "PprC.pprMachOp_for_C: FMAdd unsupported"
                        ++ "at width " ++ show w)
-        MO_FMA var _ ->
-          pprTrace "offending mop:"
+        MO_FMA var l width
+          | l == 1
+          -> pprTrace "offending mop:"
               (text $ "FMA " ++ show var)
               (panic $ "PprC.pprMachOp_for_C: should have been handled earlier!")
+          | otherwise
+          -> pprTrace "offending mop:"
+              (text $ "FMA " ++ show var ++ " " ++ show l ++ " " ++ show width)
+              (panic $ "PprC.pprMachOp_for_C: unsupported vector operation")
 
         -- Signed comparisons
         MO_S_Ge         _ -> text ">="
@@ -828,6 +833,14 @@ pprMachOp_for_C platform mop = case mop of
         MO_AlignmentCheck {} -> panic "-falignment-sanitisation not supported by unregisterised backend"
 
 -- SIMD vector instructions: currently unsupported
+        MO_V_Shuffle {} -> pprTrace "offending mop:"
+                                (text "MO_V_Shuffle")
+                                (panic $ "PprC.pprMachOp_for_C: MO_V_Shuffle"
+                                      ++ "unsupported by the unregisterised backend")
+        MO_VF_Shuffle {} -> pprTrace "offending mop:"
+                                (text "MO_VF_Shuffle")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VF_Shuffle"
+                                      ++ "unsupported by the unregisterised backend")
         MO_V_Insert {}    -> pprTrace "offending mop:"
                                 (text "MO_V_Insert")
                                 (panic $ "PprC.pprMachOp_for_C: MO_V_Insert"
@@ -868,6 +881,14 @@ pprMachOp_for_C platform mop = case mop of
                                 (text "MO_VU_Rem")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VU_Rem"
                                       ++ "unsupported by the unregisterised backend")
+        MO_V_Broadcast {} -> pprTrace "offending mop:"
+                                 (text "MO_V_Broadcast")
+                                 (panic $ "PprC.pprMachOp_for_C: MO_V_Broadcast"
+                                      ++ "unsupported by the unregisterised backend")
+        MO_VF_Broadcast {} -> pprTrace "offending mop:"
+                                 (text "MO_VF_Broadcast")
+                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Broadcast"
+                                      ++ "unsupported by the unregisterised backend")
         MO_VF_Insert {}   -> pprTrace "offending mop:"
                                 (text "MO_VF_Insert")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"


=====================================
compiler/GHC/CmmToLlvm/CodeGen.hs
=====================================
@@ -1460,6 +1460,9 @@ genMachOp _ op [x] = case op of
             all0s = LMLitVar $ LMVectorLit (replicate len all0)
         in negateVec vecty all0s LM_MO_FSub
 
+    MO_V_Broadcast  l w -> genBroadcastOp l w x
+    MO_VF_Broadcast l w -> genBroadcastOp l w x
+
     MO_RelaxedRead w -> exprToVar (CmmLoad x (cmmBits w) NaturallyAligned)
 
     MO_AlignmentCheck _ _ -> panic "-falignment-sanitisation is not supported by -fllvm"
@@ -1491,7 +1494,7 @@ genMachOp _ op [x] = case op of
     MO_F_Mul        _ -> panicOp
     MO_F_Quot       _ -> panicOp
 
-    MO_FMA _ _        -> panicOp
+    MO_FMA _ _ _      -> panicOp
 
     MO_F_Eq         _ -> panicOp
     MO_F_Ne         _ -> panicOp
@@ -1523,6 +1526,9 @@ genMachOp _ op [x] = case op of
     MO_VF_Insert  _ _ -> panicOp
     MO_VF_Extract _ _ -> panicOp
 
+    MO_V_Shuffle {} -> panicOp
+    MO_VF_Shuffle {} -> panicOp
+
     MO_VF_Add     _ _ -> panicOp
     MO_VF_Sub     _ _ -> panicOp
     MO_VF_Mul     _ _ -> panicOp
@@ -1676,7 +1682,7 @@ genMachOp_slow opt op [x, y] = case op of
     MO_F_Mul  _ -> genBinMach LM_MO_FMul
     MO_F_Quot _ -> genBinMach LM_MO_FDiv
 
-    MO_FMA _ _  -> panicOp
+    MO_FMA _ _ _ -> panicOp
 
     MO_And _   -> genBinMach LM_MO_And
     MO_Or  _   -> genBinMach LM_MO_Or
@@ -1716,9 +1722,14 @@ genMachOp_slow opt op [x, y] = case op of
 
     MO_VS_Neg {} -> panicOp
 
+    MO_VF_Broadcast {} -> panicOp
+    MO_V_Broadcast {} -> panicOp
     MO_V_Insert  {} -> panicOp
     MO_VF_Insert  {} -> panicOp
 
+    MO_V_Shuffle _ _ is -> genShuffleOp is x y
+    MO_VF_Shuffle _ _ is -> genShuffleOp is x y
+
     MO_VF_Neg {} -> panicOp
 
     MO_RelaxedRead {} -> panicOp
@@ -1815,7 +1826,7 @@ genMachOp_slow _opt op [x, y, z] = do
     panicOp = panic $ "LLVM.CodeGen.genMachOp_slow: non-ternary op encountered "
                    ++ "with three arguments! (" ++ show op ++ ")"
   case op of
-    MO_FMA var width ->
+    MO_FMA var lg width ->
       case var of
         -- LLVM only has the fmadd variant.
         FMAdd   -> genFmaOp x y z
@@ -1825,12 +1836,43 @@ genMachOp_slow _opt op [x, y, z] = do
         FNMAdd  -> genFmaOp (neg x) y z
         FNMSub  -> genFmaOp (neg x) y (neg z)
       where
-        neg x = CmmMachOp (MO_F_Neg width) [x]
+        neg x
+          | lg == 1
+          = CmmMachOp (MO_F_Neg width) [x]
+          | otherwise
+          = CmmMachOp (MO_VF_Neg lg width) [x]
     _ -> panicOp
 
 -- More than three expressions, invalid!
 genMachOp_slow _ _ _ = panic "genMachOp_slow: More than 3 expressions in MachOp!"
 
+genBroadcastOp :: Int -> Width -> CmmExpr -> LlvmM ExprData
+genBroadcastOp lg _width x = runExprData $ do
+  -- To broadcast a scalar x as a vector v:
+  --   1. insert x at the 0 position of the zero vector
+  --   2. shuffle x into all positions
+  var_x <- exprToVarW x
+  let tx = getVarType var_x
+      tv = LMVector lg tx
+      z = if isFloat tx
+          then LMFloatLit 0 tx
+          else LMIntLit   0 tx
+      zs = LMLitVar $ LMVectorLit $ replicate lg z
+  w <- doExprW tv $ Insert zs var_x (LMLitVar $ LMIntLit 0 (LMInt 32))
+  doExprW tv $ Shuffle w w (replicate lg 0)
+
+genShuffleOp :: [Int] -> CmmExpr -> CmmExpr -> LlvmM ExprData
+genShuffleOp is x y = runExprData $ do
+  vx <- exprToVarW x
+  vy <- exprToVarW y
+  let tx = getVarType vx
+      ty = getVarType vy
+  Panic.massertPpr
+    (tx == ty)
+    (vcat [ text "shuffle: mismatched arg types"
+          , ppLlvmType tx, ppLlvmType ty ])
+  doExprW tx $ Shuffle vx vy is
+
 -- | Generate code for a fused multiply-add operation.
 genFmaOp :: CmmExpr -> CmmExpr -> CmmExpr -> LlvmM ExprData
 genFmaOp x y z = runExprData $ do
@@ -1847,6 +1889,12 @@ genFmaOp x y z = runExprData $ do
   let fname = case tx of
         LMFloat  -> fsLit "llvm.fma.f32"
         LMDouble -> fsLit "llvm.fma.f64"
+        LMVector 4 LMFloat -> fsLit "llvm.fma.v4f32"
+        LMVector 8 LMFloat -> fsLit "llvm.fma.v8f32"
+        LMVector 16 LMFloat -> fsLit "llvm.fma.v16f32"
+        LMVector 2 LMDouble -> fsLit "llvm.fma.v2f64"
+        LMVector 4 LMDouble -> fsLit "llvm.fma.v4f64"
+        LMVector 8 LMDouble -> fsLit "llvm.fma.v8f64"
         _ -> pprPanic "CmmToLlvm.genFmaOp: unsupported type" (ppLlvmType tx)
   fptr <- liftExprData $ getInstrinct fname ty [tx, ty, tz]
   doExprW tx $ Call StdCall fptr [vx, vy, vz] [ReadNone, NoUnwind]


=====================================
compiler/GHC/Llvm/Ppr.hs
=====================================
@@ -281,6 +281,7 @@ ppLlvmExpression opts expr
         Extract    vec idx          -> ppExtract opts vec idx
         ExtractV   struct idx       -> ppExtractV opts struct idx
         Insert     vec elt idx      -> ppInsert opts vec elt idx
+        Shuffle    v1 v2 idxs       -> ppShuffle opts v1 v2 idxs
         GetElemPtr inb ptr indexes  -> ppGetElementPtr opts inb ptr indexes
         Load       ptr align        -> ppLoad opts ptr align
         ALoad      ord st ptr       -> ppALoad opts ord st ptr
@@ -577,6 +578,15 @@ ppInsert opts vec elt idx =
 {-# SPECIALIZE ppInsert :: LlvmCgConfig -> LlvmVar -> LlvmVar -> LlvmVar -> SDoc #-}
 {-# SPECIALIZE ppInsert :: LlvmCgConfig -> LlvmVar -> LlvmVar -> LlvmVar -> HLine #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable
 
+ppShuffle :: IsLine doc => LlvmCgConfig -> LlvmVar -> LlvmVar -> [Int] -> doc
+ppShuffle opts v1 v2 idxs =
+    text "shufflevector"
+    <+> ppLlvmType (getVarType v1) <+> ppName opts v1 <> comma
+    <+> ppLlvmType (getVarType v2) <+> ppName opts v2 <> comma
+    <+> ppLlvmType (LMVector (length idxs) (LMInt 32)) <+> ppLit opts (LMVectorLit $ map ((`LMIntLit` (LMInt 32)) . fromIntegral) idxs)
+{-# SPECIALIZE ppShuffle :: LlvmCgConfig -> LlvmVar -> LlvmVar -> [Int] -> SDoc #-}
+{-# SPECIALIZE ppShuffle :: LlvmCgConfig -> LlvmVar -> LlvmVar -> [Int] -> HLine #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable
+
 ppMetaAnnotExpr :: IsLine doc => LlvmCgConfig -> [MetaAnnot] -> LlvmExpression -> doc
 ppMetaAnnotExpr opts meta expr =
    ppLlvmExpression opts expr <> ppMetaAnnots opts meta


=====================================
compiler/GHC/Llvm/Syntax.hs
=====================================
@@ -237,6 +237,10 @@ data LlvmExpression
   -}
   | Insert LlvmVar LlvmVar LlvmVar
 
+  {- | Shuffle two vectors into a destination vector using given indices
+  -}
+  | Shuffle LlvmVar LlvmVar [Int]
+
   {- |
     Allocate amount * sizeof(tp) bytes on the heap
       * tp:     LlvmType to reserve room for


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -949,16 +949,8 @@ emitPrimOp cfg primop =
 -- SIMD primops
   (VecBroadcastOp vcat n w) -> \[e] -> opIntoRegs $ \[res] -> do
     checkVecCompatibility cfg vcat n w
-    doVecPackOp ty zeros (replicate n e) res
+    doVecBroadcastOp ty e res
    where
-    zeros :: CmmExpr
-    zeros = CmmLit $ CmmVec (replicate n zero)
-
-    zero :: CmmLit
-    zero = case vcat of
-             IntVec   -> CmmInt 0 w
-             WordVec  -> CmmInt 0 w
-             FloatVec -> CmmFloat 0 w
 
     ty :: CmmType
     ty = vecVmmType vcat n w
@@ -1085,6 +1077,10 @@ emitPrimOp cfg primop =
     ty :: CmmType
     ty = vecCmmCat vcat w
 
+  VecShuffleOp vcat n w -> \ args -> opIntoRegs $ \ [res] -> do
+    checkVecCompatibility cfg vcat n w
+    doShuffleOp (vecVmmType vcat n w) args res
+
 -- Prefetch
   PrefetchByteArrayOp3         -> \args -> opIntoRegs $ \[] ->
     doPrefetchByteArrayOp 3  args
@@ -1491,10 +1487,10 @@ emitPrimOp cfg primop =
   DoubleDivOp    -> opTranslate (MO_F_Quot W64)
   DoubleNegOp    -> opTranslate (MO_F_Neg W64)
 
-  DoubleFMAdd    -> fmaOp FMAdd  W64
-  DoubleFMSub    -> fmaOp FMSub  W64
-  DoubleFNMAdd   -> fmaOp FNMAdd W64
-  DoubleFNMSub   -> fmaOp FNMSub W64
+  DoubleFMAdd    -> fmaOp FMAdd  1 W64
+  DoubleFMSub    -> fmaOp FMSub  1 W64
+  DoubleFNMAdd   -> fmaOp FNMAdd 1 W64
+  DoubleFNMSub   -> fmaOp FNMSub 1 W64
 
 -- Float ops
 
@@ -1511,10 +1507,10 @@ emitPrimOp cfg primop =
   FloatDivOp    -> opTranslate (MO_F_Quot W32)
   FloatNegOp    -> opTranslate (MO_F_Neg  W32)
 
-  FloatFMAdd    -> fmaOp FMAdd  W32
-  FloatFMSub    -> fmaOp FMSub  W32
-  FloatFNMAdd   -> fmaOp FNMAdd W32
-  FloatFNMSub   -> fmaOp FNMSub W32
+  FloatFMAdd    -> fmaOp FMAdd  1 W32
+  FloatFMSub    -> fmaOp FMSub  1 W32
+  FloatFNMAdd   -> fmaOp FNMAdd 1 W32
+  FloatFNMSub   -> fmaOp FNMSub 1 W32
 
 -- Vector ops
 
@@ -1542,6 +1538,12 @@ emitPrimOp cfg primop =
   (VecRemOp  WordVec n w) -> opTranslate (MO_VU_Rem  n w)
   (VecNegOp  WordVec _ _) -> \_ -> panic "unsupported primop"
 
+  -- Vector FMA instructions
+  VecFMAdd  _ n w -> fmaOp FMAdd  n w
+  VecFMSub  _ n w -> fmaOp FMSub  n w
+  VecFNMAdd _ n w -> fmaOp FNMAdd n w
+  VecFNMSub _ n w -> fmaOp FNMSub n w
+
 -- Conversions
 
   IntToDoubleOp   -> opTranslate (MO_SF_Round (wordWidth platform) W64)
@@ -1839,10 +1841,11 @@ emitPrimOp cfg primop =
 
   allowFMA = stgToCmmAllowFMAInstr cfg
 
-  fmaOp :: FMASign -> Width -> [CmmActual] -> PrimopCmmEmit
-  fmaOp signs w args@[arg_x, arg_y, arg_z]
-    | allowFMA signs
-    = opTranslate (MO_FMA signs w) args
+  fmaOp :: FMASign -> Length -> Width -> [CmmActual] -> PrimopCmmEmit
+  fmaOp signs l w args@[arg_x, arg_y, arg_z]
+    |  allowFMA signs
+    || l > 1 -- (always use the MachOp for vector FMA)
+    = opTranslate (MO_FMA signs l w) args
     | otherwise
     = case signs of
 
@@ -1851,12 +1854,16 @@ emitPrimOp cfg primop =
 
         -- Other fused multiply-add operations are implemented in terms of fmadd
         -- This is sound: it does not lose any precision.
-        FMSub  -> fmaOp FMAdd w [arg_x, arg_y, neg arg_z]
-        FNMAdd -> fmaOp FMAdd w [neg arg_x, arg_y, arg_z]
-        FNMSub -> fmaOp FMAdd w [neg arg_x, arg_y, neg arg_z]
+        FMSub  -> fmaOp FMAdd l w [arg_x, arg_y, neg arg_z]
+        FNMAdd -> fmaOp FMAdd l w [neg arg_x, arg_y, arg_z]
+        FNMSub -> fmaOp FMAdd l w [neg arg_x, arg_y, neg arg_z]
     where
-      neg x = CmmMachOp (MO_F_Neg w) [x]
-  fmaOp _ _ _ = panic "fmaOp: wrong number of arguments (expected 3)"
+      neg x
+        | l == 1
+        = CmmMachOp (MO_F_Neg w) [x]
+        | otherwise
+        = CmmMachOp (MO_VF_Neg l w) [x]
+  fmaOp _ _ _ _ = panic "fmaOp: wrong number of arguments (expected 3)"
 
 data PrimopCmmEmit
   -- | Out of line fake primop that's actually just a foreign call to other
@@ -2588,6 +2595,21 @@ checkVecCompatibility cfg vcat l w =
 ------------------------------------------------------------------------------
 -- Helpers for translating vector packing and unpacking.
 
+doVecBroadcastOp :: CmmType       -- Type of vector
+                 -> CmmExpr       -- Element
+                 -> CmmFormal     -- Destination for result
+                 -> FCode ()
+doVecBroadcastOp ty e dst
+  | isFloatType (vecElemType ty)
+  = emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Broadcast len wid) [e])
+  | otherwise
+  = emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Broadcast len wid) [e])
+  where
+    len :: Length
+    len = vecLength ty
+    wid :: Width
+    wid = typeWidth (vecElemType ty)
+
 doVecPackOp :: CmmType       -- Type of vector
             -> [CmmExpr]     -- Elements
             -> CmmFormal     -- Destination for result
@@ -2666,6 +2688,36 @@ doVecInsertOp ty src e idx res = do
     wid :: Width
     wid = typeWidth (vecElemType ty)
 
+------------------------------------------------------------------------------
+-- Shuffles
+
+doShuffleOp :: CmmType -> [CmmExpr] -> LocalReg -> FCode ()
+doShuffleOp ty (v1:v2:idxs) res
+  | isVecType ty
+  = case mapMaybe idx_maybe idxs of
+      is
+        | length is == len
+        -> emitAssign (CmmLocal res) (CmmMachOp (mo is) [v1,v2])
+        | otherwise
+        -> pprPanic "doShuffleOp" $
+             vcat [ text "shuffle indices must be literals, 0 <= i <" <+> ppr len ]
+  | otherwise
+  = pprPanic "doShuffleOp" $
+        vcat [ text "non-vector argument type:" <+> ppr ty ]
+  where
+    len = vecLength ty
+    wid = typeWidth $ vecElemType ty
+    mo = if isFloatType (vecElemType ty)
+         then MO_VF_Shuffle len wid
+         else MO_V_Shuffle  len wid
+    idx_maybe (CmmLit (CmmInt i _))
+      | let j :: Int; j = fromInteger i
+      , j >= 0, j < 2 * len
+      = Just j
+    idx_maybe _ = Nothing
+doShuffleOp _ _ _ =
+  panic "doShuffleOp: wrong number of arguments"
+
 ------------------------------------------------------------------------------
 -- Helpers for translating prefetching.
 


=====================================
compiler/GHC/StgToJS/Prim.hs
=====================================
@@ -1192,12 +1192,18 @@ genPrim prof bound ty op = case op of
   VecReadOffAddrOp _ _ _            -> unhandledPrimop op
   VecWriteOffAddrOp _ _ _           -> unhandledPrimop op
 
+  VecFMAdd  {} -> unhandledPrimop op
+  VecFMSub  {} -> unhandledPrimop op
+  VecFNMAdd {} -> unhandledPrimop op
+  VecFNMSub {} -> unhandledPrimop op
+
   VecIndexScalarByteArrayOp _ _ _   -> unhandledPrimop op
   VecReadScalarByteArrayOp _ _ _    -> unhandledPrimop op
   VecWriteScalarByteArrayOp _ _ _   -> unhandledPrimop op
   VecIndexScalarOffAddrOp _ _ _     -> unhandledPrimop op
   VecReadScalarOffAddrOp _ _ _      -> unhandledPrimop op
   VecWriteScalarOffAddrOp _ _ _     -> unhandledPrimop op
+  VecShuffleOp _ _ _                -> unhandledPrimop op
 
   PrefetchByteArrayOp3              -> noOp
   PrefetchMutableByteArrayOp3       -> noOp


=====================================
docs/users_guide/9.12.1-notes.rst
=====================================
@@ -111,6 +111,19 @@ Runtime system
 
 - Usage of deprecated primops is now correctly reported (#19629).
 
+- New fused multiply-add instructions for vectors of floating-point values,
+  such as ``fmaddFloatX4# :: FloatX4# -> FloatX4# -> FloatX4# -> FloatX4#`` and
+  ``fnmsubDoubleX2# :: DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#``.
+  These follow the same semantics as ``fmadd``/``fmsub``/``fnmadd``/``fnmsub``,
+  operating in parallel on vectors of floating-point values.
+
+- New vector shuffle instructions, such as ``shuffleFloatX4# :: FloatX4# -> FloatX4# -> (# Int#, Int#, Int#, Int# #) -> FloatX4#``.
+  These instructions take two input vectors and a collection of indices (which must
+  be compile-time literal integers), and constructs a result vector by extracting
+  out the values at those indices. For instance, ``shuffleFloatX4#`` on input vectors with
+  components ``(# 0.1#, 11.1#, 22.1#, 33.1# #)`` and ``(# 44.1#, 55.1#, 66.1#, 77.1# #)``,
+  and indices ``(# 4#, 3#, 6#, 1# #)``, will return a vector with components
+  ``(# 44.1#, 33.1#, 66.1#, 11.1# #)``.
 
 ``ghc`` library
 ~~~~~~~~~~~~~~~


=====================================
libraries/base/src/GHC/Base.hs
=====================================
@@ -147,6 +147,62 @@ import GHC.Prim hiding
     dataToTagSmall#, dataToTagLarge#
   -- whereFrom# is similarly internal.
   , whereFrom#
+  -- Don't re-export vector FMA instructions
+  , fmaddFloatX4#
+  , fmsubFloatX4#
+  , fnmaddFloatX4#
+  , fnmsubFloatX4#
+  , fmaddFloatX8#
+  , fmsubFloatX8#
+  , fnmaddFloatX8#
+  , fnmsubFloatX8#
+  , fmaddFloatX16#
+  , fmsubFloatX16#
+  , fnmaddFloatX16#
+  , fnmsubFloatX16#
+  , fmaddDoubleX2#
+  , fmsubDoubleX2#
+  , fnmaddDoubleX2#
+  , fnmsubDoubleX2#
+  , fmaddDoubleX4#
+  , fmsubDoubleX4#
+  , fnmaddDoubleX4#
+  , fnmsubDoubleX4#
+  , fmaddDoubleX8#
+  , fmsubDoubleX8#
+  , fnmaddDoubleX8#
+  , fnmsubDoubleX8#
+  -- Don't re-export SIMD shuffle primops
+  , shuffleDoubleX2#
+  , shuffleDoubleX4#
+  , shuffleDoubleX8#
+  , shuffleFloatX16#
+  , shuffleFloatX4#
+  , shuffleFloatX8#
+  , shuffleInt16X16#
+  , shuffleInt16X32#
+  , shuffleInt16X8#
+  , shuffleInt32X16#
+  , shuffleInt32X4#
+  , shuffleInt32X8#
+  , shuffleInt64X2#
+  , shuffleInt64X4#
+  , shuffleInt64X8#
+  , shuffleInt8X16#
+  , shuffleInt8X32#
+  , shuffleInt8X64#
+  , shuffleWord16X16#
+  , shuffleWord16X32#
+  , shuffleWord16X8#
+  , shuffleWord32X16#
+  , shuffleWord32X4#
+  , shuffleWord32X8#
+  , shuffleWord64X2#
+  , shuffleWord64X4#
+  , shuffleWord64X8#
+  , shuffleWord8X16#
+  , shuffleWord8X32#
+  , shuffleWord8X64#
   )
 
 import GHC.Prim.Ext


=====================================
libraries/base/src/GHC/Exts.hs
=====================================
@@ -120,6 +120,62 @@ import GHC.Prim hiding
   , dataToTagSmall#, dataToTagLarge#
   -- whereFrom# is similarly internal.
   , whereFrom#
+  -- Don't re-export vector FMA instructions
+  , fmaddFloatX4#
+  , fmsubFloatX4#
+  , fnmaddFloatX4#
+  , fnmsubFloatX4#
+  , fmaddFloatX8#
+  , fmsubFloatX8#
+  , fnmaddFloatX8#
+  , fnmsubFloatX8#
+  , fmaddFloatX16#
+  , fmsubFloatX16#
+  , fnmaddFloatX16#
+  , fnmsubFloatX16#
+  , fmaddDoubleX2#
+  , fmsubDoubleX2#
+  , fnmaddDoubleX2#
+  , fnmsubDoubleX2#
+  , fmaddDoubleX4#
+  , fmsubDoubleX4#
+  , fnmaddDoubleX4#
+  , fnmsubDoubleX4#
+  , fmaddDoubleX8#
+  , fmsubDoubleX8#
+  , fnmaddDoubleX8#
+  , fnmsubDoubleX8#
+  -- Don't re-export SIMD shuffle primops
+  , shuffleDoubleX2#
+  , shuffleDoubleX4#
+  , shuffleDoubleX8#
+  , shuffleFloatX16#
+  , shuffleFloatX4#
+  , shuffleFloatX8#
+  , shuffleInt16X16#
+  , shuffleInt16X32#
+  , shuffleInt16X8#
+  , shuffleInt32X16#
+  , shuffleInt32X4#
+  , shuffleInt32X8#
+  , shuffleInt64X2#
+  , shuffleInt64X4#
+  , shuffleInt64X8#
+  , shuffleInt8X16#
+  , shuffleInt8X32#
+  , shuffleInt8X64#
+  , shuffleWord16X16#
+  , shuffleWord16X32#
+  , shuffleWord16X8#
+  , shuffleWord32X16#
+  , shuffleWord32X4#
+  , shuffleWord32X8#
+  , shuffleWord64X2#
+  , shuffleWord64X4#
+  , shuffleWord64X8#
+  , shuffleWord8X16#
+  , shuffleWord8X32#
+  , shuffleWord8X64#
   )
 
 import GHC.Prim.Ext


=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -22,4 +22,5 @@ test('simd009', [ req_th
                 ]
               , multimod_compile_and_run, ['simd009', ''])
 test('simd010', [], compile_and_run, [''])
+test('simd011', [when(have_cpu_feature('fma'), extra_hc_opts('-mfma'))], compile_and_run, [''])
 test('simd012', [], compile_and_run, [''])


=====================================
testsuite/tests/simd/should_run/simd006.hs
=====================================
@@ -123,17 +123,15 @@ instance Arbitrary Word32 where
 newtype FloatNT = FloatNT Float
   deriving newtype (Show, Num)
 instance Eq FloatNT where
-  FloatNT f1 == FloatNT f2 = f1 == f2
-    -- TODO: tests fail with this equality due to signed zeros
-    -- castFloatToWord32 f1 == castFloatToWord32 f2
+  FloatNT f1 == FloatNT f2 =
+    castFloatToWord32 f1 == castFloatToWord32 f2
 instance Arbitrary FloatNT where
   arbitrary = FloatNT . castWord32ToFloat <$> arbitrary
 newtype DoubleNT = DoubleNT Double
   deriving newtype (Show, Num)
 instance Eq DoubleNT where
-  DoubleNT d1 == DoubleNT d2 = d1 == d2
-    -- TODO: tests fail with this equality due to signed zeros
-    -- castDoubleToWord64 d1 == castDoubleToWord64 d2
+  DoubleNT d1 == DoubleNT d2 =
+    castDoubleToWord64 d1 == castDoubleToWord64 d2
 instance Arbitrary DoubleNT where
   arbitrary = DoubleNT . castWord64ToDouble <$> arbitrary
 


=====================================
testsuite/tests/simd/should_run/simd011.stdout
=====================================
@@ -0,0 +1,8 @@
+(1011.11,2044.4401,3099.99,4177.7603)
+(-988.89,-1955.5599,-2900.01,-3822.24)
+(988.89,1955.5599,2900.01,3822.24)
+(-1011.11,-2044.4401,-3099.99,-4177.7603)
+(1011.11,2044.44)
+(-988.89,-1955.56)
+(988.89,1955.56)
+(-1011.11,-2044.44)


=====================================
utils/genprimopcode/Lexer.x
=====================================
@@ -67,6 +67,7 @@ words :-
     <0>         "SCALAR"            { mkT TSCALAR }
     <0>         "VECTOR"            { mkT TVECTOR }
     <0>         "VECTUPLE"          { mkT TVECTUPLE }
+    <0>         "INTVECTUPLE"       { mkT TINTVECTUPLE }
     <0>         [a-z][a-zA-Z0-9\#_]* { mkTv TLowerName }
     <0>         [A-Z][a-zA-Z0-9\#_]* { mkTv TUpperName }
     <0>         \-? [0-9][0-9]*     { mkTv (TInteger . read) }


=====================================
utils/genprimopcode/Main.hs
=====================================
@@ -79,6 +79,8 @@ desugarVectorSpec i              = case vecOptions i of
         desugarTy (TyApp SCALAR [])   = TyApp (TyCon repCon) []
         desugarTy (TyApp VECTOR [])   = TyApp (VecTyCon vecCons vecTyName) []
         desugarTy (TyApp VECTUPLE []) = TyUTup (replicate n (TyApp (TyCon repCon) []))
+        desugarTy (TyApp INTVECTUPLE [])
+                                      = TyUTup (replicate n (TyApp (TyCon "Int#") []) )
         desugarTy (TyApp tycon ts)    = TyApp tycon (map desugarTy ts)
         desugarTy t@(TyVar {})        = t
         desugarTy (TyUTup ts)         = TyUTup (map desugarTy ts)


=====================================
utils/genprimopcode/Parser.y
=====================================
@@ -58,6 +58,7 @@ import AccessOps
     SCALAR          { TSCALAR }
     VECTOR          { TVECTOR }
     VECTUPLE        { TVECTUPLE }
+    INTVECTUPLE     { TINTVECTUPLE }
     bytearray_access_ops { TByteArrayAccessOps }
     addr_access_ops { TAddrAccessOps }
     thats_all_folks { TThatsAllFolks }
@@ -215,6 +216,7 @@ pTycon : upperName { TyCon $1 }
        | SCALAR       { SCALAR }
        | VECTOR       { VECTOR }
        | VECTUPLE     { VECTUPLE }
+       | INTVECTUPLE  { INTVECTUPLE }
 
 {
 parse :: String -> Either String Info


=====================================
utils/genprimopcode/ParserM.hs
=====================================
@@ -124,6 +124,7 @@ data Token = TEOF
            | TSCALAR
            | TVECTOR
            | TVECTUPLE
+           | TINTVECTUPLE
     deriving Show
 
 -- Actions


=====================================
utils/genprimopcode/Syntax.hs
=====================================
@@ -87,6 +87,7 @@ data TyCon = TyCon String
            | SCALAR
            | VECTOR
            | VECTUPLE
+           | INTVECTUPLE
            | VecTyCon String String
   deriving (Eq, Ord)
 
@@ -95,6 +96,7 @@ instance Show TyCon where
     show SCALAR          = "SCALAR"
     show VECTOR          = "VECTOR"
     show VECTUPLE        = "VECTUPLE"
+    show INTVECTUPLE     = "INTVECTUPLE"
     show (VecTyCon tc _) = tc
 
 -- Follow definitions of Fixity and FixityDirection in GHC



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/e2b57729572cd502b27b91635e0f37376306d583...f47ef2d5f999e43af1dcc333142368d91ee5d3af

-- 
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/e2b57729572cd502b27b91635e0f37376306d583...f47ef2d5f999e43af1dcc333142368d91ee5d3af
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240703/93a6ee42/attachment-0001.html>