[Git][ghc/ghc][wip/ncg-simd] 3 commits: TODO: MOV stuff

Thu Jun 13 15:10:13 UTC 2024

sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
64611d0d by sheaf at 2024-06-13T13:38:09+02:00
TODO: MOV stuff

- - - - -
f5032fc9 by sheaf at 2024-06-13T17:09:40+02:00
X86 NCG SIMD: refactoring

- - - - -
68d896f4 by sheaf at 2024-06-13T17:09:54+02:00
SIMD tests: fixup

- - - - -


11 changed files:

- compiler/GHC/CmmToAsm/Format.hs
- compiler/GHC/CmmToAsm/PPC/Ppr.hs
- compiler/GHC/CmmToAsm/Reg/Linear.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/CmmToLlvm/CodeGen.hs
- compiler/GHC/Platform/Reg/Class.hs
- compiler/GHC/StgToCmm/Prim.hs
- testsuite/tests/simd/should_run/Simd009c.hs
- testsuite/tests/simd/should_run/all.T


Changes:

=====================================
compiler/GHC/CmmToAsm/Format.hs
=====================================
@@ -23,6 +23,7 @@ module GHC.CmmToAsm.Format (
     cmmTypeFormat,
     formatToWidth,
     formatInBytes,
+    scalarWidth,
     isIntScalarFormat,
 )
 
@@ -79,7 +80,6 @@ data Format
         | FF64
         | VecFormat !Length       -- ^ number of elements
                     !ScalarFormat -- ^ format of each element
-                    !Width        -- ^ size of each element
         deriving (Show, Eq, Ord)
 
 pattern IntegerFormat :: Format
@@ -98,13 +98,14 @@ isIntegerFormat = \case
 instance Outputable Format where
   ppr fmt = text (show fmt)
 
-data ScalarFormat = FmtInt8
-                  | FmtInt16
-                  | FmtInt32
-                  | FmtInt64
-                  | FmtFloat
-                  | FmtDouble
-                  deriving (Show, Eq, Ord)
+data ScalarFormat
+  = FmtInt8
+  | FmtInt16
+  | FmtInt32
+  | FmtInt64
+  | FmtFloat
+  | FmtDouble
+  deriving (Show, Eq, Ord)
 
 isIntScalarFormat :: ScalarFormat -> Bool
 isIntScalarFormat FmtInt8 = True
@@ -165,27 +166,37 @@ vecFormat ty =
       elemTy = vecElemType ty
    in if isFloatType elemTy
       then case typeWidth elemTy of
-             W32 -> VecFormat l FmtFloat  W32
-             W64 -> VecFormat l FmtDouble W64
+             W32 -> VecFormat l FmtFloat
+             W64 -> VecFormat l FmtDouble
              _   -> pprPanic "Incorrect vector element width" (ppr elemTy)
       else case typeWidth elemTy of
-             W8  -> VecFormat l FmtInt8  W8
-             W16 -> VecFormat l FmtInt16 W16
-             W32 -> VecFormat l FmtInt32 W32
-             W64 -> VecFormat l FmtInt64 W64
+             W8  -> VecFormat l FmtInt8
+             W16 -> VecFormat l FmtInt16
+             W32 -> VecFormat l FmtInt32
+             W64 -> VecFormat l FmtInt64
              _   -> pprPanic "Incorrect vector element width" (ppr elemTy)
 
 -- | Get the Width of a Format.
 formatToWidth :: Format -> Width
 formatToWidth format
  = case format of
-        II8             -> W8
-        II16            -> W16
-        II32            -> W32
-        II64            -> W64
-        FF32            -> W32
-        FF64            -> W64
-        VecFormat l _ w -> widthFromBytes (l*widthInBytes w)
+        II8  -> W8
+        II16 -> W16
+        II32 -> W32
+        II64 -> W64
+        FF32 -> W32
+        FF64 -> W64
+        VecFormat l s ->
+          widthFromBytes (l * widthInBytes (scalarWidth s))
+
+scalarWidth :: ScalarFormat -> Width
+scalarWidth = \case
+  FmtInt8   -> W8
+  FmtInt16  -> W16
+  FmtInt32  -> W32
+  FmtInt64  -> W64
+  FmtFloat  -> W32
+  FmtDouble -> W64
 
 formatInBytes :: Format -> Int
 formatInBytes = widthInBytes . formatToWidth


=====================================
compiler/GHC/CmmToAsm/PPC/Ppr.hs
=====================================
@@ -221,7 +221,7 @@ pprFormat x
                 II64 -> text "d"
                 FF32 -> text "fs"
                 FF64 -> text "fd"
-                VecFormat _ _ _ -> panic "PPC pprFormat: VecFormat"
+                VecFormat {} -> panic "PPC pprFormat: VecFormat"
 
 pprCond :: IsLine doc => Cond -> doc
 pprCond c
@@ -384,7 +384,7 @@ pprInstr platform instr = case instr of
                II64 -> text "d"
                FF32 -> text "fs"
                FF64 -> text "fd"
-               VecFormat _ _ _ -> panic "PPC pprInstr: VecFormat"
+               VecFormat {} -> panic "PPC pprInstr: VecFormat"
                ),
            case addr of AddrRegImm _ _ -> empty
                         AddrRegReg _ _ -> char 'x',
@@ -427,7 +427,7 @@ pprInstr platform instr = case instr of
                II64 -> text "d"
                FF32 -> text "fs"
                FF64 -> text "fd"
-               VecFormat _ _ _ -> panic "PPC pprInstr: VecFormat"
+               VecFormat {} -> panic "PPC pprInstr: VecFormat"
                ),
            case addr of AddrRegImm _ _ -> empty
                         AddrRegReg _ _ -> char 'x',


=====================================
compiler/GHC/CmmToAsm/Reg/Linear.hs
=====================================
@@ -399,17 +399,6 @@ linearRA block_live block_id = go [] []
         (accInstr', new_fixups) <- raInsn block_live accInstr block_id instr
         go accInstr' (new_fixups ++ accFixups) instrs

--- TODO: move to proper place
-classOfReg :: Platform -> Reg -> RegClass
-classOfReg platform (RegReal x) = targetClassOfRealReg platform x
-classOfReg _ (RegVirtual x) = classOfVirtualReg x
-
--- TODO: move to proper place
--- TODO: on some platforms we can be more lenient, 
---       e.g. on X86 Float/Double/Vec are compatible
-compatibleReg :: Platform -> Reg -> Reg -> Bool
-compatibleReg platform r1 r2 = classOfReg platform r1 == classOfReg platform r2
-
 -- | Do allocation for a single instruction.
 raInsn
         :: OutputableRegConstraint freeRegs instr
@@ -434,7 +423,6 @@ raInsn _     new_instrs _ (LiveInstr ii@(Instr i) Nothing)
 raInsn block_live new_instrs id (LiveInstr (Instr instr) (Just live))
  = do
     assig    <- getAssigR :: RegM freeRegs (UniqFM Reg Loc)
-    platform <- getPlatform
 
     -- If we have a reg->reg move between virtual registers, where the
     -- src register is not live after this instruction, and the dst
@@ -447,8 +435,7 @@ raInsn block_live new_instrs id (LiveInstr (Instr instr) (Just live))
         Just (src,dst)  | Just (_, fmt) <- lookupUFM (liveDieRead live) src,
                           isVirtualReg dst,
                           not (dst `elemUFM` assig),
-                          isRealReg src || isInReg src assig,
-                          compatibleReg platform src dst -> do
+                          isRealReg src || isInReg src assig -> do
            case src of
               RegReal rr -> setAssigR (addToUFM assig dst (InReg $ RealRegUsage rr fmt))
                 -- if src is a fixed reg, then we just map dest to this
@@ -888,7 +875,9 @@ allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
             -- trimmed down datatype that only keeps track of e.g.
             -- how many stack slots something uses up.
             vr_fmt = case r of
-                VirtualRegVec {} -> VecFormat 2 FmtDouble W64
+                VirtualRegVec {} -> VecFormat 2 FmtDouble
+                -- SIMD NCG TODO: handle 256 and 512 by adding
+                -- new virtual register constructors.
                 _ -> II64
 
         -- Can we put the variable into a register it already was?


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1046,6 +1046,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_WF_Bitcast W32 -> bitcast II32 FF32 x
       MO_FW_Bitcast W64 -> bitcast FF64 II64 x
       MO_WF_Bitcast W64 -> bitcast II64 FF64 x
+      MO_WF_Bitcast {}  -> incorrectOperands
+      MO_FW_Bitcast {}  -> incorrectOperands
 
       -- widenings
       MO_UU_Conv W8  W32 -> integerExtend W8  W32 MOVZxL x
@@ -1084,42 +1086,84 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
 
       MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
-
-
       MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
 
+      MO_FF_Conv {} -> incorrectOperands
+      MO_UU_Conv {} -> incorrectOperands
+      MO_SS_Conv {} -> incorrectOperands
+      MO_XX_Conv {} -> incorrectOperands
+
       MO_FS_Truncate from to -> coerceFP2Int from to x
       MO_SF_Round    from to -> coerceInt2FP from to x
 
-      -- SIMD NCG TODO (optional): implement these vector operations as well
-      -- (they are integer vector operations).
-      MO_V_Insert {}      -> needLlvm mop
-      MO_V_Extract {}     -> needLlvm mop
-      MO_V_Add {}         -> needLlvm mop
-      MO_V_Sub {}         -> needLlvm mop
-      MO_V_Mul {}         -> needLlvm mop
-      MO_VS_Quot {}       -> needLlvm mop
-      MO_VS_Rem {}        -> needLlvm mop
-      MO_VS_Neg {}        -> needLlvm mop
-      MO_VU_Quot {}       -> needLlvm mop
-      MO_VU_Rem {}        -> needLlvm mop
-      MO_V_Shuffle {}  -> incorrectOperands
-      MO_VF_Shuffle {}  -> incorrectOperands
-      MO_VF_Broadcast {}  -> incorrectOperands
-      MO_VF_Insert {}     -> incorrectOperands
+      MO_VF_Neg l w  | avx           -> vector_float_negate_avx l w x
+                     | sse && sse2   -> vector_float_negate_sse l w x
+                     | otherwise
+                       -> sorry "Please enable the -mavx or -msse, -msse2 flag"
+      -- SIMD NCG TODO
+      MO_VS_Neg {} -> needLlvm mop
+
+      -- Binary MachOps
+      MO_Add {}    -> incorrectOperands
+      MO_Sub {}    -> incorrectOperands
+      MO_Eq {}     -> incorrectOperands
+      MO_Ne {}     -> incorrectOperands
+      MO_Mul {}    -> incorrectOperands
+      MO_S_MulMayOflo {} -> incorrectOperands
+      MO_S_Quot {} -> incorrectOperands
+      MO_S_Rem {}  -> incorrectOperands
+      MO_U_Quot {} -> incorrectOperands
+      MO_U_Rem {}  -> incorrectOperands
+      MO_S_Ge {}   -> incorrectOperands
+      MO_S_Le {}   -> incorrectOperands
+      MO_S_Gt {}   -> incorrectOperands
+      MO_S_Lt {}   -> incorrectOperands
+      MO_U_Ge {}   -> incorrectOperands
+      MO_U_Le {}   -> incorrectOperands
+      MO_U_Gt {}   -> incorrectOperands
+      MO_U_Lt {}   -> incorrectOperands
+      MO_F_Add {}  -> incorrectOperands
+      MO_F_Sub {}  -> incorrectOperands
+      MO_F_Mul {}  -> incorrectOperands
+      MO_F_Quot {} -> incorrectOperands
+      MO_F_Eq {}   -> incorrectOperands
+      MO_F_Ne {}   -> incorrectOperands
+      MO_F_Ge {}   -> incorrectOperands
+      MO_F_Le {}   -> incorrectOperands
+      MO_F_Gt {}   -> incorrectOperands
+      MO_F_Lt {}   -> incorrectOperands
+      MO_And {}    -> incorrectOperands
+      MO_Or {}     -> incorrectOperands
+      MO_Xor {}    -> incorrectOperands
+      MO_Shl {}    -> incorrectOperands
+      MO_U_Shr {}  -> incorrectOperands
+      MO_S_Shr {}  -> incorrectOperands
+
+      MO_V_Extract {}     -> incorrectOperands
+      MO_V_Add {}         -> incorrectOperands
+      MO_V_Sub {}         -> incorrectOperands
+      MO_V_Mul {}         -> incorrectOperands
+      MO_VS_Quot {}       -> incorrectOperands
+      MO_VS_Rem {}        -> incorrectOperands
+      MO_VU_Quot {}       -> incorrectOperands
+      MO_VU_Rem {}        -> incorrectOperands
+      MO_V_Shuffle {}     -> incorrectOperands
+      MO_VF_Shuffle {}    -> incorrectOperands
+
       MO_VF_Extract {}    -> incorrectOperands
       MO_VF_Add {}        -> incorrectOperands
       MO_VF_Sub {}        -> incorrectOperands
       MO_VF_Mul {}        -> incorrectOperands
       MO_VF_Quot {}       -> incorrectOperands
+      MO_V_Broadcast {}   -> incorrectOperands
+      MO_VF_Broadcast {}  -> incorrectOperands
 
-      MO_VF_Neg l w  | avx           -> vector_float_negate_avx l w x
-                     | sse && sse2   -> vector_float_negate_sse l w x
-                     | otherwise
-                       -> sorry "Please enable the -mavx or -msse, -msse2 flag"
-
+      -- Ternary MachOps
+      MO_FMA {}           -> incorrectOperands
+      MO_VF_Insert {}     -> incorrectOperands
+      MO_V_Insert {}      -> incorrectOperands
 
-      _other -> pprPanic "getRegister" (pprMachOp mop)
+      --_other -> pprPanic "getRegister" (pprMachOp mop)
    where
         triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register
         triv_ucode instr format = trivialUCode format (instr format) x
@@ -1163,12 +1207,12 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
 
         vector_float_negate_avx :: Length -> Width -> CmmExpr -> NatM Register
         vector_float_negate_avx l w expr = do
-          tmp                  <- getNewRegNat (VecFormat l FmtFloat w)
+          tmp                  <- getNewRegNat (VecFormat l FmtFloat)
           (reg, exp)           <- getSomeReg expr
           Amode addr addr_code <- memConstant (mkAlignment $ widthInBytes W32) (CmmFloat 0.0 W32)
           let format   = case w of
-                           W32 -> VecFormat l FmtFloat w
-                           W64 -> VecFormat l FmtDouble w
+                           W32 -> VecFormat l FmtFloat
+                           W64 -> VecFormat l FmtDouble
                            _ -> pprPanic "Cannot negate vector of width" (ppr w)
               code dst = case w of
                            W32 -> exp `appOL` addr_code `snocOL`
@@ -1183,11 +1227,11 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
 
         vector_float_negate_sse :: Length -> Width -> CmmExpr -> NatM Register
         vector_float_negate_sse l w expr = do
-          tmp                  <- getNewRegNat (VecFormat l FmtFloat w)
+          tmp                  <- getNewRegNat (VecFormat l FmtFloat)
           (reg, exp)           <- getSomeReg expr
           let format   = case w of
-                           W32 -> VecFormat l FmtFloat w
-                           W64 -> VecFormat l FmtDouble w
+                           W32 -> VecFormat l FmtFloat
+                           W64 -> VecFormat l FmtDouble
                            _ -> pprPanic "Cannot negate vector of width" (ppr w)
               code dst = exp `snocOL`
                          (XOR format (OpReg tmp) (OpReg tmp)) `snocOL`
@@ -1253,12 +1297,6 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_U_Shr rep -> shift_code rep SHR x y {-False-}
       MO_S_Shr rep -> shift_code rep SAR x y {-False-}
 
-      MO_V_Shuffle l w is
-        | avx
-        -> vector_shuffle_int l w x y is
-        | otherwise
-        -> sorry "Please enable the -mavx flag"
-
       MO_VF_Shuffle l w is
         | avx
         -> vector_shuffle_float l w x y is
@@ -1269,23 +1307,27 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
                             | sse4_1    -> vector_float_broadcast_sse l W32 x y
                             | otherwise
                               -> sorry "Please enable the -mavx or -msse4 flag"
-
       MO_VF_Broadcast l W64 | sse2      -> vector_float_broadcast_avx l W64 x y
                             | otherwise -> sorry "Please enable the -msse2 flag"
-      
+      MO_VF_Broadcast {} -> incorrectOperands
+
       MO_V_Broadcast l W64  | sse2      -> vector_int_broadcast l W64 x y
                             | otherwise -> sorry "Please enable the -msse2 flag"
+      -- SIMD NCG TODO: W32, W16, W8
+      MO_V_Broadcast {} -> needLlvm mop
 
       MO_VF_Extract l W32   | avx       -> vector_float_unpack l W32 x y
                             | sse       -> vector_float_unpack_sse l W32 x y
                             | otherwise
                               -> sorry "Please enable the -mavx or -msse flag"
-
       MO_VF_Extract l W64   | sse2      -> vector_float_unpack l W64 x y
                             | otherwise -> sorry "Please enable the -msse2 flag"
-      
+      MO_VF_Extract {} -> incorrectOperands
+
       MO_V_Extract l W64    | sse2      -> vector_int_unpack_sse l W64 x y
                             | otherwise -> sorry "Please enable the -msse2 flag"
+      -- SIMD NCG TODO: W32, W16, W8
+      MO_V_Extract {} -> needLlvm mop
 
       MO_VF_Add l w         | avx              -> vector_float_op_avx VA_Add l w x y
                             | sse  && w == W32 -> vector_float_op_sse VA_Add l w x y
@@ -1311,10 +1353,38 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
                             | otherwise
                               -> sorry "Please enable the -mavx or -msse flag"
 
-      MO_VF_Insert {}                  -> incorrectOperands
+      -- SIMD NCG TODO: integer vector operations
+      MO_V_Shuffle {} -> needLlvm mop
+      MO_V_Add {} -> needLlvm mop
+      MO_V_Sub {} -> needLlvm mop
+      MO_V_Mul {} -> needLlvm mop
+      MO_VS_Quot {} -> needLlvm mop
+      MO_VS_Rem {} -> needLlvm mop
+      MO_VU_Quot {} -> needLlvm mop
+      MO_VU_Rem {} -> needLlvm mop
+
+      -- Unary MachOps
+      MO_S_Neg {} -> incorrectOperands
+      MO_F_Neg {} -> incorrectOperands
+      MO_Not {} -> incorrectOperands
+      MO_SF_Round {} -> incorrectOperands
+      MO_FS_Truncate {} -> incorrectOperands
+      MO_SS_Conv {} -> incorrectOperands
+      MO_XX_Conv {} -> incorrectOperands
+      MO_FF_Conv {} -> incorrectOperands
+      MO_UU_Conv {} -> incorrectOperands
+      MO_WF_Bitcast {} -> incorrectOperands
+      MO_FW_Bitcast  {} -> incorrectOperands
+      MO_RelaxedRead {} -> incorrectOperands
+      MO_AlignmentCheck {} -> incorrectOperands
+      MO_VS_Neg {} -> incorrectOperands
+      MO_VF_Neg {} -> incorrectOperands
+
+      -- Ternary MachOps
+      MO_FMA {} -> incorrectOperands
+      MO_V_Insert {} -> incorrectOperands
+      MO_VF_Insert {} -> incorrectOperands
 
-
-      _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
   where
     --------------------
     triv_op width instr = trivialCode width op (Just op) x y
@@ -1511,8 +1581,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       (reg1, exp1) <- getSomeReg expr1
       (reg2, exp2) <- getSomeReg expr2
       let format   = case w of
-                       W32 -> VecFormat l FmtFloat  W32
-                       W64 -> VecFormat l FmtDouble W64
+                       W32 -> VecFormat l FmtFloat
+                       W64 -> VecFormat l FmtDouble
                        _ -> pprPanic "Operation not supported for width " (ppr w)
           code dst = case op of
             VA_Add -> arithInstr VADD
@@ -1535,8 +1605,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       (reg1, exp1) <- getSomeReg expr1
       (reg2, exp2) <- getSomeReg expr2
       let format   = case w of
-                       W32 -> VecFormat l FmtFloat  W32
-                       W64 -> VecFormat l FmtDouble W64
+                       W32 -> VecFormat l FmtFloat
+                       W64 -> VecFormat l FmtDouble
                        _ -> pprPanic "Operation not supported for width " (ppr w)
           code dst = case op of
             VA_Add -> arithInstr ADD
@@ -1559,26 +1629,26 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     vector_float_unpack l W32 expr (CmmLit lit)
       = do
       (r, exp) <- getSomeReg expr
-      let format   = VecFormat l FmtFloat W32
+      let format   = VecFormat l FmtFloat
           imm      = litToImm lit
           code dst
             = case lit of
-                CmmInt 0 _ -> exp `snocOL` (MOV FF32 (OpReg r) (OpReg dst))
+                CmmInt 0 _ -> exp `snocOL` (MOVSD FF32 (OpReg r) (OpReg dst))
                 CmmInt _ _ -> exp `snocOL` (VPSHUFD format imm (OpReg r) dst)
                 _          -> panic "Error in offset while unpacking"
       return (Any format code)
     vector_float_unpack l W64 expr (CmmLit lit)
       = do
       (r, exp) <- getSomeReg expr
-      let format   = VecFormat l FmtDouble W64
+      let format   = VecFormat l FmtDouble
           addr     = spRel platform 0
           code dst
             = case lit of
                 CmmInt 0 _ -> exp `snocOL`
-                              (MOV FF64 (OpReg r) (OpReg dst))
+                              (MOVSD FF64 (OpReg r) (OpReg dst))
                 CmmInt 1 _ -> exp `snocOL`
                               (MOVH format (OpReg r) (OpAddr addr)) `snocOL`
-                              (MOV FF64 (OpAddr addr) (OpReg dst))
+                              (MOVSD FF64 (OpAddr addr) (OpReg dst))
                  -- SIMD NCG TODO: avoid going via the stack here?
                 _          -> panic "Error in offset while unpacking"
       return (Any format code)
@@ -1594,7 +1664,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     vector_float_unpack_sse l W32 expr (CmmLit lit)
       = do
       (r,exp) <- getSomeReg expr
-      let format   = VecFormat l FmtFloat W32
+      let format   = VecFormat l FmtFloat
           imm      = litToImm lit
           code dst
             = case lit of
@@ -1614,7 +1684,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       = do
       fn        <- getAnyReg expr1
       (r', exp) <- getSomeReg expr2
-      let f    = VecFormat len FmtFloat W32
+      let f    = VecFormat len FmtFloat
           addr = spRel platform 0
        in return $ Any f (\r -> exp    `appOL`
                                 (fn r) `snocOL`
@@ -1624,7 +1694,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       = do
       fn        <- getAnyReg  expr1
       (r', exp) <- getSomeReg expr2
-      let f    = VecFormat len FmtDouble W64
+      let f    = VecFormat len FmtDouble
           addr = spRel platform 0
        in return $ Any f (\r -> exp    `appOL`
                                 (fn r) `snocOL`
@@ -1643,15 +1713,15 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       = do
       fn       <- getAnyReg  expr1  -- destination
       (r, exp) <- getSomeReg expr2  -- source
-      let f        = VecFormat len FmtFloat W32
+      let f        = VecFormat len FmtFloat
           addr     = spRel platform 0
           code dst = exp `appOL`
                      (fn dst) `snocOL`
                      (MOVU f (OpReg r) (OpAddr addr)) `snocOL`
                      (insertps 0) `snocOL`
-                     (insertps 8) `snocOL`
                      (insertps 16) `snocOL`
-                     (insertps 24)
+                     (insertps 32) `snocOL`
+                     (insertps 48)
             where
               insertps off =
                 INSERTPS f (litToImm $ CmmInt off W32) (OpAddr addr) dst
@@ -1659,7 +1729,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
        in return $ Any f code
     vector_float_broadcast_sse _ _ c _
       = pprPanic "Broadcast not supported for : " (pdoc platform c)
-    
+
     vector_int_broadcast :: Length
                            -> Width
                            -> CmmExpr
@@ -1669,10 +1739,10 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       = do
       fn        <- getAnyReg  expr1
       (val, exp) <- getSomeReg expr2
-      let fmt    = VecFormat len FmtInt64 W64
+      let fmt    = VecFormat len FmtInt64
       return $ Any fmt (\dst -> exp `appOL`
                                 (fn dst) `snocOL`
-                                (MOV fmt (OpReg val) (OpReg dst)) `snocOL`
+                                (MOV II64 (OpReg val) (OpReg dst)) `snocOL`
                                 (PUNPCKLQDQ fmt (OpReg dst) dst))
     vector_int_broadcast _ _ c _
       = pprPanic "Broadcast not supported for : " (pdoc platform c)
@@ -1686,7 +1756,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     vector_int_unpack_sse l at 2 W64 expr (CmmLit lit)
       = do
       (r, exp) <- getSomeReg expr
-      let fmt = VecFormat l FmtInt64 W64
+      let fmt = VecFormat l FmtInt64
       tmp <- getNewRegNat fmt
       let code dst
             = case lit of
@@ -1700,14 +1770,11 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     vector_int_unpack_sse _ w c e
       = pprPanic "Unpack not supported for : " (pdoc platform c $$ pdoc platform e $$ ppr w)
 
-    vector_shuffle_int :: Length -> Width -> CmmExpr -> CmmExpr -> [Int] -> NatM Register
-    vector_shuffle_int = undefined
-
     vector_shuffle_float :: Length -> Width -> CmmExpr -> CmmExpr -> [Int] -> NatM Register
     vector_shuffle_float l w v1 v2 is = do
       (r1, exp1) <- getSomeReg v1
       (r2, exp2) <- getSomeReg v2
-      let fmt = VecFormat l (if w == W32 then FmtFloat else FmtDouble) w
+      let fmt = VecFormat l (if w == W32 then FmtFloat else FmtDouble)
           code dst
             = exp1 `appOL` (exp2 `appOL` shuffleInstructions fmt r1 r2 is dst)
       return (Any fmt code)
@@ -1715,7 +1782,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     shuffleInstructions :: Format -> Reg -> Reg -> [Int] -> Reg -> OrdList Instr
     shuffleInstructions fmt v1 v2 is dst =
       case fmt of
-        VecFormat 2 FmtDouble _ ->
+        VecFormat 2 FmtDouble ->
           case is of
             [i1, i2] -> case (i1, i2) of
               (0,0) -> unitOL (VSHUFPD fmt (ImmInt 0b00) (OpReg v1) v1 dst)
@@ -1736,7 +1803,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
               (3,1) -> unitOL (VSHUFPD fmt (ImmInt 0b11) (OpReg v1) v2 dst)
               _ -> pprPanic "vector shuffle: indices out of bounds 0 <= i <= 3" (ppr is)
             _ -> pprPanic "vector shuffle: wrong number of indices (expected 2)" (ppr is)
-        VecFormat 4 FmtFloat  _ ->
+        VecFormat 4 FmtFloat ->
           case is of
             -- indices 0 <= i <= 7
             [i1, i2, i3, i4]
@@ -1815,7 +1882,7 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
       = do
       fn          <- getAnyReg vecExpr
       (r, exp)    <- getSomeReg valExpr
-      let fmt      = VecFormat len FmtFloat W32
+      let fmt      = VecFormat len FmtFloat
           imm      = litToImm offset
           code dst = exp `appOL`
                      (fn dst) `snocOL`
@@ -1826,17 +1893,17 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
       = do
         (valReg, valExp) <- getSomeReg valExpr
         (vecReg, vecExp) <- getSomeReg vecExpr
-        let fmt = VecFormat len FmtDouble W64
+        let fmt = VecFormat len FmtDouble
             code dst
               = case offset of
                   -- TODO: why not just index by element rather than by byte?
                   CmmInt 0  _ -> valExp `appOL`
                                  vecExp `snocOL`
-                                 (MOV fmt (OpReg valReg) (OpReg dst)) `snocOL`
+                                 (MOVSD FF64 (OpReg valReg) (OpReg dst)) `snocOL`
                                  (SHUFPD fmt (ImmInt 0b00) (OpReg vecReg) dst)
-                  CmmInt 8 _ -> valExp `appOL`
+                  CmmInt 16 _ -> valExp `appOL`
                                  vecExp `snocOL`
-                                 (MOV fmt (OpReg vecReg) (OpReg dst)) `snocOL`
+                                 (MOVSD FF64 (OpReg vecReg) (OpReg dst)) `snocOL`
                                  (SHUFPD fmt (ImmInt 0b00) (OpReg valReg) dst)
                   _ -> pprPanic "MO_VF_Insert DoubleX2: unsupported offset" (ppr offset)
          in return $ Any fmt code
@@ -1865,7 +1932,7 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
         pprTraceM "vecExpr:" (pdoc platform vecExpr)
         (valReg, valExp) <- getSomeReg valExpr
         (vecReg, vecExp) <- getSomeReg vecExpr
-        let fmt = VecFormat len FmtInt64 W64
+        let fmt = VecFormat len FmtInt64
         tmp <- getNewRegNat fmt
         pprTraceM "tmp:" (ppr tmp)
         let code dst
@@ -1875,11 +1942,11 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
                                  (MOVHLPS fmt (OpReg vecReg) tmp) `snocOL`
                                  (MOV II64 (OpReg valReg) (OpReg dst)) `snocOL`
                                  (PUNPCKLQDQ fmt (OpReg tmp) dst)
-                  CmmInt 8 _ -> valExp `appOL`
-                                vecExp `snocOL`
-                                (MOV fmt (OpReg vecReg) (OpReg dst)) `snocOL`
-                                (MOV II64 (OpReg valReg) (OpReg tmp)) `snocOL`
-                                (PUNPCKLQDQ fmt (OpReg tmp) dst)
+                  CmmInt 16 _ -> valExp `appOL`
+                                 vecExp `snocOL`
+                                 (MOV fmt (OpReg vecReg) (OpReg dst)) `snocOL`
+                                 (MOV II64 (OpReg valReg) (OpReg tmp)) `snocOL`
+                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   _ -> pprPanic "MO_V_Insert Int64X2: unsupported offset" (ppr offset)
          in return $ Any fmt code
     vector_int_insert_sse len width _ _ offset
@@ -2024,13 +2091,16 @@ getAnyReg expr = do
 
 anyReg :: Register -> NatM (Reg -> InstrBlock)
 anyReg (Any _ code)          = return code
-anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
+anyReg (Fixed rep reg fcode) = do
+  platform <- getPlatform
+  return (\dst -> fcode `snocOL` mkRegRegMoveInstr platform rep reg dst)
 
 -- A bit like getSomeReg, but we want a reg that can be byte-addressed.
 -- Fixed registers might not be byte-addressable, so we make sure we've
 -- got a temporary, inserting an extra reg copy if necessary.
 getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
 getByteReg expr = do
+  platform <- getPlatform
   is32Bit <- is32BitPlatform
   if is32Bit
       then do r <- getRegister expr
@@ -2042,7 +2112,7 @@ getByteReg expr = do
                     | isVirtualReg reg -> return (reg,code)
                     | otherwise -> do
                         tmp <- getNewRegNat rep
-                        return (tmp, code `snocOL` reg2reg rep reg tmp)
+                        return (tmp, code `snocOL` mkRegRegMoveInstr platform rep reg tmp)
                     -- ToDo: could optimise slightly by checking for
                     -- byte-addressable real registers, but that will
                     -- happen very rarely if at all.
@@ -2063,18 +2133,10 @@ getNonClobberedReg expr = do
         | reg `elem` instrClobberedRegs platform
         -> do
                 tmp <- getNewRegNat rep
-                return (tmp, code `snocOL` reg2reg rep reg tmp)
+                return (tmp, code `snocOL` mkRegRegMoveInstr platform rep reg tmp)
         | otherwise ->
                 return (reg, code)
 
-reg2reg :: Format -> Reg -> Reg -> Instr
-reg2reg format@(VecFormat _ FmtFloat W32) src dst
-  = VMOVU format (OpReg src) (OpReg dst)
-reg2reg format@(VecFormat _ FmtDouble W64) src dst
-  = VMOVU format (OpReg src) (OpReg dst)
-reg2reg format src dst
-  = MOV format (OpReg src) (OpReg dst)
-
 --------------------------------------------------------------------------------
 
 -- | Convert a 'CmmExpr' representing a memory address into an 'Amode'.
@@ -3427,7 +3489,7 @@ genCCall64 addr conv dest_regs args = do
                     tmp      <- getNewRegNat arg_fmt
                     let
                       code'  = code `appOL` arg_code tmp
-                      acode' = acode `snocOL` reg2reg arg_fmt tmp r
+                      acode' = acode `snocOL` mkRegRegMoveInstr platform arg_fmt tmp r
                     return (code',acode')
 
               arg_rep = cmmExprType platform arg


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -370,6 +370,9 @@ data Instr
         | MOVL        Format Operand Operand
         | MOVH        Format Operand Operand
         | MOVA        Format Operand Operand
+        | MOVDQU      Format Operand Operand
+        | VMOVDQU     Format Operand Operand
+        | MOVSD       Format Operand Operand
 
         -- logic operations
         | VPXOR       Format Reg Reg Reg
@@ -526,6 +529,10 @@ regUsageOfInstr platform instr
     MOVA         fmt src dst   -> mkRU fmt (use_R src []) (use_R dst [])
     MOVL         fmt src dst   -> mkRU fmt (use_R src []) (use_R dst [])
     MOVH         fmt src dst   -> mkRU fmt (use_R src []) (use_R dst [])
+    MOVDQU       fmt src dst   -> mkRU fmt (use_R src []) (use_R dst [])
+    VMOVDQU      fmt src dst   -> mkRU fmt (use_R src []) (use_R dst [])
+    MOVSD        fmt src dst   -> mkRU fmt (use_R src []) (use_R dst [])
+
     VPXOR        fmt s1 s2 dst -> mkRU fmt [s1,s2] [dst]
 
     VADD         fmt s1 s2 dst -> mkRU fmt ((use_R s1 []) ++ [s2]) [dst]
@@ -739,6 +746,10 @@ patchRegsOfInstr instr env
     MOVA       fmt src dst   -> MOVA  fmt (patchOp src) (patchOp dst)
     MOVL       fmt src dst   -> MOVL  fmt (patchOp src) (patchOp dst)
     MOVH       fmt src dst   -> MOVH  fmt (patchOp src) (patchOp dst)
+    MOVDQU     fmt src dst   -> MOVDQU  fmt (patchOp src) (patchOp dst)
+    VMOVDQU    fmt src dst   -> VMOVDQU fmt (patchOp src) (patchOp dst)
+    MOVSD      fmt src dst   -> MOVSD fmt (patchOp src) (patchOp dst)
+
     VPXOR      fmt s1 s2 dst -> VPXOR fmt (env s1) (env s2) (env dst)
 
     VADD       fmt s1 s2 dst -> VADD fmt (patchOp s1) (env s2) (env dst)
@@ -763,7 +774,7 @@ patchRegsOfInstr instr env
       -> PSLLDQ  fmt (patchOp off) (env dst)
     PSRLDQ       fmt off dst
       -> PSRLDQ  fmt (patchOp off) (env dst)
-    
+
     MOVHLPS    fmt src dst
       -> MOVHLPS fmt (patchOp src) (env dst)
     PUNPCKLQDQ fmt src dst
@@ -945,11 +956,20 @@ mkRegRegMoveInstr
     -> Reg
     -> Reg
     -> Instr
-
-mkRegRegMoveInstr _platform fmt src dst =
-  case fmt of
-    VecFormat {} -> MOVU fmt (OpReg src) (OpReg dst)
-    _ -> MOV fmt (OpReg src) (OpReg dst)
+mkRegRegMoveInstr _platform fmt@(VecFormat _ s) src dst
+  | isIntScalarFormat s
+  = if widthInBytes (formatToWidth fmt) <= 128
+    then MOVDQU  fmt (OpReg src) (OpReg dst)
+    else VMOVDQU fmt (OpReg src) (OpReg dst)
+  | otherwise
+  = if widthInBytes (formatToWidth fmt) <= 128
+    then MOVU fmt (OpReg src) (OpReg dst)
+    else VMOVU fmt (OpReg src) (OpReg dst)
+mkRegRegMoveInstr _platform fmt src dst
+  | isFloatFormat fmt
+  = MOVSD fmt (OpReg src) (OpReg dst)
+  | otherwise
+  = MOV fmt (OpReg src) (OpReg dst)
 
 -- | Check whether an instruction represents a reg-reg move.
 --      The register allocator attempts to eliminate reg->reg moves whenever it can,
@@ -959,8 +979,24 @@ takeRegRegMoveInstr
         :: Instr
         -> Maybe (Reg,Reg)
 
-takeRegRegMoveInstr (MOV _ (OpReg r1) (OpReg r2))
-        = Just (r1,r2)
+takeRegRegMoveInstr (MOV fmt (OpReg r1) (OpReg r2))
+  -- MOV zeroes the upper part of vector registers,
+  -- so it is not a real "move" in that case.
+  | not (isVecFormat fmt)
+  = Just (r1,r2)
+takeRegRegMoveInstr (MOVSD fmt (OpReg r1) (OpReg r2))
+  | not (isVecFormat fmt)
+  = Just (r1,r2)
+takeRegRegMoveInstr (MOVA _ (OpReg r1) (OpReg r2))
+  = Just (r1, r2)
+takeRegRegMoveInstr (MOVU _ (OpReg r1) (OpReg r2))
+  = Just (r1, r2)
+takeRegRegMoveInstr (VMOVU _ (OpReg r1) (OpReg r2))
+  = Just (r1, r2)
+takeRegRegMoveInstr (MOVDQU _ (OpReg r1) (OpReg r2))
+  = Just (r1, r2)
+takeRegRegMoveInstr (VMOVDQU _ (OpReg r1) (OpReg r2))
+  = Just (r1, r2)
 
 takeRegRegMoveInstr _  = Nothing
 


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -431,13 +431,13 @@ pprFormat x = case x of
   II64  -> text "q"
   FF32  -> text "ss"      -- "scalar single-precision float" (SSE2)
   FF64  -> text "sd"      -- "scalar double-precision float" (SSE2)
-  VecFormat _ FmtFloat W32  -> text "ps"
-  VecFormat _ FmtDouble W64 -> text "pd"
+  VecFormat _ FmtFloat  -> text "ps"
+  VecFormat _ FmtDouble -> text "pd"
   -- TODO: this is shady because it only works for certain instructions
-  VecFormat _ FmtInt8 W8    -> text "b"
-  VecFormat _ FmtInt16 W16  -> text "w"
-  VecFormat _ FmtInt32 W32  -> text "l"
-  VecFormat _ FmtInt64 W64  -> text "q"
+  VecFormat _ FmtInt8   -> text "b"
+  VecFormat _ FmtInt16  -> text "w"
+  VecFormat _ FmtInt32  -> text "l"
+  VecFormat _ FmtInt64  -> text "q"
 
 pprFormat_x87 :: IsLine doc => Format -> doc
 pprFormat_x87 x = case x of
@@ -715,9 +715,9 @@ pprInstr platform i = case i of
    XOR FF64 src dst
       ->  pprOpOp (text "xorpd") FF64 src dst
 
-   XOR format@(VecFormat _ sfmt _) src dst | isIntScalarFormat sfmt
+   XOR format@(VecFormat _ sfmt) src dst | isIntScalarFormat sfmt
        -> pprOpOp (text "pxor") format src dst
-   
+
    XOR format src dst
       -> pprFormatOpOp (text "xor") format src dst
 
@@ -968,6 +968,14 @@ pprInstr platform i = case i of
      -> pprFormatOpOp (text "movl") format from to
    MOVH format from to
      -> pprFormatOpOp (text "movh") format from to
+
+   MOVDQU  format from to
+     -> pprOpOp (text "movdqu") format from to
+   VMOVDQU format from to
+     -> pprOpOp (text "vmovdqu") format from to
+   MOVSD format from to
+     -> pprOpOp (text "movsd") format from to
+
    VPXOR format s1 s2 dst
      -> pprXor (text "vpxor") format s1 s2 dst
    VEXTRACT format offset from to
@@ -1045,9 +1053,9 @@ pprInstr platform i = case i of
    pprBroadcastFormat :: Format -> Line doc
    pprBroadcastFormat x
      = case x of
-         VecFormat _ FmtFloat W32  -> text "ss"
-         VecFormat _ FmtDouble W64 -> text "sd"
-         -- TODO: Add Ints and remove panic
+         VecFormat _ FmtFloat  -> text "ss"
+         VecFormat _ FmtDouble -> text "sd"
+         -- SIMD NCG TODO: Add Ints and remove panic
          VecFormat {} -> panic "Incorrect width"
          _ -> panic "Scalar Format invading vector operation"
 
@@ -1061,7 +1069,6 @@ pprInstr platform i = case i of
            pprOperand platform format op1
        ]
 
-
    pprFormatOp_ :: Line doc -> Format -> Operand -> doc
    pprFormatOp_ name format op1
      = line $ hcat [
@@ -1153,7 +1160,7 @@ pprInstr platform i = case i of
            comma,
            pprReg platform (archWordFormat (target32Bit platform)) reg2
        ]
-   
+
    pprOpReg :: Line doc -> Format -> Operand -> Reg -> doc
    pprOpReg name format op reg
      = line $ hcat [


=====================================
compiler/GHC/CmmToLlvm/CodeGen.hs
=====================================
@@ -1521,6 +1521,7 @@ genMachOp _ op [x] = case op of
     MO_VU_Rem     _ _ -> panicOp
 
     MO_VF_Broadcast _ _ -> panicOp
+    MO_V_Broadcast _ _ -> panicOp
     MO_VF_Insert  _ _ -> panicOp
     MO_VF_Extract _ _ -> panicOp
 
@@ -1722,6 +1723,7 @@ genMachOp_slow opt op [x, y] = case op of
 
     MO_VS_Neg {} -> panicOp
 
+    MO_V_Broadcast  {} -> panicOp
     MO_VF_Broadcast  {} -> panicOp
     MO_VF_Insert  {} -> panicOp
 


=====================================
compiler/GHC/Platform/Reg/Class.hs
=====================================
@@ -11,7 +11,7 @@ import GHC.Prelude
 
 import GHC.Utils.Outputable as Outputable
 import GHC.Types.Unique
-import GHC.Builtin.Uniques
+import GHC.Builtin.Uniques ( mkRegClassUnique )
 
 
 -- | The class of a register.


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -2653,7 +2653,7 @@ doVecPackOp ty z es res = do
       where
         -- vector indices are always 32-bits
         -- TODO: consider indexing by element rather than by byte
-        iLit = CmmLit (CmmInt ((toInteger i) * 8) W32)
+        iLit = CmmLit (CmmInt ((toInteger i) * 16) W32)
 
     len :: Length
     len = vecLength ty


=====================================
testsuite/tests/simd/should_run/Simd009c.hs
=====================================
@@ -13,6 +13,7 @@ module Simd009c where
 import Control.Monad ( unless )
 import Data.Foldable ( for_ )
 import GHC.Exts
+import GHC.Prim
 import Language.Haskell.TH ( CodeQ )
 import Language.Haskell.TH.Syntax ( Lift(liftTyped) )
 


=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -1,6 +1,7 @@
 # N.B. Only the X86 NCG and LLVM backends support SIMD operations for now.
 setTestOpts([ unless(arch('x86_64'), only_ways(llvm_ways))
             , when(unregisterised(), skip)
+            , js_skip, when(arch('wasm32'), skip)
             ])
 
 test('simd000', [], compile_and_run, [''])
@@ -12,5 +13,5 @@ test('simd005', [], compile_and_run, [''])
 test('simd006', [], compile_and_run, [''])
 test('simd007', [], compile_and_run, [''])
 test('simd008', [], compile_and_run, [''])
-test('simd009', [extra_files(['Simd009b.hs', 'Simd009c.hs'])], multimod_compile_and_run, ['simd009', ''])
+test('simd009', [req_th, extra_files(['Simd009b.hs', 'Simd009c.hs'])], multimod_compile_and_run, ['simd009', ''])
 test('simd010', [], compile_and_run, [''])



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/db9a19a6e10287e60619751c76710e80aae5687a...68d896f4f1b1db0dcaa263503910de475906a72f

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/db9a19a6e10287e60619751c76710e80aae5687a...68d896f4f1b1db0dcaa263503910de475906a72f
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240613/79ecb6d8/attachment-0001.html>