[Git][ghc/ghc][wip/ncg-simd] WIP: lower vector shuffle instruction on X86

Mon Jun 10 16:09:53 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
be86b5a0 by sheaf at 2024-06-10T18:09:34+02:00
WIP: lower vector shuffle instruction on X86

- - - - -


6 changed files:

- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/CmmToLlvm/CodeGen.hs
- compiler/GHC/StgToCmm/Prim.hs
- testsuite/tests/codeGen/should_run/all.T


Changes:

=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1659,12 +1659,75 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       (r2, exp2) <- getSomeReg v2
       let fmt = VecFormat l (if w == W32 then FmtFloat else FmtDouble) w
           code dst
-            = exp1 `appOL` (exp2 `appOL` shuffleInstructions fmt r1 r2 is dst) --VSHUFPD format imm (OpReg r1) r2)
+            = exp1 `appOL` (exp2 `appOL` shuffleInstructions fmt r1 r2 is dst)
       return (Any fmt code)
 
     shuffleInstructions :: Format -> Reg -> Reg -> [Int] -> Reg -> OrdList Instr
-    shuffleInstructions _fmt _v1 _v2 _is _dst =
-      error "SIMD NCG TODO: lower to shuffle instructions (e.g. VSHUFPD)"
+    shuffleInstructions fmt v1 v2 is dst =
+      case fmt of
+        VecFormat 2 FmtDouble _ ->
+          case is of
+            [i1, i2] -> case (i1, i2) of
+              (i, j) | i == j
+                -> let v = if i < 2 then v1 else v2
+                       mov = if i == 0 || i == 2 then MOVL else MOVH
+                   in unitOL (mov fmt (OpReg v) (OpReg dst)) `snocOL`
+                      SHUFPD fmt (ImmInt 0b00) (OpReg dst) dst
+              (0,1) -> unitOL (MOVU fmt (OpReg v1) (OpReg dst))
+              (2,3) -> unitOL (MOVU fmt (OpReg v2) (OpReg dst))
+              (1,0) -> unitOL (VSHUFPD fmt (ImmInt 0b01) (OpReg v1) v1 dst)
+              (3,2) -> unitOL (VSHUFPD fmt (ImmInt 0b01) (OpReg v2) v2 dst)
+              (0,2) -> unitOL (VSHUFPD fmt (ImmInt 0b00) (OpReg v2) v1 dst)
+              (2,0) -> unitOL (VSHUFPD fmt (ImmInt 0b00) (OpReg v1) v2 dst)
+              (0,3) -> unitOL (VSHUFPD fmt (ImmInt 0b10) (OpReg v2) v1 dst)
+              (3,0) -> unitOL (VSHUFPD fmt (ImmInt 0b10) (OpReg v1) v2 dst)
+              (1,2) -> unitOL (VSHUFPD fmt (ImmInt 0b01) (OpReg v2) v1 dst)
+              (2,1) -> unitOL (VSHUFPD fmt (ImmInt 0b01) (OpReg v1) v2 dst)
+              (1,3) -> unitOL (VSHUFPD fmt (ImmInt 0b11) (OpReg v2) v1 dst)
+              (3,1) -> unitOL (VSHUFPD fmt (ImmInt 0b11) (OpReg v1) v2 dst)
+              _ -> pprPanic "vector shuffle: indices out of bounds 0 <= i <= 3" (ppr is)
+            _ -> pprPanic "vector shuffle: wrong number of indices (expected 2)" (ppr is)
+        VecFormat 4 FmtFloat  _ ->
+          case is of
+            -- indices 0 <= i <= 7
+            [i1, i2, i3, i4]
+              | all ( <= 3 ) is
+              , let imm = i1 + i2 `shiftL` 2 + i3 `shiftL` 4 + i4 `shiftL` 6
+              -> unitOL (VSHUFPS fmt (ImmInt imm) (OpReg v1) v1 dst)
+              | all ( >= 4 ) is
+              , let [j1, j2, j3, j4] = map ( subtract 4 ) is
+                    imm = j1 + j2 `shiftL` 2 + j3 `shiftL` 4 + j4 `shiftL` 6
+              -> unitOL (VSHUFPS fmt (ImmInt imm) (OpReg v2) v2 dst)
+              | i1 <= 3, i2 <= 3
+              , i3 >= 4, i4 >= 4
+              , let imm = i1 + i2 `shiftL` 2 + (i3 - 4) `shiftL` 4 + (i4 - 4) `shiftL` 6
+              -> unitOL (VSHUFPS fmt (ImmInt imm) (OpReg v1) v2 dst)
+              | i1 >= 4, i2 >= 4
+              , i3 <= 3, i4 <= 3
+              , let imm = i3 + i4 `shiftL` 2 + (i1 - 4) `shiftL` 4 + (i2 - 4) `shiftL` 6
+              -> unitOL (VSHUFPS fmt (ImmInt imm) (OpReg v2) v1 dst)
+              | otherwise
+              ->
+              -- Fall-back code with 4 INSERTPS operations.
+              -- SIMD NCG TODO: handle more cases with better lowering.
+              let -- bits: ss_dd_zzzz
+                  -- ss: pick source location
+                  -- dd: pick destination location
+                  -- zzzz: pick locations to be zeroed
+                  insertImm src dst = shiftL   ( src `mod` 4 ) 6
+                                    .|. shiftL dst 4
+                  vec src = if src >= 4 then v2 else v1
+              in unitOL
+                (INSERTPS fmt (OpImm $ ImmInt $ insertImm i1 0 .|. 0b1110) (OpReg $ vec i1) dst)
+                `snocOL`
+                (INSERTPS fmt (OpImm $ ImmInt $ insertImm i2 1) (OpReg $ vec i2) dst)
+                `snocOL`
+                (INSERTPS fmt (OpImm $ ImmInt $ insertImm i3 2) (OpReg $ vec i3) dst)
+                `snocOL`
+                (INSERTPS fmt (OpImm $ ImmInt $ insertImm i4 3) (OpReg $ vec i4) dst)
+            _ -> pprPanic "vector shuffle: wrong number of indices (expected 4)" (ppr is)
+        _ ->
+          pprPanic "vector shuffle: unsupported format" (ppr fmt)
 
 getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
   sse4_1 <- sse4_1Enabled


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -378,9 +378,9 @@ data Instr
         | VPSHUFD    Format Imm Operand Reg
         | PSHUFD     Format Imm Operand Reg
         | SHUFPS     Format Imm Operand Reg
-        | VSHUFPS    Format Imm Operand Reg
+        | VSHUFPS    Format Imm Operand Reg Reg
         | SHUFPD     Format Imm Operand Reg
-        | VSHUFPD    Format Imm Operand Reg
+        | VSHUFPD    Format Imm Operand Reg Reg
         -- SIMD NCG TODO: don't store the Format (or only what we need)
         -- in order to emit these instructions.
 
@@ -522,10 +522,10 @@ regUsageOfInstr platform instr
       -> mkRU fmt (use_R src [dst]) [dst]
     SHUFPS      fmt _off src dst
       -> mkRU fmt (use_R src [dst]) [dst]
-    VSHUFPD     fmt _off src dst
-      -> mkRU fmt (use_R src [dst]) [dst]
-    VSHUFPS     fmt _off src dst
-      -> mkRU fmt (use_R src [dst]) [dst]
+    VSHUFPD     fmt _off src1 src2 dst
+      -> mkRU fmt (use_R src1 [src2]) [dst]
+    VSHUFPS     fmt _off src1 src2 dst
+      -> mkRU fmt (use_R src1 [src2]) [dst]
 
     PSLLDQ       fmt off dst -> mkRU fmt (use_R off []) [dst]
 
@@ -729,10 +729,10 @@ patchRegsOfInstr instr env
       -> SHUFPS fmt off (patchOp src) (env dst)
     SHUFPD      fmt off src dst
       -> SHUFPD fmt off (patchOp src) (env dst)
-    VSHUFPS      fmt off src dst
-      -> VSHUFPS fmt off (patchOp src) (env dst)
-    VSHUFPD      fmt off src dst
-      -> VSHUFPD fmt off (patchOp src) (env dst)
+    VSHUFPS      fmt off src1 src2 dst
+      -> VSHUFPS fmt off (patchOp src1) (env src2) (env dst)
+    VSHUFPD      fmt off src1 src2 dst
+      -> VSHUFPD fmt off (patchOp src1) (env src2) (env dst)
 
     PSLLDQ       fmt off dst
       -> PSLLDQ  fmt (patchOp off) (env dst)


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -973,10 +973,10 @@ pprInstr platform i = case i of
      -> pprShuf (text "shufps") format offset src dst
    SHUFPD format offset src dst
      -> pprShuf (text "shufpd") format offset src dst
-   VSHUFPS format offset src dst
-     -> pprShuf (text "vshufps") format offset src dst
-   VSHUFPD format offset src dst
-     -> pprShuf (text "vshufpd") format offset src dst
+   VSHUFPS format offset src1 src2 dst
+     -> pprVShuf (text "vshufps") format offset src1 src2 dst
+   VSHUFPD format offset src1 src2 dst
+     -> pprVShuf (text "vshufpd") format offset src1 src2 dst
    PSLLDQ format offset dst
      -> pprShiftLeft (text "pslldq") format offset dst
    PSRLDQ format offset dst
@@ -1248,6 +1248,19 @@ pprInstr platform i = case i of
            pprReg platform format reg3
        ]
 
+   pprVShuf :: Line doc -> Format -> Imm -> Operand -> Reg -> Reg -> doc
+   pprVShuf name format imm1 op2 reg3 reg4
+     = line $ hcat [
+           pprGenMnemonic name format,
+           pprDollImm imm1,
+           comma,
+           pprOperand platform format op2,
+           comma,
+           pprReg platform format reg3,
+           comma,
+           pprReg platform format reg4
+       ]
+
    pprShiftLeft :: Line doc -> Format -> Operand -> Reg -> doc
    pprShiftLeft name format off reg
      = line $ hcat [


=====================================
compiler/GHC/CmmToLlvm/CodeGen.hs
=====================================
@@ -1841,16 +1841,16 @@ genShuffleOp :: [Int] -> CmmExpr -> CmmExpr -> LlvmM ExprData
 genShuffleOp is x y = runExprData $ do
   vx <- exprToVarW x
   vy <- exprToVarW y
+  mask <- exprToVarW $ CmmLit $ CmmVec $ map ((`CmmInt` W32) . fromIntegral) is
   let tx = getVarType vx
       ty = getVarType vy
   Panic.massertPpr
     (tx == ty)
     (vcat [ text "shuffle: mismatched arg types"
           , ppLlvmType tx, ppLlvmType ty ])
-  let fname = fsLit "__builtin_shufflevector"
-  error "SIMD NCG TODO: generate a call to __builtin_shufflevector"
-  --fptr <- liftExprData $ getInstrinct fname ty [tx, ty]
-  --doExprW tx $ Call StdCall fptr (vx: vy: map ?? is) [ReadNone, NoUnwind]
+  let fname = fsLit "shufflevector"
+  fptr <- liftExprData $ getInstrinct fname ty [tx, ty]
+  doExprW tx $ Call StdCall fptr [vx, vy, mask] [ReadNone, NoUnwind]
 
 -- | Generate code for a fused multiply-add operation.
 genFmaOp :: CmmExpr -> CmmExpr -> CmmExpr -> LlvmM ExprData


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -1095,10 +1095,7 @@ emitPrimOp cfg primop =
 
   VecShuffleOp vcat n w -> \ args -> opIntoRegs $ \ [res] -> do
     checkVecCompatibility cfg vcat n w
-    doShuffleOp ty args res
-   where
-    ty :: CmmType
-    ty = vecCmmCat vcat w
+    doShuffleOp (vecVmmType vcat n w) args res
 
 -- Prefetch
   PrefetchByteArrayOp3         -> \args -> opIntoRegs $ \[] ->


=====================================
testsuite/tests/codeGen/should_run/all.T
=====================================
@@ -101,6 +101,8 @@ test('simd004', when(unregisterised(), skip), compile_and_run, [''])
 test('simd005', when(unregisterised(), skip), compile_and_run, [''])
 test('simd006', when(unregisterised(), skip), compile_and_run, [''])
 test('simd007', when(unregisterised(), skip), compile_and_run, [''])
+test('simd009', [when(unregisterised(), skip), extra_files(['Simd009b.hs', 'Simd009c.hs'])]
+    ,multimod_compile_and_run, ['simd009', ''])
 
 test('T1852', normal, compile_and_run, [''])
 test('T1861', extra_run_opts('0'), compile_and_run, [''])



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/be86b5a0e500a8654bde98ef347d14b69a27cca1

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/be86b5a0e500a8654bde98ef347d14b69a27cca1
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240610/29adc99f/attachment-0001.html>