[Git][ghc/ghc][wip/ncg-simd] SIMD: cleanup

Fri Jun 14 13:50:35 UTC 2024

sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
6669eee7 by sheaf at 2024-06-14T15:50:19+02:00
SIMD: cleanup

- - - - -


18 changed files:

- compiler/GHC/CmmToAsm/AArch64/Instr.hs
- compiler/GHC/CmmToAsm/PPC/Instr.hs
- compiler/GHC/CmmToAsm/PPC/Ppr.hs
- compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs
- compiler/GHC/CmmToAsm/Reg/Linear.hs
- compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs
- compiler/GHC/CmmToAsm/Reg/Liveness.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/CmmToAsm/X86/RegInfo.hs
- compiler/GHC/CmmToAsm/X86/Regs.hs
- compiler/GHC/CmmToLlvm/CodeGen.hs
- compiler/GHC/Llvm/Ppr.hs
- compiler/GHC/Llvm/Syntax.hs
- compiler/GHC/Platform/Reg.hs
- compiler/GHC/StgToCmm/Prim.hs
- compiler/GHC/Types/Unique/FM.hs


Changes:

=====================================
compiler/GHC/CmmToAsm/AArch64/Instr.hs
=====================================
@@ -151,7 +151,9 @@ regUsageOfInstr platform instr = case instr of
         -- registers as well, as they show up.
         usage (src, dst) = RU (map (,II64) $ filter (interesting platform) src)
                               (map (,II64) $ filter (interesting platform) dst)
-                       -- SIMD NCG TODO: remove this hack
+          -- SIMD NCG TODO: the format here is used for register spilling/unspilling.
+          -- As the AArch64 NCG does not currently support SIMD registers,
+          -- we simply use II64 format for all instructions.
 
         regAddr :: AddrMode -> [Reg]
         regAddr (AddrRegReg r1 r2) = [r1, r2]


=====================================
compiler/GHC/CmmToAsm/PPC/Instr.hs
=====================================
@@ -393,7 +393,9 @@ regUsageOfInstr platform instr
   where
     usage (src, dst) = RU (map (,II64) $ filter (interesting platform) src)
                           (map (,II64) $ filter (interesting platform) dst)
-                       -- SIMD NCG TODO: remove this hack
+      -- SIMD NCG TODO: the format here is used for register spilling/unspilling.
+      -- As the PowerPC NCG does not currently support SIMD registers,
+      -- we simply use II64 format for all instructions.
     regAddr (AddrRegReg r1 r2) = [r1, r2]
     regAddr (AddrRegImm r1 _)  = [r1]
 


=====================================
compiler/GHC/CmmToAsm/PPC/Ppr.hs
=====================================
@@ -196,12 +196,12 @@ pprReg :: forall doc. IsLine doc => Reg -> doc
 
 pprReg r
   = case r of
-      RegReal    (RealRegSingle i) -> ppr_reg_no i
-      RegVirtual (VirtualRegI  u)  -> text "%vI_"   <> pprUniqueAlways u
-      RegVirtual (VirtualRegHi u)  -> text "%vHi_"  <> pprUniqueAlways u
-      RegVirtual (VirtualRegF  u)  -> text "%vF_"   <> pprUniqueAlways u
-      RegVirtual (VirtualRegD  u)  -> text "%vD_"   <> pprUniqueAlways u
-      RegVirtual (VirtualRegVec u) -> text "%vVec_" <> pprUniqueAlways u
+      RegReal    (RealRegSingle  i) -> ppr_reg_no i
+      RegVirtual (VirtualRegI    u) -> text "%vI_"   <> pprUniqueAlways u
+      RegVirtual (VirtualRegHi   u) -> text "%vHi_"  <> pprUniqueAlways u
+      RegVirtual (VirtualRegF    u) -> text "%vF_"   <> pprUniqueAlways u
+      RegVirtual (VirtualRegD    u) -> text "%vD_"   <> pprUniqueAlways u
+      RegVirtual (VirtualRegV128 u) -> text "%vV128_" <> pprUniqueAlways u
 
   where
     ppr_reg_no :: Int -> doc


=====================================
compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs
=====================================
@@ -168,7 +168,6 @@ cleanForward platform blockId assoc acc (li1 : li2 : instrs)
                 cleanForward platform blockId assoc acc
                  $ li1 : LiveInstr (mkRegRegMoveInstr platform fmt reg1 reg2) Nothing
                        : instrs
-                   -- SIMD NCG TODO: is this "fmt" correct?
 
 cleanForward platform blockId assoc acc (li@(LiveInstr i1 _) : instrs)
         | Just (r1, r2) <- takeRegRegMoveInstr platform i1
@@ -249,7 +248,6 @@ cleanReload platform blockId assoc li@(LiveInstr (RELOAD slot reg fmt) _)
 
                 return  ( assoc'
                         , Just $ LiveInstr (mkRegRegMoveInstr platform fmt reg2 reg) Nothing)
-            -- SIMD NCG TODO: is this fmt correct?
 
         -- Gotta keep this instr.
         | otherwise


=====================================
compiler/GHC/CmmToAsm/Reg/Linear.hs
=====================================
@@ -871,14 +871,12 @@ allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
         freeRegs <- getFreeRegsR
         let regclass = classOfVirtualReg r
             freeRegs_thisClass  = frGetFreeRegs platform regclass freeRegs :: [RealReg]
-            -- SIMD NCG TODO: this is not the right thing to be doing,
-            -- and is indicative we should not use Format but a more
-            -- trimmed down datatype that only keeps track of e.g.
-            -- how many stack slots something uses up.
             vr_fmt = case r of
-                VirtualRegVec {} -> VecFormat 2 FmtDouble
-                -- SIMD NCG TODO: handle 256 and 512 by adding
-                -- new virtual register constructors.
+                VirtualRegV128 {} -> VecFormat 2 FmtDouble
+                -- It doesn't really matter whether we use e.g. v2f64 or v4f32
+                -- or v4i32 etc here. This is perhaps a sign that 'Format'
+                -- is not the right type to use here, but that is a battle
+                -- for another day.
                 _ -> II64
 
         -- Can we put the variable into a register it already was?


=====================================
compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs
=====================================
@@ -55,9 +55,7 @@ getStackSlotFor fs@(StackMap _ reserved) _fmt regUnique
 
 getStackSlotFor (StackMap freeSlot reserved) fmt regUnique =
   let
-    nbSlots = case fmt of
-      VecFormat {} -> 2 -- SIMD NCG TODO: panic for unsupported vectors
-      _ -> 1
+    nbSlots = formatInBytes fmt `div` 8
   in
     (StackMap (freeSlot+nbSlots) (addToUFM reserved regUnique freeSlot), freeSlot)
 


=====================================
compiler/GHC/CmmToAsm/Reg/Liveness.hs
=====================================
@@ -64,7 +64,6 @@ import Data.List (mapAccumL, partition)
 import Data.Maybe
 import Data.IntSet              (IntSet)
 import GHC.CmmToAsm.Format
-import GHC.Types.Unique (Uniquable)
 
 -----------------------------------------------------------------------------
 type RegSet = UniqSet Reg
@@ -681,10 +680,6 @@ patchRegsLiveInstr patchF li
                         , liveDieWrite  = mapKeysUFM patchF $ liveDieWrite live })
                           -- See Note [Unique Determinism and code generation]

--- SIMD NCG TODO: move this to Unique.FM module
-mapKeysUFM :: Uniquable a => (t -> a) -> UniqFM key (t, b) -> UniqFM a (a, b)
-mapKeysUFM f m = listToUFM $ map ( \ (r, fmt) -> let r' = f r in (r', (r', fmt)) ) $ nonDetEltsUFM m
-
 --------------------------------------------------------------------------------
 -- | Convert a NatCmmDecl to a LiveCmmDecl, with liveness information
 


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1,4 +1,5 @@
 {-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE MultiWayIf #-}
 {-# LANGUAGE GADTs #-}
 {-# LANGUAGE NondecreasingIndentation #-}
 
@@ -1298,10 +1299,14 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_S_Shr rep -> shift_code rep SAR x y {-False-}
 
       MO_VF_Shuffle l w is
-        | avx
-        -> vector_shuffle_float l w x y is
+        | l * widthInBytes w == 128
+        -> if
+            | avx
+            -> vector_shuffle_float l w x y is
+            | otherwise
+            -> sorry "Please enable the -mavx flag"
         | otherwise
-        -> sorry "Please enable the -mavx flag"
+        -> sorry "Please use -fllvm for wide shuffle instructions"
 
       MO_VF_Broadcast l W32 | avx       -> vector_float_broadcast_avx l W32 x y
                             | sse4_1    -> vector_float_broadcast_sse l W32 x y
@@ -1641,15 +1646,12 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       = do
       (r, exp) <- getSomeReg expr
       let format   = VecFormat l FmtDouble
-          addr     = spRel platform 0
           code dst
             = case lit of
                 CmmInt 0 _ -> exp `snocOL`
                               (MOVSD FF64 (OpReg r) (OpReg dst))
                 CmmInt 1 _ -> exp `snocOL`
-                              (MOVH format (OpReg r) (OpAddr addr)) `snocOL`
-                              (MOVSD FF64 (OpAddr addr) (OpReg dst))
-                 -- SIMD NCG TODO: avoid going via the stack here?
+                              (MOVHLPS format (OpReg r) dst)
                 _          -> panic "Error in offset while unpacking"
       return (Any format code)
     vector_float_unpack _ w c e
@@ -1676,10 +1678,10 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       = pprPanic "Unpack not supported for : " (pdoc platform c $$ pdoc platform e $$ ppr w)
     -----------------------
     vector_float_broadcast_avx :: Length
-                           -> Width
-                           -> CmmExpr
-                           -> CmmExpr
-                           -> NatM Register
+                               -> Width
+                               -> CmmExpr
+                               -> CmmExpr
+                               -> NatM Register
     vector_float_broadcast_avx len W32 expr1 expr2
       = do
       fn        <- getAnyReg expr1
@@ -1731,10 +1733,10 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       = pprPanic "Broadcast not supported for : " (pdoc platform c)
 
     vector_int_broadcast :: Length
-                           -> Width
-                           -> CmmExpr
-                           -> CmmExpr
-                           -> NatM Register
+                         -> Width
+                         -> CmmExpr
+                         -> CmmExpr
+                         -> NatM Register
     vector_int_broadcast len W64 expr1 expr2
       = do
       fn        <- getAnyReg  expr1
@@ -1749,10 +1751,10 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
     -----------------------
 
     vector_int_unpack_sse :: Length
-                        -> Width
-                        -> CmmExpr
-                        -> CmmExpr
-                        -> NatM Register
+                          -> Width
+                          -> CmmExpr
+                          -> CmmExpr
+                          -> NatM Register
     vector_int_unpack_sse l at 2 W64 expr (CmmLit lit)
       = do
       (r, exp) <- getSomeReg expr
@@ -1803,9 +1805,10 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
               (3,1) -> unitOL (VSHUFPD fmt (ImmInt 0b11) (OpReg v1) v2 dst)
               _ -> pprPanic "vector shuffle: indices out of bounds 0 <= i <= 3" (ppr is)
             _ -> pprPanic "vector shuffle: wrong number of indices (expected 2)" (ppr is)
-        VecFormat 4 FmtFloat ->
+        VecFormat 4 FmtFloat
+          -- indices 0 <= i <= 7
+          | all ( (>= 0) <&&> (<= 7) ) is ->
           case is of
-            -- indices 0 <= i <= 7
             [i1, i2, i3, i4]
               | all ( <= 3 ) is
               , let imm = i1 + i2 `shiftL` 2 + i3 `shiftL` 4 + i4 `shiftL` 6
@@ -1842,6 +1845,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
                 `snocOL`
                 (INSERTPS fmt (ImmInt $ insertImm i4 3) (OpReg $ vec i4) dst)
             _ -> pprPanic "vector shuffle: wrong number of indices (expected 4)" (ppr is)
+          | otherwise
+          -> pprPanic "vector shuffle: indices out of bounds 0 <= i <= 7" (ppr is)
         _ ->
           pprPanic "vector shuffle: unsupported format" (ppr fmt)
 
@@ -1882,12 +1887,12 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
                         -> CmmExpr
                         -> NatM Register
     -- FloatX4
-    vector_float_insert len at 4 W32 vecExpr valExpr (CmmLit offset)
+    vector_float_insert len at 4 W32 vecExpr valExpr (CmmLit (CmmInt offset _))
       = do
       fn          <- getAnyReg vecExpr
       (r, exp)    <- getSomeReg valExpr
       let fmt      = VecFormat len FmtFloat
-          imm      = litToImm offset
+          imm      = litToImm (CmmInt (offset `shiftL` 4) W32)
           code dst = exp `appOL`
                      (fn dst) `snocOL`
                      (INSERTPS fmt imm (OpReg r) dst)
@@ -1900,15 +1905,14 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
         let fmt = VecFormat len FmtDouble
             code dst
               = case offset of
-                  -- TODO: why not just index by element rather than by byte?
-                  CmmInt 0  _ -> valExp `appOL`
-                                 vecExp `snocOL`
-                                 (MOVSD FF64 (OpReg valReg) (OpReg dst)) `snocOL`
-                                 (SHUFPD fmt (ImmInt 0b00) (OpReg vecReg) dst)
-                  CmmInt 16 _ -> valExp `appOL`
-                                 vecExp `snocOL`
-                                 (MOVSD FF64 (OpReg vecReg) (OpReg dst)) `snocOL`
-                                 (SHUFPD fmt (ImmInt 0b00) (OpReg valReg) dst)
+                  CmmInt 0 _ -> valExp `appOL`
+                                vecExp `snocOL`
+                                (MOVSD FF64 (OpReg valReg) (OpReg dst)) `snocOL`
+                                (SHUFPD fmt (ImmInt 0b00) (OpReg vecReg) dst)
+                  CmmInt 1 _ -> valExp `appOL`
+                                vecExp `snocOL`
+                                (MOVSD FF64 (OpReg vecReg) (OpReg dst)) `snocOL`
+                                (SHUFPD fmt (ImmInt 0b00) (OpReg valReg) dst)
                   _ -> pprPanic "MO_VF_Insert DoubleX2: unsupported offset" (ppr offset)
          in return $ Any fmt code
     -- For DoubleX4: use VSHUFPD.
@@ -1941,16 +1945,16 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
         pprTraceM "tmp:" (ppr tmp)
         let code dst
               = case offset of
-                  CmmInt 0  _ -> valExp `appOL`
-                                 vecExp `snocOL`
-                                 (MOVHLPS fmt (OpReg vecReg) tmp) `snocOL`
-                                 (MOV II64 (OpReg valReg) (OpReg dst)) `snocOL`
-                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
-                  CmmInt 16 _ -> valExp `appOL`
-                                 vecExp `snocOL`
-                                 (MOV fmt (OpReg vecReg) (OpReg dst)) `snocOL`
-                                 (MOV II64 (OpReg valReg) (OpReg tmp)) `snocOL`
-                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
+                  CmmInt 0 _ -> valExp `appOL`
+                                vecExp `snocOL`
+                                (MOVHLPS fmt (OpReg vecReg) tmp) `snocOL`
+                                (MOV II64 (OpReg valReg) (OpReg dst)) `snocOL`
+                                (PUNPCKLQDQ fmt (OpReg tmp) dst)
+                  CmmInt 1 _ -> valExp `appOL`
+                                vecExp `snocOL`
+                                (MOV fmt (OpReg vecReg) (OpReg dst)) `snocOL`
+                                (MOV II64 (OpReg valReg) (OpReg tmp)) `snocOL`
+                                (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   _ -> pprPanic "MO_V_Insert Int64X2: unsupported offset" (ppr offset)
          in return $ Any fmt code
     vector_int_insert_sse len width _ _ offset


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -391,8 +391,6 @@ data Instr
         | VSHUFPS    Format Imm Operand Reg Reg
         | SHUFPD     Format Imm Operand Reg
         | VSHUFPD    Format Imm Operand Reg Reg
-        -- SIMD NCG TODO: don't store the Format (or only what we need)
-        -- in order to emit these instructions.
 
         | MOVHLPS    Format Operand Reg
         | PUNPCKLQDQ Format Operand Reg
@@ -874,9 +872,11 @@ mkSpillInstr config reg fmt delta slot
   = let off = spillSlotToOffset platform slot - delta
     in case fmt of
         VecFormat {}
-          -- SIMD NCG TODO: panic for unsupported VecFormats (& same in mkLoadInstr)
+          | formatInBytes fmt > 16
+          -> [VMOVU fmt (OpReg reg) (OpAddr (spRel platform off))]
+          | otherwise
           -> [MOVU fmt (OpReg reg) (OpAddr (spRel platform off))]
-            -- NB: not using MOVA because we have no guarantees about the stack
+            -- NB: not using MOVA, because we have no guarantees about the stack
             -- being sufficiently aligned, including even numbered stack slots.
         _ -> [MOV fmt (OpReg reg) (OpAddr (spRel platform off))]
     where platform = ncgPlatform config
@@ -894,9 +894,11 @@ mkLoadInstr config reg fmt delta slot
   = let off = spillSlotToOffset platform slot - delta
     in case fmt of
           VecFormat {}
-            -- SIMD NCG TODO: panic for unsupported VecFormats (& same in mkLoadInstr)
+            | formatInBytes fmt > 16
+            -> [VMOVU fmt (OpAddr (spRel platform off)) (OpReg reg)]
+            | otherwise
             -> [MOVU fmt (OpAddr (spRel platform off)) (OpReg reg)]
-            -- NB: not using MOVA because we have no guarantees about the stack
+            -- NB: not using MOVA, because we have no guarantees about the stack
             -- being sufficiently aligned, including even numbered stack slots.
           _ -> [MOV fmt (OpAddr (spRel platform off)) (OpReg reg)]
 
@@ -985,8 +987,9 @@ takeRegRegMoveInstr platform (MOV fmt (OpReg r1) (OpReg r2))
   -- MOV zeroes the upper part of vector registers,
   -- so it is not a real "move" in that case.
   | not (isVecFormat fmt)
-  -- Don't eliminate a move between e.g. RAX and XMM,
-  -- even though we might be using XMM to store a scalar integer value.
+  -- Don't eliminate a move between e.g. RAX and XMM:
+  -- even though we might be using XMM to store a scalar integer value,
+  -- some instructions only support XMM registers.
   , targetClassOfReg platform r1 == targetClassOfReg platform r2
   = Just (r1,r2)
 takeRegRegMoveInstr _ (MOVSD fmt (OpReg r1) (OpReg r2))


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -295,7 +295,7 @@ pprReg platform f r
       RegVirtual (VirtualRegHi u)  -> text "%vHi_"  <> pprUniqueAlways u
       RegVirtual (VirtualRegF  u)  -> text "%vF_"   <> pprUniqueAlways u
       RegVirtual (VirtualRegD  u)  -> text "%vD_"   <> pprUniqueAlways u
-      RegVirtual (VirtualRegVec u) -> text "%vVec_" <> pprUniqueAlways u
+      RegVirtual (VirtualRegV128 u) -> text "%vVec_" <> pprUniqueAlways u
 
   where
     ppr32_reg_no :: Format -> Int -> doc
@@ -1051,13 +1051,15 @@ pprInstr platform i = case i of
       char '\t' <> name <> pprBroadcastFormat format <> space
 
    pprBroadcastFormat :: Format -> Line doc
-   pprBroadcastFormat x
-     = case x of
-         VecFormat _ FmtFloat  -> text "ss"
-         VecFormat _ FmtDouble -> text "sd"
-         -- SIMD NCG TODO: Add Ints and remove panic
-         VecFormat {} -> panic "Incorrect width"
-         _ -> panic "Scalar Format invading vector operation"
+   pprBroadcastFormat (VecFormat _ f)
+     = case f of
+         FmtFloat  -> text "ss"
+         FmtDouble -> text "sd"
+         FmtInt8   -> text "b"
+         FmtInt16  -> text "w"
+         FmtInt32  -> text "d"
+         FmtInt64  -> text "q"
+   pprBroadcastFormat _ = panic "Scalar Format invading vector operation"
 
    pprFormatImmOp :: Line doc -> Format -> Imm -> Operand -> doc
    pprFormatImmOp name format imm op1


=====================================
compiler/GHC/CmmToAsm/X86/RegInfo.hs
=====================================
@@ -31,7 +31,7 @@ mkVirtualReg u format
         FF64    -> VirtualRegD u
         --TODO:
         -- Add VirtualRegAVX and inspect VecFormat and allocate
-        VecFormat {} -> VirtualRegVec u
+        VecFormat {} -> VirtualRegV128 u
         _other  -> VirtualRegI u
 
 regDotColor :: Platform -> RealReg -> SDoc


=====================================
compiler/GHC/CmmToAsm/X86/Regs.hs
=====================================
@@ -81,7 +81,7 @@ virtualRegSqueeze cls vr
          -> case vr of
                 VirtualRegD{}           -> 1
                 VirtualRegF{}           -> 0
-                VirtualRegVec{}         -> 1
+                VirtualRegV128{}         -> 1
                 _other                  -> 0
 
 


=====================================
compiler/GHC/CmmToLlvm/CodeGen.hs
=====================================
@@ -1850,16 +1850,13 @@ genShuffleOp :: [Int] -> CmmExpr -> CmmExpr -> LlvmM ExprData
 genShuffleOp is x y = runExprData $ do
   vx <- exprToVarW x
   vy <- exprToVarW y
-  mask <- exprToVarW $ CmmLit $ CmmVec $ map ((`CmmInt` W32) . fromIntegral) is
   let tx = getVarType vx
       ty = getVarType vy
   Panic.massertPpr
     (tx == ty)
     (vcat [ text "shuffle: mismatched arg types"
           , ppLlvmType tx, ppLlvmType ty ])
-  let fname = fsLit "shufflevector"
-  fptr <- liftExprData $ getInstrinct fname ty [tx, ty]
-  doExprW tx $ Call StdCall fptr [vx, vy, mask] [ReadNone, NoUnwind]
+  doExprW tx $ Shuffle vx vy is
 
 -- | Generate code for a fused multiply-add operation.
 genFmaOp :: CmmExpr -> CmmExpr -> CmmExpr -> LlvmM ExprData


=====================================
compiler/GHC/Llvm/Ppr.hs
=====================================
@@ -281,6 +281,7 @@ ppLlvmExpression opts expr
         Extract    vec idx          -> ppExtract opts vec idx
         ExtractV   struct idx       -> ppExtractV opts struct idx
         Insert     vec elt idx      -> ppInsert opts vec elt idx
+        Shuffle    v1 v2 idxs       -> ppShuffle opts v1 v2 idxs
         GetElemPtr inb ptr indexes  -> ppGetElementPtr opts inb ptr indexes
         Load       ptr align        -> ppLoad opts ptr align
         ALoad      ord st ptr       -> ppALoad opts ord st ptr
@@ -577,6 +578,15 @@ ppInsert opts vec elt idx =
 {-# SPECIALIZE ppInsert :: LlvmCgConfig -> LlvmVar -> LlvmVar -> LlvmVar -> SDoc #-}
 {-# SPECIALIZE ppInsert :: LlvmCgConfig -> LlvmVar -> LlvmVar -> LlvmVar -> HLine #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable
 
+ppShuffle :: IsLine doc => LlvmCgConfig -> LlvmVar -> LlvmVar -> [Int] -> doc
+ppShuffle opts v1 v2 idxs =
+    text "shufflevector"
+    <+> ppLlvmType (getVarType v1) <+> ppName opts v1 <> comma
+    <+> ppLlvmType (getVarType v2) <+> ppName opts v2 <> comma
+    <+> ppLlvmType (LMVector (length idxs) (LMInt 32)) <+> ppLit opts (LMVectorLit $ map ((`LMIntLit` (LMInt 32)) . fromIntegral) idxs)
+{-# SPECIALIZE ppShuffle :: LlvmCgConfig -> LlvmVar -> LlvmVar -> [Int] -> SDoc #-}
+{-# SPECIALIZE ppShuffle :: LlvmCgConfig -> LlvmVar -> LlvmVar -> [Int] -> HLine #-} -- see Note [SPECIALIZE to HDoc] in GHC.Utils.Outputable
+
 ppMetaAnnotExpr :: IsLine doc => LlvmCgConfig -> [MetaAnnot] -> LlvmExpression -> doc
 ppMetaAnnotExpr opts meta expr =
    ppLlvmExpression opts expr <> ppMetaAnnots opts meta


=====================================
compiler/GHC/Llvm/Syntax.hs
=====================================
@@ -237,6 +237,10 @@ data LlvmExpression
   -}
   | Insert LlvmVar LlvmVar LlvmVar
 
+  {- | Shuffle two vectors into a destination vector using given indices
+  -}
+  | Shuffle LlvmVar LlvmVar [Int]
+
   {- |
     Allocate amount * sizeof(tp) bytes on the heap
       * tp:     LlvmType to reserve room for


=====================================
compiler/GHC/Platform/Reg.hs
=====================================
@@ -1,3 +1,5 @@
+{-# LANGUAGE MagicHash #-}
+
 -- | An architecture independent description of a register.
 --      This needs to stay architecture independent because it is used
 --      by NCGMonad and the register allocators, which are shared
@@ -27,6 +29,7 @@ module GHC.Platform.Reg (
 where
 
 import GHC.Prelude
+import GHC.Exts ( Int(I#), dataToTag# )
 
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
@@ -53,66 +56,44 @@ type RegNo
 --      Virtual regs can be of either class, so that info is attached.
 --
 data VirtualReg
-        = VirtualRegI   {-# UNPACK #-} !Unique
-        | VirtualRegHi  {-# UNPACK #-} !Unique  -- High part of 2-word register
-        | VirtualRegF   {-# UNPACK #-} !Unique
-        | VirtualRegD   {-# UNPACK #-} !Unique
-        | VirtualRegVec {-# UNPACK #-} !Unique
-        deriving (Eq, Show)
-
--- This is laborious, but necessary. We can't derive Ord because
--- Unique doesn't have an Ord instance. Note nonDetCmpUnique in the
--- implementation. See Note [No Ord for Unique]
+   -- | Integer virtual register
+   = VirtualRegI    { virtualRegUnique :: {-# UNPACK #-} !Unique }
+   -- | High part of 2-word virtual register
+   | VirtualRegHi   { virtualRegUnique :: {-# UNPACK #-} !Unique }
+   -- | Float virtual register
+   | VirtualRegF    { virtualRegUnique :: {-# UNPACK #-} !Unique }
+   -- | Double virtual register
+   | VirtualRegD    { virtualRegUnique :: {-# UNPACK #-} !Unique }
+   -- | 128-bit wide vector virtual register
+   | VirtualRegV128 { virtualRegUnique :: {-# UNPACK #-} !Unique }
+   deriving (Eq, Show)
+
+-- We can't derive Ord, because Unique doesn't have an Ord instance.
+-- Note nonDetCmpUnique in the implementation. See Note [No Ord for Unique].
 -- This is non-deterministic but we do not currently support deterministic
 -- code-generation. See Note [Unique Determinism and code generation]
 instance Ord VirtualReg where
-  compare (VirtualRegI a) (VirtualRegI b) = nonDetCmpUnique a b
-  compare (VirtualRegHi a) (VirtualRegHi b) = nonDetCmpUnique a b
-  compare (VirtualRegF a) (VirtualRegF b) = nonDetCmpUnique a b
-  compare (VirtualRegD a) (VirtualRegD b) = nonDetCmpUnique a b
-  compare (VirtualRegVec a) (VirtualRegVec b) = nonDetCmpUnique a b
-
-  compare VirtualRegI{} _ = LT
-  compare _ VirtualRegI{} = GT
-  compare VirtualRegHi{} _ = LT
-  compare _ VirtualRegHi{} = GT
-  compare VirtualRegF{} _ = LT
-  compare _ VirtualRegF{} = GT
-  compare VirtualRegVec{} _ = LT
-  compare _ VirtualRegVec{} = GT
-
+  compare vr1 vr2 =
+    case compare (I# (dataToTag# vr1)) (I# (dataToTag# vr2)) of
+      LT -> LT
+      GT -> GT
+      EQ -> nonDetCmpUnique (virtualRegUnique vr1) (virtualRegUnique vr2)
 
 instance Uniquable VirtualReg where
-        getUnique reg
-         = case reg of
-                VirtualRegI u   -> u
-                VirtualRegHi u  -> u
-                VirtualRegF u   -> u
-                VirtualRegD u   -> u
-                VirtualRegVec u -> u
+        getUnique = virtualRegUnique
 
 instance Outputable VirtualReg where
         ppr reg
          = case reg of
-                VirtualRegI  u  -> text "%vI_"   <> pprUniqueAlways u
-                VirtualRegHi u  -> text "%vHi_"  <> pprUniqueAlways u
-                -- this code is kinda wrong on x86
-                -- because float and double occupy the same register set
-                -- namely SSE2 register xmm0 .. xmm15
-                VirtualRegF  u  -> text "%vFloat_"  <> pprUniqueAlways u
-                VirtualRegD  u  -> text "%vDouble_" <> pprUniqueAlways u
-                VirtualRegVec u -> text "%vVec_"    <> pprUniqueAlways u
-
+                VirtualRegI    u -> text "%vI_"   <> pprUniqueAlways u
+                VirtualRegHi   u -> text "%vHi_"  <> pprUniqueAlways u
+                VirtualRegF    u -> text "%vFloat_"  <> pprUniqueAlways u
+                VirtualRegD    u -> text "%vDouble_" <> pprUniqueAlways u
+                VirtualRegV128 u -> text "%vV128_"   <> pprUniqueAlways u
 
 
 renameVirtualReg :: Unique -> VirtualReg -> VirtualReg
-renameVirtualReg u r
- = case r of
-        VirtualRegI _   -> VirtualRegI  u
-        VirtualRegHi _  -> VirtualRegHi u
-        VirtualRegF _   -> VirtualRegF  u
-        VirtualRegD _   -> VirtualRegD  u
-        VirtualRegVec _ -> VirtualRegVec u
+renameVirtualReg u r = r { virtualRegUnique = u }
 
 classOfVirtualReg :: VirtualReg -> RegClass
 classOfVirtualReg vr
@@ -121,7 +102,7 @@ classOfVirtualReg vr
         VirtualRegHi{}  -> RcInteger
         VirtualRegF{}   -> RcFloatOrVector
         VirtualRegD{}   -> RcFloatOrVector
-        VirtualRegVec{} -> RcFloatOrVector
+        VirtualRegV128{} -> RcFloatOrVector
 
 
 -- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -2651,7 +2651,7 @@ doVecPackOp ty z es res = do
     vecPack src [] _ =
         emitAssign (CmmLocal res) (CmmReg (CmmLocal src))
 
-    -- SIMD NCG TODO (optional): it should be possible to emit better code
+    -- SIMD NCG TODO: it should be possible to emit better code
     -- for "pack" than doing a bunch of vector insertions in a row.
     vecPack src (e : es) i = do
         dst <- newTemp ty
@@ -2663,8 +2663,7 @@ doVecPackOp ty z es res = do
         vecPack dst es (i + 1)
       where
         -- vector indices are always 32-bits
-        -- TODO: consider indexing by element rather than by byte
-        iLit = CmmLit (CmmInt ((toInteger i) * 16) W32)
+        iLit = CmmLit (CmmInt (toInteger i) W32)
 
     len :: Length
     len = vecLength ty


=====================================
compiler/GHC/Types/Unique/FM.hs
=====================================
@@ -71,6 +71,7 @@ module GHC.Types.Unique.FM (
         nonDetStrictFoldUFM_Directly,
         anyUFM, allUFM, seqEltsUFM,
         mapUFM, mapUFM_Directly, strictMapUFM,
+        mapKeysUFM,
         mapMaybeUFM, mapMaybeWithKeyUFM,
         elemUFM, elemUFM_Directly,
         filterUFM, filterUFM_Directly, partitionUFM,
@@ -397,6 +398,10 @@ mapMaybeWithKeyUFM f (UFM m) = UFM (M.mapMaybeWithKey (f . mkUniqueGrimily) m)
 mapUFM_Directly :: (Unique -> elt1 -> elt2) -> UniqFM key elt1 -> UniqFM key elt2
 mapUFM_Directly f (UFM m) = UFM (M.mapWithKey (f . mkUniqueGrimily) m)
 
+-- | Map over the keys in a 'UniqFM'.
+mapKeysUFM :: Uniquable key' => (key -> key') -> UniqFM key (key, b) -> UniqFM key' (key', b)
+mapKeysUFM f m = listToUFM $ map ( \ (r, fmt) -> let r' = f r in (r', (r', fmt)) ) $ nonDetEltsUFM m
+
 strictMapUFM :: (a -> b) -> UniqFM k a -> UniqFM k b
 strictMapUFM f (UFM a) = UFM $ MS.map f a
 



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/6669eee7d5d242044fc0b17ff72c42755c4f4778

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/6669eee7d5d242044fc0b17ff72c42755c4f4778
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240614/349833d8/attachment-0001.html>