[Git][ghc/ghc][wip/ncg-simd] 2 commits: rework X86 MOV instruction

Wed Jun 26 08:51:57 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
37d40f25 by sheaf at 2024-06-26T10:51:41+02:00
rework X86 MOV instruction

- - - - -
125fb19c by sheaf at 2024-06-26T10:51:42+02:00
SIMD cleanups

- - - - -


6 changed files:

- compiler/GHC/Cmm/Opt.hs
- compiler/GHC/CmmToAsm/Format.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/CmmToC.hs


Changes:

=====================================
compiler/GHC/Cmm/Opt.hs
=====================================
@@ -80,10 +80,10 @@ cmmMachOpFoldM
     -> [CmmExpr]
     -> Maybe CmmExpr
 cmmMachOpFoldM _ (MO_V_Broadcast {}) _ = Nothing
-  -- SIMD NCG TODO: constant folding doesn't work correctly for Broadcast instructions,
-  -- perhaps due to the fact that the argument is a scalar but the result is a vector.
 cmmMachOpFoldM _ (MO_VF_Broadcast {}) _ = Nothing
-
+  -- SIMD NCG TODO: supporting constant folding for vector operations
+  -- would require augmenting getRegister' to handle them.
+  -- See the code for "getRegister' platform _ (CmmLit lit)".
 cmmMachOpFoldM _ op [CmmLit (CmmInt x rep)]
   = Just $! case op of
       MO_S_Neg _ -> CmmLit (CmmInt (-x) rep)


=====================================
compiler/GHC/CmmToAsm/Format.hs
=====================================
@@ -25,6 +25,8 @@ module GHC.CmmToAsm.Format (
     formatToWidth,
     formatInBytes,
     isIntScalarFormat,
+    isFloatScalarFormat,
+    scalarFormatFormat,
     VirtualRegFormat(..),
     RegFormat(..),
     takeVirtualRegs,
@@ -101,12 +103,23 @@ data ScalarFormat
   | FmtDouble
   deriving (Show, Eq, Ord)
 
+scalarFormatFormat :: ScalarFormat -> Format
+scalarFormatFormat = \case
+  FmtInt8 -> II8
+  FmtInt16 -> II16
+  FmtInt32 -> II32
+  FmtInt64 -> II64
+  FmtFloat -> FF32
+  FmtDouble -> FF64
+
+isFloatScalarFormat :: ScalarFormat -> Bool
+isFloatScalarFormat = \case
+  FmtFloat -> True
+  FmtDouble -> True
+  _ -> False
+
 isIntScalarFormat :: ScalarFormat -> Bool
-isIntScalarFormat FmtInt8 = True
-isIntScalarFormat FmtInt16 = True
-isIntScalarFormat FmtInt32 = True
-isIntScalarFormat FmtInt64 = True
-isIntScalarFormat _ = False
+isIntScalarFormat = not . isFloatScalarFormat
 
 -- | Get the integer format of this width.
 intFormat :: Width -> Format


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -717,8 +717,8 @@ iselExpr64 (CmmMachOp (MO_S_MulMayOflo W64) _) = do
    -- We always return a (usually false) positive.
    Reg64 rhi rlo <- getNewReg64
    let code = toOL   [
-                       MOV  II32 (OpImm (ImmInt 1)) (OpReg rhi),
-                       MOV  II32 (OpImm (ImmInt 1)) (OpReg rlo)
+                       MOV II32 (OpImm (ImmInt 1)) (OpReg rhi),
+                       MOV II32 (OpImm (ImmInt 1)) (OpReg rlo)
                      ]
    return (RegCode64 code rhi rlo)
 
@@ -1067,11 +1067,11 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
       MO_XX_Conv W8  W32
           | is32Bit   -> integerExtend W8 W32 MOVZxL x
-          | otherwise -> integerExtend W8 W32 (MOV) x
+          | otherwise -> integerExtend W8 W32 MOV x
       MO_XX_Conv W8  W16
           | is32Bit   -> integerExtend W8 W16 MOVZxL x
-          | otherwise -> integerExtend W8 W16 (MOV) x
-      MO_XX_Conv W16 W32 -> integerExtend W16 W32 (MOV) x
+          | otherwise -> integerExtend W8 W16 MOV x
+      MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
 
       MO_UU_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVZxL x
       MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
@@ -1085,9 +1085,9 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       -- the form of a movzl and print it as a movl later.
       -- This doesn't apply to MO_XX_Conv since in this case we don't care about
       -- the upper bits. So we can just use MOV.
-      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 (MOV) x
-      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 (MOV) x
-      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 (MOV) x
+      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOV x
+      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
+      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
 
       MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
       MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
@@ -1104,7 +1104,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
                      | sse && sse2   -> vector_float_negate_sse l w x
                      | otherwise
                        -> sorry "Please enable the -mavx or -msse, -msse2 flag"
-      -- SIMD NCG TODO
+      -- SIMD NCG TODO: add integer negation
       MO_VS_Neg {} -> needLlvm mop
 
       MO_VF_Broadcast l W32 | avx       -> vector_float_broadcast_avx l W32 x
@@ -1311,7 +1311,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
           (reg, exp) <- getSomeReg expr
           let fmt = VecFormat len FmtInt64
           return $ Any fmt (\dst -> exp `snocOL`
-                                    (MOV2 II64 fmt (OpReg reg) (OpReg dst)) `snocOL`
+                                    (MOVD II64 (OpReg reg) (OpReg dst)) `snocOL`
                                     (PUNPCKLQDQ fmt (OpReg dst) dst)
                                     )
         vector_int_broadcast _ _ c
@@ -1384,15 +1384,15 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
         | otherwise
         -> sorry "Please use -fllvm for wide shuffle instructions"
 
-      MO_VF_Extract l W32   | avx       -> vector_float_unpack l W32 x y
-                            | sse       -> vector_float_unpack_sse l W32 x y
+      MO_VF_Extract l W32   | avx       -> vector_float_extract l W32 x y
+                            | sse       -> vector_float_extract_sse l W32 x y
                             | otherwise
                               -> sorry "Please enable the -mavx or -msse flag"
-      MO_VF_Extract l W64   | sse2      -> vector_float_unpack l W64 x y
+      MO_VF_Extract l W64   | sse2      -> vector_float_extract l W64 x y
                             | otherwise -> sorry "Please enable the -msse2 flag"
       MO_VF_Extract {} -> incorrectOperands
 
-      MO_V_Extract l W64    | sse2      -> vector_int_unpack_sse l W64 x y
+      MO_V_Extract l W64    | sse2      -> vector_int_extract_sse l W64 x y
                             | otherwise -> sorry "Please enable the -msse2 flag"
       -- SIMD NCG TODO: W32, W16, W8
       MO_V_Extract {} -> needLlvm mop
@@ -1653,7 +1653,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       let format   = case w of
                        W32 -> VecFormat l FmtFloat
                        W64 -> VecFormat l FmtDouble
-                       _ -> pprPanic "Operation not supported for width " (ppr w)
+                       _ -> pprPanic "Floating-point AVX vector operation not supported at this width"
+                             (text "width:" <+> ppr w)
           code dst = case op of
             VA_Add -> arithInstr VADD
             VA_Sub -> arithInstr VSUB
@@ -1677,7 +1678,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       let format   = case w of
                        W32 -> VecFormat l FmtFloat
                        W64 -> VecFormat l FmtDouble
-                       _ -> pprPanic "Operation not supported for width " (ppr w)
+                       _ -> pprPanic "Floating-point SSE vector operation not supported at this width"
+                             (text "width:" <+> ppr w)
           code dst = case op of
             VA_Add -> arithInstr ADD
             VA_Sub -> arithInstr SUB
@@ -1691,12 +1693,12 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
                   (instr format (OpReg reg2) (OpReg dst))
       return (Any format code)
     --------------------
-    vector_float_unpack :: Length
-                        -> Width
-                        -> CmmExpr
-                        -> CmmExpr
-                        -> NatM Register
-    vector_float_unpack l W32 expr (CmmLit lit)
+    vector_float_extract :: Length
+                         -> Width
+                         -> CmmExpr
+                         -> CmmExpr
+                         -> NatM Register
+    vector_float_extract l W32 expr (CmmLit lit)
       = do
       (r, exp) <- getSomeReg expr
       let format   = VecFormat l FmtFloat
@@ -1705,9 +1707,9 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
             = case lit of
                 CmmInt 0 _ -> exp `snocOL` (MOV FF32 (OpReg r) (OpReg dst))
                 CmmInt _ _ -> exp `snocOL` (VPSHUFD format imm (OpReg r) dst)
-                _          -> panic "Error in offset while unpacking"
+                _          -> pprPanic "Unsupported AVX floating-point vector extract offset" (ppr lit)
       return (Any format code)
-    vector_float_unpack l W64 expr (CmmLit lit)
+    vector_float_extract l W64 expr (CmmLit lit)
       = do
       (r, exp) <- getSomeReg expr
       let format   = VecFormat l FmtDouble
@@ -1716,19 +1718,19 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
                 CmmInt 0 _ -> exp `snocOL`
                               (MOV FF64 (OpReg r) (OpReg dst))
                 CmmInt 1 _ -> exp `snocOL`
-                              (MOVHLPS format (OpReg r) dst)
-                _          -> panic "Error in offset while unpacking"
+                              (MOVHLPS format r dst)
+                _          -> pprPanic "Unsupported AVX floating-point vector extract offset" (ppr lit)
       return (Any format code)
-    vector_float_unpack _ w c e
-      = pprPanic "Unpack not supported for : " (pdoc platform c $$ pdoc platform e $$ ppr w)
+    vector_float_extract _ w c e
+      = pprPanic "Unsupported AVX floating-point vector extract" (pdoc platform c $$ pdoc platform e $$ ppr w)
     -----------------------
 
-    vector_float_unpack_sse :: Length
-                            -> Width
-                            -> CmmExpr
-                            -> CmmExpr
-                            -> NatM Register
-    vector_float_unpack_sse l W32 expr (CmmLit lit)
+    vector_float_extract_sse :: Length
+                             -> Width
+                             -> CmmExpr
+                             -> CmmExpr
+                             -> NatM Register
+    vector_float_extract_sse l W32 expr (CmmLit lit)
       = do
       (r,exp) <- getSomeReg expr
       let format   = VecFormat l FmtFloat
@@ -1737,18 +1739,18 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
             = case lit of
                 CmmInt 0 _ -> exp `snocOL` (MOVU format (OpReg r) (OpReg dst))
                 CmmInt _ _ -> exp `snocOL` (PSHUFD format imm (OpReg r) dst)
-                _          -> panic "Error in offset while unpacking"
+                _          -> pprPanic "Unsupported SSE floating-point vector extract offset" (ppr lit)
       return (Any format code)
-    vector_float_unpack_sse _ w c e
-      = pprPanic "Unpack not supported for : " (pdoc platform c $$ pdoc platform e $$ ppr w)
+    vector_float_extract_sse _ w c e
+      = pprPanic "Unsupported SSE floating-point vector extract" (pdoc platform c $$ pdoc platform e $$ ppr w)
     -----------------------
 
-    vector_int_unpack_sse :: Length
-                          -> Width
-                          -> CmmExpr
-                          -> CmmExpr
-                          -> NatM Register
-    vector_int_unpack_sse l at 2 W64 expr (CmmLit lit)
+    vector_int_extract_sse :: Length
+                           -> Width
+                           -> CmmExpr
+                           -> CmmExpr
+                           -> NatM Register
+    vector_int_extract_sse l at 2 W64 expr (CmmLit lit)
       = do
       (r, exp) <- getSomeReg expr
       let fmt = VecFormat l FmtInt64
@@ -1756,14 +1758,14 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       let code dst
             = case lit of
                 CmmInt 0 _ -> exp `snocOL`
-                              (MOV2 fmt II64 (OpReg r) (OpReg dst))
+                              (MOVD II64 (OpReg r) (OpReg dst))
                 CmmInt 1 _ -> exp `snocOL`
-                              (MOVHLPS fmt (OpReg r) tmp) `snocOL`
-                              (MOV2 fmt II64 (OpReg tmp) (OpReg dst))
+                              (MOVHLPS fmt r tmp) `snocOL`
+                              (MOVD II64 (OpReg tmp) (OpReg dst))
                 _          -> panic "Error in offset while unpacking"
       return (Any fmt code)
-    vector_int_unpack_sse _ w c e
-      = pprPanic "Unpack not supported for : " (pdoc platform c $$ pdoc platform e $$ ppr w)
+    vector_int_extract_sse _ w c e
+      = pprPanic "Unsupported SSE floating-point vector extract" (pdoc platform c $$ pdoc platform e $$ ppr w)
 
     vector_shuffle_float :: Length -> Width -> CmmExpr -> CmmExpr -> [Int] -> NatM Register
     vector_shuffle_float l w v1 v2 is = do
@@ -1938,13 +1940,13 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
               = case offset of
                   CmmInt 0 _ -> valExp `appOL`
                                 vecExp `snocOL`
-                                (MOVHLPS fmt (OpReg vecReg) tmp) `snocOL`
-                                (MOV2 II64 fmt (OpReg valReg) (OpReg dst)) `snocOL`
+                                (MOVHLPS fmt vecReg tmp) `snocOL`
+                                (MOVD II64 (OpReg valReg) (OpReg dst)) `snocOL`
                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   CmmInt 1 _ -> valExp `appOL`
                                 vecExp `snocOL`
-                                (MOV fmt (OpReg vecReg) (OpReg dst)) `snocOL`
-                                (MOV2 II64 fmt (OpReg valReg) (OpReg tmp)) `snocOL`
+                                (MOV II64 (OpReg vecReg) (OpReg dst)) `snocOL`
+                                (MOVD II64 (OpReg valReg) (OpReg tmp)) `snocOL`
                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   _ -> pprPanic "MO_V_Insert Int64X2: unsupported offset" (ppr offset)
          in return $ Any fmt code
@@ -1978,26 +1980,29 @@ getRegister' _ _ (CmmLoad mem pk _)
     loadFloatAmode  (typeWidth pk) addr mem_code
 
 getRegister' _ is32Bit (CmmLoad mem pk _)
+  -- SIMD NCG TODO: what about vectors?
   | is32Bit && not (isWord64 pk)
   = do
     let
       instr = case width of
                 W8     -> MOVZxL II8
+                  -- We always zero-extend 8-bit loads, if we
+                  -- can't think of anything better.  This is because
+                  -- we can't guarantee access to an 8-bit variant of every register
+                  -- (esi and edi don't have 8-bit variants), so to make things
+                  -- simpler we do our 8-bit arithmetic with full 32-bit registers.
                 _other -> MOV format
     code <- intLoadCode instr mem
     return (Any format code)
   where
     width = typeWidth pk
     format = intFormat width
-        -- We always zero-extend 8-bit loads, if we
-        -- can't think of anything better.  This is because
-        -- we can't guarantee access to an 8-bit variant of every register
-        -- (esi and edi don't have 8-bit variants), so to make things
-        -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+
 
 -- Simpler memory load code on x86_64
 getRegister' _ is32Bit (CmmLoad mem pk _)
- | not is32Bit
+  -- SIMD NCG TODO: what about vectors?
+  | not is32Bit
   = do
     code <- intLoadCode (MOV format) mem
     return (Any format code)
@@ -2058,13 +2063,17 @@ getRegister' platform _ (CmmLit lit)
   where
     cmmtype = cmmLitType platform lit
     vectorRegister ctype
-      = do
-      --NOTE:
-      -- This operation is only used to zero a register. For loading a
-      -- vector literal there are pack and broadcast operations
-      let format = cmmTypeFormat ctype
-          code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
-      return (Any format code)
+      | case lit of { CmmVec fs -> all (\case { CmmInt i _ -> i == 0; CmmFloat f _ -> f == 0; _ -> False }) fs; _ -> False }
+      = -- NOTE:
+        -- This operation is only used to zero a register. For loading a
+        -- vector literal there are pack and broadcast operations
+        let format = cmmTypeFormat ctype
+            code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
+        in return (Any format code)
+      | otherwise
+      = pprPanic "getRegister': no support for (nonzero) vector literals" $
+          vcat [ text "lit:" <+> ppr lit ]
+      -- SIMD NCG TODO: can we do better here?
     standardRegister ctype
       = do
       let format = cmmTypeFormat ctype
@@ -3519,10 +3528,7 @@ genCCall64 addr conv dest_regs args = do
                                 -- If we are calling a varargs function
                                 -- then we need to define ireg as well
                                 -- as freg
-                                CVTTSD2SIQ II64 (OpReg freg) ireg)
-                                  -- SLD TODO: I changed this from MOV FF64 (OpReg freg) (OpReg ireg)
-                                  -- to CVTTSD2SIQ ...
-                                  -- because it is going between two different types of register
+                                MOVD FF64 (OpReg freg) (OpReg ireg))
             | otherwise = do
                  arg_code <- getAnyReg arg
                  load_args_win rest (RegFormat ireg II64: usedInt) usedFP regs
@@ -3538,10 +3544,11 @@ genCCall64 addr conv dest_regs args = do
              (arg_reg, arg_code) <- getSomeReg arg
              delta <- getDeltaNat
              setDeltaNat (delta-arg_size)
-             let code' = code `appOL` arg_code `appOL` toOL [
+             let fmt = floatFormat width
+                 code' = code `appOL` arg_code `appOL` toOL [
                             SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
                             DELTA (delta-arg_size),
-                            MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel platform 0))]
+                            MOV fmt (OpReg arg_reg) (OpAddr (spRel platform 0))]
              push_args rest code'
 
            | otherwise = do
@@ -3650,17 +3657,13 @@ genCCall64 addr conv dest_regs args = do
         -- assign the results, if necessary
         assign_code []     = nilOL
         assign_code [dest] =
-          case typeWidth rep of
-                W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
-                                                     (OpReg xmm0)
-                                                     (OpReg r_dest))
-                W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
-                                                     (OpReg xmm0)
-                                                     (OpReg r_dest))
-                _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
+          unitOL $
+            movInstr config fmt (OpReg reg) (OpReg r_dest)
           where
-                rep = localRegType dest
-                r_dest = getRegisterReg platform  (CmmLocal dest)
+            reg = if isIntFormat fmt then rax else xmm0
+            fmt = cmmTypeFormat rep
+            rep = localRegType dest
+            r_dest = getRegisterReg platform (CmmLocal dest)
         assign_code _many = panic "genForeignCall.assign_code many"
 
     return (adjust_rsp          `appOL`
@@ -3769,9 +3772,10 @@ genSwitch expr targets = do
             tableReg <- getNewRegNat (intFormat (platformWordWidth platform))
             targetReg <- getNewRegNat (intFormat (platformWordWidth platform))
             let op = OpAddr (AddrBaseIndex (EABaseReg tableReg) (EAIndex reg (platformWordSizeInBytes platform)) (ImmInt 0))
+                fmt = archWordFormat is32bit
                 code = e_code `appOL` toOL
-                    [ LEA (archWordFormat is32bit) (OpAddr (AddrBaseIndex EABaseRip EAIndexNone (ImmCLbl lbl))) (OpReg tableReg)
-                    , MOV (archWordFormat is32bit) op (OpReg targetReg)
+                    [ LEA fmt (OpAddr (AddrBaseIndex EABaseRip EAIndexNone (ImmCLbl lbl))) (OpReg tableReg)
+                    , MOV fmt op (OpReg targetReg)
                     , JMP_TBL (OpReg targetReg) ids (Section ReadOnlyData lbl) lbl
                     ]
             return code
@@ -4432,10 +4436,10 @@ genCtzGeneric width dst src = do
       src_r <- getNewRegNat format
       tmp_r <- getNewRegNat format
       let instrs = code_src src_r `appOL` toOL
-               ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
-                [ BSF     format (OpReg src_r) tmp_r
+               ([ MOVZxL  II8    (OpReg src_r)       (OpReg src_r) | width == W8 ] ++
+                [ BSF     format (OpReg src_r)       tmp_r
                 , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
-                , CMOV NE format (OpReg tmp_r) dst_r
+                , CMOV NE format (OpReg tmp_r)       dst_r
                 ]) -- NB: We don't need to zero-extend the result for the
                    -- W8/W16 cases because the 'MOV' insn already
                    -- took care of implicitly clearing the upper bits
@@ -4509,11 +4513,11 @@ genMemCpyInlineMaybe align dst src n = do
               go dst src tmp (i - 4)
           | i >= 2 =
               unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
-              unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+              unitOL (MOV    II16  (OpReg tmp) (OpAddr dst_addr)) `appOL`
               go dst src tmp (i - 2)
           | i >= 1 =
               unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
-              unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+              unitOL (MOV    II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
               go dst src tmp (i - 1)
           | otherwise = nilOL
         where
@@ -4789,8 +4793,8 @@ genPext bid width dst src mask = do
               toOL
                 [ MOVZxL format (OpReg src_r ) (OpReg src_r )
                 , MOVZxL format (OpReg mask_r) (OpReg mask_r)
-                , PEXT   II32 (OpReg mask_r) (OpReg src_r ) dst_r
-                , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
+                , PEXT   II32   (OpReg mask_r) (OpReg src_r ) dst_r
+                , MOVZxL format (OpReg dst_r)  (OpReg dst_r) -- Truncate to op width
                 ]
             else
               unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)
@@ -4832,11 +4836,11 @@ genClz bid width dst src = do
           src_r <- getNewRegNat format
           tmp_r <- getNewRegNat format
           return $ code_src src_r `appOL` toOL
-                   ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
-                    [ BSR     format (OpReg src_r) tmp_r
-                    , MOV     II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
-                    , CMOV NE format (OpReg tmp_r) dst_r
-                    , XOR     format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
+                   ([ MOVZxL   II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                    [ BSR      format (OpReg src_r) tmp_r
+                    , MOV      II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
+                    , CMOV NE  format (OpReg tmp_r) dst_r
+                    , XOR      format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
                     ]) -- NB: We don't need to zero-extend the result for the
                        -- W8/W16 cases because the 'MOV' insn already
                        -- took care of implicitly clearing the upper bits
@@ -4848,7 +4852,8 @@ genWordToFloat bid width dst src =
 
 genAtomicRead :: Width -> MemoryOrdering -> LocalReg -> CmmExpr -> NatM InstrBlock
 genAtomicRead width _mord dst addr = do
-  load_code <- intLoadCode (MOV (intFormat width)) addr
+  let fmt = intFormat width
+  load_code <- intLoadCode (MOV fmt) addr
   return (load_code (getLocalRegReg dst))
 
 genAtomicWrite :: Width -> MemoryOrdering -> CmmExpr -> CmmExpr -> NatM InstrBlock


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -32,6 +32,7 @@ module GHC.CmmToAsm.X86.Instr
    , mkStackDeallocInstr
    , mkSpillInstr
    , mkRegRegMoveInstr
+   , movInstr
    , jumpDestsOfInstr
    , canFallthroughTo
    , patchRegsOfInstr
@@ -201,20 +202,18 @@ data Instr
         -- This carries a BlockId so it can be used in unwinding information.
         | DELTA  Int
 
-        -- Moves.
-        | MOV         Format Operand Operand
-             -- ^ N.B. Due to AT&T assembler quirks, when used with 'II64'
+        -- | X86 scalar move instruction.
+        --
+        -- When used at a vector format, only moves the lower 64 bits of data;
+        -- the rest of the data in the destination may either be zeroed or
+        -- preserved, depending on the specific format.
+        | MOV Format Operand Operand
+             -- N.B. Due to AT&T assembler quirks, when used with 'II64'
              -- 'Format' immediate source and memory target operand, the source
              -- operand is interpreted to be a 32-bit sign-extended value.
              -- True 64-bit operands need to be either first moved to a register or moved
              -- with @MOVABS@; we currently do not use this instruction in GHC.
              -- See https://stackoverflow.com/questions/52434073/whats-the-difference-between-the-x86-64-att-instructions-movq-and-movabsq.
-        | MOV2 Format Format Operand Operand
-         -- ^ Like MOV, but between two different kinds of registers
-         -- (e.g. moving rax to xmm1)
-         --
-         -- SIMD NCG TODO: this is a bit of a hack, but the alternative would
-         -- be to have MOV store two Formats to handle xmm -> rax and rax -> xmm.
 
         | MOVD   Format Operand Operand -- ^ MOVD/MOVQ SSE2 instructions
                                         -- (bitcast between a general purpose
@@ -378,12 +377,20 @@ data Instr
         | INSERTPS    Format Imm Operand Reg
 
         -- move operations
-        | VMOVU       Format Operand Operand
+
+        -- | SSE2 unaligned move of floating-point vectors
         | MOVU        Format Operand Operand
+        -- | AVX unaligned move of floating-point vectors
+        | VMOVU       Format Operand Operand
+        -- | SSE2 move between memory and low-part of an xmm register
         | MOVL        Format Operand Operand
+        -- | SSE move between memory and high-part of an xmm register
         | MOVH        Format Operand Operand
+        -- | SSE aligned move of floating-point vectors
         | MOVA        Format Operand Operand
+        -- | SSE2 unaligned move of integer vectors
         | MOVDQU      Format Operand Operand
+        -- | AVX unaligned move of integer vectors
         | VMOVDQU     Format Operand Operand
 
         -- logic operations
@@ -403,7 +410,9 @@ data Instr
         | SHUFPD     Format Imm Operand Reg
         | VSHUFPD    Format Imm Operand Reg Reg
 
-        | MOVHLPS    Format Operand Reg
+        -- | Move two 32-bit floats from the high part of an xmm register
+        -- to the low part of another xmm register.
+        | MOVHLPS    Format Reg Reg
         | PUNPCKLQDQ Format Operand Reg
 
         -- Shift
@@ -426,9 +435,21 @@ data FMAPermutation = FMA132 | FMA213 | FMA231
 regUsageOfInstr :: Platform -> Instr -> RegUsage
 regUsageOfInstr platform instr
  = case instr of
-    MOV    fmt src dst    -> usageRW fmt src dst
-    MOV2   srcFmt dstFmt src dst -> mkRU (use_R srcFmt src []) (use_R dstFmt dst [])
-    MOVD   fmt src dst    -> mkRU (use_R fmt src []) (use_R (movdOutFormat fmt) dst [])
+    MOV fmt src dst
+      -- MOVSS/MOVSD preserve the upper half of vector registers,
+      -- but only for reg-2-reg moves
+      | VecFormat _ sFmt <- fmt
+      , isFloatScalarFormat sFmt
+      , OpReg {} <- src
+      , OpReg {} <- dst
+      -> usageRM fmt src dst
+      -- other MOV instructions zero any remaining upper part of the destination
+      -- (largely to avoid partial register stalls)
+      | otherwise
+      -> usageRW fmt src dst
+    MOVD   fmt src dst    ->
+      -- NB: MOVD/MOVQ always zero any remaining upper part of destination
+      mkRU (use_R fmt src []) (use_R (movdOutFormat fmt) dst [])
     CMOV _ fmt src dst    -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
     MOVZxL fmt src dst    -> usageRW fmt src dst
     MOVSxL fmt src dst    -> usageRW fmt src dst
@@ -566,7 +587,7 @@ regUsageOfInstr platform instr
     PSLLDQ fmt off dst -> mkRU (use_R fmt off []) [mk fmt dst]
 
     MOVHLPS    fmt src dst
-      -> mkRU (use_R fmt src []) [mk fmt dst]
+      -> mkRU [mk fmt src] [mk fmt dst]
     PUNPCKLQDQ fmt src dst
       -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
 
@@ -680,7 +701,6 @@ patchRegsOfInstr :: HasDebugCallStack => Platform -> Instr -> (Reg -> Reg) -> In
 patchRegsOfInstr platform instr env
   = case instr of
     MOV fmt src dst      -> MOV fmt (patchOp src) (patchOp dst)
-    MOV2 srcFmt dstFmt src dst -> MOV2 srcFmt dstFmt (patchOp src) (patchOp dst)
     MOVD fmt src dst     -> patch2 (MOVD fmt) src dst
     CMOV cc fmt src dst  -> CMOV cc fmt (patchOp src) (env dst)
     MOVZxL fmt src dst   -> patch2 (MOVZxL fmt) src dst
@@ -805,7 +825,7 @@ patchRegsOfInstr platform instr env
       -> PSRLDQ  fmt (patchOp off) (env dst)
 
     MOVHLPS    fmt src dst
-      -> MOVHLPS fmt (patchOp src) (env dst)
+      -> MOVHLPS fmt (env src) (env dst)
     PUNPCKLQDQ fmt src dst
       -> PUNPCKLQDQ fmt (patchOp src) (env dst)
 
@@ -899,8 +919,14 @@ mkSpillInstr
     -> [Instr]
 
 mkSpillInstr config (RegFormat reg fmt) delta slot =
-  [ movInstr config fmt (OpReg reg) (OpAddr (spRel platform off)) ]
+  [ movInstr config fmt' (OpReg reg) (OpAddr (spRel platform off)) ]
   where
+    fmt'
+      | isVecFormat fmt
+      = fmt
+      | otherwise
+      = scalarMoveFormat platform fmt
+      -- Spill the platform word size, at a minimum
     platform = ncgPlatform config
     off = spillSlotToOffset platform slot - delta
 
@@ -914,16 +940,27 @@ mkLoadInstr
     -> [Instr]
 
 mkLoadInstr config (RegFormat reg fmt) delta slot =
-  [ movInstr config fmt (OpAddr (spRel platform off)) (OpReg reg) ]
+  [ movInstr config fmt' (OpAddr (spRel platform off)) (OpReg reg) ]
   where
+    fmt'
+      | isVecFormat fmt
+      = fmt
+      | otherwise
+      = scalarMoveFormat platform fmt
+        -- Load the platform word size, at a minimum
     platform = ncgPlatform config
     off = spillSlotToOffset platform slot - delta
 
+-- | A move instruction for moving the entire contents of an operand
+-- at the given 'Format'.
 movInstr :: NCGConfig -> Format -> (Operand -> Operand -> Instr)
 movInstr config fmt =
   case fmt of
     -- NB: we are using {V}MOVU and not {V}MOVA, because we have no guarantees
     -- about the stack being sufficiently aligned (even for even numbered stack slots).
+    --
+    -- (Ben Gamari told me that using MOVA instead of MOVU does not make a
+    -- difference in practice when moving between registers.)
     VecFormat {}
       | formatInBytes fmt > 16
       -> if avx
@@ -936,9 +973,8 @@ movInstr config fmt =
             -> MOVU fmt
             | otherwise
             -> sorry "128-bit wide vectors require either -msse2 or -mavx"
-    _ -> MOV (scalarMoveFormat platform fmt)
+    _ -> MOV fmt
   where
-    platform = ncgPlatform config
     avx = ncgAvxEnabled config
     sse = ncgSseVersion config
 
@@ -1002,11 +1038,18 @@ mkRegRegMoveInstr
 mkRegRegMoveInstr config fmt src dst =
   assertPpr (targetClassOfReg platform src == targetClassOfReg platform dst)
     (vcat [ text "mkRegRegMoveInstr: incompatible register classes"
+          , text "fmt:" <+> ppr fmt
           , text "src:" <+> ppr src
           , text "dst:" <+> ppr dst ]) $
-    movInstr config fmt (OpReg src) (OpReg dst)
+    movInstr config fmt' (OpReg src) (OpReg dst)
+      -- Move the platform word size, at a minimum
   where
     platform = ncgPlatform config
+    fmt'
+      | isVecFormat fmt
+      = fmt
+      | otherwise
+      = scalarMoveFormat platform fmt
 
 scalarMoveFormat :: Platform -> Format -> Format
 scalarMoveFormat platform fmt
@@ -1028,32 +1071,40 @@ takeRegRegMoveInstr
 
 takeRegRegMoveInstr platform = \case
   MOV fmt (OpReg r1) (OpReg r2)
-    -- MOV zeroes the upper part of vector registers,
-    -- so it is not a real "move" in that case.
-    | not (isVecFormat fmt)
-    -> go r1 r2
-  MOVD _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -- When used with vector registers, MOV only deals with the lower part,
+    -- so it is not a real move. For example, MOVSS/MOVSD between xmm registers
+    -- preserves the upper half, and MOVQ between xmm registers zeroes the upper half.
+    | not $ isVecFormat fmt
+    -- Don't eliminate a move between e.g. RAX and XMM:
+    -- even though we might be using XMM to store a scalar integer value,
+    -- some instructions only support XMM registers.
+    , targetClassOfReg platform r1 == targetClassOfReg platform r2
+    -> Just (r1, r2)
+  MOVD {}
+    -- MOVD moves between xmm registers and general-purpose registers,
+    -- and we don't want to eliminate those moves (as noted for MOV).
+    -> Nothing
+
+  -- SSE2/AVX move instructions always move the full register.
   MOVA _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   MOVU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   VMOVU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   MOVDQU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   VMOVDQU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
+
+  -- MOVL, MOVH and MOVHLPS preserve some part of the destination register,
+  -- so are not simple moves.
+  MOVL {} -> Nothing
+  MOVH {} -> Nothing
+  MOVHLPS {} -> Nothing
+
+  -- Other instructions are not moves.
   _ -> Nothing
-  where
-    go r1 r2
-      -- Don't eliminate a move between e.g. RAX and XMM:
-      -- even though we might be using XMM to store a scalar integer value,
-      -- some instructions only support XMM registers.
-      | targetClassOfReg platform r1 == targetClassOfReg platform r2
-      = Just (r1, r2)
-      | otherwise
-      = Nothing
 
 -- | Make an unconditional branch instruction.
 mkJumpInstr


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -619,15 +619,12 @@ pprInstr platform i = case i of
                 II64 -> II32          -- 32-bit version is equivalent, and smaller
                 _    -> format
 
-   MOV format src dst
-     -> pprFormatOpOp (text "mov") format src dst
-
-   MOV2 srcFmt dstFmt src dst
-     -> pprFormatOpOp (text "mov") fmt src dst
-     where
-      fmt = if formatInBytes srcFmt <= formatInBytes dstFmt
-            then srcFmt
-            else dstFmt
+   MOV fmt src dst
+     -> pprFormatOpOp (text "mov") fmt' src dst
+       where
+          fmt' = case fmt of
+            VecFormat _l sFmt -> scalarFormatFormat sFmt
+            _ -> fmt
 
    CMOV cc format src dst
      -> pprCondOpReg (text "cmov") format cc src dst
@@ -1004,7 +1001,7 @@ pprInstr platform i = case i of
      -> pprDoubleShift (text "psrldq") format offset dst
 
    MOVHLPS format from to
-     -> pprOpReg (text "movhlps") format from to
+     -> pprOpReg (text "movhlps") format (OpReg from) to
    PUNPCKLQDQ format from to
      -> pprOpReg (text "punpcklqdq") format from to
 


=====================================
compiler/GHC/CmmToC.hs
=====================================
@@ -820,9 +820,6 @@ pprMachOp_for_C platform mop = case mop of
         MO_SF_Round    _from to -> parens (machRep_F_CType to)
         MO_FS_Truncate _from to -> parens (machRep_S_CType platform to)
 
-        MO_V_Shuffle {} -> text "__builtin_shufflevector"
-        MO_VF_Shuffle {} -> text "__builtin_shufflevector"
-
         MO_RelaxedRead _ -> pprTrace "offending mop:"
                                 (text "MO_RelaxedRead")
                                 (panic $ "PprC.pprMachOp_for_C: MO_S_MulMayOflo"
@@ -833,88 +830,93 @@ pprMachOp_for_C platform mop = case mop of
                                 (panic $ "PprC.pprMachOp_for_C: MO_S_MulMayOflo"
                                       ++ " should have been handled earlier!")
 
+        MO_AlignmentCheck {} -> panic "-falignment-sanitisation not supported by unregisterised backend"
+
+-- SIMD vector instructions: currently unsupported
+        MO_V_Shuffle {} -> pprTrace "offending mop:"
+                                (text "MO_V_Shuffle")
+                                (panic $ "PprC.pprMachOp_for_C: MO_V_Shuffle"
+                                      ++ "unsupported by the unregisterised backend")
+        MO_VF_Shuffle {} -> pprTrace "offending mop:"
+                                (text "MO_VF_Shuffle")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VF_Shuffle"
+                                      ++ "unsupported by the unregisterised backend")
         MO_V_Insert {}    -> pprTrace "offending mop:"
                                 (text "MO_V_Insert")
                                 (panic $ "PprC.pprMachOp_for_C: MO_V_Insert"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_V_Extract {}   -> pprTrace "offending mop:"
                                 (text "MO_V_Extract")
                                 (panic $ "PprC.pprMachOp_for_C: MO_V_Extract"
-                                      ++ " should have been handled earlier!")
-
+                                      ++ "unsupported by the unregisterised backend")
         MO_V_Add {}       -> pprTrace "offending mop:"
                                 (text "MO_V_Add")
                                 (panic $ "PprC.pprMachOp_for_C: MO_V_Add"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_V_Sub {}       -> pprTrace "offending mop:"
                                 (text "MO_V_Sub")
                                 (panic $ "PprC.pprMachOp_for_C: MO_V_Sub"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_V_Mul {}       -> pprTrace "offending mop:"
                                 (text "MO_V_Mul")
                                 (panic $ "PprC.pprMachOp_for_C: MO_V_Mul"
-                                      ++ " should have been handled earlier!")
-
+                                      ++ "unsupported by the unregisterised backend")
         MO_VS_Quot {}     -> pprTrace "offending mop:"
                                 (text "MO_VS_Quot")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VS_Quot"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VS_Rem {}      -> pprTrace "offending mop:"
                                 (text "MO_VS_Rem")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VS_Rem"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VS_Neg {}      -> pprTrace "offending mop:"
                                 (text "MO_VS_Neg")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VS_Neg"
-                                      ++ " should have been handled earlier!")
-
+                                      ++ "unsupported by the unregisterised backend")
         MO_VU_Quot {}     -> pprTrace "offending mop:"
                                 (text "MO_VU_Quot")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VU_Quot"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VU_Rem {}      -> pprTrace "offending mop:"
                                 (text "MO_VU_Rem")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VU_Rem"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_V_Broadcast {} -> pprTrace "offending mop:"
                                  (text "MO_V_Broadcast")
                                  (panic $ "PprC.pprMachOp_for_C: MO_V_Broadcast"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VF_Broadcast {} -> pprTrace "offending mop:"
                                  (text "MO_VF_Broadcast")
                                  (panic $ "PprC.pprMachOp_for_C: MO_VF_Broadcast"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VF_Insert {}   -> pprTrace "offending mop:"
                                 (text "MO_VF_Insert")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VF_Extract {}  -> pprTrace "offending mop:"
                                 (text "MO_VF_Extract")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Extract"
-                                      ++ " should have been handled earlier!")
-
+                                      ++ "unsupported by the unregisterised backend")
         MO_VF_Add {}      -> pprTrace "offending mop:"
                                 (text "MO_VF_Add")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Add"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VF_Sub {}      -> pprTrace "offending mop:"
                                 (text "MO_VF_Sub")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Sub"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VF_Neg {}      -> pprTrace "offending mop:"
                                 (text "MO_VF_Neg")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Neg"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VF_Mul {}      -> pprTrace "offending mop:"
                                 (text "MO_VF_Mul")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Mul"
-                                      ++ " should have been handled earlier!")
+                                      ++ "unsupported by the unregisterised backend")
         MO_VF_Quot {}     -> pprTrace "offending mop:"
                                 (text "MO_VF_Quot")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Quot"
-                                      ++ " should have been handled earlier!")
-
-        MO_AlignmentCheck {} -> panic "-falignment-sanitisation not supported by unregisterised backend"
+                                      ++ "unsupported by the unregisterised backend")
 
 signedOp :: MachOp -> Bool      -- Argument type(s) are signed ints
 signedOp (MO_S_Quot _)    = True



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/6fe8b90c2d137b60387af1d18169a7c8c1bddbcc...125fb19cec33bdd5bb0de9117222b6dbac665830

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/6fe8b90c2d137b60387af1d18169a7c8c1bddbcc...125fb19cec33bdd5bb0de9117222b6dbac665830
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240626/ccc58742/attachment-0001.html>