[Git][ghc/ghc][wip/ncg-simd] rework X86 MOV instruction

Tue Jun 25 15:05:44 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
6ac63edb by sheaf at 2024-06-25T17:05:27+02:00
rework X86 MOV instruction

- - - - -


4 changed files:

- compiler/GHC/CmmToAsm/Format.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs


Changes:

=====================================
compiler/GHC/CmmToAsm/Format.hs
=====================================
@@ -25,6 +25,8 @@ module GHC.CmmToAsm.Format (
     formatToWidth,
     formatInBytes,
     isIntScalarFormat,
+    isFloatScalarFormat,
+    scalarFormatFormat,
     VirtualRegFormat(..),
     RegFormat(..),
     takeVirtualRegs,
@@ -101,12 +103,23 @@ data ScalarFormat
   | FmtDouble
   deriving (Show, Eq, Ord)
 
+scalarFormatFormat :: ScalarFormat -> Format
+scalarFormatFormat = \case
+  FmtInt8 -> II8
+  FmtInt16 -> II16
+  FmtInt32 -> II32
+  FmtInt64 -> II64
+  FmtFloat -> FF32
+  FmtDouble -> FF64
+
+isFloatScalarFormat :: ScalarFormat -> Bool
+isFloatScalarFormat = \case
+  FmtFloat -> True
+  FmtDouble -> True
+  _ -> False
+
 isIntScalarFormat :: ScalarFormat -> Bool
-isIntScalarFormat FmtInt8 = True
-isIntScalarFormat FmtInt16 = True
-isIntScalarFormat FmtInt32 = True
-isIntScalarFormat FmtInt64 = True
-isIntScalarFormat _ = False
+isIntScalarFormat = not . isFloatScalarFormat
 
 -- | Get the integer format of this width.
 intFormat :: Width -> Format


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -717,8 +717,8 @@ iselExpr64 (CmmMachOp (MO_S_MulMayOflo W64) _) = do
    -- We always return a (usually false) positive.
    Reg64 rhi rlo <- getNewReg64
    let code = toOL   [
-                       MOV  II32 (OpImm (ImmInt 1)) (OpReg rhi),
-                       MOV  II32 (OpImm (ImmInt 1)) (OpReg rlo)
+                       MOV II32 (OpImm (ImmInt 1)) (OpReg rhi),
+                       MOV II32 (OpImm (ImmInt 1)) (OpReg rlo)
                      ]
    return (RegCode64 code rhi rlo)
 
@@ -995,7 +995,7 @@ getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _ _])
 
 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _ _])
  | not is32Bit = do
-  code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
+  code <- intLoadCode (MOV II64) addr -- 32-bit loads zero-extend
   return (Any II64 code)
 
 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _ _])
@@ -1067,11 +1067,11 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
       MO_XX_Conv W8  W32
           | is32Bit   -> integerExtend W8 W32 MOVZxL x
-          | otherwise -> integerExtend W8 W32 (MOV) x
+          | otherwise -> integerExtend W8 W32 MOV x
       MO_XX_Conv W8  W16
           | is32Bit   -> integerExtend W8 W16 MOVZxL x
-          | otherwise -> integerExtend W8 W16 (MOV) x
-      MO_XX_Conv W16 W32 -> integerExtend W16 W32 (MOV) x
+          | otherwise -> integerExtend W8 W16 MOV x
+      MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
 
       MO_UU_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVZxL x
       MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
@@ -1085,9 +1085,9 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       -- the form of a movzl and print it as a movl later.
       -- This doesn't apply to MO_XX_Conv since in this case we don't care about
       -- the upper bits. So we can just use MOV.
-      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 (MOV) x
-      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 (MOV) x
-      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 (MOV) x
+      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOV x
+      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
+      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
 
       MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
       MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
@@ -1311,7 +1311,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
           (reg, exp) <- getSomeReg expr
           let fmt = VecFormat len FmtInt64
           return $ Any fmt (\dst -> exp `snocOL`
-                                    (MOV2 II64 fmt (OpReg reg) (OpReg dst)) `snocOL`
+                                    (MOVD II64 (OpReg reg) (OpReg dst)) `snocOL`
                                     (PUNPCKLQDQ fmt (OpReg dst) dst)
                                     )
         vector_int_broadcast _ _ c
@@ -1716,7 +1716,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
                 CmmInt 0 _ -> exp `snocOL`
                               (MOV FF64 (OpReg r) (OpReg dst))
                 CmmInt 1 _ -> exp `snocOL`
-                              (MOVHLPS format (OpReg r) dst)
+                              (MOVHLPS format r dst)
                 _          -> panic "Error in offset while unpacking"
       return (Any format code)
     vector_float_unpack _ w c e
@@ -1756,10 +1756,10 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       let code dst
             = case lit of
                 CmmInt 0 _ -> exp `snocOL`
-                              (MOV2 fmt II64 (OpReg r) (OpReg dst))
+                              (MOVD II64 (OpReg r) (OpReg dst))
                 CmmInt 1 _ -> exp `snocOL`
-                              (MOVHLPS fmt (OpReg r) tmp) `snocOL`
-                              (MOV2 fmt II64 (OpReg tmp) (OpReg dst))
+                              (MOVHLPS fmt r tmp) `snocOL`
+                              (MOVD II64 (OpReg tmp) (OpReg dst))
                 _          -> panic "Error in offset while unpacking"
       return (Any fmt code)
     vector_int_unpack_sse _ w c e
@@ -1938,13 +1938,13 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
               = case offset of
                   CmmInt 0 _ -> valExp `appOL`
                                 vecExp `snocOL`
-                                (MOVHLPS fmt (OpReg vecReg) tmp) `snocOL`
-                                (MOV2 II64 fmt (OpReg valReg) (OpReg dst)) `snocOL`
+                                (MOVHLPS fmt vecReg tmp) `snocOL`
+                                (MOVD II64 (OpReg valReg) (OpReg dst)) `snocOL`
                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   CmmInt 1 _ -> valExp `appOL`
                                 vecExp `snocOL`
-                                (MOV fmt (OpReg vecReg) (OpReg dst)) `snocOL`
-                                (MOV2 II64 fmt (OpReg valReg) (OpReg tmp)) `snocOL`
+                                (MOV II64 (OpReg vecReg) (OpReg dst)) `snocOL`
+                                (MOVD II64 (OpReg valReg) (OpReg tmp)) `snocOL`
                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   _ -> pprPanic "MO_V_Insert Int64X2: unsupported offset" (ppr offset)
          in return $ Any fmt code
@@ -1978,26 +1978,29 @@ getRegister' _ _ (CmmLoad mem pk _)
     loadFloatAmode  (typeWidth pk) addr mem_code
 
 getRegister' _ is32Bit (CmmLoad mem pk _)
+  -- SIMD NCG TODO: what about vectors?
   | is32Bit && not (isWord64 pk)
   = do
     let
       instr = case width of
                 W8     -> MOVZxL II8
+                  -- We always zero-extend 8-bit loads, if we
+                  -- can't think of anything better.  This is because
+                  -- we can't guarantee access to an 8-bit variant of every register
+                  -- (esi and edi don't have 8-bit variants), so to make things
+                  -- simpler we do our 8-bit arithmetic with full 32-bit registers.
                 _other -> MOV format
     code <- intLoadCode instr mem
     return (Any format code)
   where
     width = typeWidth pk
     format = intFormat width
-        -- We always zero-extend 8-bit loads, if we
-        -- can't think of anything better.  This is because
-        -- we can't guarantee access to an 8-bit variant of every register
-        -- (esi and edi don't have 8-bit variants), so to make things
-        -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+
 
 -- Simpler memory load code on x86_64
 getRegister' _ is32Bit (CmmLoad mem pk _)
- | not is32Bit
+  -- SIMD NCG TODO: what about vectors?
+  | not is32Bit
   = do
     code <- intLoadCode (MOV format) mem
     return (Any format code)
@@ -2058,13 +2061,17 @@ getRegister' platform _ (CmmLit lit)
   where
     cmmtype = cmmLitType platform lit
     vectorRegister ctype
-      = do
-      --NOTE:
-      -- This operation is only used to zero a register. For loading a
-      -- vector literal there are pack and broadcast operations
-      let format = cmmTypeFormat ctype
-          code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
-      return (Any format code)
+      | case lit of { CmmVec fs -> all (\case { CmmInt i _ -> i == 0; CmmFloat f _ -> f == 0; _ -> False }) fs; _ -> False }
+      = -- NOTE:
+        -- This operation is only used to zero a register. For loading a
+        -- vector literal there are pack and broadcast operations
+        let format = cmmTypeFormat ctype
+            code dst = unitOL (XOR format (OpReg dst) (OpReg dst))
+        in return (Any format code)
+      | otherwise
+      = pprPanic "getRegister': no support for (nonzero) vector literals" $
+          vcat [ text "lit:" <+> ppr lit ]
+      -- SIMD NCG TODO: can we do better here?
     standardRegister ctype
       = do
       let format = cmmTypeFormat ctype
@@ -3519,10 +3526,7 @@ genCCall64 addr conv dest_regs args = do
                                 -- If we are calling a varargs function
                                 -- then we need to define ireg as well
                                 -- as freg
-                                CVTTSD2SIQ II64 (OpReg freg) ireg)
-                                  -- SLD TODO: I changed this from MOV FF64 (OpReg freg) (OpReg ireg)
-                                  -- to CVTTSD2SIQ ...
-                                  -- because it is going between two different types of register
+                                MOVD FF64 (OpReg freg) (OpReg ireg))
             | otherwise = do
                  arg_code <- getAnyReg arg
                  load_args_win rest (RegFormat ireg II64: usedInt) usedFP regs
@@ -3538,10 +3542,11 @@ genCCall64 addr conv dest_regs args = do
              (arg_reg, arg_code) <- getSomeReg arg
              delta <- getDeltaNat
              setDeltaNat (delta-arg_size)
-             let code' = code `appOL` arg_code `appOL` toOL [
+             let fmt = floatFormat width
+                 code' = code `appOL` arg_code `appOL` toOL [
                             SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
                             DELTA (delta-arg_size),
-                            MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel platform 0))]
+                            MOV fmt (OpReg arg_reg) (OpAddr (spRel platform 0))]
              push_args rest code'
 
            | otherwise = do
@@ -3650,17 +3655,13 @@ genCCall64 addr conv dest_regs args = do
         -- assign the results, if necessary
         assign_code []     = nilOL
         assign_code [dest] =
-          case typeWidth rep of
-                W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
-                                                     (OpReg xmm0)
-                                                     (OpReg r_dest))
-                W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
-                                                     (OpReg xmm0)
-                                                     (OpReg r_dest))
-                _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
+          unitOL $
+            mkRegRegMoveInstr config fmt reg r_dest
           where
-                rep = localRegType dest
-                r_dest = getRegisterReg platform  (CmmLocal dest)
+            reg = if isIntFormat fmt then rax else xmm0
+            fmt = cmmTypeFormat rep
+            rep = localRegType dest
+            r_dest = getRegisterReg platform (CmmLocal dest)
         assign_code _many = panic "genForeignCall.assign_code many"
 
     return (adjust_rsp          `appOL`
@@ -3769,9 +3770,10 @@ genSwitch expr targets = do
             tableReg <- getNewRegNat (intFormat (platformWordWidth platform))
             targetReg <- getNewRegNat (intFormat (platformWordWidth platform))
             let op = OpAddr (AddrBaseIndex (EABaseReg tableReg) (EAIndex reg (platformWordSizeInBytes platform)) (ImmInt 0))
+                fmt = archWordFormat is32bit
                 code = e_code `appOL` toOL
-                    [ LEA (archWordFormat is32bit) (OpAddr (AddrBaseIndex EABaseRip EAIndexNone (ImmCLbl lbl))) (OpReg tableReg)
-                    , MOV (archWordFormat is32bit) op (OpReg targetReg)
+                    [ LEA fmt (OpAddr (AddrBaseIndex EABaseRip EAIndexNone (ImmCLbl lbl))) (OpReg tableReg)
+                    , MOV fmt op (OpReg targetReg)
                     , JMP_TBL (OpReg targetReg) ids (Section ReadOnlyData lbl) lbl
                     ]
             return code
@@ -4385,9 +4387,9 @@ genCtz64_32 bid dst src = do
   --    dst = 64;
   --  }
   let instrs = vcode `appOL` toOL
-           ([ MOV      II32 (OpReg rhi)         (OpReg tmp_r)
+           ([ MOV II32 (OpReg rhi)         (OpReg tmp_r)
             , OR       II32 (OpReg rlo)         (OpReg tmp_r)
-            , MOV      II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+            , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r)
             , JXX EQQ    lbl2
             , JXX ALWAYS lbl1
 
@@ -4432,10 +4434,10 @@ genCtzGeneric width dst src = do
       src_r <- getNewRegNat format
       tmp_r <- getNewRegNat format
       let instrs = code_src src_r `appOL` toOL
-               ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
-                [ BSF     format (OpReg src_r) tmp_r
-                , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
-                , CMOV NE format (OpReg tmp_r) dst_r
+               ([ MOVZxL   II8    (OpReg src_r)       (OpReg src_r) | width == W8 ] ++
+                [ BSF      format (OpReg src_r)       tmp_r
+                , MOV      II32   (OpImm (ImmInt bw)) (OpReg dst_r)
+                , CMOV NE  format (OpReg tmp_r)       dst_r
                 ]) -- NB: We don't need to zero-extend the result for the
                    -- W8/W16 cases because the 'MOV' insn already
                    -- took care of implicitly clearing the upper bits
@@ -4509,11 +4511,11 @@ genMemCpyInlineMaybe align dst src n = do
               go dst src tmp (i - 4)
           | i >= 2 =
               unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
-              unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+              unitOL (MOV    II16  (OpReg tmp) (OpAddr dst_addr)) `appOL`
               go dst src tmp (i - 2)
           | i >= 1 =
               unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
-              unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+              unitOL (MOV    II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
               go dst src tmp (i - 1)
           | otherwise = nilOL
         where
@@ -4789,8 +4791,8 @@ genPext bid width dst src mask = do
               toOL
                 [ MOVZxL format (OpReg src_r ) (OpReg src_r )
                 , MOVZxL format (OpReg mask_r) (OpReg mask_r)
-                , PEXT   II32 (OpReg mask_r) (OpReg src_r ) dst_r
-                , MOVZxL format (OpReg dst_r) (OpReg dst_r) -- Truncate to op width
+                , PEXT   II32   (OpReg mask_r) (OpReg src_r ) dst_r
+                , MOVZxL format (OpReg dst_r)  (OpReg dst_r) -- Truncate to op width
                 ]
             else
               unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)
@@ -4832,11 +4834,11 @@ genClz bid width dst src = do
           src_r <- getNewRegNat format
           tmp_r <- getNewRegNat format
           return $ code_src src_r `appOL` toOL
-                   ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
-                    [ BSR     format (OpReg src_r) tmp_r
-                    , MOV     II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
-                    , CMOV NE format (OpReg tmp_r) dst_r
-                    , XOR     format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
+                   ([ MOVZxL   II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                    [ BSR      format (OpReg src_r) tmp_r
+                    , MOV      II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
+                    , CMOV NE  format (OpReg tmp_r) dst_r
+                    , XOR      format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
                     ]) -- NB: We don't need to zero-extend the result for the
                        -- W8/W16 cases because the 'MOV' insn already
                        -- took care of implicitly clearing the upper bits
@@ -4848,7 +4850,8 @@ genWordToFloat bid width dst src =
 
 genAtomicRead :: Width -> MemoryOrdering -> LocalReg -> CmmExpr -> NatM InstrBlock
 genAtomicRead width _mord dst addr = do
-  load_code <- intLoadCode (MOV (intFormat width)) addr
+  let fmt = intFormat width
+  load_code <- intLoadCode (MOV fmt) addr
   return (load_code (getLocalRegReg dst))
 
 genAtomicWrite :: Width -> MemoryOrdering -> CmmExpr -> CmmExpr -> NatM InstrBlock


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -32,6 +32,7 @@ module GHC.CmmToAsm.X86.Instr
    , mkStackDeallocInstr
    , mkSpillInstr
    , mkRegRegMoveInstr
+   , movInstr
    , jumpDestsOfInstr
    , canFallthroughTo
    , patchRegsOfInstr
@@ -202,19 +203,13 @@ data Instr
         | DELTA  Int
 
         -- Moves.
-        | MOV         Format Operand Operand
+        | MOV Format Operand Operand
              -- ^ N.B. Due to AT&T assembler quirks, when used with 'II64'
              -- 'Format' immediate source and memory target operand, the source
              -- operand is interpreted to be a 32-bit sign-extended value.
              -- True 64-bit operands need to be either first moved to a register or moved
              -- with @MOVABS@; we currently do not use this instruction in GHC.
              -- See https://stackoverflow.com/questions/52434073/whats-the-difference-between-the-x86-64-att-instructions-movq-and-movabsq.
-        | MOV2 Format Format Operand Operand
-         -- ^ Like MOV, but between two different kinds of registers
-         -- (e.g. moving rax to xmm1)
-         --
-         -- SIMD NCG TODO: this is a bit of a hack, but the alternative would
-         -- be to have MOV store two Formats to handle xmm -> rax and rax -> xmm.
 
         | MOVD   Format Operand Operand -- ^ MOVD/MOVQ SSE2 instructions
                                         -- (bitcast between a general purpose
@@ -378,12 +373,20 @@ data Instr
         | INSERTPS    Format Imm Operand Reg
 
         -- move operations
-        | VMOVU       Format Operand Operand
+
+        -- | SSE2 unaligned move of floating-point vectors
         | MOVU        Format Operand Operand
+        -- | AVX unaligned move of floating-point vectors
+        | VMOVU       Format Operand Operand
+        -- | SSE2 move between memory and low-part of an xmm register
         | MOVL        Format Operand Operand
+        -- | SSE move between memory and high-part of an xmm register
         | MOVH        Format Operand Operand
+        -- | SSE aligned move of floating-point vectors
         | MOVA        Format Operand Operand
+        -- | SSE2 unaligned move of integer vectors
         | MOVDQU      Format Operand Operand
+        -- | AVX unaligned move of integer vectors
         | VMOVDQU     Format Operand Operand
 
         -- logic operations
@@ -403,7 +406,9 @@ data Instr
         | SHUFPD     Format Imm Operand Reg
         | VSHUFPD    Format Imm Operand Reg Reg
 
-        | MOVHLPS    Format Operand Reg
+        -- | Move two 32-bit floats from the high part of an xmm register
+        -- to the low part of another xmm register.
+        | MOVHLPS    Format Reg Reg
         | PUNPCKLQDQ Format Operand Reg
 
         -- Shift
@@ -426,9 +431,21 @@ data FMAPermutation = FMA132 | FMA213 | FMA231
 regUsageOfInstr :: Platform -> Instr -> RegUsage
 regUsageOfInstr platform instr
  = case instr of
-    MOV    fmt src dst    -> usageRW fmt src dst
-    MOV2   srcFmt dstFmt src dst -> mkRU (use_R srcFmt src []) (use_R dstFmt dst [])
-    MOVD   fmt src dst    -> mkRU (use_R fmt src []) (use_R (movdOutFormat fmt) dst [])
+    MOV fmt src dst
+      -- MOVSS/MOVSD preserve the upper half of vector registers,
+      -- but only for reg-2-reg moves
+      | VecFormat _ sFmt <- fmt
+      , isFloatScalarFormat sFmt
+      , OpReg {} <- src
+      , OpReg {} <- dst
+      -> usageRM fmt src dst
+      -- other MOV instructions zero any remaining upper part of the destination
+      -- (largely to avoid partial register stalls)
+      | otherwise
+      -> usageRW fmt src dst
+    MOVD   fmt src dst    ->
+      -- NB: MOVD/MOVQ always zero any remaining upper part of destination
+      mkRU (use_R fmt src []) (use_R (movdOutFormat fmt) dst [])
     CMOV _ fmt src dst    -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
     MOVZxL fmt src dst    -> usageRW fmt src dst
     MOVSxL fmt src dst    -> usageRW fmt src dst
@@ -566,7 +583,7 @@ regUsageOfInstr platform instr
     PSLLDQ fmt off dst -> mkRU (use_R fmt off []) [mk fmt dst]
 
     MOVHLPS    fmt src dst
-      -> mkRU (use_R fmt src []) [mk fmt dst]
+      -> mkRU [mk fmt src] [mk fmt dst]
     PUNPCKLQDQ fmt src dst
       -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
 
@@ -680,7 +697,6 @@ patchRegsOfInstr :: HasDebugCallStack => Platform -> Instr -> (Reg -> Reg) -> In
 patchRegsOfInstr platform instr env
   = case instr of
     MOV fmt src dst      -> MOV fmt (patchOp src) (patchOp dst)
-    MOV2 srcFmt dstFmt src dst -> MOV2 srcFmt dstFmt (patchOp src) (patchOp dst)
     MOVD fmt src dst     -> patch2 (MOVD fmt) src dst
     CMOV cc fmt src dst  -> CMOV cc fmt (patchOp src) (env dst)
     MOVZxL fmt src dst   -> patch2 (MOVZxL fmt) src dst
@@ -805,7 +821,7 @@ patchRegsOfInstr platform instr env
       -> PSRLDQ  fmt (patchOp off) (env dst)
 
     MOVHLPS    fmt src dst
-      -> MOVHLPS fmt (patchOp src) (env dst)
+      -> MOVHLPS fmt (env src) (env dst)
     PUNPCKLQDQ fmt src dst
       -> PUNPCKLQDQ fmt (patchOp src) (env dst)
 
@@ -914,11 +930,12 @@ mkLoadInstr
     -> [Instr]
 
 mkLoadInstr config (RegFormat reg fmt) delta slot =
-  [ movInstr config fmt (OpAddr (spRel platform off)) (OpReg reg) ]
+  [ movInstr config fmt  (OpAddr (spRel platform off)) (OpReg reg) ]
   where
     platform = ncgPlatform config
     off = spillSlotToOffset platform slot - delta
 
+-- | A move instruction for moving the entire contents of an operand.
 movInstr :: NCGConfig -> Format -> (Operand -> Operand -> Instr)
 movInstr config fmt =
   case fmt of
@@ -1028,32 +1045,40 @@ takeRegRegMoveInstr
 
 takeRegRegMoveInstr platform = \case
   MOV fmt (OpReg r1) (OpReg r2)
-    -- MOV zeroes the upper part of vector registers,
-    -- so it is not a real "move" in that case.
-    | not (isVecFormat fmt)
-    -> go r1 r2
-  MOVD _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -- When used with vector registers, MOV only deals with the lower part,
+    -- so it is not a real move. For example, MOVSS/MOVSD between xmm registers
+    -- preserves the upper half, and MOVQ between xmm registers zeroes the upper half.
+    | not $ isVecFormat fmt
+    -- Don't eliminate a move between e.g. RAX and XMM:
+    -- even though we might be using XMM to store a scalar integer value,
+    -- some instructions only support XMM registers.
+    , targetClassOfReg platform r1 == targetClassOfReg platform r2
+    -> Just (r1, r2)
+  MOVD {}
+    -- MOVD moves between xmm registers and general-purpose registers,
+    -- and we don't want to eliminate those moves (as noted for MOV).
+    -> Nothing
+
+  -- SSE2/AVX move instructions always move the full register.
   MOVA _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   MOVU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   VMOVU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   MOVDQU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   VMOVDQU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
+
+  -- MOVL, MOVH and MOVHLPS preserve some part of the destination register,
+  -- so are not simple moves.
+  MOVL {} -> Nothing
+  MOVH {} -> Nothing
+  MOVHLPS {} -> Nothing
+
+  -- Other instructions are not moves.
   _ -> Nothing
-  where
-    go r1 r2
-      -- Don't eliminate a move between e.g. RAX and XMM:
-      -- even though we might be using XMM to store a scalar integer value,
-      -- some instructions only support XMM registers.
-      | targetClassOfReg platform r1 == targetClassOfReg platform r2
-      = Just (r1, r2)
-      | otherwise
-      = Nothing
 
 -- | Make an unconditional branch instruction.
 mkJumpInstr


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -619,15 +619,12 @@ pprInstr platform i = case i of
                 II64 -> II32          -- 32-bit version is equivalent, and smaller
                 _    -> format
 
-   MOV format src dst
-     -> pprFormatOpOp (text "mov") format src dst
-
-   MOV2 srcFmt dstFmt src dst
-     -> pprFormatOpOp (text "mov") fmt src dst
-     where
-      fmt = if formatInBytes srcFmt <= formatInBytes dstFmt
-            then srcFmt
-            else dstFmt
+   MOV fmt src dst
+     -> pprFormatOpOp (text "mov") fmt' src dst
+       where
+          fmt' = case fmt of
+            VecFormat _l sFmt -> scalarFormatFormat sFmt
+            _ -> fmt
 
    CMOV cc format src dst
      -> pprCondOpReg (text "cmov") format cc src dst
@@ -1004,7 +1001,7 @@ pprInstr platform i = case i of
      -> pprDoubleShift (text "psrldq") format offset dst
 
    MOVHLPS format from to
-     -> pprOpReg (text "movhlps") format from to
+     -> pprOpReg (text "movhlps") format (OpReg from) to
    PUNPCKLQDQ format from to
      -> pprOpReg (text "punpcklqdq") format from to
 



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/6ac63edbd1272a6d75f7a43d9df7f99a32e80856

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/6ac63edbd1272a6d75f7a43d9df7f99a32e80856
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240625/742ba1c9/attachment-0001.html>