[Git][ghc/ghc][wip/ncg-simd] rework X86 MOV instruction

Tue Jun 25 11:32:39 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
79003024 by sheaf at 2024-06-25T13:32:20+02:00
rework X86 MOV instruction

- - - - -


3 changed files:

- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs


Changes:

=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -526,8 +526,8 @@ assignMem_I64Code addrTree valueTree = do
   RegCode64 vcode rhi rlo <- iselExpr64 valueTree
   let
         -- Little-endian store
-        mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
-        mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
+        mov_lo = MOV II32 II32 (OpReg rlo) (OpAddr addr)
+        mov_hi = MOV II32 II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
   return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
 
 
@@ -536,8 +536,8 @@ assignReg_I64Code (CmmLocal dst) valueTree = do
    RegCode64 vcode r_src_hi r_src_lo <- iselExpr64 valueTree
    let
          Reg64 r_dst_hi r_dst_lo = localReg64 dst
-         mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
-         mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
+         mov_lo = MOV II32 II32 (OpReg r_src_lo) (OpReg r_dst_lo)
+         mov_hi = MOV II32 II32 (OpReg r_src_hi) (OpReg r_dst_hi)
    return (
         vcode `snocOL` mov_lo `snocOL` mov_hi
      )
@@ -552,8 +552,8 @@ iselExpr64 (CmmLit (CmmInt i _)) = do
         r = fromIntegral (fromIntegral i :: Word32)
         q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
         code = toOL [
-                MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
-                MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
+                MOV II32 II32 (OpImm (ImmInteger r)) (OpReg rlo),
+                MOV II32 II32 (OpImm (ImmInteger q)) (OpReg rhi)
                 ]
   return (RegCode64 code rhi rlo)
 
@@ -561,8 +561,8 @@ iselExpr64 (CmmLoad addrTree ty _) | isWord64 ty = do
    Amode addr addr_code <- getAmode addrTree
    Reg64 rhi rlo <- getNewReg64
    let
-        mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
-        mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
+        mov_lo = MOV II32 II32 (OpAddr addr) (OpReg rlo)
+        mov_hi = MOV II32 II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
    return (
             RegCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi) rhi rlo
      )
@@ -578,9 +578,9 @@ iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
         r = fromIntegral (fromIntegral i :: Word32)
         q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
         code =  code1 `appOL`
-                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                toOL [ MOV II32 II32 (OpReg r1lo) (OpReg rlo),
                        ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
-                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       MOV II32 II32 (OpReg r1hi) (OpReg rhi),
                        ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
    return (RegCode64 code rhi rlo)
 
@@ -591,9 +591,9 @@ iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
    let
         code =  code1 `appOL`
                 code2 `appOL`
-                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                toOL [ MOV II32 II32 (OpReg r1lo) (OpReg rlo),
                        ADD II32 (OpReg r2lo) (OpReg rlo),
-                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       MOV II32 II32 (OpReg r1hi) (OpReg rhi),
                        ADC II32 (OpReg r2hi) (OpReg rhi) ]
    return (RegCode64 code rhi rlo)
 
@@ -604,9 +604,9 @@ iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
    let
         code =  code1 `appOL`
                 code2 `appOL`
-                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                toOL [ MOV II32 II32 (OpReg r1lo) (OpReg rlo),
                        SUB II32 (OpReg r2lo) (OpReg rlo),
-                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       MOV II32 II32 (OpReg r1hi) (OpReg rhi),
                        SBB II32 (OpReg r2hi) (OpReg rhi) ]
    return (RegCode64 code rhi rlo)
 
@@ -642,10 +642,10 @@ iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
      code <- getAnyReg expr
      Reg64 r_dst_hi r_dst_lo <- getNewReg64
      return $ RegCode64 (code r_dst_lo `snocOL`
-                          MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
+                          MOV II32 II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
                           CLTD II32 `snocOL`
-                          MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
-                          MOV II32 (OpReg edx) (OpReg r_dst_hi))
+                          MOV II32 II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
+                          MOV II32 II32 (OpReg edx) (OpReg r_dst_hi))
                           r_dst_hi
                           r_dst_lo
 
@@ -655,8 +655,8 @@ iselExpr64 (CmmMachOp (MO_SS_Conv W16 W64) [expr]) = do
      return $ RegCode64 (code `appOL` toOL [
                           MOVSxL II16 (OpReg r) (OpReg eax),
                           CLTD II32,
-                          MOV II32 (OpReg eax) (OpReg r_dst_lo),
-                          MOV II32 (OpReg edx) (OpReg r_dst_hi)])
+                          MOV II32 II32 (OpReg eax) (OpReg r_dst_lo),
+                          MOV II32 II32 (OpReg edx) (OpReg r_dst_hi)])
                           r_dst_hi
                           r_dst_lo
 
@@ -666,8 +666,8 @@ iselExpr64 (CmmMachOp (MO_SS_Conv W8 W64) [expr]) = do
      return $ RegCode64 (code `appOL` toOL [
                           MOVSxL II8 (OpReg r) (OpReg eax),
                           CLTD II32,
-                          MOV II32 (OpReg eax) (OpReg r_dst_lo),
-                          MOV II32 (OpReg edx) (OpReg r_dst_hi)])
+                          MOV II32 II32 (OpReg eax) (OpReg r_dst_lo),
+                          MOV II32 II32 (OpReg edx) (OpReg r_dst_hi)])
                           r_dst_hi
                           r_dst_lo
 
@@ -676,7 +676,7 @@ iselExpr64 (CmmMachOp (MO_S_Neg _) [expr]) = do
    Reg64 rohi rolo <- getNewReg64
    let
         ocode = code `appOL`
-                toOL [ MOV II32 (OpReg rlo) (OpReg rolo),
+                toOL [ MOV II32 II32 (OpReg rlo) (OpReg rolo),
                        XOR II32 (OpReg rohi) (OpReg rohi),
                        NEGI II32 (OpReg rolo),
                        SBB II32 (OpReg rhi) (OpReg rohi) ]
@@ -699,16 +699,16 @@ iselExpr64 (CmmMachOp (MO_Mul _) [e1,e2]) = do
    let
         code =  code1 `appOL`
                 code2 `appOL`
-                toOL [ MOV  II32 (OpReg r1lo) (OpReg eax),
-                       MOV  II32 (OpReg r2lo) (OpReg tmp),
-                       MOV  II32 (OpReg r1hi) (OpReg rhi),
-                       IMUL II32 (OpReg tmp) (OpReg rhi),
-                       MOV  II32 (OpReg r2hi) (OpReg rlo),
-                       IMUL II32 (OpReg eax) (OpReg rlo),
-                       ADD  II32 (OpReg rlo) (OpReg rhi),
-                       MUL2 II32 (OpReg tmp),
-                       ADD  II32 (OpReg edx) (OpReg rhi),
-                       MOV  II32 (OpReg eax) (OpReg rlo)
+                toOL [ MOV  II32 II32 (OpReg r1lo) (OpReg eax),
+                       MOV  II32 II32 (OpReg r2lo) (OpReg tmp),
+                       MOV  II32 II32 (OpReg r1hi) (OpReg rhi),
+                       IMUL II32      (OpReg tmp) (OpReg rhi),
+                       MOV  II32 II32 (OpReg r2hi) (OpReg rlo),
+                       IMUL II32      (OpReg eax) (OpReg rlo),
+                       ADD  II32      (OpReg rlo) (OpReg rhi),
+                       MUL2 II32      (OpReg tmp),
+                       ADD  II32      (OpReg edx) (OpReg rhi),
+                       MOV  II32 II32 (OpReg eax) (OpReg rlo)
                      ]
    return (RegCode64 code rhi rlo)
 
@@ -717,8 +717,8 @@ iselExpr64 (CmmMachOp (MO_S_MulMayOflo W64) _) = do
    -- We always return a (usually false) positive.
    Reg64 rhi rlo <- getNewReg64
    let code = toOL   [
-                       MOV  II32 (OpImm (ImmInt 1)) (OpReg rhi),
-                       MOV  II32 (OpImm (ImmInt 1)) (OpReg rlo)
+                       MOV  II32 II32 (OpImm (ImmInt 1)) (OpReg rhi),
+                       MOV  II32 II32 (OpImm (ImmInt 1)) (OpReg rlo)
                      ]
    return (RegCode64 code rhi rlo)
 
@@ -740,15 +740,15 @@ iselExpr64 (CmmMachOp (MO_Shl _) [e1,e2]) = do
    let
         code =  code1 `appOL`
                 code2 ecx `appOL`
-                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
-                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                toOL [ MOV II32 II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 II32 (OpReg r1hi) (OpReg rhi),
                        SHLD II32 (OpReg ecx) (OpReg rlo) (OpReg rhi),
                        SHL II32 (OpReg ecx) (OpReg rlo),
                        TEST II32 (OpImm (ImmInt 32)) (OpReg ecx),
                        JXX EQQ lbl2,
                        JXX ALWAYS lbl1,
                        NEWBLOCK lbl1,
-                       MOV II32 (OpReg rlo) (OpReg rhi),
+                       MOV II32 II32 (OpReg rlo) (OpReg rhi),
                        XOR II32 (OpReg rlo) (OpReg rlo),
                        JXX ALWAYS lbl2,
                        NEWBLOCK lbl2
@@ -770,16 +770,16 @@ iselExpr64 (CmmMachOp (MO_S_Shr _) [e1,e2]) = do
    let
         code =  code1 `appOL`
                 code2 `appOL`
-                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
-                       MOV II32 (OpReg r1hi) (OpReg rhi),
-                       MOV II32 (OpReg r2) (OpReg ecx),
+                toOL [ MOV II32 II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 II32 (OpReg r1hi) (OpReg rhi),
+                       MOV II32 II32 (OpReg r2) (OpReg ecx),
                        SHRD II32 (OpReg ecx) (OpReg rhi) (OpReg rlo),
                        SAR II32 (OpReg ecx) (OpReg rhi),
                        TEST II32 (OpImm (ImmInt 32)) (OpReg ecx),
                        JXX EQQ lbl2,
                        JXX ALWAYS lbl1,
                        NEWBLOCK lbl1,
-                       MOV II32 (OpReg rhi) (OpReg rlo),
+                       MOV II32 II32 (OpReg rhi) (OpReg rlo),
                        SAR II32 (OpImm (ImmInt 31)) (OpReg rhi),
                        JXX ALWAYS lbl2,
                        NEWBLOCK lbl2
@@ -797,16 +797,16 @@ iselExpr64 (CmmMachOp (MO_U_Shr _) [e1,e2]) = do
    let
         code =  code1 `appOL`
                 code2 `appOL`
-                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
-                       MOV II32 (OpReg r1hi) (OpReg rhi),
-                       MOV II32 (OpReg r2) (OpReg ecx),
+                toOL [ MOV II32 II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 II32 (OpReg r1hi) (OpReg rhi),
+                       MOV II32 II32 (OpReg r2) (OpReg ecx),
                        SHRD II32 (OpReg ecx) (OpReg rhi) (OpReg rlo),
                        SHR II32 (OpReg ecx) (OpReg rhi),
                        TEST II32 (OpImm (ImmInt 32)) (OpReg ecx),
                        JXX EQQ lbl2,
                        JXX ALWAYS lbl1,
                        NEWBLOCK lbl1,
-                       MOV II32 (OpReg rhi) (OpReg rlo),
+                       MOV II32 II32 (OpReg rhi) (OpReg rlo),
                        XOR II32 (OpReg rhi) (OpReg rhi),
                        JXX ALWAYS lbl2,
                        NEWBLOCK lbl2
@@ -822,8 +822,8 @@ iselExpr64 (CmmMachOp (MO_Not _) [e1]) = do
    Reg64 rhi rlo <- getNewReg64
    let
         code =  code1 `appOL`
-                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
-                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                toOL [ MOV II32 II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 II32 (OpReg r1hi) (OpReg rhi),
                        NOT II32 (OpReg rlo),
                        NOT II32 (OpReg rhi)
                      ]
@@ -845,8 +845,8 @@ iselExpr64ParallelBin op e1 e2 = do
    let
         code =  code1 `appOL`
                 code2 `appOL`
-                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
-                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                toOL [ MOV II32 II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 II32 (OpReg r1hi) (OpReg rhi),
                        op  II32 (OpReg r2lo) (OpReg rlo),
                        op  II32 (OpReg r2hi) (OpReg rhi)
                      ]
@@ -995,7 +995,7 @@ getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _ _])
 
 getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _ _])
  | not is32Bit = do
-  code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
+  code <- intLoadCode (MOV II32 II64) addr -- 32-bit loads zero-extend
   return (Any II64 code)
 
 getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _ _])
@@ -1054,40 +1054,40 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_FW_Bitcast {}  -> incorrectOperands
 
       -- widenings
-      MO_UU_Conv W8  W32 -> integerExtend W8  W32 MOVZxL x
-      MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
-      MO_UU_Conv W8  W16 -> integerExtend W8  W16 MOVZxL x
+      MO_UU_Conv W8  W32 -> integerExtend W8  W32 (const . MOVZxL) x
+      MO_UU_Conv W16 W32 -> integerExtend W16 W32 (const . MOVZxL) x
+      MO_UU_Conv W8  W16 -> integerExtend W8  W16 (const . MOVZxL) x
 
-      MO_SS_Conv W8  W32 -> integerExtend W8  W32 MOVSxL x
-      MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
-      MO_SS_Conv W8  W16 -> integerExtend W8  W16 MOVSxL x
+      MO_SS_Conv W8  W32 -> integerExtend W8  W32 (const . MOVSxL) x
+      MO_SS_Conv W16 W32 -> integerExtend W16 W32 (const . MOVSxL) x
+      MO_SS_Conv W8  W16 -> integerExtend W8  W16 (const . MOVSxL) x
 
       -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we
       -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register
       -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
       MO_XX_Conv W8  W32
-          | is32Bit   -> integerExtend W8 W32 MOVZxL x
-          | otherwise -> integerExtend W8 W32 (MOV) x
+          | is32Bit   -> integerExtend W8 W32 (const . MOVZxL) x
+          | otherwise -> integerExtend W8 W32 MOV x
       MO_XX_Conv W8  W16
-          | is32Bit   -> integerExtend W8 W16 MOVZxL x
-          | otherwise -> integerExtend W8 W16 (MOV) x
-      MO_XX_Conv W16 W32 -> integerExtend W16 W32 (MOV) x
-
-      MO_UU_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVZxL x
-      MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
-      MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
-      MO_SS_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVSxL x
-      MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
-      MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
+          | is32Bit   -> integerExtend W8 W16 (const . MOVZxL) x
+          | otherwise -> integerExtend W8 W16 MOV x
+      MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
+
+      MO_UU_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 (const . MOVZxL) x
+      MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 (const . MOVZxL) x
+      MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 (const . MOVZxL) x
+      MO_SS_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 (const . MOVSxL) x
+      MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 (const . MOVSxL) x
+      MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 (const . MOVSxL) x
       -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
       -- However, we don't want the register allocator to throw it
       -- away as an unnecessary reg-to-reg move, so we keep it in
       -- the form of a movzl and print it as a movl later.
       -- This doesn't apply to MO_XX_Conv since in this case we don't care about
       -- the upper bits. So we can just use MOV.
-      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 (MOV) x
-      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 (MOV) x
-      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 (MOV) x
+      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOV x
+      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
+      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
 
       MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
       MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
@@ -1185,7 +1185,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
 
         -- signed or unsigned extension.
         integerExtend :: Width -> Width
-                      -> (Format -> Operand -> Operand -> Instr)
+                      -> (Format -> Format -> Operand -> Operand -> Instr)
                       -> CmmExpr -> NatM Register
         integerExtend from to instr expr = do
             (reg,e_code) <- if from == W8 then getByteReg expr
@@ -1193,7 +1193,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
             let
                 code dst =
                   e_code `snocOL`
-                  instr (intFormat from) (OpReg reg) (OpReg dst)
+                  instr (intFormat from) (intFormat to) (OpReg reg) (OpReg dst)
             return (Any (intFormat to) code)
 
         bitcast :: Format -> Format -> CmmExpr -> NatM Register
@@ -1311,7 +1311,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
           (reg, exp) <- getSomeReg expr
           let fmt = VecFormat len FmtInt64
           return $ Any fmt (\dst -> exp `snocOL`
-                                    (MOV2 II64 fmt (OpReg reg) (OpReg dst)) `snocOL`
+                                    (MOVD II64 (OpReg reg) (OpReg dst)) `snocOL`
                                     (PUNPCKLQDQ fmt (OpReg dst) dst)
                                     )
         vector_int_broadcast _ _ c
@@ -1703,7 +1703,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
           imm      = litToImm lit
           code dst
             = case lit of
-                CmmInt 0 _ -> exp `snocOL` (MOV FF32 (OpReg r) (OpReg dst))
+                CmmInt 0 _ -> exp `snocOL` (MOV FF32 format (OpReg r) (OpReg dst))
                 CmmInt _ _ -> exp `snocOL` (VPSHUFD format imm (OpReg r) dst)
                 _          -> panic "Error in offset while unpacking"
       return (Any format code)
@@ -1714,7 +1714,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
           code dst
             = case lit of
                 CmmInt 0 _ -> exp `snocOL`
-                              (MOV FF64 (OpReg r) (OpReg dst))
+                              (MOV FF64 format (OpReg r) (OpReg dst))
                 CmmInt 1 _ -> exp `snocOL`
                               (MOVHLPS format (OpReg r) dst)
                 _          -> panic "Error in offset while unpacking"
@@ -1756,10 +1756,10 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       let code dst
             = case lit of
                 CmmInt 0 _ -> exp `snocOL`
-                              (MOV2 fmt II64 (OpReg r) (OpReg dst))
+                              (MOVD II64 (OpReg r) (OpReg dst))
                 CmmInt 1 _ -> exp `snocOL`
                               (MOVHLPS fmt (OpReg r) tmp) `snocOL`
-                              (MOV2 fmt II64 (OpReg tmp) (OpReg dst))
+                              (MOVD II64 (OpReg tmp) (OpReg dst))
                 _          -> panic "Error in offset while unpacking"
       return (Any fmt code)
     vector_int_unpack_sse _ w c e
@@ -1900,11 +1900,11 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
               = case offset of
                   CmmInt 0 _ -> valExp `appOL`
                                 vecExp `snocOL`
-                                (MOV FF64 (OpReg valReg) (OpReg dst)) `snocOL`
+                                (MOV FF64 fmt (OpReg valReg) (OpReg dst)) `snocOL`
                                 (SHUFPD fmt (ImmInt 0b00) (OpReg vecReg) dst)
                   CmmInt 1 _ -> valExp `appOL`
                                 vecExp `snocOL`
-                                (MOV FF64 (OpReg vecReg) (OpReg dst)) `snocOL`
+                                (MOV FF64 fmt (OpReg vecReg) (OpReg dst)) `snocOL`
                                 (SHUFPD fmt (ImmInt 0b00) (OpReg valReg) dst)
                   _ -> pprPanic "MO_VF_Insert DoubleX2: unsupported offset" (ppr offset)
          in return $ Any fmt code
@@ -1939,12 +1939,12 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
                   CmmInt 0 _ -> valExp `appOL`
                                 vecExp `snocOL`
                                 (MOVHLPS fmt (OpReg vecReg) tmp) `snocOL`
-                                (MOV2 II64 fmt (OpReg valReg) (OpReg dst)) `snocOL`
+                                (MOVD II64 (OpReg valReg) (OpReg dst)) `snocOL`
                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   CmmInt 1 _ -> valExp `appOL`
                                 vecExp `snocOL`
-                                (MOV fmt (OpReg vecReg) (OpReg dst)) `snocOL`
-                                (MOV2 II64 fmt (OpReg valReg) (OpReg tmp)) `snocOL`
+                                (MOV II64 fmt (OpReg vecReg) (OpReg dst)) `snocOL`
+                                (MOVD II64 (OpReg valReg) (OpReg tmp)) `snocOL`
                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   _ -> pprPanic "MO_V_Insert Int64X2: unsupported offset" (ppr offset)
          in return $ Any fmt code
@@ -1983,23 +1983,24 @@ getRegister' _ is32Bit (CmmLoad mem pk _)
     let
       instr = case width of
                 W8     -> MOVZxL II8
-                _other -> MOV format
+                  -- We always zero-extend 8-bit loads, if we
+                  -- can't think of anything better.  This is because
+                  -- we can't guarantee access to an 8-bit variant of every register
+                  -- (esi and edi don't have 8-bit variants), so to make things
+                  -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+                _other -> MOV format format
     code <- intLoadCode instr mem
     return (Any format code)
   where
     width = typeWidth pk
     format = intFormat width
-        -- We always zero-extend 8-bit loads, if we
-        -- can't think of anything better.  This is because
-        -- we can't guarantee access to an 8-bit variant of every register
-        -- (esi and edi don't have 8-bit variants), so to make things
-        -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+
 
 -- Simpler memory load code on x86_64
 getRegister' _ is32Bit (CmmLoad mem pk _)
  | not is32Bit
   = do
-    code <- intLoadCode (MOV format) mem
+    code <- intLoadCode (MOV format format) mem
     return (Any format code)
   where format = intFormat $ typeWidth pk
 
@@ -2040,7 +2041,7 @@ getRegister' platform is32Bit (CmmLit lit)
   | not is32Bit, isWord64 (cmmLitType platform lit), not (isBigLit lit)
   = let
         imm = litToImm lit
-        code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
+        code dst = unitOL (MOV II32 II64 (OpImm imm) (OpReg dst))
     in
         return (Any II64 code)
   where
@@ -2069,7 +2070,7 @@ getRegister' platform _ (CmmLit lit)
       = do
       let format = cmmTypeFormat ctype
           imm = litToImm lit
-          code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
+          code dst = unitOL (MOV format format (OpImm imm) (OpReg dst))
       return (Any format code)
 
 getRegister' platform _ other
@@ -2410,7 +2411,7 @@ loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register
 loadFloatAmode w addr addr_code = do
   let format = floatFormat w
       code dst = addr_code `snocOL`
-                    MOV format (OpAddr addr) (OpReg dst)
+                    MOV format format (OpAddr addr) (OpReg dst)
 
   return (Any format code)
 
@@ -2583,19 +2584,19 @@ condIntCode' platform cond x y
         cmpExact :: OrdList Instr
         cmpExact =
           toOL
-            [ MOV II32 (OpReg r1_hi) (OpReg tmp1)
-            , MOV II32 (OpReg r1_lo) (OpReg tmp2)
+            [ MOV II32 II32 (OpReg r1_hi) (OpReg tmp1)
+            , MOV II32 II32 (OpReg r1_lo) (OpReg tmp2)
             , XOR II32 (OpReg r2_hi) (OpReg tmp1)
             , XOR II32 (OpReg r2_lo) (OpReg tmp2)
             , OR  II32 (OpReg tmp1)  (OpReg tmp2)
             ]
         cmpGE = toOL
-            [ MOV II32 (OpReg r1_hi) (OpReg tmp1)
+            [ MOV II32 II32 (OpReg r1_hi) (OpReg tmp1)
             , CMP II32 (OpReg r2_lo) (OpReg r1_lo)
             , SBB II32 (OpReg r2_hi) (OpReg tmp1)
             ]
         cmpLE = toOL
-            [ MOV II32 (OpReg r2_hi) (OpReg tmp1)
+            [ MOV II32 II32 (OpReg r2_hi) (OpReg tmp1)
             , CMP II32 (OpReg r1_lo) (OpReg r2_lo)
             , SBB II32 (OpReg r1_hi) (OpReg tmp1)
             ]
@@ -2735,7 +2736,7 @@ assignMem_IntCode pk addr src = do
     let
         code = code_src `appOL`
                code_addr `snocOL`
-                  MOV pk op_src (OpAddr addr)
+                  MOV pk pk op_src (OpAddr addr)
         -- NOTE: op_src is stable, so it will still be valid
         -- after code_addr.  This may involve the introduction
         -- of an extra MOV to a temporary register, but we hope
@@ -2753,7 +2754,7 @@ assignMem_IntCode pk addr src = do
 
 -- Assign; dst is a reg, rhs is mem
 assignReg_IntCode pk reg (CmmLoad src _ _) = do
-  load_code <- intLoadCode (MOV pk) src
+  load_code <- intLoadCode (MOV pk pk) src
   platform <- ncgPlatform <$> getConfig
   return (load_code (getRegisterReg platform reg))
 
@@ -2771,7 +2772,7 @@ assignMem_FltCode pk addr src = do
   let
         code = src_code `appOL`
                addr_code `snocOL`
-               MOV pk (OpReg src_reg) (OpAddr addr)
+               MOV pk pk (OpReg src_reg) (OpAddr addr)
 
   return code
 
@@ -3323,7 +3324,7 @@ genCCall32 addr _ dest_regs args = do
                                       in
 
                                       -- assume SSE2
-                                       MOV format (OpReg reg) (OpAddr addr)
+                                       MOV format format (OpReg reg) (OpAddr addr)
 
                                      ]
                                )
@@ -3405,12 +3406,12 @@ genCCall32 addr _ dest_regs args = do
                                    -- NB: This code will need to be
                                    -- revisited once GHC does more work around
                                    -- SIGFPE f
-                                   MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
+                                   MOV fmt fmt (OpAddr tmp_amode) (OpReg r_dest),
                                    ADD II32 (OpImm (ImmInt b)) (OpReg esp),
                                    DELTA delta0]
-              | isWord64 ty    = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
-                                        MOV II32 (OpReg edx) (OpReg r_dest_hi)]
-              | otherwise      = unitOL (MOV (intFormat w)
+              | isWord64 ty    = toOL [MOV II32 II32 (OpReg eax) (OpReg r_dest),
+                                        MOV II32 II32 (OpReg edx) (OpReg r_dest_hi)]
+              | otherwise      = unitOL (MOV (intFormat w) (intFormat w)
                                              (OpReg eax)
                                              (OpReg r_dest))
               where
@@ -3519,10 +3520,7 @@ genCCall64 addr conv dest_regs args = do
                                 -- If we are calling a varargs function
                                 -- then we need to define ireg as well
                                 -- as freg
-                                CVTTSD2SIQ II64 (OpReg freg) ireg)
-                                  -- SLD TODO: I changed this from MOV FF64 (OpReg freg) (OpReg ireg)
-                                  -- to CVTTSD2SIQ ...
-                                  -- because it is going between two different types of register
+                                MOVD FF64 (OpReg freg) (OpReg ireg))
             | otherwise = do
                  arg_code <- getAnyReg arg
                  load_args_win rest (RegFormat ireg II64: usedInt) usedFP regs
@@ -3538,10 +3536,11 @@ genCCall64 addr conv dest_regs args = do
              (arg_reg, arg_code) <- getSomeReg arg
              delta <- getDeltaNat
              setDeltaNat (delta-arg_size)
-             let code' = code `appOL` arg_code `appOL` toOL [
+             let fmt = floatFormat width
+                 code' = code `appOL` arg_code `appOL` toOL [
                             SUB (intFormat (wordWidth platform)) (OpImm (ImmInt arg_size)) (OpReg rsp),
                             DELTA (delta-arg_size),
-                            MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel platform 0))]
+                            MOV fmt fmt (OpReg arg_reg) (OpAddr (spRel platform 0))]
              push_args rest code'
 
            | otherwise = do
@@ -3634,7 +3633,7 @@ genCCall64 addr conv dest_regs args = do
         -- It's not safe to omit this assignment, even if the number
         -- of SSE2 regs in use is zero.  If %al is larger than 8
         -- on entry to a varargs function, seg faults ensue.
-        assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
+        assign_eax n = unitOL (MOV II32 II32 (OpImm (ImmInt n)) (OpReg eax))
 
     let call = callinsns `appOL`
                toOL (
@@ -3650,17 +3649,13 @@ genCCall64 addr conv dest_regs args = do
         -- assign the results, if necessary
         assign_code []     = nilOL
         assign_code [dest] =
-          case typeWidth rep of
-                W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
-                                                     (OpReg xmm0)
-                                                     (OpReg r_dest))
-                W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
-                                                     (OpReg xmm0)
-                                                     (OpReg r_dest))
-                _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
+          unitOL $
+            mkRegRegMoveInstr config fmt reg r_dest
           where
-                rep = localRegType dest
-                r_dest = getRegisterReg platform  (CmmLocal dest)
+            reg = if isIntFormat fmt then rax else xmm0
+            fmt = cmmTypeFormat rep
+            rep = localRegType dest
+            r_dest = getRegisterReg platform (CmmLocal dest)
         assign_code _many = panic "genForeignCall.assign_code many"
 
     return (adjust_rsp          `appOL`
@@ -3769,9 +3764,10 @@ genSwitch expr targets = do
             tableReg <- getNewRegNat (intFormat (platformWordWidth platform))
             targetReg <- getNewRegNat (intFormat (platformWordWidth platform))
             let op = OpAddr (AddrBaseIndex (EABaseReg tableReg) (EAIndex reg (platformWordSizeInBytes platform)) (ImmInt 0))
+                fmt = archWordFormat is32bit
                 code = e_code `appOL` toOL
-                    [ LEA (archWordFormat is32bit) (OpAddr (AddrBaseIndex EABaseRip EAIndexNone (ImmCLbl lbl))) (OpReg tableReg)
-                    , MOV (archWordFormat is32bit) op (OpReg targetReg)
+                    [ LEA fmt (OpAddr (AddrBaseIndex EABaseRip EAIndexNone (ImmCLbl lbl))) (OpReg tableReg)
+                    , MOV fmt fmt op (OpReg targetReg)
                     , JMP_TBL (OpReg targetReg) ids (Section ReadOnlyData lbl) lbl
                     ]
             return code
@@ -4019,7 +4015,7 @@ genTrivialCode rep instr a b = do
      code dst
         | dst `regClashesWithOp` b_op =
                 b_code `appOL`
-                unitOL (MOV rep b_op (OpReg tmp)) `appOL`
+                unitOL (MOV rep rep b_op (OpReg tmp)) `appOL`
                 a_code dst `snocOL`
                 instr (OpReg tmp) (OpReg dst)
         | otherwise =
@@ -4192,7 +4188,7 @@ sse2NegCode w x = do
   tmp <- getNewRegNat fmt
   let
     code dst = x_code dst `appOL` amode_code `appOL` toOL [
-        MOV fmt (OpAddr amode) (OpReg tmp),
+        MOV fmt fmt (OpAddr amode) (OpReg tmp),
         XOR fmt (OpReg tmp) (OpReg dst)
         ]
   --
@@ -4295,11 +4291,11 @@ genAtomicRMW bid width amop dst addr n = do
           -- final move should go away, because it's the last use of arg
           -- and the first use of dst_r.
           AMO_Add  -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
-                                     , MOV format (OpReg arg) (OpReg dst_r)
+                                     , MOV format format (OpReg arg) (OpReg dst_r)
                                      ], bid)
           AMO_Sub  -> return $ (toOL [ NEGI format (OpReg arg)
                                      , LOCK (XADD format (OpReg arg) (OpAddr amode))
-                                     , MOV format (OpReg arg) (OpReg dst_r)
+                                     , MOV format format (OpReg arg) (OpReg dst_r)
                                      ], bid)
           -- In these cases we need a new block id, and have to return it so
           -- that later instruction selection can reference it.
@@ -4327,12 +4323,12 @@ genAtomicRMW bid width amop dst addr n = do
             updateCfgNat (addWeightEdge lbl1 lbl1 0)
 
             return $ (toOL
-                [ MOV format (OpAddr amode) (OpReg eax)
+                [ MOV format format (OpAddr amode) (OpReg eax)
                 , JXX ALWAYS lbl1
                 , NEWBLOCK lbl1
                   -- Keep old value so we can return it:
-                , MOV format (OpReg eax) (OpReg dst_r)
-                , MOV format (OpReg eax) (OpReg tmp)
+                , MOV format format (OpReg eax) (OpReg dst_r)
+                , MOV format format (OpReg eax) (OpReg tmp)
                 ]
                 `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
                 [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
@@ -4385,9 +4381,9 @@ genCtz64_32 bid dst src = do
   --    dst = 64;
   --  }
   let instrs = vcode `appOL` toOL
-           ([ MOV      II32 (OpReg rhi)         (OpReg tmp_r)
+           ([ MOV II32 II32 (OpReg rhi)         (OpReg tmp_r)
             , OR       II32 (OpReg rlo)         (OpReg tmp_r)
-            , MOV      II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+            , MOV II32 II32 (OpImm (ImmInt 64)) (OpReg dst_r)
             , JXX EQQ    lbl2
             , JXX ALWAYS lbl1
 
@@ -4432,10 +4428,10 @@ genCtzGeneric width dst src = do
       src_r <- getNewRegNat format
       tmp_r <- getNewRegNat format
       let instrs = code_src src_r `appOL` toOL
-               ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
-                [ BSF     format (OpReg src_r) tmp_r
-                , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
-                , CMOV NE format (OpReg tmp_r) dst_r
+               ([ MOVZxL   II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                [ BSF      format (OpReg src_r) tmp_r
+                , MOV II32 II32   (OpImm (ImmInt bw)) (OpReg dst_r)
+                , CMOV NE  format (OpReg tmp_r) dst_r
                 ]) -- NB: We don't need to zero-extend the result for the
                    -- W8/W16 cases because the 'MOV' insn already
                    -- took care of implicitly clearing the upper bits
@@ -4499,21 +4495,21 @@ genMemCpyInlineMaybe align dst src n = do
       go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
       go dst src tmp i
           | i >= sizeBytes =
-              unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL`
-              unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL`
+              unitOL (MOV format format (OpAddr src_addr) (OpReg tmp)) `appOL`
+              unitOL (MOV format format (OpReg tmp) (OpAddr dst_addr)) `appOL`
               go dst src tmp (i - sizeBytes)
           -- Deal with remaining bytes.
           | i >= 4 =  -- Will never happen on 32-bit
-              unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
-              unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+              unitOL (MOV II32 II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
+              unitOL (MOV II32 II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
               go dst src tmp (i - 4)
           | i >= 2 =
-              unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
-              unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+              unitOL (MOVZxL   II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
+              unitOL (MOV II16 II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
               go dst src tmp (i - 2)
           | i >= 1 =
-              unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
-              unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+              unitOL (MOVZxL  II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
+              unitOL (MOV II8 II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
               go dst src tmp (i - 1)
           | otherwise = nilOL
         where
@@ -4591,17 +4587,17 @@ genMemSetInlineMaybe align dst c n = do
     gen4 :: AddrMode -> Integer -> (InstrBlock, Integer)
     gen4 addr size
         | size >= 4 =
-            (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
+            (unitOL (MOV II32 II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
         | size >= 2 =
-            (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
+            (unitOL (MOV II16 II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
         | size >= 1 =
-            (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
+            (unitOL (MOV II8 II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
         | otherwise = (nilOL, 0)
 
     -- Generates a 64-bit wide MOV instruction from REG to MEM.
     gen8 :: AddrMode -> Reg -> InstrBlock
     gen8 addr reg8byte =
-      unitOL (MOV format (OpReg reg8byte) (OpAddr addr))
+      unitOL (MOV format format (OpReg reg8byte) (OpAddr addr))
 
     -- Unrolls memset when the widest MOV is <= 4 bytes.
     go4 :: Reg -> Integer -> InstrBlock
@@ -4687,8 +4683,8 @@ genByteSwap width dst src = do
         let Reg64 dst_hi dst_lo = localReg64 dst
         RegCode64 vcode rhi rlo <- iselExpr64 src
         return $ vcode `appOL`
-                 toOL [ MOV II32 (OpReg rlo) (OpReg dst_hi),
-                        MOV II32 (OpReg rhi) (OpReg dst_lo),
+                 toOL [ MOV II32 II32 (OpReg rlo) (OpReg dst_hi),
+                        MOV II32 II32 (OpReg rhi) (OpReg dst_lo),
                         BSWAP II32 dst_hi,
                         BSWAP II32 dst_lo ]
       W16 -> do
@@ -4832,11 +4828,11 @@ genClz bid width dst src = do
           src_r <- getNewRegNat format
           tmp_r <- getNewRegNat format
           return $ code_src src_r `appOL` toOL
-                   ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
-                    [ BSR     format (OpReg src_r) tmp_r
-                    , MOV     II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
-                    , CMOV NE format (OpReg tmp_r) dst_r
-                    , XOR     format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
+                   ([ MOVZxL   II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                    [ BSR      format (OpReg src_r) tmp_r
+                    , MOV II32 II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
+                    , CMOV NE  format (OpReg tmp_r) dst_r
+                    , XOR      format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
                     ]) -- NB: We don't need to zero-extend the result for the
                        -- W8/W16 cases because the 'MOV' insn already
                        -- took care of implicitly clearing the upper bits
@@ -4848,7 +4844,8 @@ genWordToFloat bid width dst src =
 
 genAtomicRead :: Width -> MemoryOrdering -> LocalReg -> CmmExpr -> NatM InstrBlock
 genAtomicRead width _mord dst addr = do
-  load_code <- intLoadCode (MOV (intFormat width)) addr
+  let fmt = intFormat width
+  load_code <- intLoadCode (MOV fmt fmt) addr
   return (load_code (getLocalRegReg dst))
 
 genAtomicWrite :: Width -> MemoryOrdering -> CmmExpr -> CmmExpr -> NatM InstrBlock
@@ -4885,9 +4882,9 @@ genCmpXchg bid width dst addr old new = do
       platform <- getPlatform
       let dst_r    = getRegisterReg platform  (CmmLocal dst)
           code     = toOL
-                     [ MOV format (OpReg oldval) (OpReg eax)
+                     [ MOV format format (OpReg oldval) (OpReg eax)
                      , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
-                     , MOV format (OpReg eax) (OpReg dst_r)
+                     , MOV format format (OpReg eax) (OpReg dst_r)
                      ]
       return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
           `appOL` code
@@ -4909,7 +4906,7 @@ genXchg width dst addr value = do
   let dst_r    = getLocalRegReg dst
   -- Copy the value into the target register, perform the exchange.
   let code     = toOL
-                 [ MOV format (OpReg newval) (OpReg dst_r)
+                 [ MOV format format (OpReg newval) (OpReg dst_r)
                   -- On X86 xchg implies a lock prefix if we use a memory argument.
                   -- so this is atomic.
                  , XCHG format (OpAddr amode) dst_r
@@ -4930,7 +4927,7 @@ genFloatAbs width dst src = do
   tmp <- getNewRegNat format
   let dst_r = getLocalRegReg dst
   pure $ src_code dst_r `appOL` amode_code `appOL` toOL
-           [ MOV format (OpAddr amode) (OpReg tmp)
+           [ MOV format format (OpAddr amode) (OpReg tmp)
            , AND format (OpReg tmp) (OpReg dst_r)
            ]
 
@@ -5005,8 +5002,8 @@ genSignedLargeMul width res_c res_h res_l arg_x arg_y = do
       code = y_code `appOL`
              x_code rax `appOL`
              toOL [ IMUL2 format y_reg
-                  , MOV format (OpReg rdx) (OpReg reg_h)
-                  , MOV format (OpReg rax) (OpReg reg_l)
+                  , MOV format format (OpReg rdx) (OpReg reg_h)
+                  , MOV format format (OpReg rax) (OpReg reg_l)
                   , SETCC CARRY (OpReg reg_tmp)
                   , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
                   ]
@@ -5028,8 +5025,8 @@ genUnsignedLargeMul width res_h res_l arg_x arg_y = do
       code = y_code `appOL`
              x_code rax `appOL`
              toOL [MUL2 format y_reg,
-                   MOV format (OpReg rdx) (OpReg reg_h),
-                   MOV format (OpReg rax) (OpReg reg_l)]
+                   MOV format format (OpReg rdx) (OpReg reg_h),
+                   MOV format format (OpReg rax) (OpReg reg_l)]
   return code
 
 
@@ -5072,5 +5069,5 @@ genQuotRem width signed res_q res_r m_arg_x_high arg_x_low arg_y = do
                x_low_code rax `appOL`
                x_high_code rdx `appOL`
                toOL [instr format y_reg,
-                     MOV format (OpReg rax) (OpReg reg_q),
-                     MOV format (OpReg rdx) (OpReg reg_r)]
+                     MOV format format (OpReg rax) (OpReg reg_q),
+                     MOV format format (OpReg rdx) (OpReg reg_r)]


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -202,19 +202,17 @@ data Instr
         | DELTA  Int
 
         -- Moves.
-        | MOV         Format Operand Operand
+        | MOV
+           Format -- ^ format of data being moved
+           Format -- ^ format of the two operands
+           Operand -- ^ src
+           Operand -- ^ dst
              -- ^ N.B. Due to AT&T assembler quirks, when used with 'II64'
              -- 'Format' immediate source and memory target operand, the source
              -- operand is interpreted to be a 32-bit sign-extended value.
              -- True 64-bit operands need to be either first moved to a register or moved
              -- with @MOVABS@; we currently do not use this instruction in GHC.
              -- See https://stackoverflow.com/questions/52434073/whats-the-difference-between-the-x86-64-att-instructions-movq-and-movabsq.
-        | MOV2 Format Format Operand Operand
-         -- ^ Like MOV, but between two different kinds of registers
-         -- (e.g. moving rax to xmm1)
-         --
-         -- SIMD NCG TODO: this is a bit of a hack, but the alternative would
-         -- be to have MOV store two Formats to handle xmm -> rax and rax -> xmm.
 
         | MOVD   Format Operand Operand -- ^ MOVD/MOVQ SSE2 instructions
                                         -- (bitcast between a general purpose
@@ -426,9 +424,21 @@ data FMAPermutation = FMA132 | FMA213 | FMA231
 regUsageOfInstr :: Platform -> Instr -> RegUsage
 regUsageOfInstr platform instr
  = case instr of
-    MOV    fmt src dst    -> usageRW fmt src dst
-    MOV2   srcFmt dstFmt src dst -> mkRU (use_R srcFmt src []) (use_R dstFmt dst [])
-    MOVD   fmt src dst    -> mkRU (use_R fmt src []) (use_R (movdOutFormat fmt) dst [])
+    MOV instrFmt opFmt src dst
+      -- MOVSS/MOVSD preserve the upper half of vector registers,
+      -- but only for reg-2-reg moves
+      | isFloatFormat instrFmt
+      , isVecFormat opFmt
+      , OpReg {} <- src
+      , OpReg {} <- dst
+      -> usageRM opFmt src dst
+      -- other MOV instructions zero any remaining upper part of the destination
+      -- (largely to avoid partial register stalls)
+      | otherwise
+      -> usageRW opFmt src dst
+    MOVD   fmt src dst    ->
+      -- NB: MOVD/MOVQ always zero any remaining upper part of destination
+      mkRU (use_R fmt src []) (use_R (movdOutFormat fmt) dst [])
     CMOV _ fmt src dst    -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
     MOVZxL fmt src dst    -> usageRW fmt src dst
     MOVSxL fmt src dst    -> usageRW fmt src dst
@@ -679,8 +689,7 @@ movdOutFormat format = case format of
 patchRegsOfInstr :: HasDebugCallStack => Platform -> Instr -> (Reg -> Reg) -> Instr
 patchRegsOfInstr platform instr env
   = case instr of
-    MOV fmt src dst      -> MOV fmt (patchOp src) (patchOp dst)
-    MOV2 srcFmt dstFmt src dst -> MOV2 srcFmt dstFmt (patchOp src) (patchOp dst)
+    MOV fmt fmt' src dst -> MOV fmt fmt' (patchOp src) (patchOp dst)
     MOVD fmt src dst     -> patch2 (MOVD fmt) src dst
     CMOV cc fmt src dst  -> CMOV cc fmt (patchOp src) (env dst)
     MOVZxL fmt src dst   -> patch2 (MOVZxL fmt) src dst
@@ -914,11 +923,12 @@ mkLoadInstr
     -> [Instr]
 
 mkLoadInstr config (RegFormat reg fmt) delta slot =
-  [ movInstr config fmt (OpAddr (spRel platform off)) (OpReg reg) ]
+  [ movInstr config fmt  (OpAddr (spRel platform off)) (OpReg reg) ]
   where
     platform = ncgPlatform config
     off = spillSlotToOffset platform slot - delta
 
+-- | A move instruction for moving the entire contents of an operand.
 movInstr :: NCGConfig -> Format -> (Operand -> Operand -> Instr)
 movInstr config fmt =
   case fmt of
@@ -936,7 +946,7 @@ movInstr config fmt =
             -> MOVU fmt
             | otherwise
             -> sorry "128-bit wide vectors require either -msse2 or -mavx"
-    _ -> MOV (scalarMoveFormat platform fmt)
+    _ -> MOV (scalarMoveFormat platform fmt) fmt
   where
     platform = ncgPlatform config
     avx = ncgAvxEnabled config
@@ -1027,33 +1037,40 @@ takeRegRegMoveInstr
         -> Maybe (Reg,Reg)
 
 takeRegRegMoveInstr platform = \case
-  MOV fmt (OpReg r1) (OpReg r2)
-    -- MOV zeroes the upper part of vector registers,
-    -- so it is not a real "move" in that case.
-    | not (isVecFormat fmt)
-    -> go r1 r2
-  MOVD _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+  MOV movFmt regFmt (OpReg r1) (OpReg r2)
+    -- Moving from a smaller src to a larger dst causes the upper part
+    -- to be zeroed (or to be preserved, in the case of MOVSS/MOVSD).
+    | formatToWidth movFmt >= formatToWidth regFmt
+    -- Don't eliminate a move between e.g. RAX and XMM:
+    -- even though we might be using XMM to store a scalar integer value,
+    -- some instructions only support XMM registers.
+    , targetClassOfReg platform r1 == targetClassOfReg platform r2
+    -> Just (r1, r2)
+  MOVD {}
+    -- MOVD moves between xmm registers and general-purpose registers,
+    -- and we don't want to eliminate those moves (as noted for MOV).
+    -> Nothing
+
+  -- SSE2/AVX move instructions always move the full register.
   MOVA _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   MOVU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   VMOVU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   MOVDQU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
   VMOVDQU _ (OpReg r1) (OpReg r2)
-    -> go r1 r2
+    -> Just (r1, r2)
+
+  -- MOVL, MOVH and MOVHLPS preserve some part of the destination register,
+  -- so are not simple moves.
+  MOVL {} -> Nothing
+  MOVH {} -> Nothing
+  MOVHLPS {} -> Nothing
+
+  -- Other instructions are not moves.
   _ -> Nothing
-  where
-    go r1 r2
-      -- Don't eliminate a move between e.g. RAX and XMM:
-      -- even though we might be using XMM to store a scalar integer value,
-      -- some instructions only support XMM registers.
-      | targetClassOfReg platform r1 == targetClassOfReg platform r2
-      = Just (r1, r2)
-      | otherwise
-      = Nothing
 
 -- | Make an unconditional branch instruction.
 mkJumpInstr
@@ -1131,7 +1148,7 @@ mkStackAllocInstr platform amount
         -- See Note [Windows stack layout]
         case platformArch platform of
             ArchX86_64 | needs_probe_call platform amount ->
-                           [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax)
+                           [ MOV II64 II64 (OpImm (ImmInt amount)) (OpReg rax)
                            , CALL (Left $ strImmLit (fsLit "___chkstk_ms")) [RegFormat rax II64]
                            , SUB II64 (OpReg rax) (OpReg rsp)
                            ]


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -613,21 +613,14 @@ pprInstr platform i = case i of
 
    -- Replace 'mov $0x0,%reg' by 'xor %reg,%reg', which is smaller and cheaper.
    -- The code generator catches most of these already, but not all.
-   MOV format (OpImm (ImmInt 0)) dst@(OpReg _)
+   MOV format _ (OpImm (ImmInt 0)) dst@(OpReg _)
      -> pprInstr platform (XOR format' dst dst)
         where format' = case format of
                 II64 -> II32          -- 32-bit version is equivalent, and smaller
                 _    -> format
 
-   MOV format src dst
-     -> pprFormatOpOp (text "mov") format src dst
-
-   MOV2 srcFmt dstFmt src dst
-     -> pprFormatOpOp (text "mov") fmt src dst
-     where
-      fmt = if formatInBytes srcFmt <= formatInBytes dstFmt
-            then srcFmt
-            else dstFmt
+   MOV instrFmt opFmt src dst
+     -> pprFormatFormatOpOp (text "mov") instrFmt opFmt src dst
 
    CMOV cc format src dst
      -> pprCondOpReg (text "cmov") format cc src dst
@@ -1099,6 +1092,15 @@ pprInstr platform i = case i of
            pprOperand platform format op2
        ]
 
+   pprFormatFormatOpOp :: Line doc -> Format -> Format -> Operand -> Operand -> doc
+   pprFormatFormatOpOp name instrFmt opFmt op1 op2
+     = line $ hcat [
+           pprMnemonic name instrFmt,
+           pprOperand platform opFmt op1,
+           comma,
+           pprOperand platform opFmt op2
+       ]
+
    pprMovdOpOp :: Line doc -> Format -> Operand -> Operand -> doc
    pprMovdOpOp name format op1 op2
      = let instr = case format of



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/790030244fcc060eaa904cd9ab92f53735109c38

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/790030244fcc060eaa904cd9ab92f53735109c38
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240625/31ad734b/attachment-0001.html>