[Git][ghc/ghc][master] x86 NCG: Make MOVD's output format explicit

Mon Jan 27 21:23:41 UTC 2025


Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC


Commits:
62760367 by ARATA Mizuki at 2025-01-27T16:23:06-05:00
x86 NCG: Make MOVD's output format explicit

The old design led to inference of a wrong format,
losing upper bits of a vector register.

Fixes #25659

Co-authored-by: sheaf <sam.derbyshire at gmail.com>

- - - - -


6 changed files:

- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- + testsuite/tests/simd/should_run/T25659.hs
- + testsuite/tests/simd/should_run/T25659.stdout
- testsuite/tests/simd/should_run/all.T


Changes:

=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -1171,7 +1171,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
         bitcast :: Format -> Format -> CmmExpr -> NatM Register
         bitcast fmt rfmt expr =
           do (src, e_code) <- getSomeReg expr
-             let code = \dst -> e_code `snocOL` (MOVD fmt (OpReg src) (OpReg dst))
+             let code = \dst -> e_code `snocOL` (MOVD fmt rfmt (OpReg src) (OpReg dst))
              return (Any rfmt code)
 
         toI8Reg :: Width -> CmmExpr -> NatM Register
@@ -1262,7 +1262,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
               code dst = exp `snocOL`
                          -- VPBROADCAST from GPR requires AVX-512,
                          -- so we use an additional MOVD.
-                         (MOVD movFormat (OpReg reg) (OpReg dst)) `snocOL`
+                         (MOVD movFormat fmt (OpReg reg) (OpReg dst)) `snocOL`
                          (VPBROADCAST fmt fmt (OpReg dst) dst)
           return $ Any fmt code
 
@@ -1272,7 +1272,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
           (reg, exp) <- getNonClobberedReg expr
           let fmt = VecFormat 16 FmtInt8
           return $ Any fmt (\dst -> exp `snocOL`
-                                    (MOVD II32 (OpReg reg) (OpReg dst)) `snocOL`
+                                    (MOVD II32 fmt (OpReg reg) (OpReg dst)) `snocOL`
                                     (PUNPCKLBW fmt (OpReg dst) dst) `snocOL`
                                     (PUNPCKLWD (VecFormat 8 FmtInt16) (OpReg dst) dst) `snocOL`
                                     (PSHUFD fmt (ImmInt 0x00) (OpReg dst) dst)
@@ -1284,7 +1284,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
           (reg, exp) <- getNonClobberedReg expr
           let fmt = VecFormat 8 FmtInt16
           return $ Any fmt (\dst -> exp `snocOL`
-                                    (MOVD II32 (OpReg reg) (OpReg dst)) `snocOL`
+                                    (MOVD II32 fmt (OpReg reg) (OpReg dst)) `snocOL`
                                     (PUNPCKLWD fmt (OpReg dst) dst) `snocOL`
                                     (PSHUFD fmt (ImmInt 0x00) (OpReg dst) dst)
                                     )
@@ -1295,7 +1295,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
           (reg, exp) <- getNonClobberedReg expr
           let fmt = VecFormat 4 FmtInt32
           return $ Any fmt (\dst -> exp `snocOL`
-                                    (MOVD II32 (OpReg reg) (OpReg dst)) `snocOL`
+                                    (MOVD II32 fmt (OpReg reg) (OpReg dst)) `snocOL`
                                     (PSHUFD fmt (ImmInt 0x00) (OpReg dst) dst)
                                     )
 
@@ -1305,7 +1305,7 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
           (reg, exp) <- getNonClobberedReg expr
           let fmt = VecFormat 2 FmtInt64
           return $ Any fmt (\dst -> exp `snocOL`
-                                    (MOVD II64 (OpReg reg) (OpReg dst)) `snocOL`
+                                    (MOVD II64 fmt (OpReg reg) (OpReg dst)) `snocOL`
                                     (PUNPCKLQDQ fmt (OpReg dst) dst)
                                     )
 
@@ -1793,16 +1793,16 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       let code dst =
             case i of
               0 -> exp `snocOL`
-                   (MOVD FF32 (OpReg r) (OpReg dst))
+                   (MOVD fmt II32 (OpReg r) (OpReg dst))
               1 -> exp `snocOL`
                    (PSHUFD fmt (ImmInt 0b01_01_01_01) (OpReg r) tmp) `snocOL` -- tmp <- (r[1],r[1],r[1],r[1])
-                   (MOVD FF32 (OpReg tmp) (OpReg dst))
+                   (MOVD fmt II32 (OpReg tmp) (OpReg dst))
               2 -> exp `snocOL`
                    (PSHUFD fmt (ImmInt 0b11_10_11_10) (OpReg r) tmp) `snocOL` -- tmp <- (r[2],r[3],r[2],r[3])
-                   (MOVD FF32 (OpReg tmp) (OpReg dst))
+                   (MOVD fmt II32 (OpReg tmp) (OpReg dst))
               _ -> exp `snocOL`
                    (PSHUFD fmt (ImmInt 0b11_11_11_11) (OpReg r) tmp) `snocOL` -- tmp <- (r[3],r[3],r[3],r[3])
-                   (MOVD FF32 (OpReg tmp) (OpReg dst))
+                   (MOVD fmt II32 (OpReg tmp) (OpReg dst))
       return (Any II32 code)
     vector_int32x4_extract_sse2 _ offset
       = pprPanic "Unsupported offset" (pdoc platform offset)
@@ -1818,10 +1818,10 @@ getRegister' platform is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       let code dst =
             case lit of
               CmmInt 0 _ -> exp `snocOL`
-                            (MOVD FF64 (OpReg r) (OpReg dst))
+                            (MOVD fmt II64 (OpReg r) (OpReg dst))
               CmmInt 1 _ -> exp `snocOL`
                             (MOVHLPS fmt r tmp) `snocOL`
-                            (MOVD FF64 (OpReg tmp) (OpReg dst))
+                            (MOVD fmt II64 (OpReg tmp) (OpReg dst))
               _          -> panic "Error in offset while unpacking"
       return (Any II64 code)
     vector_int64x2_extract_sse2 _ offset
@@ -2103,22 +2103,22 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
               = case offset of
                   0 -> valExp `appOL`
                        (vecCode dst) `snocOL`
-                       (MOVD II32 (OpReg valReg) (OpReg tmp1)) `snocOL`
+                       (MOVD II32 vectorFormat (OpReg valReg) (OpReg tmp1)) `snocOL`
                        (MOV floatVectorFormat (OpReg tmp1) (OpReg dst)) -- MOVSS; dst <- (tmp1[0],dst[1],dst[2],dst[3])
                   1 -> valExp `appOL`
                        (vecCode tmp1) `snocOL`
-                       (MOVD II32 (OpReg valReg) (OpReg dst)) `snocOL` -- dst <- (val,0,0,0)
+                       (MOVD II32 vectorFormat (OpReg valReg) (OpReg dst)) `snocOL` -- dst <- (val,0,0,0)
                        (PUNPCKLQDQ vectorFormat (OpReg tmp1) dst) `snocOL` -- dst <- (dst[0],dst[1],tmp1[0],tmp1[1])
                        (SHUF floatVectorFormat (ImmInt 0b11_10_00_10) (OpReg tmp1) dst) -- SHUFPS; dst <- (dst[2],dst[0],tmp1[2],tmp1[3])
                   2 -> valExp `appOL`
                        (vecCode dst) `snocOL`
-                       (MOVD II32 (OpReg valReg) (OpReg tmp1)) `snocOL` -- tmp1 <- (val,0,0,0)
+                       (MOVD II32 vectorFormat (OpReg valReg) (OpReg tmp1)) `snocOL` -- tmp1 <- (val,0,0,0)
                        (MOVU floatVectorFormat (OpReg dst) (OpReg tmp2)) `snocOL` -- MOVUPS; tmp2 <- dst
                        (SHUF floatVectorFormat (ImmInt 0b01_00_01_11) (OpReg tmp1) tmp2) `snocOL` -- SHUFPS; tmp2 <- (tmp2[3],tmp2[1],tmp1[0],tmp1[1])
                        (SHUF floatVectorFormat (ImmInt 0b00_10_01_00) (OpReg tmp2) dst) -- SHUFPS; dst <- (dst[0],dst[1],tmp2[2],tmp2[0])
                   _ -> valExp `appOL`
                        (vecCode dst) `snocOL`
-                       (MOVD II32 (OpReg valReg) (OpReg tmp1)) `snocOL` -- tmp1 <- (val,0,0,0)
+                       (MOVD II32 vectorFormat (OpReg valReg) (OpReg tmp1)) `snocOL` -- tmp1 <- (val,0,0,0)
                        (SHUF floatVectorFormat (ImmInt 0b11_10_01_00) (OpReg dst) tmp1) `snocOL` -- SHUFPS; tmp1 <- (tmp1[0],tmp1[1],dst[2],dst[3])
                        (SHUF floatVectorFormat (ImmInt 0b00_10_01_00) (OpReg tmp1) dst) -- SHUFPS; dst <- (dst[0],dst[1],tmp1[2],tmp1[0])
         return $ Any vectorFormat code
@@ -2139,12 +2139,12 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
                   CmmInt 0 _ -> valExp `appOL`
                                 vecExp `snocOL`
                                 (MOVHLPS fmt vecReg tmp) `snocOL`
-                                (MOVD II64 (OpReg valReg) (OpReg dst)) `snocOL`
+                                (MOVD II64 fmt (OpReg valReg) (OpReg dst)) `snocOL`
                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   CmmInt 1 _ -> valExp `appOL`
                                 vecExp `snocOL`
-                                (MOV II64 (OpReg vecReg) (OpReg dst)) `snocOL`
-                                (MOVD II64 (OpReg valReg) (OpReg tmp)) `snocOL`
+                                (MOVDQU fmt (OpReg vecReg) (OpReg dst)) `snocOL`
+                                (MOVD II64 fmt (OpReg valReg) (OpReg tmp)) `snocOL`
                                 (PUNPCKLQDQ fmt (OpReg tmp) dst)
                   _ -> pprPanic "MO_V_Insert Int64X2: unsupported offset" (ppr offset)
          in return $ Any fmt code
@@ -4083,7 +4083,7 @@ loadArgsWin config (arg:rest) = do
            -- arguments in both fp and integer registers.
            let (assign_code', regs')
                 | isFloatFormat arg_fmt =
-                    ( assign_code `snocOL` MOVD FF64 (OpReg freg) (OpReg ireg),
+                    ( assign_code `snocOL` MOVD FF64 II64 (OpReg freg) (OpReg ireg),
                       [ RegWithFormat freg FF64
                       , RegWithFormat ireg II64 ])
                 | otherwise = (assign_code, [RegWithFormat ireg II64])


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -39,7 +39,6 @@ module GHC.CmmToAsm.X86.Instr
    , patchJumpInstr
    , isMetaInstr
    , isJumpishInstr
-   , movdOutFormat
    , MinOrMax(..), MinMaxType(..)
    )
 where
@@ -127,11 +126,16 @@ data Instr
              -- with @MOVABS@; we currently do not use this instruction in GHC.
              -- See https://stackoverflow.com/questions/52434073/whats-the-difference-between-the-x86-64-att-instructions-movq-and-movabsq.
 
-        | MOVD   Format Operand Operand -- ^ MOVD/MOVQ SSE2 instructions
-                                        -- (bitcast between a general purpose
-                                        -- register and a float register).
-                                        -- Format is input format, output format is
-                                        -- calculated in the 'movdOutFormat' function.
+        -- | MOVD/MOVQ SSE2 instructions
+        -- (bitcast between a general purpose register and a float register).
+        | MOVD
+           Format -- ^ input format
+           Format -- ^ output format
+           Operand Operand
+           -- NB: MOVD stores both the input and output formats. This is because
+           -- neither format fully determines the other, as either might be
+           -- a vector format, and we need to know the exact format in order to
+           -- correctly spill/unspill. See #25659.
         | CMOV   Cond Format Operand Reg
         | MOVZxL      Format Operand Operand
               -- ^ The format argument is the size of operand 1 (the number of bits we keep)
@@ -377,10 +381,10 @@ regUsageOfInstr platform instr
       -- (largely to avoid partial register stalls)
       | otherwise
       -> usageRW fmt src dst
-    MOVD   fmt src dst    ->
+    MOVD fmt1 fmt2 src dst    ->
       -- NB: MOVD and MOVQ always zero any remaining upper part of destination,
       -- so the destination is "written" not "modified".
-      usageRW' fmt (movdOutFormat fmt) src dst
+      usageRW' fmt1 fmt2 src dst
     CMOV _ fmt src dst    -> mkRU (use_R fmt src [mk fmt dst]) [mk fmt dst]
     MOVZxL fmt src dst    -> usageRW fmt src dst
     MOVSxL fmt src dst    -> usageRW fmt src dst
@@ -650,14 +654,6 @@ interesting :: Platform -> Reg -> Bool
 interesting _        (RegVirtual _)              = True
 interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
 
-movdOutFormat :: Format -> Format
-movdOutFormat format = case format of
-  II32 -> FF32
-  II64 -> FF64
-  FF32 -> II32
-  FF64 -> II64
-  _    -> pprPanic "X86: improper format for movd/movq" (ppr format)
-
 
 -- | Applies the supplied function to all registers in instructions.
 -- Typically used to change virtual registers to real registers.
@@ -665,7 +661,7 @@ patchRegsOfInstr :: HasDebugCallStack => Platform -> Instr -> (Reg -> Reg) -> In
 patchRegsOfInstr platform instr env
   = case instr of
     MOV fmt src dst      -> MOV fmt (patchOp src) (patchOp dst)
-    MOVD fmt src dst     -> patch2 (MOVD fmt) src dst
+    MOVD fmt1 fmt2 src dst -> patch2 (MOVD fmt1 fmt2) src dst
     CMOV cc fmt src dst  -> CMOV cc fmt (patchOp src) (env dst)
     MOVZxL fmt src dst   -> patch2 (MOVZxL fmt) src dst
     MOVSxL fmt src dst   -> patch2 (MOVSxL fmt) src dst


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -657,8 +657,8 @@ pprInstr platform i = case i of
    CMOV cc format src dst
      -> pprCondOpReg (text "cmov") format cc src dst
 
-   MOVD format src dst
-     -> pprMovdOpOp (text "mov") format src dst
+   MOVD format1 format2 src dst
+     -> pprMovdOpOp (text "mov") format1 format2 src dst
 
    MOVZxL II32 src dst
       -> pprFormatOpOp (text "mov") II32 src dst
@@ -1151,21 +1151,21 @@ pprInstr platform i = case i of
            pprOperand platform format op2
        ]
 
-   pprMovdOpOp :: Line doc -> Format -> Operand -> Operand -> doc
-   pprMovdOpOp name format op1 op2
-     = let instr = case format of
+   pprMovdOpOp :: Line doc -> Format -> Format -> Operand -> Operand -> doc
+   pprMovdOpOp name format1 format2 op1 op2
+     = let instr = case (format1, format2) of
              -- bitcasts to/from a general purpose register to a floating point
              -- register require II32 or II64.
-             II32 -> text "d"
-             II64 -> text "q"
-             FF32 -> text "d"
-             FF64 -> text "q"
-             _    -> panic "X86.Ppr.pprMovdOpOp: improper format for movd/movq."
+             (II32, _) -> text "d"
+             (II64, _) -> text "q"
+             (_, II32) -> text "d"
+             (_, II64) -> text "q"
+             _ -> panic "X86.Ppr.pprMovdOpOp: improper format for movd/movq."
        in line $ hcat [
            char '\t' <> name <> instr <> space,
-           pprOperand platform format op1,
+           pprOperand platform format1 op1,
            comma,
-           pprOperand platform (movdOutFormat format) op2
+           pprOperand platform format2 op2
            ]
 
    pprFormatImmRegOp :: Line doc -> Format -> Imm -> Reg -> Operand -> doc


=====================================
testsuite/tests/simd/should_run/T25659.hs
=====================================
@@ -0,0 +1,19 @@
+{-# LANGUAGE MagicHash, UnboxedTuples, ExtendedLiterals #-}
+import GHC.Int
+import GHC.Prim
+
+test :: (Int64X2# -> Int64X2# -> Int64X2#) -> IO ()
+test f = do
+  let a = packInt64X2# (# 0#Int64, 11#Int64 #)
+      b = packInt64X2# (# 22#Int64, 33#Int64 #)
+      c = f a b
+      (# x0, x1 #) = unpackInt64X2# a
+      (# y0, y1 #) = unpackInt64X2# b
+      (# z0, z1 #) = unpackInt64X2# c
+  putStrLn $ "a = " ++ show (I64# x0, I64# x1)
+  putStrLn $ "b = " ++ show (I64# y0, I64# y1)
+  putStrLn $ "c = " ++ show (I64# z0, I64# z1)
+{-# NOINLINE test #-}
+
+main :: IO ()
+main = test (\_ b -> b)


=====================================
testsuite/tests/simd/should_run/T25659.stdout
=====================================
@@ -0,0 +1,3 @@
+a = (0,11)
+b = (22,33)
+c = (22,33)


=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -26,6 +26,7 @@ test('word32x4_basic_baseline', [], compile_and_run, [''])
 test('word64x2_basic_baseline', [], compile_and_run, [''])
 
 test('T25658', [], compile_and_run, ['']) # #25658 is a bug with SSE2 code generation
+test('T25659', [], compile_and_run, [''])
 
 # Ensure we set the CPU features we have available.
 #



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/62760367dca36da5828875c5423109f9a5523337

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/62760367dca36da5828875c5423109f9a5523337
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20250127/7a65a087/attachment-0001.html>