[Git][ghc/ghc][wip/ncg-simd] realArgRegsCover: say we need xmm registers

Fri Aug 23 15:46:54 UTC 2024


sheaf pushed to branch wip/ncg-simd at Glasgow Haskell Compiler / GHC


Commits:
d665d078 by sheaf at 2024-08-23T17:46:39+02:00
realArgRegsCover: say we need xmm registers

This commit specifies that all XMM registers used for argument passing
are live in Cmm jumps annotated with `[*]`. This ensures that we don't
drop the upper 64-bits of a 128-bit wide XMM register on the floor if
a subsequent function needs it.

However, this does not address a similar problem with YMM/ZMM, as this
causes significant difficulties with the LLVM backend as it is currently
structured.

- - - - -


6 changed files:

- compiler/GHC/Cmm/CallConv.hs
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- testsuite/tests/simd/should_run/all.T
- + testsuite/tests/simd/should_run/simd014.hs
- + testsuite/tests/simd/should_run/simd014.stdout
- + testsuite/tests/simd/should_run/simd014Cmm.cmm


Changes:

=====================================
compiler/GHC/Cmm/CallConv.hs
=====================================
@@ -14,6 +14,7 @@ import GHC.Cmm (Convention(..))
 
 import GHC.Platform
 import GHC.Platform.Profile
+import GHC.Platform.Reg.Class (RegArch(..), registerArch)
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
 import GHC.Data.List.SetOps (nubOrdBy)
@@ -221,120 +222,126 @@ nodeOnly = noAvailRegs { availVanillaRegs = [VanillaReg 1] }
 -- this set of registers is guaranteed to preserve the contents of all live
 -- registers. We only use this functionality in hand-written C-- code in the RTS.
 realArgRegsCover :: Platform -> [GlobalRegUse]
-realArgRegsCover platform
-    | passFloatArgsInXmm platform
-    = [ GlobalRegUse r (globalRegSpillType platform r) | r <- realVanillaRegs platform ]
-   ++ [ GlobalRegUse r (globalRegSpillType platform r) | r <- realLongRegs    platform ]
-   ++ [ GlobalRegUse r (globalRegSpillType platform r) | r <- realDoubleRegs  platform ]
-        -- The above seems wrong, as it means we only save the low 64 bits
-        -- of XMM/YMM/ZMM registers on X86_64, which is probably wrong.
-        --
-        -- Challenge: change the realDoubleRegs line to use ZmmReg instead,
-        -- and fix the resulting compiler errors.
-
-    | otherwise
-    = [ GlobalRegUse r (globalRegSpillType platform r)
-      | r <- realVanillaRegs platform ++ realFloatRegs platform ++ realDoubleRegs platform ++ realLongRegs platform
-      ] -- we don't save XMM registers if they are not used for parameter passing
-
-{-
-
-  Note [GHCi and native call registers]
-  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-  The GHCi bytecode interpreter does not have access to the STG registers
-  that the native calling convention uses for passing arguments. It uses
-  helper stack frames to move values between the stack and registers.
-
-  If only a single register needs to be moved, GHCi uses a specific stack
-  frame. For example stg_ctoi_R1p saves a heap pointer value from STG register
-  R1 and stg_ctoi_D1 saves a double precision floating point value from D1.
-  In the other direction, helpers stg_ret_p and stg_ret_d move a value from
-  the stack to the R1 and D1 registers, respectively.
-
-  When GHCi needs to move more than one register it cannot use a specific
-  helper frame. It would simply be impossible to create a helper for all
-  possible combinations of register values. Instead, there are generic helper
-  stack frames that use a call_info word that describes the active registers
-  and the number of stack words used by the arguments of a call.
-
-  These helper stack frames are currently:
-
-      - stg_ret_t:    return a tuple to the continuation at the top of
-                          the stack
-      - stg_ctoi_t:   convert a tuple return value to be used in
-                          bytecode
-      - stg_primcall: call a function
-
-
-  The call_info word contains a bitmap of the active registers
-  for the call and and a stack offset. The layout is as follows:
-
-      - bit 0-23:  Bitmap of active registers for the call, the
-                   order corresponds to the list returned by
-                   allArgRegsCover. For example if bit 0 (the least
-                   significant bit) is set, the first register in the
-                   allArgRegsCover list is active. Bit 1 for the
-                   second register in the list and so on.
-
-      - bit 24-31: Unsigned byte indicating the stack offset
-                   of the continuation in words. For tuple returns
-                   this is the number of words returned on the
-                   stack. For primcalls this field is unused, since
-                   we don't jump to a continuation.
-
-    The upper 32 bits on 64 bit platforms are currently unused.
-
-    If a register is smaller than a word on the stack (for example a
-    single precision float on a 64 bit system), then the stack slot
-    is padded to a whole word.
-
-    Example:
-
-        If a tuple is returned in three registers and an additional two
-        words on the stack, then three bits in the register bitmap
-        (bits 0-23) would be set. And bit 24-31 would be
-        00000010 (two in binary).
-
-        The values on the stack before a call to POP_ARG_REGS would
-        be as follows:
-
-            ...
-            continuation
-            stack_arg_1
-            stack_arg_2
-            register_arg_3
-            register_arg_2
-            register_arg_1 <- Sp
-
-        A call to POP_ARG_REGS(call_info) would move register_arg_1
-        to the register corresponding to the lowest set bit in the
-        call_info word. register_arg_2 would be moved to the register
-        corresponding to the second lowest set bit, and so on.
-
-        After POP_ARG_REGS(call_info), the stack pointer Sp points
-        to the topmost stack argument, so the stack looks as follows:
-
-            ...
-            continuation
-            stack_arg_1
-            stack_arg_2 <- Sp
-
-        At this point all the arguments are in place and we are ready
-        to jump to the continuation, the location (offset from Sp) of
-        which is found by inspecting the value of bits 24-31. In this
-        case the offset is two words.
-
-    On x86_64, the double precision (Dn) and single precision
-    floating (Fn) point registers overlap, e.g. D1 uses the same
-    physical register as F1. On this platform, the list returned
-    by allArgRegsCover contains only entries for the double
-    precision registers. If an argument is passed in register
-    Fn, the bit corresponding to Dn should be set.
-
-  Note: if anything changes in how registers for native calls overlap,
-           make sure to also update GHC.StgToByteCode.layoutNativeCall
- -}
+realArgRegsCover platform =
+  map ( \ r -> GlobalRegUse r ( globalRegSpillType platform r ) ) $
+    case registerArch ( platformArch platform ) of
+      Unified ->
+            realVanillaRegs platform
+         ++ realLongRegs    platform
+         ++ map XmmReg ( realXmmRegNos platform )
+         -- NB: this means that we will preserve 128 bit XMM registers.
+         --
+         -- However, we don't preserve YMM/ZMM, even though this constitutes a
+         -- serious correctness bug, because it presents many challenges, in
+         -- particular with the LLVM backend (which currently does not support
+         -- register aliasing): the LLVM code generator would have to insert
+         -- casts when it generates function prologues loading *_Arg parameters
+         -- into *_Var local variables for each live register.
+         --
+         -- See #25169. More background: #14251, #17920, comments on GitLab MR !3419.
+      Separate ->
+          -- Float and vector registers are separate: include both in the list.
+            realVanillaRegs platform
+         ++ realLongRegs    platform
+         ++ realDoubleRegs  platform
+         ++ map XmmReg ( realXmmRegNos platform )
+              -- See comment above.
+
+{- Note [GHCi and native call registers]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The GHCi bytecode interpreter does not have access to the STG registers
+that the native calling convention uses for passing arguments. It uses
+helper stack frames to move values between the stack and registers.
+
+If only a single register needs to be moved, GHCi uses a specific stack
+frame. For example stg_ctoi_R1p saves a heap pointer value from STG register
+R1 and stg_ctoi_D1 saves a double precision floating point value from D1.
+In the other direction, helpers stg_ret_p and stg_ret_d move a value from
+the stack to the R1 and D1 registers, respectively.
+
+When GHCi needs to move more than one register it cannot use a specific
+helper frame. It would simply be impossible to create a helper for all
+possible combinations of register values. Instead, there are generic helper
+stack frames that use a call_info word that describes the active registers
+and the number of stack words used by the arguments of a call.
+
+These helper stack frames are currently:
+
+    - stg_ret_t:    return a tuple to the continuation at the top of
+                        the stack
+    - stg_ctoi_t:   convert a tuple return value to be used in
+                        bytecode
+    - stg_primcall: call a function
+
+
+The call_info word contains a bitmap of the active registers
+for the call and and a stack offset. The layout is as follows:
+
+  - bit 0-23:  Bitmap of active registers for the call, the
+               order corresponds to the list returned by
+               allArgRegsCover. For example if bit 0 (the least
+               significant bit) is set, the first register in the
+               allArgRegsCover list is active. Bit 1 for the
+               second register in the list and so on.
+
+  - bit 24-31: Unsigned byte indicating the stack offset
+               of the continuation in words. For tuple returns
+               this is the number of words returned on the
+               stack. For primcalls this field is unused, since
+               we don't jump to a continuation.
+
+The upper 32 bits on 64 bit platforms are currently unused.
+
+If a register is smaller than a word on the stack (for example a
+single precision float on a 64 bit system), then the stack slot
+is padded to a whole word.
+
+  Example:
+
+    If a tuple is returned in three registers and an additional two
+    words on the stack, then three bits in the register bitmap
+    (bits 0-23) would be set. And bit 24-31 would be
+    00000010 (two in binary).
+
+    The values on the stack before a call to POP_ARG_REGS would
+    be as follows:
+
+      ...
+      continuation
+      stack_arg_1
+      stack_arg_2
+      register_arg_3
+      register_arg_2
+      register_arg_1 <- Sp
+
+    A call to POP_ARG_REGS(call_info) would move register_arg_1
+    to the register corresponding to the lowest set bit in the
+    call_info word. register_arg_2 would be moved to the register
+    corresponding to the second lowest set bit, and so on.
+
+    After POP_ARG_REGS(call_info), the stack pointer Sp points
+    to the topmost stack argument, so the stack looks as follows:
+
+      ...
+      continuation
+      stack_arg_1
+      stack_arg_2 <- Sp
+
+    At this point all the arguments are in place and we are ready
+    to jump to the continuation, the location (offset from Sp) of
+    which is found by inspecting the value of bits 24-31. In this
+    case the offset is two words.
+
+On x86_64, the double precision (Dn) and single precision
+floating (Fn) point registers overlap, e.g. D1 uses the same
+physical register as F1. On this platform, the list returned
+by allArgRegsCover contains only entries for the double
+precision registers. If an argument is passed in register
+Fn, the bit corresponding to Dn should be set.
+
+Note: if anything changes in how registers for native calls overlap,
+         make sure to also update GHC.StgToByteCode.layoutNativeCall
+-}
 
 -- | Like 'realArgRegsCover' but always includes the node. This covers all real
 -- and virtual registers actually used for passing arguments.


=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -993,6 +993,9 @@ getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal (GlobalRegUse
       return $ Any II64 (\dst -> unitOL $
         LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
 
+getRegister' _ _ (CmmMachOp mop []) =
+  pprPanic "getRegister(x86): nullary MachOp" (text $ show mop)
+
 getRegister' platform is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
     sse4_1 <- sse4_1Enabled
     avx    <- avxEnabled
@@ -1967,6 +1970,9 @@ getRegister' platform _is32Bit (CmmMachOp mop [x, y, z]) = do -- ternary MachOps
             , text "width:" <+> ppr width
             , text "offset:" <+> pdoc platform offset ]
 
+getRegister' _ _ (CmmMachOp mop (_:_:_:_:_)) =
+  pprPanic "getRegister(x86): MachOp with >= 4 arguments" (text $ show mop)
+
 getRegister' platform is32Bit load@(CmmLoad mem ty _)
   | isVecType ty
   = do
@@ -2002,7 +2008,7 @@ getRegister' platform is32Bit load@(CmmLoad mem ty _)
     return (Any format code)
 
   | otherwise
-  = pprPanic "getRegister(x86)" (pdoc platform load)
+  = pprPanic "getRegister(x86) CmmLoad" (pdoc platform load)
   where
     format = cmmTypeFormat ty
     width = typeWidth ty
@@ -2090,8 +2096,8 @@ getRegister' platform _ (CmmLit lit) =
     cmmTy = cmmLitType platform lit
     fmt = cmmTypeFormat cmmTy
 
-getRegister' platform _ other
-  = pprPanic "getRegister(x86)" (pdoc platform other)
+getRegister' platform _ slot@(CmmStackSlot {}) =
+  pprPanic "getRegister(x86) CmmStackSlot" (pdoc platform slot)
 
 getFloatLitRegister :: CmmLit -> NatM Register
 getFloatLitRegister lit = do


=====================================
testsuite/tests/simd/should_run/all.T
=====================================
@@ -14,6 +14,10 @@ setTestOpts(
   , js_skip
 
   # Ensure we set the CPU features we have available.
+  #
+  # This is especially important with the LLVM backend, as LLVM can otherwise
+  # produce ABI-incompatible code, e.g. when compiling usage of YMM registers
+  # with or without -mavx2.
   , when(have_cpu_feature('sse4_1'), extra_hc_opts('-msse4'))
   , when(have_cpu_feature('sse4_2'), extra_hc_opts('-msse4.2'))
   , when(have_cpu_feature('avx'), extra_hc_opts('-mavx'))
@@ -45,11 +49,10 @@ test('simd013',
      , unless(arch('x86_64'), skip) # because the C file uses Intel intrinsics
      ],
      compile_and_run, ['simd013C.c'])
+test('simd014', [], compile_and_run, ['simd014Cmm.cmm'])
 
-
-
-test('T22187',[],compile,[''])
-test('T22187_run',[],compile_and_run,[''])
+test('T22187', [],compile,[''])
+test('T22187_run', [],compile_and_run,[''])
 
 test('T25062_V16', [], compile_and_run, [''])
 test('T25062_V32', [ unless(have_cpu_feature('avx2'), skip)


=====================================
testsuite/tests/simd/should_run/simd014.hs
=====================================
@@ -0,0 +1,35 @@
+{-# LANGUAGE GHCForeignImportPrim #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+module Main where
+
+-- base
+import GHC.Exts
+  ( Double(..), DoubleX2#
+  , packDoubleX2#, unpackDoubleX2#
+  )
+
+-- Test for handwritten Cmm code and realArgsRegCover (relates to #25169).
+
+--------------------------------------------------------------------------------
+
+data DoubleX2 = DX2# DoubleX2#
+
+instance Show DoubleX2 where
+  show ( DX2# d ) = case unpackDoubleX2# d of
+    (# a, b #) -> show ( D# a, D# b )
+
+foreign import prim "f1"
+  f1 :: DoubleX2# -> DoubleX2# -> DoubleX2# -> DoubleX2#
+     -> (# DoubleX2#, DoubleX2#, DoubleX2#, DoubleX2# #)
+
+main :: IO ()
+main = do
+  let !x1 = packDoubleX2# (# 1.1##, 1.2## #)
+      !x2 = packDoubleX2# (# 2.1##, 2.2## #)
+      !x3 = packDoubleX2# (# 3.1##, 3.2## #)
+      !x4 = packDoubleX2# (# 4.1##, 4.2## #)
+      !(# y1, y2, y3, y4 #) = f1 x1 x2 x3 x4
+  print [ DX2# y1, DX2# y2, DX2# y3, DX2# y4 ]


=====================================
testsuite/tests/simd/should_run/simd014.stdout
=====================================
@@ -0,0 +1 @@
+[(3.1,3.2),(2.1,2.2),(1.1,1.2),(4.1,4.2)]


=====================================
testsuite/tests/simd/should_run/simd014Cmm.cmm
=====================================
@@ -0,0 +1,28 @@
+#include "Cmm.h"
+
+f1
+{
+  // Switch XMM1 and XMM2
+  XMM6 = XMM1 ;
+  XMM1 = XMM2 ;
+  XMM2 = XMM6 ;
+  jump f2 [ XMM1, XMM2, XMM3, XMM4 ];
+}
+
+f2
+{
+  // Switch XMM2 and XMM3
+  XMM6 = XMM2 ;
+  XMM2 = XMM3 ;
+  XMM3 = XMM6 ;
+  jump f3 [ XMM1, XMM2, XMM3, XMM4 ];
+}
+
+f3
+{
+  // Switch XMM1 and XMM2
+  XMM6 = XMM1 ;
+  XMM1 = XMM2 ;
+  XMM2 = XMM6 ;
+  jump %ENTRY_CODE(Sp(0)) [ XMM1, XMM2, XMM3, XMM4 ];
+}



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/d665d07837d9f1b1335384c4d84a6a07bc9eb5f2

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/d665d07837d9f1b1335384c4d84a6a07bc9eb5f2
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240823/ed9214d4/attachment-0001.html>