[Git][ghc/ghc][wip/T23576] Add NCG support for common 64bit operations to the x86 backend.

Thu Nov 2 18:47:50 UTC 2023

Andreas Klebinger pushed to branch wip/T23576 at Glasgow Haskell Compiler / GHC


Commits:
1b0dfcbe by Jaro Reinders at 2023-11-02T19:36:25+01:00
Add NCG support for common 64bit operations to the x86 backend.

These used to be implemented via C calls which was obviously quite bad
for performance for operations like simple addition.

Co-authored-by: Andreas Klebinger

- - - - -


9 changed files:

- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs
- compiler/GHC/CmmToAsm/X86/Ppr.hs
- compiler/GHC/Driver/Config/StgToCmm.hs
- compiler/GHC/StgToCmm/Config.hs
- compiler/GHC/StgToCmm/Prim.hs
- compiler/GHC/Utils/Unique.hs
- docs/users_guide/9.6.1-notes.rst
- testsuite/tests/cmm/should_run/all.T


Changes:

=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -523,7 +523,6 @@ assignReg_I64Code (CmmLocal dst) valueTree = do
 assignReg_I64Code _ _
    = panic "assignReg_I64Code(i386): invalid lvalue"
 
-
 iselExpr64 :: HasDebugCallStack => CmmExpr -> NatM (RegCode64 InstrBlock)
 iselExpr64 (CmmLit (CmmInt i _)) = do
   Reg64 rhi rlo <- getNewReg64
@@ -550,7 +549,6 @@ iselExpr64 (CmmReg (CmmLocal local_reg)) = do
   let Reg64 hi lo = localReg64 local_reg
   return (RegCode64 nilOL hi lo)

--- we handle addition, but rather badly
 iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
    RegCode64 code1 r1hi r1lo <- iselExpr64 e1
    Reg64 rhi rlo <- getNewReg64
@@ -590,11 +588,31 @@ iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
                        SBB II32 (OpReg r2hi) (OpReg rhi) ]
    return (RegCode64 code rhi rlo)
 
-iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
+iselExpr64 (CmmMachOp (MO_UU_Conv W32 W64) [expr]) = do
      code <- getAnyReg expr
      Reg64 r_dst_hi r_dst_lo <- getNewReg64
      return $ RegCode64 (code r_dst_lo `snocOL`
-                          MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
+                          XOR II32 (OpReg r_dst_hi) (OpReg r_dst_hi))
+                          r_dst_hi
+                          r_dst_lo
+
+iselExpr64 (CmmMachOp (MO_UU_Conv W16 W64) [expr]) = do
+     (rsrc, code) <- getByteReg expr
+     Reg64 r_dst_hi r_dst_lo <- getNewReg64
+     return $ RegCode64 (code `appOL` toOL [
+                          MOVZxL II16 (OpReg rsrc) (OpReg r_dst_lo),
+                          XOR    II32 (OpReg r_dst_hi) (OpReg r_dst_hi)
+                          ])
+                          r_dst_hi
+                          r_dst_lo
+
+iselExpr64 (CmmMachOp (MO_UU_Conv W8 W64) [expr]) = do
+     (rsrc, code) <- getByteReg expr
+     Reg64 r_dst_hi r_dst_lo <- getNewReg64
+     return $ RegCode64 (code `appOL` toOL [
+                          MOVZxL II8 (OpReg rsrc) (OpReg r_dst_lo),
+                          XOR    II32 (OpReg r_dst_hi) (OpReg r_dst_hi)
+                          ])
                           r_dst_hi
                           r_dst_lo
 
@@ -609,11 +627,208 @@ iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
                           r_dst_hi
                           r_dst_lo
 
+iselExpr64 (CmmMachOp (MO_SS_Conv W16 W64) [expr]) = do
+     (r, code) <- getByteReg expr
+     Reg64 r_dst_hi r_dst_lo <- getNewReg64
+     return $ RegCode64 (code `appOL` toOL [
+                          MOVSxL II16 (OpReg r) (OpReg eax),
+                          CLTD II32,
+                          MOV II32 (OpReg eax) (OpReg r_dst_lo),
+                          MOV II32 (OpReg edx) (OpReg r_dst_hi)])
+                          r_dst_hi
+                          r_dst_lo
+
+iselExpr64 (CmmMachOp (MO_SS_Conv W8 W64) [expr]) = do
+     (r, code) <- getByteReg expr
+     Reg64 r_dst_hi r_dst_lo <- getNewReg64
+     return $ RegCode64 (code `appOL` toOL [
+                          MOVSxL II8 (OpReg r) (OpReg eax),
+                          CLTD II32,
+                          MOV II32 (OpReg eax) (OpReg r_dst_lo),
+                          MOV II32 (OpReg edx) (OpReg r_dst_hi)])
+                          r_dst_hi
+                          r_dst_lo
+
+iselExpr64 (CmmMachOp (MO_S_Neg _) [expr]) = do
+   RegCode64 code rhi rlo <- iselExpr64 expr
+   Reg64 rohi rolo <- getNewReg64
+   let
+        ocode = code `appOL`
+                toOL [ MOV II32 (OpReg rlo) (OpReg rolo),
+                       XOR II32 (OpReg rohi) (OpReg rohi),
+                       NEGI II32 (OpReg rolo),
+                       SBB II32 (OpReg rhi) (OpReg rohi) ]
+   return (RegCode64 ocode rohi rolo)
+
+-- To multiply two 64-bit numbers we use the following decomposition (in C notation):
+--
+--     ((r1hi << 32) + r1lo) * ((r2hi << 32) + r2lo)
+--      = ((r2lo * r1hi) << 32)
+--      + ((r1lo * r2hi) << 32)
+--      + r1lo * r2lo
+--
+-- Note that @(r1hi * r2hi) << 64@ can be dropped because it overflows completely.
+
+iselExpr64 (CmmMachOp (MO_Mul _) [e1,e2]) = do
+   RegCode64 code1 r1hi r1lo <- iselExpr64 e1
+   RegCode64 code2 r2hi r2lo <- iselExpr64 e2
+   Reg64 rhi rlo <- getNewReg64
+   tmp <- getNewRegNat II32
+   let
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ MOV  II32 (OpReg r1lo) (OpReg eax),
+                       MOV  II32 (OpReg r2lo) (OpReg tmp),
+                       MOV  II32 (OpReg r1hi) (OpReg rhi),
+                       IMUL II32 (OpReg tmp) (OpReg rhi),
+                       MOV  II32 (OpReg r2hi) (OpReg rlo),
+                       IMUL II32 (OpReg eax) (OpReg rlo),
+                       ADD  II32 (OpReg rlo) (OpReg rhi),
+                       MUL2 II32 (OpReg tmp),
+                       ADD  II32 (OpReg edx) (OpReg rhi),
+                       MOV  II32 (OpReg eax) (OpReg rlo)
+                     ]
+   return (RegCode64 code rhi rlo)
+
+iselExpr64 (CmmMachOp (MO_S_MulMayOflo W64) [e1,e2]) = do
+   -- Performance sensitive users won't use 32 bit so let's keep it simple:
+   -- We always return a (usually false) positive.
+   Reg64 rhi rlo <- getNewReg64
+   let code = toOL   [
+                       MOV  II32 (OpImm (ImmInt 1)) (OpReg rhi),
+                       MOV  II32 (OpImm (ImmInt 1)) (OpReg rlo)
+                     ]
+   return (RegCode64 code rhi rlo)
+
+
+-- To shift a 64-bit number to the left we use the SHLD and SHL instructions.
+-- We use SHLD to shift the bits in @rhi@ to the left while copying
+-- high bits from @rlo@ to fill the new space in the low bits of @rhi at .
+-- That leaves @rlo@ unchanged, so we use SHL to shift the bits of @rlo@ left.
+-- However, both these instructions only use the lowest 5 bits from %cl to do
+-- their shifting. So if the sixth bit (0x32) is set then we additionally move
+-- the contents of @rlo@ to @rhi@ and clear @rlo at .
+
+iselExpr64 (CmmMachOp (MO_Shl _) [e1,e2]) = do
+   RegCode64 code1 r1hi r1lo <- iselExpr64 e1
+   code2 <- getAnyReg e2
+   Reg64 rhi rlo <- getNewReg64
+   lbl1 <- newBlockId
+   lbl2 <- newBlockId
+   let
+        code =  code1 `appOL`
+                code2 ecx `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       SHLD II32 (OpReg ecx) (OpReg rlo) (OpReg rhi),
+                       SHL II32 (OpReg ecx) (OpReg rlo),
+                       TEST II32 (OpImm (ImmInt 32)) (OpReg ecx),
+                       JXX EQQ lbl2,
+                       JXX ALWAYS lbl1,
+                       NEWBLOCK lbl1,
+                       MOV II32 (OpReg rlo) (OpReg rhi),
+                       XOR II32 (OpReg rlo) (OpReg rlo),
+                       JXX ALWAYS lbl2,
+                       NEWBLOCK lbl2
+                     ]
+   return (RegCode64 code rhi rlo)
+
+-- Similar to above, however now we're shifting to the right
+-- and we're doing a signed shift which means that @rhi@ needs
+-- to be set to either 0 if @rhi@ is positive or 0xffffffff otherwise,
+-- and if the sixth bit of %cl is set (so the shift amount is more than 32).
+-- To accomplish that we shift @rhi@ by 31.
+
+iselExpr64 (CmmMachOp (MO_S_Shr _) [e1,e2]) = do
+   RegCode64 code1 r1hi r1lo <- iselExpr64 e1
+   (r2, code2) <- getSomeReg e2
+   Reg64 rhi rlo <- getNewReg64
+   lbl1 <- newBlockId
+   lbl2 <- newBlockId
+   let
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       MOV II32 (OpReg r2) (OpReg ecx),
+                       SHRD II32 (OpReg ecx) (OpReg rhi) (OpReg rlo),
+                       SAR II32 (OpReg ecx) (OpReg rhi),
+                       TEST II32 (OpImm (ImmInt 32)) (OpReg ecx),
+                       JXX EQQ lbl2,
+                       JXX ALWAYS lbl1,
+                       NEWBLOCK lbl1,
+                       MOV II32 (OpReg rhi) (OpReg rlo),
+                       SAR II32 (OpImm (ImmInt 31)) (OpReg rhi),
+                       JXX ALWAYS lbl2,
+                       NEWBLOCK lbl2
+                     ]
+   return (RegCode64 code rhi rlo)
+
+-- Similar to the above.
+
+iselExpr64 (CmmMachOp (MO_U_Shr _) [e1,e2]) = do
+   RegCode64 code1 r1hi r1lo <- iselExpr64 e1
+   (r2, code2) <- getSomeReg e2
+   Reg64 rhi rlo <- getNewReg64
+   lbl1 <- newBlockId
+   lbl2 <- newBlockId
+   let
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       MOV II32 (OpReg r2) (OpReg ecx),
+                       SHRD II32 (OpReg ecx) (OpReg rhi) (OpReg rlo),
+                       SHR II32 (OpReg ecx) (OpReg rhi),
+                       TEST II32 (OpImm (ImmInt 32)) (OpReg ecx),
+                       JXX EQQ lbl2,
+                       JXX ALWAYS lbl1,
+                       NEWBLOCK lbl1,
+                       MOV II32 (OpReg rhi) (OpReg rlo),
+                       XOR II32 (OpReg rhi) (OpReg rhi),
+                       JXX ALWAYS lbl2,
+                       NEWBLOCK lbl2
+                     ]
+   return (RegCode64 code rhi rlo)
+
+iselExpr64 (CmmMachOp (MO_And _) [e1,e2]) = iselExpr64ParallelBin AND e1 e2
+iselExpr64 (CmmMachOp (MO_Or  _) [e1,e2]) = iselExpr64ParallelBin OR  e1 e2
+iselExpr64 (CmmMachOp (MO_Xor _) [e1,e2]) = iselExpr64ParallelBin XOR e1 e2
+
+iselExpr64 (CmmMachOp (MO_Not _) [e1]) = do
+   RegCode64 code1 r1hi r1lo <- iselExpr64 e1
+   Reg64 rhi rlo <- getNewReg64
+   let
+        code =  code1 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       NOT II32 (OpReg rlo),
+                       NOT II32 (OpReg rhi)
+                     ]
+   return (RegCode64 code rhi rlo)
+
+iselExpr64 (CmmRegOff r i) = iselExpr64 (mangleIndexTree r i)
+
 iselExpr64 expr
    = do
       platform <- getPlatform
-      pprPanic "iselExpr64(i386)" (pdoc platform expr)
+      pprPanic "iselExpr64(i386)" (pdoc platform expr $+$ text (show expr))
 
+iselExpr64ParallelBin :: (Format -> Operand -> Operand -> Instr)
+                      -> CmmExpr -> CmmExpr -> NatM (RegCode64 (OrdList Instr))
+iselExpr64ParallelBin op e1 e2 = do
+   RegCode64 code1 r1hi r1lo <- iselExpr64 e1
+   RegCode64 code2 r2hi r2lo <- iselExpr64 e2
+   Reg64 rhi rlo <- getNewReg64
+   let
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       op  II32 (OpReg r2lo) (OpReg rlo),
+                       op  II32 (OpReg r2hi) (OpReg rhi)
+                     ]
+   return (RegCode64 code rhi rlo)
 
 --------------------------------------------------------------------------------
 getRegister :: CmmExpr -> NatM Register
@@ -675,6 +890,18 @@ getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
   RegCode64 code _rhi rlo <- iselExpr64 x
   return $ Fixed II32 rlo code
 
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W8) [x])
+ | is32Bit = do
+  RegCode64 code _rhi rlo <- iselExpr64 x
+  ro <- getNewRegNat II8
+  return $ Fixed II8 ro (code `appOL` toOL [ MOVZxL II8 (OpReg rlo) (OpReg ro) ])
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W16) [x])
+ | is32Bit = do
+  RegCode64 code _rhi rlo <- iselExpr64 x
+  ro <- getNewRegNat II16
+  return $ Fixed II16 ro (code `appOL` toOL [ MOVZxL II16 (OpReg rlo) (OpReg ro) ])
+
 getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
   float_const_sse2  where
   float_const_sse2
@@ -1657,6 +1884,27 @@ machOpToCond mo = case mo of
   MO_U_Le _ -> LEU
   _other -> pprPanic "machOpToCond" (pprMachOp mo)
 
+{-  Note [64-bit integer comparisons on 32-bit]
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    When doing these comparisons there are 2 kinds of
+    comparisons.
+
+    * Comparison for equality (or lack thereof)
+
+    We use xor to check if high/low bits are
+    equal. Then combine the results using or.
+
+    * Other comparisons:
+
+    We first compare the low registers
+    and use a subtraction with borrow to compare the high registers.
+
+    For signed numbers the condition is determined by
+    the sign and overflow flags agreeing or not
+    and for unsigned numbers the condition is the carry flag.
+
+-}
 
 -- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
 -- passed back up the tree.
@@ -1667,6 +1915,68 @@ condIntCode cond x y = do platform <- getPlatform
 
 condIntCode' :: Platform -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode
 
+-- 64-bit integer comparisons on 32-bit
+-- See Note [64-bit integer comparisons on 32-bit]
+condIntCode' platform cond x y
+  | target32Bit platform && isWord64 (cmmExprType platform x) = do
+
+  RegCode64 code1 r1hi r1lo <- iselExpr64 x
+  RegCode64 code2 r2hi r2lo <- iselExpr64 y
+
+  -- we mustn't clobber r1/r2 so we use temporaries
+  tmp1 <- getNewRegNat II32
+  tmp2 <- getNewRegNat II32
+
+  let (cond', cmpCode) = intComparison cond r1hi r1lo r2hi r2lo tmp1 tmp2
+  return $ CondCode False cond' (code1 `appOL` code2 `appOL` cmpCode)
+
+  where
+    intComparison cond r1_hi r1_lo r2_hi r2_lo tmp1 tmp2 =
+      case cond of
+        -- These don't occur as argument of condIntCode'
+        ALWAYS  -> panic "impossible"
+        NEG     -> panic "impossible"
+        POS     -> panic "impossible"
+        CARRY   -> panic "impossible"
+        OFLO    -> panic "impossible"
+        PARITY  -> panic "impossible"
+        NOTPARITY -> panic "impossible"
+        -- Special case #1 x == y and x != y
+        EQQ -> (EQQ, cmpExact)
+        NE  -> (NE, cmpExact)
+        -- [x >= y]
+        GE  -> (GE, cmpGE)
+        GEU -> (GEU, cmpGE)
+        -- [x >  y]
+        GTT -> (LTT, cmpLE)
+        GU  -> (LU, cmpLE)
+        -- [x <= y]
+        LE  -> (GE, cmpLE)
+        LEU -> (GEU, cmpLE)
+        -- [x <  y]
+        LTT -> (LTT, cmpGE)
+        LU  -> (LU, cmpGE)
+      where
+        cmpExact :: OrdList Instr
+        cmpExact =
+          toOL
+            [ MOV II32 (OpReg r1_hi) (OpReg tmp1)
+            , MOV II32 (OpReg r1_lo) (OpReg tmp2)
+            , XOR II32 (OpReg r2_hi) (OpReg tmp1)
+            , XOR II32 (OpReg r2_lo) (OpReg tmp2)
+            , OR  II32 (OpReg tmp1)  (OpReg tmp2)
+            ]
+        cmpGE = toOL
+            [ MOV II32 (OpReg r1_hi) (OpReg tmp1)
+            , CMP II32 (OpReg r2_lo) (OpReg r1_lo)
+            , SBB II32 (OpReg r2_hi) (OpReg tmp1)
+            ]
+        cmpLE = toOL
+            [ MOV II32 (OpReg r2_hi) (OpReg tmp1)
+            , CMP II32 (OpReg r1_lo) (OpReg r2_lo)
+            , SBB II32 (OpReg r1_hi) (OpReg tmp1)
+            ]
+
 -- memory vs immediate
 condIntCode' platform cond (CmmLoad x pk _) (CmmLit lit)
  | is32BitLit platform lit = do
@@ -1881,36 +2191,6 @@ I386: First, we have to ensure that the condition
 codes are set according to the supplied comparison operation.
 -}
 
-{-  Note [64-bit integer comparisons on 32-bit]
-    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    When doing these comparisons there are 2 kinds of
-    comparisons.
-
-    * Comparison for equality (or lack thereof)
-
-    We use xor to check if high/low bits are
-    equal. Then combine the results using or and
-    perform a single conditional jump based on the
-    result.
-
-    * Other comparisons:
-
-    We map all other comparisons to the >= operation.
-    Why? Because it's easy to encode it with a single
-    conditional jump.
-
-    We do this by first computing [r1_lo - r2_lo]
-    and use the carry flag to compute
-    [r1_high - r2_high - CF].
-
-    At which point if r1 >= r2 then the result will be
-    positive. Otherwise negative so we can branch on this
-    condition.
-
--}
-
-
 genCondBranch
     :: BlockId      -- the source of the jump
     -> BlockId      -- the true branch target
@@ -1926,67 +2206,6 @@ genCondBranch bid id false expr = do
 genCondBranch' :: Bool -> BlockId -> BlockId -> BlockId -> CmmExpr
                -> NatM InstrBlock
 
--- 64-bit integer comparisons on 32-bit
--- See Note [64-bit integer comparisons on 32-bit]
-genCondBranch' is32Bit _bid true false (CmmMachOp mop [e1,e2])
-  | is32Bit, Just W64 <- maybeIntComparison mop = do
-
-  RegCode64 code1 r1hi r1lo <- iselExpr64 e1
-  RegCode64 code2 r2hi r2lo <- iselExpr64 e2
-  let cond = machOpToCond mop :: Cond
-
-  -- we mustn't clobber r1/r2 so we use temporaries
-  tmp1 <- getNewRegNat II32
-  tmp2 <- getNewRegNat II32
-
-  let cmpCode = intComparison cond true false r1hi r1lo r2hi r2lo tmp1 tmp2
-  return $ code1 `appOL` code2 `appOL` cmpCode
-
-  where
-    intComparison cond true false r1_hi r1_lo r2_hi r2_lo tmp1 tmp2 =
-      case cond of
-        -- Impossible results of machOpToCond
-        ALWAYS  -> panic "impossible"
-        NEG     -> panic "impossible"
-        POS     -> panic "impossible"
-        CARRY   -> panic "impossible"
-        OFLO    -> panic "impossible"
-        PARITY  -> panic "impossible"
-        NOTPARITY -> panic "impossible"
-        -- Special case #1 x == y and x != y
-        EQQ -> cmpExact
-        NE  -> cmpExact
-        -- [x >= y]
-        GE  -> cmpGE
-        GEU -> cmpGE
-        -- [x >  y] <==> ![y >= x]
-        GTT -> intComparison GE  false true r2_hi r2_lo r1_hi r1_lo tmp1 tmp2
-        GU  -> intComparison GEU false true r2_hi r2_lo r1_hi r1_lo tmp1 tmp2
-        -- [x <= y] <==> [y >= x]
-        LE  -> intComparison GE  true false r2_hi r2_lo r1_hi r1_lo tmp1 tmp2
-        LEU -> intComparison GEU true false r2_hi r2_lo r1_hi r1_lo tmp1 tmp2
-        -- [x <  y] <==> ![x >= x]
-        LTT -> intComparison GE  false true r1_hi r1_lo r2_hi r2_lo tmp1 tmp2
-        LU  -> intComparison GEU false true r1_hi r1_lo r2_hi r2_lo tmp1 tmp2
-      where
-        cmpExact :: OrdList Instr
-        cmpExact =
-          toOL
-            [ MOV II32 (OpReg r1_hi) (OpReg tmp1)
-            , MOV II32 (OpReg r1_lo) (OpReg tmp2)
-            , XOR II32 (OpReg r2_hi) (OpReg tmp1)
-            , XOR II32 (OpReg r2_lo) (OpReg tmp2)
-            , OR  II32 (OpReg tmp1)  (OpReg tmp2)
-            , JXX cond true
-            , JXX ALWAYS false
-            ]
-        cmpGE = toOL
-            [ MOV II32 (OpReg r1_hi) (OpReg tmp1)
-            , CMP II32 (OpReg r2_lo) (OpReg r1_lo)
-            , SBB II32 (OpReg r2_hi) (OpReg tmp1)
-            , JXX cond true
-            , JXX ALWAYS false ]
-
 genCondBranch' _ bid id false bool = do
   CondCode is_float cond cond_code <- getCondCode bool
   use_sse2 <- sse2Enabled


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -249,6 +249,8 @@ data Instr
         | SHL         Format Operand{-amount-} Operand
         | SAR         Format Operand{-amount-} Operand
         | SHR         Format Operand{-amount-} Operand
+        | SHRD        Format Operand{-amount-} Operand Operand
+        | SHLD        Format Operand{-amount-} Operand Operand
 
         | BT          Format Imm Operand
         | NOP
@@ -399,6 +401,8 @@ regUsageOfInstr platform instr
     SHL    _ imm dst    -> usageRM imm dst
     SAR    _ imm dst    -> usageRM imm dst
     SHR    _ imm dst    -> usageRM imm dst
+    SHLD   _ imm dst1 dst2 -> usageRMM imm dst1 dst2
+    SHRD   _ imm dst1 dst2 -> usageRMM imm dst1 dst2
     BT     _ _   src    -> mkRUR (use_R src [])
 
     PUSH   _ op         -> mkRUR (use_R op [])
@@ -568,6 +572,8 @@ patchRegsOfInstr instr env
     SHL  fmt imm dst     -> patch1 (SHL fmt imm) dst
     SAR  fmt imm dst     -> patch1 (SAR fmt imm) dst
     SHR  fmt imm dst     -> patch1 (SHR fmt imm) dst
+    SHLD fmt imm dst1 dst2 -> patch2 (SHLD fmt imm) dst1 dst2
+    SHRD fmt imm dst1 dst2 -> patch2 (SHRD fmt imm) dst1 dst2
     BT   fmt imm src     -> patch1 (BT  fmt imm) src
     TEST fmt src dst     -> patch2 (TEST fmt) src dst
     CMP  fmt src dst     -> patch2 (CMP  fmt) src dst


=====================================
compiler/GHC/CmmToAsm/X86/Ppr.hs
=====================================
@@ -732,6 +732,12 @@ pprInstr platform i = case i of
    SHR format src dst
       -> pprShift (text "shr") format src dst
 
+   SHLD format src dst1 dst2
+      -> pprShift2 (text "shld") format src dst1 dst2
+
+   SHRD format src dst1 dst2
+      -> pprShift2 (text "shrd") format src dst1 dst2
+
    BT format imm src
       -> pprFormatImmOp (text "bt") format imm src
 
@@ -1070,6 +1076,17 @@ pprInstr platform i = case i of
            pprOperand platform format dest
        ]
 
+   pprShift2 :: Line doc -> Format -> Operand -> Operand -> Operand -> doc
+   pprShift2 name format src dest1 dest2
+     = line $ hcat [
+           pprMnemonic name format,
+           pprOperand platform II8 src,  -- src is 8-bit sized
+           comma,
+           pprOperand platform format dest1,
+           comma,
+           pprOperand platform format dest2
+       ]
+
 
    pprFormatOpOpCoerce :: Line doc -> Format -> Format -> Operand -> Operand -> doc
    pprFormatOpOpCoerce name format1 format2 op1 op2


=====================================
compiler/GHC/Driver/Config/StgToCmm.hs
=====================================
@@ -53,7 +53,8 @@ initStgToCmmConfig dflags mod = StgToCmmConfig
   , stgToCmmDoBoundsCheck = gopt Opt_DoBoundsChecking      dflags
   , stgToCmmDoTagCheck    = gopt Opt_DoTagInferenceChecks  dflags
   -- backend flags
-  , stgToCmmAllowBigArith             = not ncg || platformArch platform == ArchWasm32
+  , stgToCmmAllowBigArith             = not ncg || platformArch platform == ArchWasm32 || platformArch platform == ArchX86
+  , stgToCmmAllowBigQuot              = not ncg || platformArch platform == ArchWasm32
   , stgToCmmAllowQuotRemInstr         = ncg  && (x86ish || ppc)
   , stgToCmmAllowQuotRem2             = (ncg && (x86ish || ppc)) || llvm
   , stgToCmmAllowExtendedAddSubInstrs = (ncg && (x86ish || ppc)) || llvm


=====================================
compiler/GHC/StgToCmm/Config.hs
=====================================
@@ -65,6 +65,7 @@ data StgToCmmConfig = StgToCmmConfig
   , stgToCmmDoTagCheck     :: !Bool              -- ^ Verify tag inference predictions.
   ------------------------------ Backend Flags ----------------------------------
   , stgToCmmAllowBigArith             :: !Bool   -- ^ Allowed to emit larger than native size arithmetic (only LLVM and C backends)
+  , stgToCmmAllowBigQuot              :: !Bool   -- ^ Allowed to emit larger than native size division operations
   , stgToCmmAllowQuotRemInstr         :: !Bool   -- ^ Allowed to generate QuotRem instructions
   , stgToCmmAllowQuotRem2             :: !Bool   -- ^ Allowed to generate QuotRem
   , stgToCmmAllowExtendedAddSubInstrs :: !Bool   -- ^ Allowed to generate AddWordC, SubWordC, Add2, etc.


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -1682,6 +1682,13 @@ emitPrimOp cfg primop =
     let stmt = mkAssign (CmmLocal res) (CmmMachOp mop args)
     emit stmt
 
+  isQuottishOp :: CallishMachOp -> Bool
+  isQuottishOp MO_I64_Quot = True
+  isQuottishOp MO_I64_Rem = True
+  isQuottishOp MO_W64_Quot = True
+  isQuottishOp MO_W64_Rem = True
+  isQuottishOp _ = False
+
   opTranslate64
     :: [CmmExpr]
     -> (Width -> MachOp)
@@ -1690,7 +1697,8 @@ emitPrimOp cfg primop =
   opTranslate64 args mkMop callish =
     case platformWordSize platform of
       -- LLVM and C `can handle larger than native size arithmetic natively.
-      _ | stgToCmmAllowBigArith cfg -> opTranslate args $ mkMop W64
+      _ | not (isQuottishOp callish), stgToCmmAllowBigArith cfg -> opTranslate args $ mkMop W64
+        | isQuottishOp callish, stgToCmmAllowBigQuot cfg -> opTranslate args $ mkMop W64
       PW4 -> opCallish args callish
       PW8 -> opTranslate args $ mkMop W64
 


=====================================
compiler/GHC/Utils/Unique.hs
=====================================
@@ -2,12 +2,12 @@
 
 {- Work around #23537
 
-On 32 bit systems, GHC's codegen around 64 bit numbers is not quite
-complete. This led to panics mentioning missing cases in iselExpr64.
-Now that GHC uses Word64 for its uniques, these panics have started
-popping up whenever a unique is compared to many other uniques in one
-function. As a workaround we use these two functions which are not
-inlined on 32 bit systems, thus preventing the panics.
+On 32 bit systems, GHC's codegen around 64 bit numbers used to be incomplete
+before GHC 9.10. This led to panics mentioning missing cases in iselExpr64.
+Now that GHC uses Word64 for its uniques, these panics have started popping up
+whenever a unique is compared to many other uniques in one function. As a
+workaround we use these two functions which are not inlined, on 32 bit systems
+and if compiled with versions before GHC 9.9, thus preventing the panics.
 -}
 
 module GHC.Utils.Unique (sameUnique, anyOfUnique) where
@@ -18,7 +18,7 @@ import GHC.Prelude.Basic (Bool, Eq((==)), Foldable(elem))
 import GHC.Types.Unique (Unique, Uniquable (getUnique))
 
 
-#if WORD_SIZE_IN_BITS == 32
+#if WORD_SIZE_IN_BITS == 32 && !MIN_VERSION_GLASGOW_HASKELL(9,9,0,0)
 {-# NOINLINE sameUnique #-}
 #else
 {-# INLINE sameUnique #-}
@@ -26,7 +26,7 @@ import GHC.Types.Unique (Unique, Uniquable (getUnique))
 sameUnique :: Uniquable a => a -> a -> Bool
 sameUnique x y = getUnique x == getUnique y
 
-#if WORD_SIZE_IN_BITS == 32
+#if WORD_SIZE_IN_BITS == 32 && !MIN_VERSION_GLASGOW_HASKELL(9,9,0,0)
 {-# NOINLINE anyOfUnique #-}
 #else
 {-# INLINE anyOfUnique #-}


=====================================
docs/users_guide/9.6.1-notes.rst
=====================================
@@ -122,7 +122,7 @@ Compiler
   that targets ``wasm32-wasi`` and compiles Haskell code to
   self-contained WebAssembly modules that can be executed on a variety
   of different runtimes. There are a few caveats to be aware of:
-  
+
   - To use the WebAssembly backend, one would need to follow the
     instructions on `ghc-wasm-meta
     <https://gitlab.haskell.org/ghc/ghc-wasm-meta>`_. The WebAssembly
@@ -149,6 +149,9 @@ Compiler
 
 - The :ghc-flag:`-Wstar-is-type` warning is now enabled by default.
 
+- The 32bit x86 NCG backend will now generate inline assembly for most common 64bit
+  operations. This improves Int64/Word64 performance substantially on this platform.
+
 GHCi
 ~~~~
 


=====================================
testsuite/tests/cmm/should_run/all.T
=====================================
@@ -27,7 +27,6 @@ test('ByteSwitch',
 test('T22871',
      [    extra_run_opts('"' + config.libdir + '"')
      ,    req_cmm
-     ,    when(arch('i386'), skip) # x86 NCG panics with "iselExpr64(i386)"
      ],
      multi_compile_and_run,
      ['T22871', [('T22871_cmm.cmm', '')], ''])



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/1b0dfcbe14c5ec52d1eb71cbe43b21b52b4187e4

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/1b0dfcbe14c5ec52d1eb71cbe43b21b52b4187e4
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20231102/8167b84c/attachment-0001.html>