[Git][ghc/ghc][master] 4 commits: Revert "Division by constants optimization"

Marge Bot (@marge-bot) gitlab at gitlab.haskell.org
Sat Jan 18 17:39:21 UTC 2025



Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC


Commits:
b6f54188 by Ben Gamari at 2025-01-18T12:38:46-05:00
Revert "Division by constants optimization"

This appears to be responsible for the regression described in #25653.

This reverts commit daff1e30219d136977c71f42e82ccc58c9013cfb.

- - - - -
0fd90de8 by Ben Gamari at 2025-01-18T12:38:46-05:00
testsuite: Introduce div2 test

This is a useful test from !8392 which is worth keeping around.

- - - - -
32680979 by Ben Gamari at 2025-01-18T12:38:46-05:00
testsuite: Test shift correctness in mul2 test

- - - - -
163aa50a by Ben Gamari at 2025-01-18T12:38:46-05:00
testsuite: Add regression test for #25653

- - - - -


10 changed files:

- compiler/GHC/Cmm/Config.hs
- compiler/GHC/Cmm/MachOp.hs
- compiler/GHC/Cmm/Opt.hs
- compiler/GHC/Cmm/Pipeline.hs
- compiler/GHC/Cmm/Sink.hs
- compiler/GHC/Driver/Config/Cmm.hs
- compiler/GHC/StgToCmm/Prim.hs
- + testsuite/tests/numeric/should_run/T25653.hs
- + testsuite/tests/numeric/should_run/T25653.stdout
- testsuite/tests/numeric/should_run/all.T


Changes:

=====================================
compiler/GHC/Cmm/Config.hs
=====================================
@@ -24,8 +24,6 @@ data CmmConfig = CmmConfig
   , cmmExternalDynamicRefs :: !Bool    -- ^ Generate code to link against dynamic libraries
   , cmmDoCmmSwitchPlans    :: !Bool    -- ^ Should the Cmm pass replace Stg switch statements
   , cmmSplitProcPoints     :: !Bool    -- ^ Should Cmm split proc points or not
-  , cmmAllowMul2           :: !Bool    -- ^ Does this platform support mul2
-  , cmmOptConstDivision    :: !Bool    -- ^ Should we optimize constant divisors
   }
 
 -- | retrieve the target Cmm platform


=====================================
compiler/GHC/Cmm/MachOp.hs
=====================================
@@ -7,7 +7,6 @@ module GHC.Cmm.MachOp
     , pprMachOp, isCommutableMachOp, isAssociativeMachOp
     , isComparisonMachOp, maybeIntComparison, machOpResultType
     , machOpArgReps, maybeInvertComparison, isFloatComparison
-    , isCommutableCallishMachOp
 
     -- MachOp builders
     , mo_wordAdd, mo_wordSub, mo_wordEq, mo_wordNe,mo_wordMul, mo_wordSQuot
@@ -846,17 +845,3 @@ machOpMemcpyishAlign op = case op of
   MO_Memmove align -> Just align
   MO_Memcmp  align -> Just align
   _                -> Nothing
-
-isCommutableCallishMachOp :: CallishMachOp -> Bool
-isCommutableCallishMachOp op =
-  case op of
-    MO_x64_Add  -> True
-    MO_x64_Mul  -> True
-    MO_x64_Eq   -> True
-    MO_x64_Ne   -> True
-    MO_x64_And  -> True
-    MO_x64_Or   -> True
-    MO_x64_Xor  -> True
-    MO_S_Mul2 _ -> True
-    MO_U_Mul2 _ -> True
-    _ -> False


=====================================
compiler/GHC/Cmm/Opt.hs
=====================================
@@ -5,52 +5,27 @@
 -- (c) The University of Glasgow 2006
 --
 -----------------------------------------------------------------------------
-{-# LANGUAGE TupleSections #-}
-{-# LANGUAGE TypeApplications #-}
-{-# LANGUAGE PatternSynonyms #-}
 module GHC.Cmm.Opt (
         constantFoldNode,
         constantFoldExpr,
         cmmMachOpFold,
-        cmmMachOpFoldM,
-        Opt, runOpt
+        cmmMachOpFoldM
  ) where
 
 import GHC.Prelude
 
-import GHC.Cmm.Dataflow.Block
 import GHC.Cmm.Utils
 import GHC.Cmm
-import GHC.Cmm.Config
-import GHC.Types.Unique.DSM
-
 import GHC.Utils.Misc
+
 import GHC.Utils.Panic
 import GHC.Platform
 
 import Data.Maybe
-import Data.Word
-import GHC.Exts (oneShot)
-import Control.Monad
-
-constantFoldNode :: CmmNode e x -> Opt (CmmNode e x)
-constantFoldNode (CmmUnsafeForeignCall (PrimTarget op) res args)
-  = traverse constantFoldExprOpt args >>= cmmCallishMachOpFold op res
-constantFoldNode node
-  = mapExpOpt constantFoldExprOpt node
-
-constantFoldExprOpt :: CmmExpr -> Opt CmmExpr
-constantFoldExprOpt e = wrapRecExpOpt f e
-  where
-    f (CmmMachOp op args)
-      = do
-        cfg <- getConfig
-        case cmmMachOpFold (cmmPlatform cfg) op args of
-          CmmMachOp op' args' -> fromMaybe (CmmMachOp op' args') <$> cmmMachOpFoldOptM cfg op' args'
-          e -> pure e
-    f (CmmRegOff r 0) = pure (CmmReg r)
-    f (CmmLit (CmmInt x rep)) = pure (CmmLit $ CmmInt (narrowU rep x) rep)
-    f e = pure e
+
+
+constantFoldNode :: Platform -> CmmNode e x -> CmmNode e x
+constantFoldNode platform = mapExp (constantFoldExpr platform)
 
 constantFoldExpr :: Platform -> CmmExpr -> CmmExpr
 constantFoldExpr platform = wrapRecExp f
@@ -321,7 +296,7 @@ cmmMachOpFoldM platform cmp [CmmMachOp conv [x], CmmLit (CmmInt i _)]
     maybe_comparison (MO_S_Le _) rep False = Just (MO_U_Le rep)
     maybe_comparison _ _ _ = Nothing
 
--- We can often do something with constants of 0, 1 and (-1) ...
+-- We can often do something with constants of 0 and 1 ...
 -- See Note [Comparison operators]
 
 cmmMachOpFoldM platform mop [x, y@(CmmLit (CmmInt 0 _))]
@@ -392,8 +367,6 @@ cmmMachOpFoldM platform mop [x, (CmmLit (CmmInt n _))]
         MO_Mul rep
            | Just p <- exactLog2 n ->
                  Just $! (cmmMachOpFold platform (MO_Shl rep) [x, CmmLit (CmmInt p $ wordWidth platform)])
-        -- The optimization for division by power of 2 is technically duplicated, but since at least one other part of ghc uses
-        -- the pure `constantFoldExpr` this remains
         MO_U_Quot rep
            | Just p <- exactLog2 n ->
                  Just $! (cmmMachOpFold platform (MO_U_Shr rep) [x, CmmLit (CmmInt p $ wordWidth platform)])
@@ -402,19 +375,46 @@ cmmMachOpFoldM platform mop [x, (CmmLit (CmmInt n _))]
                  Just $! (cmmMachOpFold platform (MO_And rep) [x, CmmLit (CmmInt (n - 1) rep)])
         MO_S_Quot rep
            | Just p <- exactLog2 n,
-             CmmReg _ <- x ->
+             CmmReg _ <- x ->   -- We duplicate x in signedQuotRemHelper, hence require
+                                -- it is a reg.  FIXME: remove this restriction.
                 Just $! (cmmMachOpFold platform (MO_S_Shr rep)
-                  [signedQuotRemHelper platform n x rep p, CmmLit (CmmInt p $ wordWidth platform)])
+                  [signedQuotRemHelper rep p, CmmLit (CmmInt p $ wordWidth platform)])
         MO_S_Rem rep
            | Just p <- exactLog2 n,
-             CmmReg _ <- x ->
+             CmmReg _ <- x ->   -- We duplicate x in signedQuotRemHelper, hence require
+                                -- it is a reg.  FIXME: remove this restriction.
                 -- We replace (x `rem` 2^p) by (x - (x `quot` 2^p) * 2^p).
                 -- Moreover, we fuse MO_S_Shr (last operation of MO_S_Quot)
                 -- and MO_S_Shl (multiplication by 2^p) into a single MO_And operation.
                 Just $! (cmmMachOpFold platform (MO_Sub rep)
                     [x, cmmMachOpFold platform (MO_And rep)
-                      [signedQuotRemHelper platform n x rep p, CmmLit (CmmInt (- n) rep)]])
+                      [signedQuotRemHelper rep p, CmmLit (CmmInt (- n) rep)]])
         _ -> Nothing
+  where
+    -- In contrast with unsigned integers, for signed ones
+    -- shift right is not the same as quot, because it rounds
+    -- to minus infinity, whereas quot rounds toward zero.
+    -- To fix this up, we add one less than the divisor to the
+    -- dividend if it is a negative number.
+    --
+    -- to avoid a test/jump, we use the following sequence:
+    --      x1 = x >> word_size-1  (all 1s if -ve, all 0s if +ve)
+    --      x2 = y & (divisor-1)
+    --      result = x + x2
+    -- this could be done a bit more simply using conditional moves,
+    -- but we're processor independent here.
+    --
+    -- we optimise the divide by 2 case slightly, generating
+    --      x1 = x >> word_size-1  (unsigned)
+    --      return = x + x1
+    signedQuotRemHelper :: Width -> Integer -> CmmExpr
+    signedQuotRemHelper rep p = CmmMachOp (MO_Add rep) [x, x2]
+      where
+        bits = fromIntegral (widthInBits rep) - 1
+        shr = if p == 1 then MO_U_Shr rep else MO_S_Shr rep
+        x1 = CmmMachOp shr [x, CmmLit (CmmInt bits $ wordWidth platform)]
+        x2 = if p == 1 then x1 else
+             CmmMachOp (MO_And rep) [x1, CmmLit (CmmInt (n-1) rep)]
 
 -- ToDo (#7116): optimise floating-point multiplication, e.g. x*2.0 -> x+x
 -- Unfortunately this needs a unique supply because x might not be a
@@ -448,533 +448,3 @@ That's what the constant-folding operations on comparison operators do above.
 isPicReg :: CmmExpr -> Bool
 isPicReg (CmmReg (CmmGlobal (GlobalRegUse PicBaseReg _))) = True
 isPicReg _ = False
-
-canOptimizeDivision :: CmmConfig -> Width -> Bool
-canOptimizeDivision cfg rep = cmmOptConstDivision cfg &&
-  -- we can either widen the arguments to simulate mul2 or use mul2 directly for the platform word size
-  (rep < wordWidth platform || (rep == wordWidth platform && cmmAllowMul2 cfg))
-  where platform = cmmPlatform cfg
-
--- -----------------------------------------------------------------------------
--- Folding callish machops
-
-cmmCallishMachOpFold :: CallishMachOp -> [CmmFormal] -> [CmmActual] -> Opt (CmmNode O O)
-cmmCallishMachOpFold op res args =
-  fromMaybe (CmmUnsafeForeignCall (PrimTarget op) res args) <$> (getConfig >>= \cfg -> cmmCallishMachOpFoldM cfg op res args)
-
-cmmCallishMachOpFoldM :: CmmConfig -> CallishMachOp -> [CmmFormal] -> [CmmActual] -> Opt (Maybe (CmmNode O O))
-
--- If possible move the literals to the right, the following cases assume that to be the case
-cmmCallishMachOpFoldM cfg op res [x@(CmmLit _),y]
-  | isCommutableCallishMachOp op && not (isLit y) = cmmCallishMachOpFoldM cfg op res [y,x]
-
--- Both arguments are literals, replace with the result
-cmmCallishMachOpFoldM _ op res [CmmLit (CmmInt x _), CmmLit (CmmInt y _)]
-  = case op of
-    MO_S_Mul2 rep
-      | [rHiNeeded,rHi,rLo] <- res -> do
-          let resSz = widthInBits rep
-              resVal = (narrowS rep x) * (narrowS rep y)
-              high = resVal `shiftR` resSz
-              low = narrowS rep resVal
-              isHiNeeded = high /= low `shiftR` resSz
-              isHiNeededVal = if isHiNeeded then 1 else 0
-          prependNode $! CmmAssign (CmmLocal rHiNeeded) (CmmLit $ CmmInt isHiNeededVal rep)
-          prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt high rep)
-          pure . Just $! CmmAssign (CmmLocal rLo) (CmmLit $ CmmInt low rep)
-    MO_U_Mul2 rep
-      | [rHi,rLo] <- res -> do
-          let resSz = widthInBits rep
-              resVal = (narrowU rep x) * (narrowU rep y)
-              high = resVal `shiftR` resSz
-              low = narrowU rep resVal
-          prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt high rep)
-          pure . Just $! CmmAssign (CmmLocal rLo) (CmmLit $ CmmInt low rep)
-    MO_S_QuotRem rep
-      | [rQuot, rRem] <- res,
-        y /= 0 -> do
-          let (q,r) = quotRem (narrowS rep x) (narrowS rep y)
-          prependNode $! CmmAssign (CmmLocal rQuot) (CmmLit $ CmmInt q rep)
-          pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt r rep)
-    MO_U_QuotRem rep
-      | [rQuot, rRem] <- res,
-        y /= 0 -> do
-          let (q,r) = quotRem (narrowU rep x) (narrowU rep y)
-          prependNode $! CmmAssign (CmmLocal rQuot) (CmmLit $ CmmInt q rep)
-          pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt r rep)
-    _ -> pure Nothing
-
--- 0, 1 or -1 as one of the constants
-
-cmmCallishMachOpFoldM _ op res [_, CmmLit (CmmInt 0 _)]
-  = case op of
-    -- x * 0 == 0
-    MO_S_Mul2 rep
-      | [rHiNeeded, rHi, rLo] <- res -> do
-        prependNode $! CmmAssign (CmmLocal rHiNeeded) (CmmLit $ CmmInt 0 rep)
-        prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt 0 rep)
-        pure . Just $! CmmAssign (CmmLocal rLo) (CmmLit $ CmmInt 0 rep)
-    -- x * 0 == 0
-    MO_U_Mul2 rep
-      | [rHi, rLo] <- res -> do
-        prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt 0 rep)
-        pure . Just $! CmmAssign (CmmLocal rLo) (CmmLit $ CmmInt 0 rep)
-    _ -> pure Nothing
-
-cmmCallishMachOpFoldM _ op res [CmmLit (CmmInt 0 _), _]
-  = case op of
-    -- 0 quotRem d == (0,0)
-    MO_S_QuotRem rep
-      | [rQuot, rRem] <- res -> do
-      prependNode $! CmmAssign (CmmLocal rQuot) (CmmLit $ CmmInt 0 rep)
-      pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt 0 rep)
-    -- 0 quotRem d == (0,0)
-    MO_U_QuotRem rep
-      | [rQuot,rRem] <- res -> do
-      prependNode $! CmmAssign (CmmLocal rQuot) (CmmLit $ CmmInt 0 rep)
-      pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt 0 rep)
-    _ -> pure Nothing
-
-cmmCallishMachOpFoldM cfg op res [x, CmmLit (CmmInt 1 _)]
-  = case op of
-    -- x * 1 == x -- Note: The high word needs to be a sign extension of the low word, so we use a sign extending shift
-    MO_S_Mul2 rep
-      | [rHiNeeded, rHi, rLo] <- res -> do
-        let platform = cmmPlatform cfg
-            wordRep = wordWidth platform
-            repInBits = toInteger $ widthInBits rep
-        prependNode $! CmmAssign (CmmLocal rHiNeeded) (CmmLit $ CmmInt 0 rep)
-        prependNode $! CmmAssign (CmmLocal rHi) (cmmMachOpFold platform (MO_S_Shr rep) [x, CmmLit $ CmmInt (repInBits - 1) wordRep])
-        pure . Just $! CmmAssign (CmmLocal rLo) x
-    -- x * 1 == x
-    MO_U_Mul2 rep
-      | [rHi, rLo] <- res -> do
-        prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt 0 rep)
-        pure . Just $! CmmAssign (CmmLocal rLo) x
-    -- x quotRem 1 == (x, 0)
-    MO_S_QuotRem rep
-      | [rQuot, rRem] <- res -> do
-        prependNode $! CmmAssign (CmmLocal rQuot) x
-        pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt 0 rep)
-    -- x quotRem 1 == (x, 0)
-    MO_U_QuotRem rep
-      | [rQuot, rRem] <- res -> do
-        prependNode $! CmmAssign (CmmLocal rQuot) x
-        pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt 0 rep)
-    _ -> pure Nothing
-
--- handle quotRem with a constant divisor
-
-cmmCallishMachOpFoldM cfg op res [n, CmmLit (CmmInt d' _)]
-  = case op of
-    MO_S_QuotRem rep
-      | Just p <- exactLog2 d,
-        [rQuot,rRem] <- res -> do
-          n' <- intoRegister n (cmmBits rep)
-          -- first prepend the optimized division by a power 2
-          prependNode $! CmmAssign (CmmLocal rQuot)
-            (cmmMachOpFold platform (MO_S_Shr rep)
-              [signedQuotRemHelper platform d n' rep p, CmmLit (CmmInt p $ wordWidth platform)])
-          -- then output an optimized remainder by a power of 2
-          pure . Just $! CmmAssign (CmmLocal rRem)
-            (cmmMachOpFold platform (MO_Sub rep)
-              [n', cmmMachOpFold platform (MO_And rep)
-                [signedQuotRemHelper platform d n' rep p, CmmLit (CmmInt (- d) rep)]])
-      | canOptimizeDivision cfg rep,
-        d /= (-1), d /= 0, d /= 1,
-        [rQuot,rRem] <- res -> do
-          -- we are definitely going to use n multiple times, so put it into a register
-          n' <- intoRegister n (cmmBits rep)
-          -- generate an optimized (signed) division of n by d
-          q <- generateDivisionBySigned platform cfg rep n' d
-          -- we also need the result multiple times to calculate the remainder
-          q' <- intoRegister q (cmmBits rep)
-
-          prependNode $! CmmAssign (CmmLocal rQuot) q'
-          -- The remainder now becomes n - q * d
-          pure . Just $! CmmAssign (CmmLocal rRem) $ CmmMachOp (MO_Sub rep) [n', CmmMachOp (MO_Mul rep) [q', CmmLit $ CmmInt d rep]]
-      where
-        platform = cmmPlatform cfg
-        d = narrowS rep d'
-    MO_U_QuotRem rep
-      | Just p <- exactLog2 d,
-        [rQuot,rRem] <- res -> do
-          -- first prepend the optimized division by a power 2
-          prependNode $! CmmAssign (CmmLocal rQuot) $ CmmMachOp (MO_U_Shr rep) [n, CmmLit (CmmInt p $ wordWidth platform)]
-          -- then output an optimized remainder by a power of 2
-          pure . Just $! CmmAssign (CmmLocal rRem) $ CmmMachOp (MO_And rep) [n, CmmLit (CmmInt (d - 1) rep)]
-      | canOptimizeDivision cfg rep,
-        d /= 0, d /= 1,
-        [rQuot,rRem] <- res -> do
-          -- we are definitely going to use n multiple times, so put it into a register
-          n' <- intoRegister n (cmmBits rep)
-          -- generate an optimized (unsigned) division of n by d
-          q <- generateDivisionByUnsigned platform cfg rep n' d
-          -- we also need the result multiple times to calculate the remainder
-          q' <- intoRegister q (cmmBits rep)
-
-          prependNode $! CmmAssign (CmmLocal rQuot) q'
-          -- The remainder now becomes n - q * d
-          pure . Just $! CmmAssign (CmmLocal rRem) $ CmmMachOp (MO_Sub rep) [n', CmmMachOp (MO_Mul rep) [q', CmmLit $ CmmInt d rep]]
-      where
-        platform = cmmPlatform cfg
-        d = narrowU rep d'
-    _ -> pure Nothing
-
-cmmCallishMachOpFoldM _ _ _ _ = pure Nothing
-
--- -----------------------------------------------------------------------------
--- Specialized constant folding for MachOps which sometimes need to expand into multiple nodes
-
-cmmMachOpFoldOptM :: CmmConfig -> MachOp -> [CmmExpr] -> Opt (Maybe CmmExpr)
-
-cmmMachOpFoldOptM cfg op [n, CmmLit (CmmInt d' _)] =
-  case op of
-    MO_S_Quot rep
-      -- recheck for power of 2 division. This may not be handled by cmmMachOpFoldM if n is not in a register
-      | Just p <- exactLog2 d -> do
-        n' <- intoRegister n (cmmBits rep)
-        pure . Just $! cmmMachOpFold platform (MO_S_Shr rep)
-          [ signedQuotRemHelper platform d n' rep p
-          , CmmLit (CmmInt p $ wordWidth platform)
-          ]
-      | canOptimizeDivision cfg rep,
-        d /= (-1), d /= 0, d /= 1 -> Just <$!> generateDivisionBySigned platform cfg rep n d
-      where d = narrowS rep d'
-    MO_S_Rem rep
-      -- recheck for power of 2 remainder. This may not be handled by cmmMachOpFoldM if n is not in a register
-      | Just p <- exactLog2 d -> do
-        n' <- intoRegister n (cmmBits rep)
-        pure . Just $! cmmMachOpFold platform (MO_Sub rep)
-          [ n'
-          , cmmMachOpFold platform (MO_And rep)
-              [ signedQuotRemHelper platform d n' rep p
-              , CmmLit (CmmInt (- d) rep)
-              ]
-          ]
-      | canOptimizeDivision cfg rep,
-        d /= (-1), d /= 0, d /= 1 -> do
-        n' <- intoRegister n (cmmBits rep)
-        -- first generate the division
-        q <- generateDivisionBySigned platform cfg rep n' d
-        -- then calculate the remainder by n - q * d
-        pure . Just $! CmmMachOp (MO_Sub rep) [n', CmmMachOp (MO_Mul rep) [q, CmmLit $ CmmInt d rep]]
-      where d = narrowS rep d'
-    MO_U_Quot rep
-      -- No need to recheck power of 2 division because cmmMachOpFoldM always handles that case
-      | canOptimizeDivision cfg rep,
-        d /= 0, d /= 1, Nothing <- exactLog2 d -> Just <$!> generateDivisionByUnsigned platform cfg rep n d
-      where d = narrowU rep d'
-    MO_U_Rem rep
-      -- No need to recheck power of 2 remainder because cmmMachOpFoldM always handles that case
-      | canOptimizeDivision cfg rep,
-        d /= 0, d /= 1, Nothing <- exactLog2 d -> do
-        n' <- intoRegister n (cmmBits rep)
-        -- first generate the division
-        q <- generateDivisionByUnsigned platform cfg rep n d
-        -- then calculate the remainder by n - q * d
-        pure . Just $! CmmMachOp (MO_Sub rep) [n', CmmMachOp (MO_Mul rep) [q, CmmLit $ CmmInt d rep]]
-      where d = narrowU rep d'
-    _ -> pure Nothing
-  where platform = cmmPlatform cfg
-
-cmmMachOpFoldOptM _ _ _ = pure Nothing
-
--- -----------------------------------------------------------------------------
--- Utils for prepending new nodes
-
--- Move an expression into a register to possibly use it multiple times
-intoRegister :: CmmExpr -> CmmType -> Opt CmmExpr
-intoRegister e@(CmmReg _) _ = pure e
-intoRegister expr ty = do
-  u <- getUniqueM
-  let reg = LocalReg u ty
-  CmmReg (CmmLocal reg) <$ prependNode (CmmAssign (CmmLocal reg) expr)
-
-prependNode :: CmmNode O O -> Opt ()
-prependNode n = Opt $ \_ xs -> pure (xs ++ [n], ())
-
--- -----------------------------------------------------------------------------
--- Division by constants utils
-
--- Helper for division by a power of 2
--- In contrast with unsigned integers, for signed ones
--- shift right is not the same as quot, because it rounds
--- to minus infinity, whereas quot rounds toward zero.
--- To fix this up, we add one less than the divisor to the
--- dividend if it is a negative number.
---
--- to avoid a test/jump, we use the following sequence:
---      x1 = x >> word_size-1  (all 1s if -ve, all 0s if +ve)
---      x2 = y & (divisor-1)
---      result = x + x2
--- this could be done a bit more simply using conditional moves,
--- but we're processor independent here.
---
--- we optimize the divide by 2 case slightly, generating
---      x1 = x >> word_size-1  (unsigned)
---      return = x + x1
-signedQuotRemHelper :: Platform -> Integer -> CmmExpr -> Width -> Integer -> CmmExpr
-signedQuotRemHelper platform n x rep p = CmmMachOp (MO_Add rep) [x, x2]
-  where
-    bits = fromIntegral (widthInBits rep) - 1
-    shr = if p == 1 then MO_U_Shr rep else MO_S_Shr rep
-    x1 = CmmMachOp shr [x, CmmLit (CmmInt bits $ wordWidth platform)]
-    x2 = if p == 1 then x1 else
-          CmmMachOp (MO_And rep) [x1, CmmLit (CmmInt (n-1) rep)]
-
-{- Note: [Division by constants]
-
-Integer division is floor(n / d), the goal is to find m,p
-such that floor((m * n) / 2^p) = floor(n / d).
-
-The idea being: n/d = n * (1/d). But we cannot store 1/d in an integer without
-some error, so we choose some 2^p / d such that the error ends up small and
-thus vanishes when we divide by 2^p again.
-
-The algorithm below to generate these numbers is taken from Hacker's Delight
-Second Edition Chapter 10 "Integer division by constants". The chapter also
-contains proof that this method does indeed produce correct results.
-
-However this is a much more literal interpretation of the algorithm,
-which we can use because of the unbounded Integer type. Hacker's Delight
-also provides a much more complex algorithm which computes these numbers
-without the need to exceed the word size, but that is not necessary here.
--}
-
-generateDivisionBySigned :: Platform -> CmmConfig -> Width -> CmmExpr -> Integer -> Opt CmmExpr
-
--- Sanity checks, division will generate incorrect results or undesirable code for these cases
--- cmmMachOpFoldM and cmmMachOpFoldOptM should have already handled these cases!
-generateDivisionBySigned _ _ _ _ 0 = panic "generate signed division with 0"
-generateDivisionBySigned _ _ _ _ 1 = panic "generate signed division with 1"
-generateDivisionBySigned _ _ _ _ (-1) = panic "generate signed division with -1"
-generateDivisionBySigned _ _ _ _ d | Just _ <- exactLog2 d = panic $ "generate signed division with " ++ show d
-
-generateDivisionBySigned platform _cfg rep n divisor = do
-  -- We only duplicate n' if we actually need to add/subtract it, so we may not need it in a register
-  n' <- if sign == 0 then pure n else intoRegister n resRep
-
-  -- Set up mul2
-  (shift', qExpr) <- mul2 n'
-
-  -- add/subtract n if necessary
-  let qExpr' = case sign of
-        1  -> CmmMachOp (MO_Add rep) [qExpr, n']
-        -1 -> CmmMachOp (MO_Sub rep) [qExpr, n']
-        _  -> qExpr
-
-  qExpr'' <- intoRegister (cmmMachOpFold platform (MO_S_Shr rep) [qExpr', CmmLit $ CmmInt shift' wordRep]) resRep
-
-  -- Lastly add the sign of the quotient to correct for negative results
-  pure $! cmmMachOpFold platform
-    (MO_Add rep) [qExpr'', cmmMachOpFold platform (MO_U_Shr rep) [qExpr'', CmmLit $ CmmInt (toInteger $ widthInBits rep - 1) wordRep]]
-  where
-    resRep = cmmBits rep
-    wordRep = wordWidth platform
-    (magic, sign, shift) = divisionMagicS rep divisor
-    -- generate the multiply with the magic number
-    mul2 n
-      -- Using mul2 for sub-word sizes regresses for signed integers only
-      | rep == wordWidth platform = do
-        (r1, r2, r3) <- (,,) <$> getUniqueM <*> getUniqueM <*> getUniqueM
-        let rg1    = LocalReg r1 resRep
-            resReg = LocalReg r2 resRep
-            rg3    = LocalReg r3 resRep
-        res <- CmmReg (CmmLocal resReg) <$ prependNode (CmmUnsafeForeignCall (PrimTarget (MO_S_Mul2 rep)) [rg1, resReg, rg3] [n, CmmLit $ CmmInt magic rep])
-        pure (shift, res)
-      -- widen the register and multiply without the MUL2 instruction
-      -- if we don't need an additional add after this we can combine the shifts
-      | otherwise = pure (if sign == 0 then 0 else shift, res)
-          where
-            wordRep = wordWidth platform
-            -- (n * magic) >> widthInBits + (if sign == 0 then shift else 0) -- With conversion in between to not overflow
-            res = cmmMachOpFold platform (MO_SS_Conv wordRep rep)
-                    [ cmmMachOpFold platform (MO_S_Shr wordRep)
-                      [ cmmMachOpFold platform (MO_Mul wordRep)
-                        [ cmmMachOpFold platform (MO_SS_Conv rep wordRep) [n]
-                        , CmmLit $ CmmInt magic wordRep
-                        ]
-                      -- Check if we need to generate an add/subtract later. If not we can combine this with the postshift
-                      , CmmLit $ CmmInt ((if sign == 0 then toInteger shift else 0) + (toInteger $ widthInBits rep)) wordRep
-                      ]
-                    ]
-
--- See hackers delight for how and why this works (chapter in note [Division by constants])
-divisionMagicS :: Width -> Integer -> (Integer, Integer, Integer)
-divisionMagicS rep divisor = (magic, sign, toInteger $ p - wSz)
-  where
-    sign = if divisor > 0
-      then if magic < 0 then 1 else 0
-      else if magic < 0 then 0 else -1
-    wSz = widthInBits rep
-    ad = abs divisor
-    t = (1 `shiftL` (wSz - 1)) + if divisor > 0 then 0 else 1
-    anc = t - 1 - rem t ad
-    go p'
-      | twoP > anc * (ad - rem twoP ad) = p'
-      | otherwise = go (p' + 1)
-      where twoP = 1 `shiftL` p'
-    p = go wSz
-    am = (twoP + ad - rem twoP ad) `quot` ad
-      where twoP = 1 `shiftL` p
-    magic = narrowS rep $ if divisor > 0 then am else -am
-
-generateDivisionByUnsigned :: Platform -> CmmConfig -> Width -> CmmExpr -> Integer -> Opt CmmExpr
--- Sanity checks, division will generate incorrect results or undesirable code for these cases
--- cmmMachOpFoldM and cmmMachOpFoldOptM should have already handled these cases!
-generateDivisionByUnsigned _ _ _ _ 0 = panic "generate signed division with 0"
-generateDivisionByUnsigned _ _ _ _ 1 = panic "generate signed division with 1"
-generateDivisionByUnsigned _ _ _ _ d | Just _ <- exactLog2 d = panic $ "generate signed division with " ++ show d
-
-generateDivisionByUnsigned platform cfg rep n divisor = do
-  -- We only duplicate n' if we actually need to add/subtract it, so we may not need it in a register
-  n' <- if not needsAdd -- Invariant: We also never preshift if we need an add, thus we don't need n in a register
-    then pure $! cmmMachOpFold platform (MO_U_Shr rep) [n, CmmLit $ CmmInt preShift wordRep]
-    else intoRegister n resRep
-
-  -- Set up mul2
-  (postShift', qExpr) <- mul2 n'
-
-  -- add/subtract n if necessary
-  let qExpr' = if needsAdd
-        -- This is qExpr + (n - qExpr) / 2 = (qExpr + n) / 2 but with a guarantee that it'll not overflow
-        then cmmMachOpFold platform (MO_Add rep)
-          [ cmmMachOpFold platform (MO_U_Shr rep)
-            [ cmmMachOpFold platform (MO_Sub rep) [n', qExpr]
-            , CmmLit $ CmmInt 1 wordRep
-            ]
-          , qExpr
-          ]
-        else qExpr
-      -- If we already divided by 2 in the add, remember to shift one bit less
-      -- Hacker's Delight, Edition 2 Page 234: postShift > 0 if we needed an add, except if the divisor
-      -- is 1, which we checked for above
-      finalShift = if needsAdd then postShift' - 1 else postShift'
-
-  -- apply the final postShift
-  pure $! cmmMachOpFold platform (MO_U_Shr rep) [qExpr', CmmLit $ CmmInt finalShift wordRep]
-  where
-    resRep = cmmBits rep
-    wordRep = wordWidth platform
-    (preShift, magic, needsAdd, postShift) =
-        let withPre = divisionMagicU rep True  divisor
-            noPre   = divisionMagicU rep False divisor
-        in case (withPre, noPre) of
-          -- Use whatever does not cause us to take the expensive case
-          ((_, _, False, _), (_, _, True, _)) -> withPre
-          -- If we cannot avoid the expensive case, don't bother with the pre shift
-          _ -> noPre
-    -- generate the multiply with the magic number
-    mul2 n
-      | rep == wordWidth platform || (cmmAllowMul2 cfg && needsAdd) = do
-        (r1, r2) <- (,) <$> getUniqueM <*> getUniqueM
-        let rg1    = LocalReg r1 resRep
-            resReg = LocalReg r2 resRep
-        res <- CmmReg (CmmLocal resReg) <$ prependNode (CmmUnsafeForeignCall (PrimTarget (MO_U_Mul2 rep)) [resReg, rg1] [n, CmmLit $ CmmInt magic rep])
-        pure (postShift, res)
-      | otherwise = do
-        pure (if needsAdd then postShift else 0, res)
-          where
-            wordRep = wordWidth platform
-            -- (n * magic) >> widthInBits + (if sign == 0 then shift else 0) -- With conversion in between to not overflow
-            res = cmmMachOpFold platform (MO_UU_Conv wordRep rep)
-              [ cmmMachOpFold platform (MO_U_Shr wordRep)
-                [ cmmMachOpFold platform (MO_Mul wordRep)
-                  [ cmmMachOpFold platform (MO_UU_Conv rep wordRep) [n]
-                  , CmmLit $ CmmInt magic wordRep
-                  ]
-                -- Check if we need to generate an add later. If not we can combine this with the postshift
-                , CmmLit $ CmmInt ((if needsAdd then 0 else postShift) + (toInteger $ widthInBits rep)) wordRep
-                ]
-              ]
-
--- See hackers delight for how and why this works (chapter in note [Division by constants])
--- The preshift isn't described there, but the idea is:
--- If a divisor d has n trailing zeros, then d is a multiple of 2^n. Since we want to divide x by d
--- we can also calculate (x / 2^n) / (d / 2^n) which may then not require an extra addition.
---
--- The addition performs: quotient + dividend, but we need to avoid overflows, so we actually need to
--- calculate: quotient + (dividend - quotient) / 2 = (quotient + dividend) / 2
--- Thus if the preshift can avoid all of this, we have 1 operation in place of 3.
---
--- The decision to use the preshift is made somewhere else, here we only report if the addition is needed
-divisionMagicU :: Width -> Bool -> Integer -> (Integer, Integer, Bool, Integer)
-divisionMagicU rep doPreShift divisor = (toInteger zeros, magic, needsAdd, toInteger $ p - wSz)
-  where
-    wSz = widthInBits rep
-    zeros = if doPreShift then countTrailingZeros $ fromInteger @Word64 divisor else 0
-    d = divisor `shiftR` zeros
-    ones = ((1 `shiftL` wSz) - 1) `shiftR` zeros
-    nc = ones - rem (ones - d) d
-    go p'
-      | twoP > nc * (d - 1 - rem (twoP - 1) d) = p'
-      | otherwise = go (p' + 1)
-      where twoP = 1 `shiftL` p'
-    p = go wSz
-    m = (twoP + d - 1 - rem (twoP - 1) d) `quot` d
-      where twoP = 1 `shiftL` p
-    needsAdd = d < 1 `shiftL` (p - wSz)
-    magic = if needsAdd then m - (ones + 1) else m
-
--- -----------------------------------------------------------------------------
--- Opt monad
-
-newtype Opt a = OptI { runOptI :: CmmConfig -> [CmmNode O O] -> UniqDSM ([CmmNode O O], a) }
-
--- | Pattern synonym for 'Opt', as described in Note [The one-shot state
--- monad trick].
-pattern Opt :: (CmmConfig -> [CmmNode O O] -> UniqDSM ([CmmNode O O], a)) -> Opt a
-pattern Opt f <- OptI f
-  where Opt f = OptI . oneShot $ \cfg -> oneShot $ \out -> f cfg out
-{-# COMPLETE Opt #-}
-
-runOpt :: CmmConfig -> Opt a -> UniqDSM ([CmmNode O O], a)
-runOpt cf (Opt g) = g cf []
-
-getConfig :: Opt CmmConfig
-getConfig = Opt $ \cf xs -> pure (xs, cf)
-
-instance Functor Opt where
-  fmap f (Opt g) = Opt $ \cf xs -> fmap (fmap f) (g cf xs)
-
-instance Applicative Opt where
-  pure a = Opt $ \_ xs -> pure (xs, a)
-  ff <*> fa = do
-    f <- ff
-    f <$> fa
-
-instance Monad Opt where
-  Opt g >>= f = Opt $ \cf xs -> do
-    (ys, a) <- g cf xs
-    runOptI (f a) cf ys
-
-instance MonadGetUnique Opt where
-  getUniqueM = Opt $ \_ xs -> (xs,) <$> getUniqueDSM
-
-mapForeignTargetOpt :: (CmmExpr -> Opt CmmExpr) -> ForeignTarget -> Opt ForeignTarget
-mapForeignTargetOpt exp   (ForeignTarget e c) = flip ForeignTarget c <$> exp e
-mapForeignTargetOpt _   m@(PrimTarget _)      = pure m
-
-wrapRecExpOpt :: (CmmExpr -> Opt CmmExpr) -> CmmExpr -> Opt CmmExpr
-wrapRecExpOpt f (CmmMachOp op es)       = traverse (wrapRecExpOpt f) es >>= f . CmmMachOp op
-wrapRecExpOpt f (CmmLoad addr ty align) = wrapRecExpOpt f addr >>= \newAddr -> f (CmmLoad newAddr ty align)
-wrapRecExpOpt f e                       = f e
-
-mapExpOpt :: (CmmExpr -> Opt CmmExpr) -> CmmNode e x -> Opt (CmmNode e x)
-mapExpOpt _ f@(CmmEntry{})                          = pure f
-mapExpOpt _ m@(CmmComment _)                        = pure m
-mapExpOpt _ m@(CmmTick _)                           = pure m
-mapExpOpt f   (CmmUnwind regs)                      = CmmUnwind <$> traverse (traverse (traverse f)) regs
-mapExpOpt f   (CmmAssign r e)                       = CmmAssign r <$> f e
-mapExpOpt f   (CmmStore addr e align)               = CmmStore <$> f addr <*> f e <*> pure align
-mapExpOpt f   (CmmUnsafeForeignCall tgt fs as)      = CmmUnsafeForeignCall <$> mapForeignTargetOpt f tgt <*> pure fs <*> traverse f as
-mapExpOpt _ l@(CmmBranch _)                         = pure l
-mapExpOpt f   (CmmCondBranch e ti fi l)             = f e >>= \newE -> pure (CmmCondBranch newE ti fi l)
-mapExpOpt f   (CmmSwitch e ids)                     = flip CmmSwitch ids <$> f e
-mapExpOpt f   n at CmmCall {cml_target=tgt}            = f tgt >>= \newTgt -> pure n{cml_target = newTgt}
-mapExpOpt f   (CmmForeignCall tgt fs as succ ret_args updfr intrbl)
-                                                    = do
-                                                      newTgt <- mapForeignTargetOpt f tgt
-                                                      newAs <- traverse f as
-                                                      pure $ CmmForeignCall newTgt fs newAs succ ret_args updfr intrbl


=====================================
compiler/GHC/Cmm/Pipeline.hs
=====================================
@@ -137,12 +137,9 @@ cpsTop logger platform cfg dus proc =
       dump Opt_D_dump_cmm_sp "Layout Stack" g
 
       ----------- Sink and inline assignments  --------------------------------
-      (g, dus) <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
-           if cmmOptSink cfg
-              then pure $ runUniqueDSM dus $ cmmSink cfg g
-              else return (g, dus)
-      dump Opt_D_dump_cmm_sink "Sink assignments" g
-
+      g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
+           condPass (cmmOptSink cfg) (cmmSink platform) g
+                    Opt_D_dump_cmm_sink "Sink assignments"
 
       ------------- CAF analysis ----------------------------------------------
       let cafEnv = {-# SCC "cafAnal" #-} cafAnal platform call_pps l g


=====================================
compiler/GHC/Cmm/Sink.hs
=====================================
@@ -20,8 +20,6 @@ import GHC.Platform.Regs
 
 import GHC.Platform
 import GHC.Types.Unique.FM
-import GHC.Types.Unique.DSM
-import GHC.Cmm.Config
 
 import Data.List (partition)
 import Data.Maybe
@@ -152,10 +150,9 @@ type Assignments = [Assignment]
   --     y = e2
   --     x = e1
 
-cmmSink :: CmmConfig -> CmmGraph -> UniqDSM CmmGraph
-cmmSink cfg graph = ofBlockList (g_entry graph) <$> sink mapEmpty blocks
+cmmSink :: Platform -> CmmGraph -> CmmGraph
+cmmSink platform graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
   where
-  platform = cmmPlatform cfg
   liveness = cmmLocalLivenessL platform graph
   getLive l = mapFindWithDefault emptyLRegSet l liveness
 
@@ -163,41 +160,11 @@ cmmSink cfg graph = ofBlockList (g_entry graph) <$> sink mapEmpty blocks
 
   join_pts = findJoinPoints blocks
 
-  sink :: LabelMap Assignments -> [CmmBlock] -> UniqDSM [CmmBlock]
-  sink _ [] = pure []
-  sink sunk (b:bs) = do
-    -- Now sink and inline in this block
-    (prepend, last_fold) <- runOpt cfg $ constantFoldNode last
-
-    (middle', assigs) <- walk cfg (ann_middles ++ annotate platform live_middle prepend) (mapFindWithDefault [] lbl sunk)
-
-    let (final_last, assigs') = tryToInline platform live last_fold assigs
-        -- Now, drop any assignments that we will not sink any further.
-        (dropped_last, assigs'') = dropAssignments platform drop_if init_live_sets assigs'
-        drop_if :: (LocalReg, CmmExpr, AbsMem)
-                      -> [LRegSet] -> (Bool, [LRegSet])
-        drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
-            where
-              should_drop =  conflicts platform a final_last
-                          || not (isTrivial platform rhs) && live_in_multi live_sets r
-                          || r `elemLRegSet` live_in_joins
-
-              live_sets' | should_drop = live_sets
-                        | otherwise   = map upd live_sets
-
-              upd set | r `elemLRegSet` set = set `unionLRegSet` live_rhs
-                      | otherwise           = set
-
-              live_rhs = foldRegsUsed platform (flip insertLRegSet) emptyLRegSet rhs
-
-        final_middle = foldl' blockSnoc middle' dropped_last
-
-        sunk' = mapUnion sunk $
-                  mapFromList [ (l, filterAssignments platform (getLive l) assigs'')
-                              | l <- succs ]
-
-    (blockJoin first final_middle final_last :) <$> sink sunk' bs
-
+  sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
+  sink _ [] = []
+  sink sunk (b:bs) =
+    -- pprTrace "sink" (ppr lbl) $
+    blockJoin first final_middle final_last : sink sunk' bs
     where
       lbl = entryLabel b
       (first, middle, last) = blockSplit b
@@ -211,6 +178,11 @@ cmmSink cfg graph = ofBlockList (g_entry graph) <$> sink mapEmpty blocks
       live_middle = gen_killL platform last live
       ann_middles = annotate platform live_middle (blockToList middle)
 
+      -- Now sink and inline in this block
+      (middle', assigs) = walk platform ann_middles (mapFindWithDefault [] lbl sunk)
+      fold_last = constantFoldNode platform last
+      (final_last, assigs') = tryToInline platform live fold_last assigs
+
       -- We cannot sink into join points (successors with more than
       -- one predecessor), so identify the join points and the set
       -- of registers live in them.
@@ -228,6 +200,31 @@ cmmSink cfg graph = ofBlockList (g_entry graph) <$> sink mapEmpty blocks
            (_one:_two:_) -> True
            _ -> False
 
+      -- Now, drop any assignments that we will not sink any further.
+      (dropped_last, assigs'') = dropAssignments platform drop_if init_live_sets assigs'
+
+      drop_if :: (LocalReg, CmmExpr, AbsMem)
+                      -> [LRegSet] -> (Bool, [LRegSet])
+      drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
+          where
+            should_drop =  conflicts platform a final_last
+                        || not (isTrivial platform rhs) && live_in_multi live_sets r
+                        || r `elemLRegSet` live_in_joins
+
+            live_sets' | should_drop = live_sets
+                       | otherwise   = map upd live_sets
+
+            upd set | r `elemLRegSet` set = set `unionLRegSet` live_rhs
+                    | otherwise          = set
+
+            live_rhs = foldRegsUsed platform (flip insertLRegSet) emptyLRegSet rhs
+
+      final_middle = foldl' blockSnoc middle' dropped_last
+
+      sunk' = mapUnion sunk $
+                 mapFromList [ (l, filterAssignments platform (getLive l) assigs'')
+                             | l <- succs ]
+
 {- TODO: enable this later, when we have some good tests in place to
    measure the effect and tune it.
 
@@ -302,7 +299,7 @@ filterAssignments platform live assigs = reverse (go assigs [])
 --    * a list of assignments that will be placed *after* that block.
 --
 
-walk :: CmmConfig
+walk :: Platform
      -> [(LRegSet, CmmNode O O)]    -- nodes of the block, annotated with
                                         -- the set of registers live *after*
                                         -- this node.
@@ -312,39 +309,36 @@ walk :: CmmConfig
                                         -- Earlier assignments may refer
                                         -- to later ones.
 
-     -> UniqDSM ( Block CmmNode O O             -- The new block
-               , Assignments                   -- Assignments to sink further
-               )
+     -> ( Block CmmNode O O             -- The new block
+        , Assignments                   -- Assignments to sink further
+        )
 
-walk cfg nodes assigs = go nodes emptyBlock assigs
+walk platform nodes assigs = go nodes emptyBlock assigs
  where
-   platform = cmmPlatform cfg
-   go []               block as = pure (block, as)
+   go []               block as = (block, as)
    go ((live,node):ns) block as
     -- discard nodes representing dead assignment
     | shouldDiscard node live             = go ns block as
-    | otherwise = do
-      (prepend, node1) <- runOpt cfg $ constantFoldNode node
-      if not (null prepend)
-        then go (annotate platform live (prepend ++ [node1]) ++ ns) block as
-        else do
-          let -- Inline assignments
-              (node2, as1) = tryToInline platform live node1 as
-              -- Drop any earlier assignments conflicting with node2
-              (dropped, as') = dropAssignmentsSimple platform
-                                (\a -> conflicts platform a node2) as1
-              -- Walk over the rest of the block. Includes dropped assignments
-              block' = foldl' blockSnoc block dropped `blockSnoc` node2
-
-          (prepend2, node3) <- runOpt cfg $ constantFoldNode node2
-          if | not (null prepend2)                 -> go (annotate platform live (prepend2 ++ [node3]) ++ ns) block as
-             -- sometimes only after simplification we can tell we can discard the node.
-             -- See Note [Discard simplified nodes]
-             | noOpAssignment node3                -> go ns block as
-             -- Pick up interesting assignments
-             | Just a <- shouldSink platform node3 -> go ns block (a : as1)
-             -- Try inlining, drop assignments and move on
-             | otherwise                           -> go ns block' as'
+    -- sometimes only after simplification we can tell we can discard the node.
+    -- See Note [Discard simplified nodes]
+    | noOpAssignment node2                = go ns block as
+    -- Pick up interesting assignments
+    | Just a <- shouldSink platform node2 = go ns block (a : as1)
+    -- Try inlining, drop assignments and move on
+    | otherwise                           = go ns block' as'
+    where
+      -- Simplify node
+      node1 = constantFoldNode platform node
+
+      -- Inline assignments
+      (node2, as1) = tryToInline platform live node1 as
+
+      -- Drop any earlier assignments conflicting with node2
+      (dropped, as') = dropAssignmentsSimple platform
+                          (\a -> conflicts platform a node2) as1
+
+      -- Walk over the rest of the block. Includes dropped assignments
+      block' = foldl' blockSnoc block dropped `blockSnoc` node2
 
 {- Note [Discard simplified nodes]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


=====================================
compiler/GHC/Driver/Config/Cmm.hs
=====================================
@@ -24,17 +24,5 @@ initCmmConfig dflags = CmmConfig
   , cmmDoCmmSwitchPlans    = not (backendHasNativeSwitch (backend dflags))
   , cmmSplitProcPoints     = not (backendSupportsUnsplitProcPoints (backend dflags))
                              || not (platformTablesNextToCode platform)
-  , cmmAllowMul2           = (ncg && x86ish) || llvm
-  , cmmOptConstDivision    = not llvm
   }
   where platform                = targetPlatform dflags
-        -- Copied from StgToCmm
-        (ncg, llvm) = case backendPrimitiveImplementation (backend dflags) of
-                          GenericPrimitives -> (False, False)
-                          NcgPrimitives -> (True, False)
-                          LlvmPrimitives -> (False, True)
-                          JSPrimitives -> (False, False)
-        x86ish  = case platformArch platform of
-                    ArchX86    -> True
-                    ArchX86_64 -> True
-                    _          -> False


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -1571,28 +1571,28 @@ emitPrimOp cfg primop =
   CastDoubleToWord64Op -> translateBitcasts (MO_FW_Bitcast W64)
   CastWord64ToDoubleOp -> translateBitcasts (MO_WF_Bitcast W64)
 
-  IntQuotRemOp -> opCallishHandledLater $
-    if allowQuotRem
+  IntQuotRemOp -> \args -> flip opCallishHandledLater args $
+    if allowQuotRem && not (quotRemCanBeOptimized args)
     then Left (MO_S_QuotRem  (wordWidth platform))
     else Right (genericIntQuotRemOp (wordWidth platform))
 
-  Int8QuotRemOp -> opCallishHandledLater $
-    if allowQuotRem
+  Int8QuotRemOp -> \args -> flip opCallishHandledLater args $
+    if allowQuotRem && not (quotRemCanBeOptimized args)
     then Left (MO_S_QuotRem W8)
     else Right (genericIntQuotRemOp W8)
 
-  Int16QuotRemOp -> opCallishHandledLater $
-    if allowQuotRem
+  Int16QuotRemOp -> \args -> flip opCallishHandledLater args $
+    if allowQuotRem && not (quotRemCanBeOptimized args)
     then Left (MO_S_QuotRem W16)
     else Right (genericIntQuotRemOp W16)
 
-  Int32QuotRemOp -> opCallishHandledLater $
-    if allowQuotRem
+  Int32QuotRemOp -> \args -> flip opCallishHandledLater args $
+    if allowQuotRem && not (quotRemCanBeOptimized args)
     then Left (MO_S_QuotRem W32)
     else Right (genericIntQuotRemOp W32)
 
-  WordQuotRemOp -> opCallishHandledLater $
-    if allowQuotRem
+  WordQuotRemOp -> \args -> flip opCallishHandledLater args $
+    if allowQuotRem && not (quotRemCanBeOptimized args)
     then Left (MO_U_QuotRem  (wordWidth platform))
     else Right (genericWordQuotRemOp (wordWidth platform))
 
@@ -1601,18 +1601,18 @@ emitPrimOp cfg primop =
     then Left (MO_U_QuotRem2 (wordWidth platform))
     else Right (genericWordQuotRem2Op platform)
 
-  Word8QuotRemOp -> opCallishHandledLater $
-    if allowQuotRem
+  Word8QuotRemOp -> \args -> flip opCallishHandledLater args $
+    if allowQuotRem && not (quotRemCanBeOptimized args)
     then Left (MO_U_QuotRem W8)
     else Right (genericWordQuotRemOp W8)
 
-  Word16QuotRemOp -> opCallishHandledLater $
-    if allowQuotRem
+  Word16QuotRemOp -> \args -> flip opCallishHandledLater args $
+    if allowQuotRem && not (quotRemCanBeOptimized args)
     then Left (MO_U_QuotRem W16)
     else Right (genericWordQuotRemOp W16)
 
-  Word32QuotRemOp -> opCallishHandledLater $
-    if allowQuotRem
+  Word32QuotRemOp -> \args -> flip opCallishHandledLater args $
+    if allowQuotRem && not (quotRemCanBeOptimized args)
     then Left (MO_U_QuotRem W32)
     else Right (genericWordQuotRemOp W32)
 
@@ -1835,6 +1835,23 @@ emitPrimOp cfg primop =
     pure $ map (CmmReg . CmmLocal) regs
 
   alwaysExternal = \_ -> PrimopCmmEmit_External
+  -- Note [QuotRem optimization]
+  -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  -- `quot` and `rem` with constant divisor can be implemented with fast bit-ops
+  -- (shift, .&.).
+  --
+  -- Currently we only support optimization (performed in GHC.Cmm.Opt) when the
+  -- constant is a power of 2. #9041 tracks the implementation of the general
+  -- optimization.
+  --
+  -- `quotRem` can be optimized in the same way. However as it returns two values,
+  -- it is implemented as a "callish" primop which is harder to match and
+  -- to transform later on. For simplicity, the current implementation detects cases
+  -- that can be optimized (see `quotRemCanBeOptimized`) and converts STG quotRem
+  -- primop into two CMM quot and rem primops.
+  quotRemCanBeOptimized = \case
+    [_, CmmLit (CmmInt n _) ] -> isJust (exactLog2 n)
+    _                         -> False
 
   allowQuotRem  = stgToCmmAllowQuotRemInstr         cfg
   allowQuotRem2 = stgToCmmAllowQuotRem2             cfg


=====================================
testsuite/tests/numeric/should_run/T25653.hs
=====================================
@@ -0,0 +1,10 @@
+module Main (main) where
+
+import Data.Word (Word8)
+
+myMod :: Word8 -> Word8
+myMod i = mod i 7
+{-# NOINLINE myMod #-}
+
+main :: IO ()
+main = print $ myMod 5


=====================================
testsuite/tests/numeric/should_run/T25653.stdout
=====================================
@@ -0,0 +1 @@
+5


=====================================
testsuite/tests/numeric/should_run/all.T
=====================================
@@ -86,3 +86,4 @@ test('foundation', [when(js_arch(), run_timeout_multiplier(2)), js_fragile(24259
 test('T24066', normal, compile_and_run, [''])
 test('div01', normal, compile_and_run, [''])
 test('T24245', normal, compile_and_run, [''])
+test('T25653', normal, compile_and_run, [''])



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/bf4f5ad31ddeb87baad10de47f84081648f808dc...163aa50aad7a24912c5cc0ade3ef80b108e99f1a

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/bf4f5ad31ddeb87baad10de47f84081648f808dc...163aa50aad7a24912c5cc0ade3ef80b108e99f1a
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20250118/4057e4f7/attachment-0001.html>


More information about the ghc-commits mailing list