[Git][ghc/ghc][wip/andreask/deep_discounts] 2 commits: Prototype for deep inlining discounts.

Sun Oct 2 14:24:00 UTC 2022

Andreas Klebinger pushed to branch wip/andreask/deep_discounts at Glasgow Haskell Compiler / GHC


Commits:
9198be09 by Andreas Klebinger at 2022-10-02T15:13:53+02:00
Prototype for deep inlining discounts.

This is very much not finished but the basic idea is the allow for
reasonably accurate estimates how much smaller a inlined function will
be if applied to a certain argument.

See #21938 for a proper description of the idea.

Optimize UnVarSet slightly

min discount of 10 if the arg is a value

Add a depth discount to nested argInfo/argGuidance

Move some functions around to avoid a module loop

- - - - -
4cbe93cf by Andreas Klebinger at 2022-10-02T16:09:37+02:00
Remove one flag

- - - - -


17 changed files:

- compiler/GHC/Core.hs
- compiler/GHC/Core/Opt/Simplify/Env.hs
- + compiler/GHC/Core/Opt/Simplify/Inline.hs
- compiler/GHC/Core/Opt/Simplify/Iteration.hs
- compiler/GHC/Core/Opt/Simplify/Utils.hs
- compiler/GHC/Core/Ppr.hs
- compiler/GHC/Core/Seq.hs
- compiler/GHC/Core/Unfold.hs
- compiler/GHC/Data/Graph/UnVar.hs
- compiler/GHC/Driver/Flags.hs
- compiler/GHC/Driver/Session.hs
- compiler/GHC/Types/Unique/FM.hs
- compiler/GHC/Types/Var/Env.hs
- compiler/GHC/Utils/Outputable.hs
- compiler/ghc.cabal.in
- docs/users_guide/hints.rst
- docs/users_guide/using-optimisation.rst


Changes:

=====================================
compiler/GHC/Core.hs
=====================================
@@ -55,6 +55,7 @@ module GHC.Core (
 
         -- * Unfolding data types
         Unfolding(..),  UnfoldingGuidance(..), UnfoldingSource(..),
+        ArgDiscount(..), ConMap, ConDiscount(..),
 
         -- ** Constructing 'Unfolding's
         noUnfolding, bootUnfolding, evaldUnfolding, mkOtherCon,
@@ -62,12 +63,13 @@ module GHC.Core (
 
         -- ** Predicates and deconstruction on 'Unfolding'
         unfoldingTemplate, expandUnfolding_maybe,
-        maybeUnfoldingTemplate, otherCons,
+        maybeUnfoldingTemplate, maybeUnfoldingGuidance, otherCons,
         isValueUnfolding, isEvaldUnfolding, isCheapUnfolding,
         isExpandableUnfolding, isConLikeUnfolding, isCompulsoryUnfolding,
         isStableUnfolding, isInlineUnfolding, isBootUnfolding,
         hasCoreUnfolding, hasSomeUnfolding,
-        canUnfold, neverUnfoldGuidance, isStableSource,
+        canUnfold, neverUnfoldGuidance, argGuidance, isStableSource,
+        discountDepth,
 
         -- * Annotated expression data types
         AnnExpr, AnnExpr'(..), AnnBind(..), AnnAlt(..),
@@ -120,6 +122,7 @@ import GHC.Utils.Panic.Plain
 import Data.Data hiding (TyCon)
 import Data.Int
 import Data.Word
+import GHC.Types.Unique.FM
 
 infixl 4 `mkApps`, `mkTyApps`, `mkVarApps`, `App`, `mkCoApps`
 -- Left associative, so that we can say (f `mkTyApps` xs `mkVarApps` ys)
@@ -1367,6 +1370,44 @@ data UnfoldingSource
                        -- Inline absolutely always, however boring the context.
 
 
+-- Argument use is modeled as a tree
+type ConMap a = UniqFM DataCon a
+
+data ConDiscount = ConDiscount { cd_con :: !DataCon
+                               , cd_discount :: !Int
+                               , cd_arg_discounts :: [ArgDiscount]
+                               }
+                  deriving (Eq)
+
+instance Outputable ConDiscount where
+  ppr (ConDiscount c d as) = ppr c <> text ":" <> ppr d <> (ppr as)
+
+data ArgDiscount
+            -- Argument is used, either sequed with a single alternative
+            -- used as argument to a interesting function or in other ways
+            -- which could make this argument worthwhile to inline.
+            = SomeArgUse { ad_seq_discount :: !Int }
+            -- Argument is used to discriminate between case alternatives.
+            -- We give specific constructors a discount based on the alternative
+            -- they will select, and provide a generic discount if we know the arg
+            -- is a value but not what value exactly.
+            -- Only of the the two discounts might be applied for the same argument.
+            | DiscSeq { ad_seq_discount :: !Int -- ^ Discount if no specific constructor discount matches
+                      , ad_con_discount :: !(ConMap ConDiscount) -- ^ Discounts for specific constructors
+                      }
+            -- A discount for the use of a function.
+            | FunDisc { ad_seq_discount :: !Int, ad_fun :: !Name}
+            | NoSeqUse
+            deriving Eq
+
+instance Outputable ArgDiscount where
+  ppr (SomeArgUse n)= text "seq:" <> ppr n
+  ppr (NoSeqUse)= text "lazy use"
+  ppr (FunDisc d f ) = text "fun-"<>ppr f<>text ":"<> ppr d
+  ppr (DiscSeq d_seq m)
+    | isNullUFM m = text "disc:"<> ppr d_seq
+    | otherwise = sep (punctuate comma ((text "some_con:"<> ppr d_seq) : map ppr (nonDetEltsUFM m)))
+      -- (text "some_con:"<> ppr d_seq) <> text "||" <> braces (pprUFM m ppr)
 
 -- | 'UnfoldingGuidance' says when unfolding should take place
 data UnfoldingGuidance
@@ -1384,13 +1425,13 @@ data UnfoldingGuidance
   | UnfIfGoodArgs {     -- Arose from a normal Id; the info here is the
                         -- result of a simple analysis of the RHS
 
-      ug_args ::  [Int],  -- Discount if the argument is evaluated.
+      ug_args ::  [ArgDiscount],  -- Discount if the argument is evaluated.
                           -- (i.e., a simplification will definitely
                           -- be possible).  One elt of the list per *value* arg.
 
-      ug_size :: Int,     -- The "size" of the unfolding.
+      ug_size :: !Int,     -- The "size" of the unfolding.
 
-      ug_res :: Int       -- Scrutinee discount: the discount to subtract if the thing is in
+      ug_res :: !Int       -- Scrutinee discount: the discount to subtract if the thing is in
     }                     -- a context (case (thing args) of ...),
                           -- (where there are the right number of arguments.)
 
@@ -1494,6 +1535,10 @@ maybeUnfoldingTemplate (DFunUnfolding { df_bndrs = bndrs, df_con = con, df_args
 maybeUnfoldingTemplate _
   = Nothing
 
+maybeUnfoldingGuidance :: Unfolding -> Maybe UnfoldingGuidance
+maybeUnfoldingGuidance CoreUnfolding { uf_guidance = guidance } = Just guidance
+maybeUnfoldingGuidance _ = Nothing
+
 -- | The constructors that the unfolding could never be:
 -- returns @[]@ if no information is available
 otherCons :: Unfolding -> [AltCon]
@@ -1582,6 +1627,30 @@ neverUnfoldGuidance :: UnfoldingGuidance -> Bool
 neverUnfoldGuidance UnfNever = True
 neverUnfoldGuidance _        = False
 
+-- Returns a list of available argument discounts if any.
+argGuidance :: UnfoldingGuidance -> [ArgDiscount]
+argGuidance UnfIfGoodArgs { ug_args = arg_guides } = arg_guides
+argGuidance _ = []
+
+discountDepth :: ArgDiscount -> Int
+discountDepth dc = case dc of
+    NoSeqUse -> 0
+    FunDisc{} -> 1
+    SomeArgUse{} -> 1
+    DiscSeq { ad_con_discount = con_dc} ->
+      let max_con_depth =
+            nonDetStrictFoldUFM
+              (\(e :: ConDiscount) max_depth ->
+                max max_depth
+                    (maximum (1:(map discountDepth (cd_arg_discounts e)))))
+              0
+              con_dc
+      in max 1 max_con_depth
+
+
+
+
+
 hasCoreUnfolding :: Unfolding -> Bool
 -- An unfolding "has Core" if it contains a Core expression, which
 -- may mention free variables. See Note [Fragile unfoldings]
@@ -1953,6 +2022,10 @@ bindersOf (Rec pairs)       = [binder | (binder, _) <- pairs]
 bindersOfBinds :: [Bind b] -> [b]
 bindersOfBinds binds = foldr ((++) . bindersOf) [] binds
 
+{-# INLINE foldBindersOf #-}
+foldBindersOf :: (a -> b -> a) -> Bind b -> a -> a
+foldBindersOf f b r = foldl' f r (bindersOf b)
+
 rhssOfBind :: Bind b -> [Expr b]
 rhssOfBind (NonRec _ rhs) = [rhs]
 rhssOfBind (Rec pairs)    = [rhs | (_,rhs) <- pairs]


=====================================
compiler/GHC/Core/Opt/Simplify/Env.hs
=====================================
@@ -22,6 +22,7 @@ module GHC.Core.Opt.Simplify.Env (
         zapSubstEnv, setSubstEnv, bumpCaseDepth,
         getInScope, setInScopeFromE, setInScopeFromF,
         setInScopeSet, modifyInScope, addNewInScopeIds,
+        addNewInScopeId, addNewInScopeBndr,
         getSimplRules, enterRecGroupRHSs,
 
         -- * Substitution results
@@ -561,6 +562,20 @@ setInScopeFromE rhs_env here_env = rhs_env { seInScope = seInScope here_env }
 setInScopeFromF :: SimplEnv -> SimplFloats -> SimplEnv
 setInScopeFromF env floats = env { seInScope = sfInScope floats }
 
+addNewInScopeId :: SimplEnv -> CoreBndr -> SimplEnv
+addNewInScopeId env@(SimplEnv { seInScope = in_scope, seIdSubst = id_subst }) v
+-- See Note [Bangs in the Simplifier]
+  = let !in_scope1 = in_scope `extendInScopeSet` v
+        !id_subst1 = id_subst `delVarEnv` v
+    in
+    env { seInScope = in_scope1,
+          seIdSubst = id_subst1 }
+        -- Why delete?  Consider
+        --      let x = a*b in (x, \x -> x+3)
+        -- We add [x |-> a*b] to the substitution, but we must
+        -- _delete_ it from the substitution when going inside
+        -- the (\x -> ...)!
+
 addNewInScopeIds :: SimplEnv -> [CoreBndr] -> SimplEnv
         -- The new Ids are guaranteed to be freshly allocated
 addNewInScopeIds env@(SimplEnv { seInScope = in_scope, seIdSubst = id_subst }) vs
@@ -576,6 +591,21 @@ addNewInScopeIds env@(SimplEnv { seInScope = in_scope, seIdSubst = id_subst }) v
         -- _delete_ it from the substitution when going inside
         -- the (\x -> ...)!
 
+addNewInScopeBndr :: SimplEnv -> CoreBind -> SimplEnv
+addNewInScopeBndr env@(SimplEnv { seInScope = in_scope, seIdSubst = id_subst }) b
+-- See Note [Bangs in the Simplifier]
+  = let vs = bindersOf b
+        !in_scope1 = in_scope `extendInScopeSetList` vs
+        !id_subst1 = id_subst `delVarEnvList` vs
+    in
+    env { seInScope = in_scope1,
+          seIdSubst = id_subst1 }
+        -- Why delete?  Consider
+        --      let x = a*b in (x, \x -> x+3)
+        -- We add [x |-> a*b] to the substitution, but we must
+        -- _delete_ it from the substitution when going inside
+        -- the (\x -> ...)!
+
 modifyInScope :: SimplEnv -> CoreBndr -> SimplEnv
 -- The variable should already be in scope, but
 -- replace the existing version with this new one


=====================================
compiler/GHC/Core/Opt/Simplify/Inline.hs
=====================================
@@ -0,0 +1,690 @@
+-- {-# OPTIONS_GHC -ddump-simpl -ddump-to-file -ddump-stg-final -dsuppress-coercions -dsuppress-coercion-types #-}
+
+module GHC.Core.Opt.Simplify.Inline
+where
+
+import GHC.Prelude
+
+import GHC.Core
+import GHC.Core.Unfold
+import GHC.Core.TyCon (isClassTyCon)
+
+import GHC.Types.Name
+import GHC.Types.Id
+import GHC.Types.Basic
+
+import GHC.Utils.Misc
+import GHC.Utils.Outputable
+
+import Data.List        ( isPrefixOf )
+import GHC.Utils.Logger
+import GHC.Driver.Flags
+import GHC.Types.Unique.FM
+import GHC.Core.Opt.Simplify.Env
+import GHC.Types.Literal
+import GHC.Core.DataCon
+import GHC.Types.Var
+import GHC.Core.Opt.Simplify.Utils
+import GHC.Utils.Panic.Plain (assert)
+import GHC.Utils.Trace
+import GHC.Types.Tickish
+
+callSiteInline :: Logger
+               -> UnfoldingOpts
+               -> Int                   -- Case depth
+               -> Id                    -- The Id
+               -> Bool                  -- True <=> unfolding is active
+               -> Bool                  -- True if there are no arguments at all (incl type args)
+               -> [ArgSummary]          -- One for each value arg; True if it is interesting
+               -> CallCtxt              -- True <=> continuation is interesting
+               -> Maybe CoreExpr        -- Unfolding, if any
+callSiteInline logger !opts !case_depth id active_unfolding lone_variable arg_infos cont_info
+  = case idUnfolding id of
+      -- idUnfolding checks for loop-breakers, returning NoUnfolding
+      -- Things with an INLINE pragma may have an unfolding *and*
+      -- be a loop breaker  (maybe the knot is not yet untied)
+        CoreUnfolding { uf_tmpl = unf_template
+                      , uf_is_work_free = is_wf
+                      , uf_guidance = guidance, uf_expandable = is_exp }
+          | active_unfolding -> tryUnfolding logger opts case_depth id lone_variable
+                                    arg_infos cont_info unf_template
+                                    is_wf is_exp guidance
+          | otherwise -> traceInline logger opts id "Inactive unfolding:" (ppr id) Nothing
+        NoUnfolding      -> Nothing
+        BootUnfolding    -> Nothing
+        OtherCon {}      -> Nothing
+        DFunUnfolding {} -> Nothing     -- Never unfold a DFun
+
+-- | Report the inlining of an identifier's RHS to the user, if requested.
+traceInline :: Logger -> UnfoldingOpts -> Id -> String -> SDoc -> a -> a
+traceInline logger !opts inline_id str doc result
+  -- We take care to ensure that doc is used in only one branch, ensuring that
+  -- the simplifier can push its allocation into the branch. See Note [INLINE
+  -- conditional tracing utilities].
+  | enable    = logTraceMsg logger str doc result
+  | otherwise = result
+  where
+    enable
+      | logHasDumpFlag logger Opt_D_dump_verbose_inlinings
+      = True
+      | Just prefix <- unfoldingReportPrefix opts
+      = prefix `isPrefixOf` occNameString (getOccName inline_id)
+      | otherwise
+      = False
+{-# INLINE traceInline #-} -- see Note [INLINE conditional tracing utilities]
+
+{- Note [Avoid inlining into deeply nested cases]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Consider a function f like this:
+
+  f arg1 arg2 =
+    case ...
+      ... -> g arg1
+      ... -> g arg2
+
+This function is small. So should be safe to inline.
+However sometimes this doesn't quite work out like that.
+Consider this code:
+
+f1 arg1 arg2 ... = ...
+    case _foo of
+      alt1 -> ... f2 arg1 ...
+      alt2 -> ... f2 arg2 ...
+
+f2 arg1 arg2 ... = ...
+    case _foo of
+      alt1 -> ... f3 arg1 ...
+      alt2 -> ... f3 arg2 ...
+
+f3 arg1 arg2 ... = ...
+
+... repeats up to n times. And then f1 is
+applied to some arguments:
+
+foo = ... f1 <interestingArgs> ...
+
+Initially f2..fn are not interesting to inline so we don't.
+However we see that f1 is applied to interesting args.
+So it's an obvious choice to inline those:
+
+foo =
+    ...
+      case _foo of
+        alt1 -> ... f2 <interestingArg> ...
+        alt2 -> ... f2 <interestingArg> ...
+
+As a result we go and inline f2 both mentions of f2 in turn are now applied to interesting
+arguments and f2 is small:
+
+foo =
+    ...
+      case _foo of
+        alt1 -> ... case _foo of
+            alt1 -> ... f3 <interestingArg> ...
+            alt2 -> ... f3 <interestingArg> ...
+
+        alt2 -> ... case _foo of
+            alt1 -> ... f3 <interestingArg> ...
+            alt2 -> ... f3 <interestingArg> ...
+
+The same thing happens for each binding up to f_n, duplicating the amount of inlining
+done in each step. Until at some point we are either done or run out of simplifier
+ticks/RAM. This pattern happened #18730.
+
+To combat this we introduce one more heuristic when weighing inlining decision.
+We keep track of a "case-depth". Which increases each time we look inside a case
+expression with more than one alternative.
+
+We then apply a penalty to inlinings based on the case-depth at which they would
+be inlined. Bounding the number of inlinings in such a scenario.
+
+The heuristic can be tuned in two ways:
+
+* We can ignore the first n levels of case nestings for inlining decisions using
+  -funfolding-case-threshold.
+* The penalty grows linear with the depth. It's computed as size*(depth-threshold)/scaling.
+  Scaling can be set with -funfolding-case-scaling.
+
+Some guidance on setting these defaults:
+
+* A low treshold (<= 2) is needed to prevent exponential cases from spiraling out of
+  control. We picked 2 for no particular reason.
+* Scaling the penalty by any more than 30 means the reproducer from
+  T18730 won't compile even with reasonably small values of n. Instead
+  it will run out of runs/ticks. This means to positively affect the reproducer
+  a scaling <= 30 is required.
+* A scaling of >= 15 still causes a few very large regressions on some nofib benchmarks.
+  (+80% for gc/fulsom, +90% for real/ben-raytrace, +20% for spectral/fibheaps)
+* A scaling of >= 25 showed no regressions on nofib. However it showed a number of
+  (small) regression for compiler perf benchmarks.
+
+The end result is that we are settling for a scaling of 30, with a threshold of 2.
+This gives us minimal compiler perf regressions. No nofib runtime regressions and
+will still avoid this pattern sometimes. This is a "safe" default, where we err on
+the side of compiler blowup instead of risking runtime regressions.
+
+For cases where the default falls short the flag can be changed to allow more/less inlining as
+needed on a per-module basis.
+
+-}
+
+tryUnfolding :: Logger -> UnfoldingOpts -> Int -> Id -> Bool -> [ArgSummary] -> CallCtxt
+             -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
+             -> Maybe CoreExpr
+tryUnfolding logger opts !case_depth id lone_variable
+             arg_infos cont_info unf_template
+             is_wf is_exp guidance
+ = case guidance of
+     UnfNever -> traceInline logger opts id str (text "UnfNever") Nothing
+
+     UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
+        | enough_args && (boring_ok || some_benefit || unfoldingVeryAggressive opts)
+                -- See Note [INLINE for small functions] (3)
+        -> traceInline logger opts id str (mk_doc some_benefit empty True) (Just unf_template)
+        | otherwise
+        -> traceInline logger opts id str (mk_doc some_benefit empty False) Nothing
+        where
+          some_benefit = calc_some_benefit uf_arity
+          enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
+
+     UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
+        | unfoldingVeryAggressive opts
+        -> traceInline logger opts id str (mk_doc some_benefit extra_doc True) (Just unf_template)
+        | is_wf && some_benefit && small_enough
+        -> traceInline logger opts id str (mk_doc some_benefit extra_doc True) (Just unf_template)
+        | otherwise
+        -> traceInline logger opts id str (mk_doc some_benefit extra_doc False) Nothing
+        where
+          !some_benefit = calc_some_benefit (length arg_discounts)
+          extra_doc = vcat [ text "case depth =" <+> int case_depth
+                           , text "depth based penalty =" <+> int depth_penalty
+                           , text "discounted size =" <+> int adjusted_size ]
+          -- See Note [Avoid inlining into deeply nested cases]
+          depth_treshold = unfoldingCaseThreshold opts
+          depth_scaling = unfoldingCaseScaling opts
+          depth_penalty | case_depth <= depth_treshold = 0
+                        | otherwise       = (size * (case_depth - depth_treshold)) `div` depth_scaling
+          !adjusted_size = size + depth_penalty - discount
+          !small_enough = adjusted_size <= unfoldingUseThreshold opts
+          discount = computeDiscount arg_discounts res_discount arg_infos cont_info
+
+  where
+    mk_doc some_benefit extra_doc yes_or_no
+      = vcat [ text "arg infos" <+> ppr arg_infos
+             , text "interesting continuation" <+> ppr cont_info
+             , text "some_benefit" <+> ppr some_benefit
+             , text "is exp:" <+> ppr is_exp
+             , text "is work-free:" <+> ppr is_wf
+             , text "guidance" <+> ppr guidance
+             , extra_doc
+             , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
+
+    ctx = log_default_dump_context (logFlags logger)
+    str = "Considering inlining: " ++ showSDocOneLine ctx (ppr id)
+    n_val_args = length arg_infos
+
+           -- some_benefit is used when the RHS is small enough
+           -- and the call has enough (or too many) value
+           -- arguments (ie n_val_args >= arity). But there must
+           -- be *something* interesting about some argument, or the
+           -- result context, to make it worth inlining
+    calc_some_benefit :: Arity -> Bool   -- The Arity is the number of args
+                                         -- expected by the unfolding
+    calc_some_benefit uf_arity
+       | not saturated = interesting_args       -- Under-saturated
+                                        -- Note [Unsaturated applications]
+       | otherwise = interesting_args   -- Saturated or over-saturated
+                  || interesting_call
+      where
+        saturated      = n_val_args >= uf_arity
+        over_saturated = n_val_args > uf_arity
+        interesting_args = any nonTrivArg arg_infos
+                -- NB: (any nonTriv arg_infos) looks at the
+                -- over-saturated args too which is "wrong";
+                -- but if over-saturated we inline anyway.
+
+        interesting_call
+          | over_saturated
+          = True
+          | otherwise
+          = case cont_info of
+              CaseCtxt   -> not (lone_variable && is_exp)  -- Note [Lone variables]
+              ValAppCtxt -> True                           -- Note [Cast then apply]
+              RuleArgCtxt -> uf_arity > 0  -- See Note [RHS of lets]
+              DiscArgCtxt -> uf_arity > 0  -- Note [Inlining in ArgCtxt]
+              RhsCtxt NonRecursive
+                          -> uf_arity > 0  -- See Note [RHS of lets]
+              _other      -> False         -- See Note [Nested functions]
+
+
+{- Note [RHS of lets]
+~~~~~~~~~~~~~~~~~~~~~
+When the call is the argument of a function with a RULE, or the RHS of a let,
+we are a little bit keener to inline (in tryUnfolding).  For example
+     f y = (y,y,y)
+     g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
+We'd inline 'f' if the call was in a case context, and it kind-of-is,
+only we can't see it.  Also
+     x = f v
+could be expensive whereas
+     x = case v of (a,b) -> a
+is patently cheap and may allow more eta expansion.
+
+So, in `interesting_call` in `tryUnfolding`, we treat the RHS of a
+/non-recursive/ let as not-totally-boring.  A /recursive/ let isn't
+going be inlined so there is much less point.  Hence the (only reason
+for the) RecFlag in RhsCtxt
+
+Note [Unsaturated applications]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When a call is not saturated, we *still* inline if one of the
+arguments has interesting structure.  That's sometimes very important.
+A good example is the Ord instance for Bool in Base:
+
+ Rec {
+    $fOrdBool =GHC.Classes.D:Ord
+                 @ Bool
+                 ...
+                 $cmin_ajX
+
+    $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
+    $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
+  }
+
+But the defn of GHC.Classes.$dmmin is:
+
+  $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
+    {- Arity: 3, HasNoCafRefs, Strictness: SLL,
+       Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
+                   case @ a GHC.Classes.<= @ a $dOrd x y of wild {
+                     GHC.Types.False -> y GHC.Types.True -> x }) -}
+
+We *really* want to inline $dmmin, even though it has arity 3, in
+order to unravel the recursion.
+
+
+Note [Things to watch]
+~~~~~~~~~~~~~~~~~~~~~~
+*   { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
+    Assume x is exported, so not inlined unconditionally.
+    Then we want x to inline unconditionally; no reason for it
+    not to, and doing so avoids an indirection.
+
+*   { x = I# 3; ....f x.... }
+    Make sure that x does not inline unconditionally!
+    Lest we get extra allocation.
+
+Note [Inlining an InlineRule]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+An InlineRules is used for
+  (a) programmer INLINE pragmas
+  (b) inlinings from worker/wrapper
+
+For (a) the RHS may be large, and our contract is that we *only* inline
+when the function is applied to all the arguments on the LHS of the
+source-code defn.  (The uf_arity in the rule.)
+
+However for worker/wrapper it may be worth inlining even if the
+arity is not satisfied (as we do in the CoreUnfolding case) so we don't
+require saturation.
+
+Note [Nested functions]
+~~~~~~~~~~~~~~~~~~~~~~~
+At one time we treated a call of a non-top-level function as
+"interesting" (regardless of how boring the context) in the hope
+that inlining it would eliminate the binding, and its allocation.
+Specifically, in the default case of interesting_call we had
+   _other -> not is_top && uf_arity > 0
+
+But actually postInlineUnconditionally does some of this and overall
+it makes virtually no difference to nofib.  So I simplified away this
+special case
+
+Note [Cast then apply]
+~~~~~~~~~~~~~~~~~~~~~~
+Consider
+   myIndex = __inline_me ( (/\a. <blah>) |> co )
+   co :: (forall a. a -> a) ~ (forall a. T a)
+     ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
+
+We need to inline myIndex to unravel this; but the actual call (myIndex a) has
+no value arguments.  The ValAppCtxt gives it enough incentive to inline.
+
+Note [Inlining in ArgCtxt]
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+The condition (arity > 0) here is very important, because otherwise
+we end up inlining top-level stuff into useless places; eg
+   x = I# 3#
+   f = \y.  g x
+This can make a very big difference: it adds 16% to nofib 'integer' allocs,
+and 20% to 'power'.
+
+At one stage I replaced this condition by 'True' (leading to the above
+slow-down).  The motivation was test eyeball/inline1.hs; but that seems
+to work ok now.
+
+NOTE: arguably, we should inline in ArgCtxt only if the result of the
+call is at least CONLIKE.  At least for the cases where we use ArgCtxt
+for the RHS of a 'let', we only profit from the inlining if we get a
+CONLIKE thing (modulo lets).
+
+Note [Lone variables]   See also Note [Interaction of exprIsWorkFree and lone variables]
+~~~~~~~~~~~~~~~~~~~~~   which appears below
+The "lone-variable" case is important.  I spent ages messing about
+with unsatisfactory variants, but this is nice.  The idea is that if a
+variable appears all alone
+
+        as an arg of lazy fn, or rhs    BoringCtxt
+        as scrutinee of a case          CaseCtxt
+        as arg of a fn                  ArgCtxt
+AND
+        it is bound to a cheap expression
+
+then we should not inline it (unless there is some other reason,
+e.g. it is the sole occurrence).  That is what is happening at
+the use of 'lone_variable' in 'interesting_call'.
+
+Why?  At least in the case-scrutinee situation, turning
+        let x = (a,b) in case x of y -> ...
+into
+        let x = (a,b) in case (a,b) of y -> ...
+and thence to
+        let x = (a,b) in let y = (a,b) in ...
+is bad if the binding for x will remain.
+
+Another example: I discovered that strings
+were getting inlined straight back into applications of 'error'
+because the latter is strict.
+        s = "foo"
+        f = \x -> ...(error s)...
+
+Fundamentally such contexts should not encourage inlining because, provided
+the RHS is "expandable" (see Note [exprIsExpandable] in GHC.Core.Utils) the
+context can ``see'' the unfolding of the variable (e.g. case or a
+RULE) so there's no gain.
+
+However, watch out:
+
+ * Consider this:
+        foo = _inline_ (\n. [n])
+        bar = _inline_ (foo 20)
+        baz = \n. case bar of { (m:_) -> m + n }
+   Here we really want to inline 'bar' so that we can inline 'foo'
+   and the whole thing unravels as it should obviously do.  This is
+   important: in the NDP project, 'bar' generates a closure data
+   structure rather than a list.
+
+   So the non-inlining of lone_variables should only apply if the
+   unfolding is regarded as cheap; because that is when exprIsConApp_maybe
+   looks through the unfolding.  Hence the "&& is_wf" in the
+   InlineRule branch.
+
+ * Even a type application or coercion isn't a lone variable.
+   Consider
+        case $fMonadST @ RealWorld of { :DMonad a b c -> c }
+   We had better inline that sucker!  The case won't see through it.
+
+   For now, I'm treating treating a variable applied to types
+   in a *lazy* context "lone". The motivating example was
+        f = /\a. \x. BIG
+        g = /\a. \y.  h (f a)
+   There's no advantage in inlining f here, and perhaps
+   a significant disadvantage.  Hence some_val_args in the Stop case
+
+Note [Interaction of exprIsWorkFree and lone variables]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The lone-variable test says "don't inline if a case expression
+scrutinises a lone variable whose unfolding is cheap".  It's very
+important that, under these circumstances, exprIsConApp_maybe
+can spot a constructor application. So, for example, we don't
+consider
+        let x = e in (x,x)
+to be cheap, and that's good because exprIsConApp_maybe doesn't
+think that expression is a constructor application.
+
+In the 'not (lone_variable && is_wf)' test, I used to test is_value
+rather than is_wf, which was utterly wrong, because the above
+expression responds True to exprIsHNF, which is what sets is_value.
+
+This kind of thing can occur if you have
+
+        {-# INLINE foo #-}
+        foo = let x = e in (x,x)
+
+which Roman did.
+
+Note [Minimum value discount]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We always give *some* benefit to value arguments.
+A discount of 10 per arg because we replace the arguments
+and another of 10 if it's some non-trivial value.
+However when computing unfolding guidance we might have come to
+the conclusion that certain argument values deservere little or no
+discount. But we want to chance of inlining to only ever increase as
+more is known about the argument to keep things more predictable. So
+we always give at least 10 discount if the argument is a value. No matter
+what the actual value is.
+-}
+
+
+
+computeDiscount :: [ArgDiscount] -> Int -> [ArgSummary] -> CallCtxt
+                -> Int
+computeDiscount arg_discounts !res_discount arg_infos cont_info
+
+  = 10          -- Discount of 10 because the result replaces the call
+                -- so we count 10 for the function itself
+
+    + 10 * applied_arg_length
+               -- Discount of 10 for each arg supplied,
+               -- because the result replaces the call
+
+    + total_arg_discount + res_discount'
+  where
+    (applied_arg_length,total_arg_discount) = zipWithSumLength arg_discounts arg_infos
+    -- actual_arg_discounts = zipWith mk_arg_discount (arg_discounts) arg_infos
+    -- total_arg_discount   = sum actual_arg_discounts
+
+    -- See Note [Minimum value discount]
+    mk_arg_discount :: ArgDiscount -> ArgSummary -> Int
+    mk_arg_discount _        TrivArg    = 0
+    mk_arg_discount _        NonTrivArg = 10
+    mk_arg_discount NoSeqUse    _       = 10
+    mk_arg_discount discount ValueArg   = max 10 (ad_seq_discount discount)
+    mk_arg_discount (DiscSeq seq_discount con_discounts) (ConArg con args)
+
+      -- There is a discount specific to this constructor, use that.
+      | Just (ConDiscount _ branch_dc arg_discounts) <- lookupUFM con_discounts con
+      = max 10 $ max seq_discount (branch_dc + (sum $ zipWith mk_arg_discount arg_discounts args))
+
+      -- Otherwise give it the generic seq discount
+      | otherwise = max 10 seq_discount
+    mk_arg_discount (SomeArgUse d) ConArg{} = max 10 d
+    mk_arg_discount (FunDisc d _) (ConArg{})
+      -- How can this arise? With dictionary constructors for example.
+      -- We see C:Show foo bar and give it a FunDisc for being applied
+      -- like a function.
+      -- But when constructing ArgSummaries we treat it as constructor
+      -- since well it is one. This is harmless, but a bit odd for sure.
+      -- We just treat it like any other boring ValueArg here.
+      = -- pprTrace "Function discount for con arg" (ppr arg_infos)
+        max 10 d
+
+    -- zipWithSumLength xs ys = (length $ zip xs ys, sum $ zipWith _ xs ys)
+    zipWithSumLength :: [ArgDiscount] -> [ArgSummary] -> (Int, Int)
+    zipWithSumLength dcs args = go 0 0 dcs args
+      where
+        go !length !discount (dc:dcs) (arg:args) =
+          let arg_discount = mk_arg_discount dc arg
+          in go (1+length) (discount + arg_discount) dcs args
+        go l d [] _ = (l,d)
+        go l d _ [] = (l,d)
+
+    res_discount'
+      | LT <- arg_discounts `compareLength` arg_infos
+      = res_discount   -- Over-saturated
+      | otherwise
+      = case cont_info of
+           BoringCtxt  -> 0
+           CaseCtxt    -> res_discount  -- Presumably a constructor
+           ValAppCtxt  -> res_discount  -- Presumably a function
+           _           -> 40 `min` res_discount
+                -- ToDo: this 40 `min` res_discount doesn't seem right
+                --   for DiscArgCtxt it shouldn't matter because the function will
+                --       get the arg discount for any non-triv arg
+                --   for RuleArgCtxt we do want to be keener to inline; but not only
+                --       constructor results
+                --   for RhsCtxt I suppose that exposing a data con is good in general
+                --   And 40 seems very arbitrary
+                --
+                -- res_discount can be very large when a function returns
+                -- constructors; but we only want to invoke that large discount
+                -- when there's a case continuation.
+                -- Otherwise we, rather arbitrarily, threshold it.  Yuk.
+                -- But we want to avoid inlining large functions that return
+                -- constructors into contexts that are simply "interesting"
+
+{- Note [Interesting arguments]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+An argument is interesting if it deserves a discount for unfoldings
+with a discount in that argument position.  The idea is to avoid
+unfolding a function that is applied only to variables that have no
+unfolding (i.e. they are probably lambda bound): f x y z There is
+little point in inlining f here.
+
+Generally, *values* (like (C a b) and (\x.e)) deserve discounts.  But
+we must look through lets, eg (let x = e in C a b), because the let will
+float, exposing the value, if we inline.  That makes it different to
+exprIsHNF.
+
+Before 2009 we said it was interesting if the argument had *any* structure
+at all; i.e. (hasSomeUnfolding v).  But does too much inlining; see #3016.
+
+But we don't regard (f x y) as interesting, unless f is unsaturated.
+If it's saturated and f hasn't inlined, then it's probably not going
+to now!
+
+Note [Conlike is interesting]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Consider
+        f d = ...((*) d x y)...
+        ... f (df d')...
+where df is con-like. Then we'd really like to inline 'f' so that the
+rule for (*) (df d) can fire.  To do this
+  a) we give a discount for being an argument of a class-op (eg (*) d)
+  b) we say that a con-like argument (eg (df d)) is interesting
+-}
+
+interestingArg :: SimplEnv -> CoreExpr -> ArgSummary
+-- See Note [Interesting arguments]
+interestingArg env e =
+  go env depth_limit 0 e
+    where
+    depth_limit = unfoldingMaxDiscountDepth . sm_uf_opts . seMode $ env
+
+    -- n is # value args to which the expression is applied
+    go :: SimplEnv -> Int -> Int -> CoreExpr -> ArgSummary
+    go !_env max_depth _n !_
+      | max_depth <= 0 = TrivArg
+    go env depth n (Var v)
+       = case substId env v of
+           DoneId v'            -> go_var depth n v'
+           DoneEx e _           -> go (zapSubstEnv env) depth            n e
+           ContEx tvs cvs ids e -> go (setSubstEnv env tvs cvs ids) depth n e
+
+    go _ _depth  _ (Lit l)
+       | isLitRubbish l        = TrivArg -- Leads to unproductive inlining in WWRec, #20035
+       | otherwise             = ValueArg
+    go _ _depth   _ (Type _)          = TrivArg
+    go _ _depth   _ (Coercion _)      = TrivArg
+    go env depth n (App fn (Type _)) = go env depth n fn
+    go env depth n e@(App _fn _arg)
+      | (fn,arg_summaries,_ticks) <- mapArgsTicksVal (go env (depth-1) 0) e
+      = let fn_summary = go env depth (n + length arg_summaries) fn
+        in case fn_summary of
+          ConArg con fn_args
+            | isClassTyCon (dataConTyCon con) -> ValueArg
+            | otherwise ->
+              assert (null fn_args) $
+              ConArg con arg_summaries
+          _ -> fn_summary
+
+    go env depth n (Tick _ a)        = go env depth n a
+    go env depth n (Cast e _)        = go env depth n e
+    go env depth n (Lam v e)
+       | isTyVar v             = go env depth n e
+       | n>0                   = NonTrivArg     -- (\x.b) e   is NonTriv
+       | otherwise             = ValueArg
+    go _ _depth _ (Case {})           = NonTrivArg
+    go env depth n (Let b e)         = case go env' depth n e of
+                                   ValueArg -> ValueArg
+                                   c at ConArg{} -> c
+                                   _        -> NonTrivArg
+                               where
+                                 env' = env `addNewInScopeBndr` b
+
+    go_var depth n v
+      | Just rhs <- maybeUnfoldingTemplate (idUnfolding v)
+      , (f, arg_summaries, _ticks) <- mapArgsTicksVal (go env (depth-1) 0) rhs
+      , Var f' <- varView f
+      , Just con <- isDataConId_maybe f'
+      , not (isClassTyCon $ dataConTyCon con)
+      =
+        -- pprTrace "ConArg1" (ppr $ ConArg con $ map (go env 0) args) $
+        ConArg con arg_summaries
+
+      | Just con <- isDataConId_maybe v
+      = ConArg con []
+      | isConLikeId v     = ValueArg   -- Experimenting with 'conlike' rather that
+                                      --    data constructors here
+      | idArity v > n     = ValueArg   -- Catches (eg) primops with arity but no unfolding
+      | n > 0             = NonTrivArg -- Saturated or unknown call
+      | conlike_unfolding = ValueArg   -- n==0; look for an interesting unfolding
+                                      -- See Note [Conlike is interesting]
+      | otherwise         = TrivArg    -- n==0, no useful unfolding
+      where
+        conlike_unfolding = isConLikeUnfolding (idUnfolding v)
+
+        varView (Cast e _) = e
+        varView (Tick _ e) = e
+        varView e = e
+
+-- | Like @collectArgs@, but maps over the arguments at the same time.
+-- and also looks through casts.
+mapArgsTicksVal :: (Expr b -> c) -> Expr b
+                 -> (Expr b, [c], [CoreTickish])
+mapArgsTicksVal fm expr
+  = go expr [] []
+  where
+    go (App f a)  as ts
+      | isValArg a = go f (fm a:as) ts
+      | otherwise = go f as ts
+    go (Tick t e) as ts = go e as (t:ts)
+    go (Cast e _) as ts = go e as ts
+    go e          as ts = (e, as, reverse ts)
+
+
+contArgs :: SimplCont -> (Bool, [ArgSummary], SimplCont)
+-- Summarises value args, discards type args and coercions
+-- The returned continuation of the call is only used to
+-- answer questions like "are you interesting?"
+contArgs cont
+  | lone cont = (True, [], cont)
+  | otherwise = go [] cont
+  where
+    lone (ApplyToTy  {}) = False  -- See Note [Lone variables] in GHC.Core.Unfold
+    lone (ApplyToVal {}) = False  -- NB: even a type application or cast
+    lone (CastIt {})     = False  --     stops it being "lone"
+    lone _               = True
+
+    go args (ApplyToVal { sc_arg = arg, sc_env = se, sc_cont = k })
+                                                  = go (is_interesting arg se : args) k
+    go args (ApplyToTy { sc_cont = k }) = go args k
+    go args (CastIt _ k)                = go args k
+    go args k                           = (False, reverse args, k)
+
+    -- is_interesting arg _se = arg
+    is_interesting arg se = interestingArg se arg
+                   -- Do *not* use short-cutting substitution here
+                   -- because we want to get as much IdInfo as possible


=====================================
compiler/GHC/Core/Opt/Simplify/Iteration.hs
=====================================
@@ -74,6 +74,7 @@ import GHC.Utils.Logger
 import GHC.Utils.Misc
 
 import Control.Monad
+import GHC.Core.Opt.Simplify.Inline
 
 {-
 The guts of the simplifier is in this module, but the driver loop for
@@ -3568,7 +3569,7 @@ mkDupableContWithDmds env _
     do { let rhs_ty       = contResultType cont
              (m,arg_ty,_) = splitFunTy fun_ty
        ; arg_bndr <- newId (fsLit "arg") m arg_ty
-       ; let env' = env `addNewInScopeIds` [arg_bndr]
+       ; let env' = env `addNewInScopeId` arg_bndr
        ; (floats, join_rhs) <- rebuildCall env' (addValArgTo fun (Var arg_bndr) fun_ty) cont
        ; mkDupableStrictBind env' arg_bndr (wrapFloats floats join_rhs) rhs_ty }
   where
@@ -3670,7 +3671,7 @@ mkDupableStrictBind env arg_bndr join_rhs res_ty
        ; let arg_info = ArgInfo { ai_fun   = join_bndr
                                 , ai_rules = Nothing, ai_args  = []
                                 , ai_encl  = False, ai_dmds  = repeat topDmd
-                                , ai_discs = repeat 0 }
+                                , ai_discs = repeat NoSeqUse }
        ; return ( addJoinFloats (emptyFloats env) $
                   unitJoinFloat                   $
                   NonRec join_bndr                $


=====================================
compiler/GHC/Core/Opt/Simplify/Utils.hs
=====================================
@@ -24,7 +24,7 @@ module GHC.Core.Opt.Simplify.Utils (
         SimplCont(..), DupFlag(..), StaticEnv,
         isSimplified, contIsStop,
         contIsDupable, contResultType, contHoleType, contHoleScaling,
-        contIsTrivial, contArgs, contIsRhs,
+        contIsTrivial, contIsRhs,
         countArgs,
         mkBoringStop, mkRhsStop, mkLazyArgStop,
         interestingCallContext,
@@ -44,7 +44,6 @@ module GHC.Core.Opt.Simplify.Utils (
 import GHC.Prelude
 
 import GHC.Core
-import GHC.Types.Literal ( isLitRubbish )
 import GHC.Core.Opt.Simplify.Env
 import GHC.Core.Opt.Stats ( Tick(..) )
 import qualified GHC.Core.Subst
@@ -320,8 +319,9 @@ data ArgInfo
                                 --   that the function diverges after being given
                                 --   that number of args
 
-        ai_discs :: [Int]       -- Discounts for remaining value arguments (beyong ai_args)
-                                --   non-zero => be keener to inline
+        ai_discs :: [ArgDiscount]
+                                -- Discounts for remaining value arguments (beyong ai_args)
+                                --   non-NoSeqUse => be keener to inline
                                 --   Always infinite
     }
 
@@ -525,29 +525,6 @@ countArgs (ApplyToVal { sc_cont = cont }) = 1 + countArgs cont
 countArgs (CastIt _ cont)                 = countArgs cont
 countArgs _                               = 0
 
-contArgs :: SimplCont -> (Bool, [ArgSummary], SimplCont)

--- Summarises value args, discards type args and coercions
--- The returned continuation of the call is only used to
--- answer questions like "are you interesting?"
-contArgs cont
-  | lone cont = (True, [], cont)
-  | otherwise = go [] cont
-  where
-    lone (ApplyToTy  {}) = False  -- See Note [Lone variables] in GHC.Core.Unfold
-    lone (ApplyToVal {}) = False  -- NB: even a type application or cast
-    lone (CastIt {})     = False  --     stops it being "lone"
-    lone _               = True
-
-    go args (ApplyToVal { sc_arg = arg, sc_env = se, sc_cont = k })
-                                        = go (is_interesting arg se : args) k
-    go args (ApplyToTy { sc_cont = k }) = go args k
-    go args (CastIt _ k)                = go args k
-    go args k                           = (False, reverse args, k)
-
-    is_interesting arg se = interestingArg se arg
-                   -- Do *not* use short-cutting substitution here
-                   -- because we want to get as much IdInfo as possible
-
 -- | Describes how the 'SimplCont' will evaluate the hole as a 'SubDemand'.
 -- This can be more insightful than the limited syntactic context that
 -- 'SimplCont' provides, because the 'Stop' constructor might carry a useful
@@ -597,14 +574,14 @@ mkArgInfo env fun rules n_val_args call_cont
   = ArgInfo { ai_fun   = fun
             , ai_args  = []
             , ai_rules = fun_rules
-            , ai_encl  = interestingArgContext rules call_cont
+            , ai_encl  = computeArgShapeContext rules call_cont
             , ai_dmds  = add_type_strictness (idType fun) arg_dmds
             , ai_discs = arg_discounts }
   where
     fun_rules = mkFunRules rules
 
-    vanilla_discounts, arg_discounts :: [Int]
-    vanilla_discounts = repeat 0
+    vanilla_discounts, arg_discounts :: [ArgDiscount]
+    vanilla_discounts = repeat NoSeqUse
     arg_discounts = case idUnfolding fun of
                         CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_args = discounts}}
                               -> discounts ++ vanilla_discounts
@@ -758,14 +735,14 @@ lazyArgContext :: ArgInfo -> CallCtxt
 -- Use this for lazy arguments
 lazyArgContext (ArgInfo { ai_encl = encl_rules, ai_discs = discs })
   | encl_rules                = RuleArgCtxt
-  | disc:_ <- discs, disc > 0 = DiscArgCtxt  -- Be keener here
+  | disc:_ <- discs, disc /= NoSeqUse = DiscArgCtxt  -- Be keener here
   | otherwise                 = BoringCtxt   -- Nothing interesting
 
 strictArgContext :: ArgInfo -> CallCtxt
 strictArgContext (ArgInfo { ai_encl = encl_rules, ai_discs = discs })
 -- Use this for strict arguments
   | encl_rules                = RuleArgCtxt
-  | disc:_ <- discs, disc > 0 = DiscArgCtxt  -- Be keener here
+  | disc:_ <- discs, disc /= NoSeqUse = DiscArgCtxt  -- Be keener here
   | otherwise                 = RhsCtxt NonRecursive
       -- Why RhsCtxt?  if we see f (g x), and f is strict, we
       -- want to be a bit more eager to inline g, because it may
@@ -814,7 +791,7 @@ interestingCallContext env cont
         -- a build it's *great* to inline it here.  So we must ensure that
         -- the context for (f x) is not totally uninteresting.
 
-interestingArgContext :: [CoreRule] -> SimplCont -> Bool
+computeArgShapeContext :: [CoreRule] -> SimplCont -> Bool
 -- If the argument has form (f x y), where x,y are boring,
 -- and f is marked INLINE, then we don't want to inline f.
 -- But if the context of the argument is
@@ -823,7 +800,7 @@ interestingArgContext :: [CoreRule] -> SimplCont -> Bool
 -- exposes a rule that might fire.  Similarly, if the context is
 --      h (g (f x x))
 -- where h has rules, then we do want to inline f; hence the
--- call_cont argument to interestingArgContext
+-- call_cont argument to computeArgShapeContext
 --
 -- The ai-rules flag makes this happen; if it's
 -- set, the inliner gets just enough keener to inline f
@@ -833,9 +810,9 @@ interestingArgContext :: [CoreRule] -> SimplCont -> Bool
 -- regardless of how boring its context is; but that seems overkill
 -- For example, it'd mean that wrapper functions were always inlined
 --
--- The call_cont passed to interestingArgContext is the context of
+-- The call_cont passed to computeArgShapeContext is the context of
 -- the call itself, e.g. g <hole> in the example above
-interestingArgContext rules call_cont
+computeArgShapeContext rules call_cont
   = notNull rules || enclosing_fn_has_rules
   where
     enclosing_fn_has_rules = go call_cont
@@ -850,78 +827,7 @@ interestingArgContext rules call_cont
     go (Stop _ _ _)                 = False
     go (TickIt _ c)                 = go c
 
-{- Note [Interesting arguments]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-An argument is interesting if it deserves a discount for unfoldings
-with a discount in that argument position.  The idea is to avoid
-unfolding a function that is applied only to variables that have no
-unfolding (i.e. they are probably lambda bound): f x y z There is
-little point in inlining f here.
-
-Generally, *values* (like (C a b) and (\x.e)) deserve discounts.  But
-we must look through lets, eg (let x = e in C a b), because the let will
-float, exposing the value, if we inline.  That makes it different to
-exprIsHNF.
-
-Before 2009 we said it was interesting if the argument had *any* structure
-at all; i.e. (hasSomeUnfolding v).  But does too much inlining; see #3016.
 
-But we don't regard (f x y) as interesting, unless f is unsaturated.
-If it's saturated and f hasn't inlined, then it's probably not going
-to now!
-
-Note [Conlike is interesting]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Consider
-        f d = ...((*) d x y)...
-        ... f (df d')...
-where df is con-like. Then we'd really like to inline 'f' so that the
-rule for (*) (df d) can fire.  To do this
-  a) we give a discount for being an argument of a class-op (eg (*) d)
-  b) we say that a con-like argument (eg (df d)) is interesting
--}
-
-interestingArg :: SimplEnv -> CoreExpr -> ArgSummary
--- See Note [Interesting arguments]
-interestingArg env e = go env 0 e
-  where
-    -- n is # value args to which the expression is applied
-    go env n (Var v)
-       = case substId env v of
-           DoneId v'            -> go_var n v'
-           DoneEx e _           -> go (zapSubstEnv env)             n e
-           ContEx tvs cvs ids e -> go (setSubstEnv env tvs cvs ids) n e
-
-    go _   _ (Lit l)
-       | isLitRubbish l        = TrivArg -- Leads to unproductive inlining in WWRec, #20035
-       | otherwise             = ValueArg
-    go _   _ (Type _)          = TrivArg
-    go _   _ (Coercion _)      = TrivArg
-    go env n (App fn (Type _)) = go env n fn
-    go env n (App fn _)        = go env (n+1) fn
-    go env n (Tick _ a)        = go env n a
-    go env n (Cast e _)        = go env n e
-    go env n (Lam v e)
-       | isTyVar v             = go env n e
-       | n>0                   = NonTrivArg     -- (\x.b) e   is NonTriv
-       | otherwise             = ValueArg
-    go _ _ (Case {})           = NonTrivArg
-    go env n (Let b e)         = case go env' n e of
-                                   ValueArg -> ValueArg
-                                   _        -> NonTrivArg
-                               where
-                                 env' = env `addNewInScopeIds` bindersOf b
-
-    go_var n v
-       | isConLikeId v     = ValueArg   -- Experimenting with 'conlike' rather that
-                                        --    data constructors here
-       | idArity v > n     = ValueArg   -- Catches (eg) primops with arity but no unfolding
-       | n > 0             = NonTrivArg -- Saturated or unknown call
-       | conlike_unfolding = ValueArg   -- n==0; look for an interesting unfolding
-                                        -- See Note [Conlike is interesting]
-       | otherwise         = TrivArg    -- n==0, no useful unfolding
-       where
-         conlike_unfolding = isConLikeUnfolding (idUnfolding v)
 
 {-
 ************************************************************************


=====================================
compiler/GHC/Core/Ppr.hs
=====================================
@@ -614,7 +614,8 @@ instance Outputable UnfoldingGuidance where
                 text "boring_ok=" <> ppr boring_ok)
     ppr (UnfIfGoodArgs { ug_args = cs, ug_size = size, ug_res = discount })
       = hsep [ text "IF_ARGS",
-               brackets (hsep (map int cs)),
+               ppUnlessOption sdocSuppressUnfoldingGuidance
+               (brackets (hsep (map ppr cs))),
                int size,
                int discount ]
 


=====================================
compiler/GHC/Core/Seq.hs
=====================================
@@ -23,6 +23,8 @@ import GHC.Types.Var( varType, tyVarKind )
 import GHC.Core.Type( seqType, isTyVar )
 import GHC.Core.Coercion( seqCo )
 import GHC.Types.Id( idInfo )
+import GHC.Utils.Misc (seqList)
+import GHC.Types.Unique.FM (seqEltsUFM)
 
 -- | Evaluate all the fields of the 'IdInfo' that are generally demanded by the
 -- compiler
@@ -112,5 +114,15 @@ seqUnfolding (CoreUnfolding { uf_tmpl = e, uf_is_top = top,
 seqUnfolding _ = ()
 
 seqGuidance :: UnfoldingGuidance -> ()
-seqGuidance (UnfIfGoodArgs ns n b) = n `seq` sum ns `seq` b `seq` ()
+seqGuidance (UnfIfGoodArgs ns n b) = n `seq` (seqList (map seqArgDiscount ns) ()) `seq` b `seq` ()
 seqGuidance _                      = ()
+
+-- seqTopDiscount :: (a, ArgDiscount) -> ()
+-- seqTopDiscount (!_,dc) = seqArgDiscount dc
+
+seqArgDiscount :: ArgDiscount -> ()
+seqArgDiscount (DiscSeq !_ sub_args) = seqEltsUFM seqConDiscount sub_args
+seqArgDiscount !_ = ()
+
+seqConDiscount :: ConDiscount -> ()
+seqConDiscount (ConDiscount !_ !_ sub_args) = seqList (map seqArgDiscount sub_args) ()
\ No newline at end of file


=====================================
compiler/GHC/Core/Unfold.hs
=====================================
@@ -26,20 +26,19 @@ module GHC.Core.Unfold (
         updateFunAppDiscount, updateDictDiscount,
         updateVeryAggressive, updateCaseScaling,
         updateCaseThreshold, updateReportPrefix,
+        updateMaxDiscountDepth, updateMaxGuideDepth,
 
-        ArgSummary(..),
+        ArgSummary(..), nonTrivArg,
 
         couldBeSmallEnoughToInline, inlineBoringOk,
         smallEnoughToInline,
 
-        callSiteInline, CallCtxt(..),
+        CallCtxt(..),
         calcUnfoldingGuidance
     ) where
 
 import GHC.Prelude
 
-import GHC.Driver.Flags
-
 import GHC.Core
 import GHC.Core.Utils
 import GHC.Types.Id
@@ -51,16 +50,19 @@ import GHC.Types.RepType ( isZeroBitTy )
 import GHC.Types.Basic  ( Arity, RecFlag(..) )
 import GHC.Core.Type
 import GHC.Builtin.Names
-import GHC.Data.Bag
-import GHC.Utils.Logger
 import GHC.Utils.Misc
 import GHC.Utils.Outputable
 import GHC.Types.ForeignCall
-import GHC.Types.Name
 import GHC.Types.Tickish
 
 import qualified Data.ByteString as BS
-import Data.List (isPrefixOf)
+import GHC.Types.Unique.FM
+import Data.Maybe
+import GHC.Types.Var.Env
+import GHC.Utils.Panic.Plain (assert)
+import GHC.Data.Graph.UnVar
+import GHC.Utils.Trace (pprTraceDebug)
+
 
 
 -- | Unfolding options
@@ -88,6 +90,11 @@ data UnfoldingOpts = UnfoldingOpts
 
    , unfoldingReportPrefix :: !(Maybe String)
       -- ^ Only report inlining decisions for names with this prefix
+
+   , unfoldingMaxDiscountDepth :: !Int
+      -- ^ When considering unfolding a definition look this deep
+      -- into the applied arguments and into nested argument use.
+
    }
 
 defaultUnfoldingOpts :: UnfoldingOpts
@@ -122,6 +129,8 @@ defaultUnfoldingOpts = UnfoldingOpts
 
       -- Don't filter inlining decision reports
    , unfoldingReportPrefix = Nothing
+
+   , unfoldingMaxDiscountDepth = 20
    }
 
 -- Helpers for "GHC.Driver.Session"
@@ -151,6 +160,10 @@ updateCaseScaling n opts = opts { unfoldingCaseScaling = n }
 updateReportPrefix :: Maybe String -> UnfoldingOpts -> UnfoldingOpts
 updateReportPrefix n opts = opts { unfoldingReportPrefix = n }
 
+updateMaxDiscountDepth :: Int -> UnfoldingOpts -> UnfoldingOpts
+updateMaxDiscountDepth n opts = opts { unfoldingMaxDiscountDepth = n }
+
+
 {-
 Note [Occurrence analysis of unfoldings]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -258,17 +271,11 @@ calcUnfoldingGuidance opts is_top_bottoming expr
     val_bndrs   = filter isId bndrs
     n_val_bndrs = length val_bndrs
 
-    mk_discount :: Bag (Id,Int) -> Id -> Int
-    mk_discount cbs bndr = foldl' combine 0 cbs
-           where
-             combine acc (bndr', disc)
-               | bndr == bndr' = acc `plus_disc` disc
-               | otherwise     = acc
-
-             plus_disc :: Int -> Int -> Int
-             plus_disc | isFunTy (idType bndr) = max
-                       | otherwise             = (+)
-             -- See Note [Function and non-function discounts]
+    mk_discount :: VarEnv ArgDiscount -> Id -> (ArgDiscount)
+    mk_discount cbs bndr =
+        let !dc = lookupWithDefaultVarEnv cbs NoSeqUse bndr
+            -- !depth = discountDepth dc
+        in (dc)
 
 {- Note [Inline unsafeCoerce]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -402,6 +409,39 @@ the constructor would lead to a separate allocation instead of just
 sharing the wrapper closure.
 
 The solution: don’t ignore coercion arguments after all.
+
+Note [Nested discounts]
+~~~~~~~~~~~~~~~~~~~~~~~
+If we have code like:
+
+f :: Either Bool Int -> Int
+f x = case x of
+    Left l -> case l of
+        True -> 1
+        False -> sum [1..200] :: Int
+    Right r -> case r of
+        r -> r + succ r
+
+What should the discount be? I argue it should be a discount *tree* like this:
+    [some_con:62,
+      Left:43
+        [some_con:30, False:10[], True:49[]],
+      Right:70
+        [some_con:20, GHC.Types.I#:11[disc:30]]
+        ]
+
+How can we compute this? While we traverse the expression if we see a case on a interesting
+binder (e.g. x):
+* We look at all the alternatives(Left,Right), treating the constructor bound vars(l,r) as
+  additional interesting binders for their rhss.
+* We compute a default discount `some_con` which assumes we will statically choose the largest
+  alternative if we inline.
+* We compute specific discounts for each constructor alternative by attributing any
+  discount on alternative binders(l,r) to the constructor this alternative is matching on.
+* The discounts for the whole case are then represented by the combination of the flat default discount
+  as well as a list of constructor specific discounts, which carry inside them the discounts for their
+  arguments.
+
 -}
 
 uncondInline :: CoreExpr -> Arity -> Int -> Bool
@@ -412,6 +452,7 @@ uncondInline rhs arity size
   | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
   | otherwise = exprIsTrivial rhs        -- See Note [INLINE for small functions] (4)
 
+-- {-# NOINLINE sizeExpr #-}
 sizeExpr :: UnfoldingOpts
          -> Int             -- Bomb out if it gets bigger than this
          -> [Id]            -- Arguments; we're interested in which of these
@@ -423,81 +464,107 @@ sizeExpr :: UnfoldingOpts
 
 -- Forcing bOMB_OUT_SIZE early prevents repeated
 -- unboxing of the Int argument.
-sizeExpr opts !bOMB_OUT_SIZE top_args expr
-  = size_up expr
+sizeExpr opts !bOMB_OUT_SIZE top_args' expr
+  = size_up depth_limit (mkUnVarSet top_args') expr
   where
-    size_up (Cast e _) = size_up e
-    size_up (Tick _ e) = size_up e
-    size_up (Type _)   = sizeZero           -- Types cost nothing
-    size_up (Coercion _) = sizeZero
-    size_up (Lit lit)  = sizeN (litSize lit)
-    size_up (Var f) | isZeroBitId f = sizeZero
-                      -- Make sure we get constructor discounts even
-                      -- on nullary constructors
-                    | otherwise       = size_up_call f [] 0
-
-    size_up (App fun arg)
-      | isTyCoArg arg = size_up fun
-      | otherwise     = size_up arg  `addSizeNSD`
-                        size_up_app fun [arg] (if isZeroBitExpr arg then 1 else 0)
-
-    size_up (Lam b e)
-      | isId b && not (isZeroBitId b) = lamScrutDiscount opts (size_up e `addSizeN` 10)
-      | otherwise = size_up e
-
-    size_up (Let (NonRec binder rhs) body)
-      = size_up_rhs (binder, rhs) `addSizeNSD`
-        size_up body              `addSizeN`
+    depth_limit = unfoldingMaxDiscountDepth opts
+    size_up :: Int -> UnVarSet -> Expr Var -> ExprSize
+    size_up !depth !arg_comps (Cast e _) = size_up depth arg_comps e
+    size_up !depth arg_comps (Tick _ e) = size_up depth arg_comps e
+    size_up !_depth _arg_comps (Type _)   = sizeZero           -- Types cost nothing
+    size_up !_depth _arg_comps (Coercion _) = sizeZero
+    size_up !_depth _arg_comps (Lit lit)  = sizeN (litSize lit)
+    size_up !_depth  arg_comps (Var f) | isZeroBitId f = sizeZero
+                                -- Make sure we get constructor discounts even
+                                -- on nullary constructors
+                              | otherwise       = size_up_call arg_comps f [] 0
+
+    size_up !depth arg_comps (App fun arg)
+      | isTyCoArg arg = size_up depth arg_comps fun
+      | otherwise     = size_up depth arg_comps arg  `addSizeNSD`
+                        size_up_app depth arg_comps  fun [arg] (if isZeroBitExpr arg then 1 else 0)
+
+    size_up !depth arg_comps (Lam b e)
+      | isId b && not (isZeroBitId b) = lamScrutDiscount opts (size_up depth (delUnVarSet arg_comps b) e `addSizeN` 10)
+      | otherwise = size_up depth (delUnVarSet arg_comps b) e
+
+    size_up !depth arg_comps (Let (NonRec binder rhs) body)
+      = let arg_comps' = delUnVarSet arg_comps binder
+        in
+        size_up_rhs depth  arg_comps' (binder, rhs) `addSizeNSD`
+        size_up depth arg_comps' body              `addSizeN`
         size_up_alloc binder
 
-    size_up (Let (Rec pairs) body)
-      = foldr (addSizeNSD . size_up_rhs)
-              (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
+    size_up !depth arg_comps (Let (Rec pairs) body)
+      = let lhs_bnds = map fst pairs
+            arg_comps' = delUnVarSetList arg_comps lhs_bnds
+        in
+        foldr (addSizeNSD . (size_up_rhs depth  arg_comps'))
+              (size_up depth arg_comps' body `addSizeN` sum (map (size_up_alloc . fst) pairs))
               pairs
 
-    size_up (Case e _ _ alts)
+    size_up !depth arg_comps (Case e _ _ alts)
         | null alts
-        = size_up e    -- case e of {} never returns, so take size of scrutinee
+        = size_up depth arg_comps e    -- case e of {} never returns, so take size of scrutinee
 
-    size_up (Case e _ _ alts)
+    size_up !depth arg_comps (Case e _ _ alts)
         -- Now alts is non-empty
-        | Just v <- is_top_arg e -- We are scrutinising an argument variable
+        -- We are scrutinising an argument variable or a subcomponent thereof.
+        | Just v <- is_top_arg e
         = let
-            alt_sizes = map size_up_alt alts
-
-                  -- alts_size tries to compute a good discount for
-                  -- the case when we are scrutinising an argument variable
+            -- Compute size of alternatives
+            alt_sizes = map (size_up_alt depth (Just v) arg_comps) alts
+
+            -- Apply a discount for a given constructor that brings the size down to just
+            -- the size of the alternative.
+            alt_size_discount tot_size (Alt (DataAlt con) alt_bndrs _rhs) (SizeIs alt_size _ _) =
+              let trim_discount = max 10 $ tot_size - alt_size
+              in Just (unitUFM con (ConDiscount con trim_discount (map (const NoSeqUse) alt_bndrs)))
+            alt_size_discount _tot_size _ _alt_size = Nothing
+
+            -- Add up discounts from the alternatives
+            added_alt_sizes = (foldr1 addAltSize alt_sizes)
+            -- Compute size of the largest rhs
+            largest_alt_size = (foldr (maxSize bOMB_OUT_SIZE) 0 alt_sizes)
+
+            -- alts_size tries to compute a good discount for
+            -- the case when we are scrutinising an argument variable or subcomponent thereof
             alts_size (SizeIs tot tot_disc tot_scrut)
-                          -- Size of all alternatives
-                      (SizeIs max _        _)
-                          -- Size of biggest alternative
-                  = SizeIs tot (unitBag (v, 20 + tot - max)
-                      `unionBags` tot_disc) tot_scrut
-                          -- If the variable is known, we produce a
-                          -- discount that will take us back to 'max',
-                          -- the size of the largest alternative The
-                          -- 1+ is a little discount for reduced
-                          -- allocation in the caller
+                      largest_alt_size
+
+                  = let default_alt_discount = 20 + tot - largest_alt_size
+                        alt_discounts = unitUFM v $ DiscSeq default_alt_discount $ plusUFMList $ catMaybes $ zipWith (alt_size_discount tot) alts alt_sizes
+                    in
+                    SizeIs tot
+                           (tot_disc
+                            `plusDiscountEnv` (alt_discounts))
+                            tot_scrut
+                          -- If the variable is known but we don't have a
+                          -- specific constructor discount for it, we produce a
+                          -- discount that will take us back to 'largest_alt_size',
+                          -- the size of the largest alternative.
                           --
                           -- Notice though, that we return tot_disc,
                           -- the total discount from all branches.  I
                           -- think that's right.
 
-            alts_size tot_size _ = tot_size
+            alts_size tot_size _max_alt_size = tot_size -- Too Big
           in
-          alts_size (foldr1 addAltSize alt_sizes)  -- alts is non-empty
-                    (foldr1 maxSize    alt_sizes)
+          -- Why foldr1? We might get TooBig already after the first few alternatives
+          -- in which case we don't have to look at the remaining ones.
+          alts_size added_alt_sizes  -- alts is non-empty
+                    largest_alt_size
                 -- Good to inline if an arg is scrutinised, because
                 -- that may eliminate allocation in the caller
                 -- And it eliminates the case itself
         where
-          is_top_arg (Var v) | v `elem` top_args = Just v
+          is_top_arg (Var v) | v `elemUnVarSet` arg_comps = Just v
           is_top_arg (Cast e _) = is_top_arg e
           is_top_arg _ = Nothing
 
 
-    size_up (Case e _ _ alts) = size_up e  `addSizeNSD`
-                                foldr (addAltSize . size_up_alt) case_size alts
+    size_up !depth arg_comps (Case e _ _ alts) = size_up depth arg_comps e  `addSizeNSD`
+                                foldr (addAltSize . (size_up_alt depth Nothing arg_comps) ) case_size alts
       where
           case_size
            | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
@@ -534,42 +601,54 @@ sizeExpr opts !bOMB_OUT_SIZE top_args expr
               | otherwise
                 = False
 
-    size_up_rhs (bndr, rhs)
+    size_up_rhs !depth !arg_comps (bndr, rhs)
       | Just join_arity <- isJoinId_maybe bndr
         -- Skip arguments to join point
-      , (_bndrs, body) <- collectNBinders join_arity rhs
-      = size_up body
+      , (bndrs, body) <- collectNBinders join_arity rhs
+      = size_up depth (delUnVarSetList arg_comps bndrs) body
       | otherwise
-      = size_up rhs
+      = size_up depth arg_comps rhs
 
     ------------
     -- size_up_app is used when there's ONE OR MORE value args
-    size_up_app (App fun arg) args voids
-        | isTyCoArg arg                  = size_up_app fun args voids
-        | isZeroBitExpr arg              = size_up_app fun (arg:args) (voids + 1)
-        | otherwise                      = size_up arg  `addSizeNSD`
-                                           size_up_app fun (arg:args) voids
-    size_up_app (Var fun)     args voids = size_up_call fun args voids
-    size_up_app (Tick _ expr) args voids = size_up_app expr args voids
-    size_up_app (Cast expr _) args voids = size_up_app expr args voids
-    size_up_app other         args voids = size_up other `addSizeN`
+    size_up_app depth !arg_comps  (App fun arg) args voids
+        | isTyCoArg arg                  = size_up_app depth arg_comps  fun args voids
+        | isZeroBitExpr arg              = size_up_app depth arg_comps  fun (arg:args) (voids + 1)
+        | otherwise                      = size_up depth arg_comps arg  `addSizeNSD`
+                                           size_up_app depth arg_comps  fun (arg:args) voids
+    size_up_app _depth arg_comps (Var fun)     args voids = size_up_call arg_comps fun args voids
+    size_up_app depth arg_comps  (Tick _ expr) args voids = size_up_app depth arg_comps  expr args voids
+    size_up_app depth arg_comps  (Cast expr _) args voids = size_up_app depth arg_comps  expr args voids
+    size_up_app depth arg_comps  other         args voids = size_up depth arg_comps other `addSizeN`
                                            callSize (length args) voids
        -- if the lhs is not an App or a Var, or an invisible thing like a
        -- Tick or Cast, then we should charge for a complete call plus the
        -- size of the lhs itself.
 
     ------------
-    size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
-    size_up_call fun val_args voids
+    size_up_call :: UnVarSet -> Id -> [CoreExpr] -> Int -> ExprSize
+    size_up_call !arg_comps fun val_args voids
        = case idDetails fun of
            FCallId _        -> sizeN (callSize (length val_args) voids)
            DataConWorkId dc -> conSize    dc (length val_args)
            PrimOpId op _    -> primOpSize op (length val_args)
-           ClassOpId _      -> classOpSize opts top_args val_args
-           _                -> funSize opts top_args fun (length val_args) voids
+           ClassOpId _      -> classOpSize opts arg_comps val_args
+           _                -> funSize opts arg_comps fun (length val_args) voids
 
     ------------
-    size_up_alt (Alt _con _bndrs rhs) = size_up rhs `addSizeN` 10
+    -- Take into acount the binders of scrutinized argument binders
+    -- But not too deeply! Hence we check if we exhausted depth.
+    -- If so we simply ingore the case binders.
+    size_up_alt depth m_top_arg !arg_comps  (Alt alt_con bndrs rhs)
+      | Just top_arg <- m_top_arg
+      , depth > 0
+      , DataAlt con <- alt_con
+      =
+        let alt_size = size_up depth (extendUnVarSetList bndrs arg_comps) rhs `addSizeN` 10
+        -- let alt_size = size_up (arg_comps) rhs `addSizeN` 10
+
+        in asExprSize top_arg alt_size con bndrs
+    size_up_alt depth _ arg_comps  (Alt _con bndrs rhs) = size_up depth (delUnVarSetList arg_comps bndrs) rhs `addSizeN` 10
         -- Don't charge for args, so that wrappers look cheap
         -- (See comments about wrappers with Case)
         --
@@ -590,6 +669,7 @@ sizeExpr opts !bOMB_OUT_SIZE top_args expr
     ------------
         -- These addSize things have to be here because
         -- I don't want to give them bOMB_OUT_SIZE as an argument
+    addSizeN :: ExprSize -> Int -> ExprSize
     addSizeN TooBig          _  = TooBig
     addSizeN (SizeIs n xs d) m  = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
 
@@ -598,7 +678,7 @@ sizeExpr opts !bOMB_OUT_SIZE top_args expr
     addAltSize _                 TooBig = TooBig
     addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
         = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
-                                 (xs `unionBags` ys)
+                                 (xs `plusDiscountEnv` ys)
                                  (d1 + d2) -- Note [addAltSize result discounts]
 
         -- This variant ignores the result discount from its LEFT argument
@@ -607,7 +687,7 @@ sizeExpr opts !bOMB_OUT_SIZE top_args expr
     addSizeNSD _                 TooBig = TooBig
     addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
         = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
-                                 (xs `unionBags` ys)
+                                 (xs `plusDiscountEnv` ys)
                                  d2  -- Ignore d1
 
     -- don't count expressions such as State# RealWorld
@@ -631,21 +711,23 @@ litSize _other = 0    -- Must match size of nullary constructors
                       -- Key point: if  x |-> 4, then x must inline unconditionally
                       --            (eg via case binding)
 
-classOpSize :: UnfoldingOpts -> [Id] -> [CoreExpr] -> ExprSize
+classOpSize :: UnfoldingOpts -> UnVarSet -> [CoreExpr] -> ExprSize
 -- See Note [Conlike is interesting]
 classOpSize _ _ []
   = sizeZero
-classOpSize opts top_args (arg1 : other_args)
-  = SizeIs size arg_discount 0
+classOpSize opts !top_args (arg1 : other_args)
+  = SizeIs size (arg_discount arg1) 0
   where
     size = 20 + (10 * length other_args)
     -- If the class op is scrutinising a lambda bound dictionary then
     -- give it a discount, to encourage the inlining of this function
     -- The actual discount is rather arbitrarily chosen
-    arg_discount = case arg1 of
-                     Var dict | dict `elem` top_args
-                              -> unitBag (dict, unfoldingDictDiscount opts)
-                     _other   -> emptyBag
+    arg_discount dict_arg = case dict_arg of
+                      Var dict | dict `elemUnVarSet` top_args
+                                -> unitUFM dict (classOpArgDiscount $ unfoldingDictDiscount opts)
+                      Tick _ e -> arg_discount e
+                      Cast e _ -> arg_discount e
+                      _other   -> mempty
 
 -- | The size of a function call
 callSize
@@ -668,10 +750,10 @@ jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
   -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
   -- better solution?
 
-funSize :: UnfoldingOpts -> [Id] -> Id -> Int -> Int -> ExprSize
+funSize :: UnfoldingOpts -> UnVarSet -> Id -> Int -> Int -> ExprSize
 -- Size for functions that are not constructors or primops
 -- Note [Function applications]
-funSize opts top_args fun n_val_args voids
+funSize opts !top_args fun n_val_args voids
   | fun `hasKey` buildIdKey   = buildSize
   | fun `hasKey` augmentIdKey = augmentSize
   | otherwise = SizeIs size arg_discount res_discount
@@ -685,9 +767,10 @@ funSize opts top_args fun n_val_args voids
 
         --                  DISCOUNTS
         --  See Note [Function and non-function discounts]
-    arg_discount | some_val_args && fun `elem` top_args
-                 = unitBag (fun, unfoldingFunAppDiscount opts)
-                 | otherwise = emptyBag
+    arg_discount | some_val_args && fun `elemUnVarSet` top_args
+                 = -- pprTrace "mkFunSize" (ppr fun) $
+                   unitUFM fun (FunDisc (unfoldingFunAppDiscount opts) (idName fun))
+                 | otherwise = mempty
         -- If the function is an argument and is applied
         -- to some values, give it an arg-discount
 
@@ -698,13 +781,13 @@ funSize opts top_args fun n_val_args voids
 
 conSize :: DataCon -> Int -> ExprSize
 conSize dc n_val_args
-  | n_val_args == 0 = SizeIs 0 emptyBag 10    -- Like variables
+  | n_val_args == 0 = SizeIs 0 mempty 10    -- Like variables
 
 -- See Note [Unboxed tuple size and result discount]
-  | isUnboxedTupleDataCon dc = SizeIs 0 emptyBag 10
+  | isUnboxedTupleDataCon dc = SizeIs 0 mempty 10
 
 -- See Note [Constructor size and result discount]
-  | otherwise = SizeIs 10 emptyBag 10
+  | otherwise = SizeIs 10 mempty 10
 
 {- Note [Constructor size and result discount]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -808,7 +891,7 @@ primOpSize op n_val_args
 
 
 buildSize :: ExprSize
-buildSize = SizeIs 0 emptyBag 40
+buildSize = SizeIs 0 mempty 40
         -- We really want to inline applications of build
         -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
         -- Indeed, we should add a result_discount because build is
@@ -817,7 +900,7 @@ buildSize = SizeIs 0 emptyBag 40
         -- The "4" is rather arbitrary.
 
 augmentSize :: ExprSize
-augmentSize = SizeIs 0 emptyBag 40
+augmentSize = SizeIs 0 mempty 40
         -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
         -- e plus ys. The -2 accounts for the \cn
 
@@ -899,37 +982,86 @@ Code for manipulating sizes
 data ExprSize
     = TooBig
     | SizeIs { _es_size_is  :: {-# UNPACK #-} !Int -- ^ Size found
-             , _es_args     :: !(Bag (Id,Int))
+             , _es_args     :: !(VarEnv ArgDiscount)
                -- ^ Arguments cased herein, and discount for each such
              , _es_discount :: {-# UNPACK #-} !Int
                -- ^ Size to subtract if result is scrutinised by a case
                -- expression
              }
 
+plusDiscountEnv :: VarEnv ArgDiscount -> VarEnv ArgDiscount -> VarEnv ArgDiscount
+plusDiscountEnv el er = plusUFM_C combineArgDiscount el er
+
+-- TODO: Might be worth giving this a larger discount if the type class is known.
+-- So that `f @T $d x = opDoStuff @T $d x ` applied to `f @Bool $dC_$Bool` is likely
+-- to inline turning the unknown into a known call.
+classOpArgDiscount :: Int -> ArgDiscount
+classOpArgDiscount n = SomeArgUse n
+
+-- After computing the discounts for an alternatives rhs we transfer discounts from the
+-- alt binders to the constructor specific discount of the scrutinee for the given constructor.
+asExprSize :: Id -> ExprSize -> DataCon -> [Id] -> ExprSize
+asExprSize _ TooBig _ _ = TooBig
+asExprSize scrut (SizeIs n arg_discs s_d) con alt_bndrs =
+    let (alt_discount_bags, top_discounts) = partitionWithKeyUFM (\k _v -> k `elem` map getUnique alt_bndrs) arg_discs
+        alt_discount_map = alt_discount_bags
+        alt_bndr_uses = map (\bndr -> lookupWithDefaultVarEnv alt_discount_map NoSeqUse bndr ) alt_bndrs :: [ArgDiscount]
+
+    in SizeIs n (unitUFM scrut (mkConUse con alt_bndr_uses) `plusUFM` top_discounts) s_d
+
+
+mkConUse :: DataCon -> [ArgDiscount] -> ArgDiscount
+mkConUse con uses =
+  DiscSeq
+    0
+    -- We apply a penalty of 1 per case alternative, so here we apply a discount of 1 per *eliminated*
+    -- case alternative. And then one more because we get rid of a conditional branch which is always good.
+    (unitUFM con (ConDiscount con (length uses) uses))
+
+combineArgDiscount :: ArgDiscount -> ArgDiscount -> ArgDiscount
+combineArgDiscount NoSeqUse u2 = u2
+combineArgDiscount u1 NoSeqUse = u1
+combineArgDiscount (SomeArgUse d1) (SomeArgUse d2) = SomeArgUse $! d1 + d2
+combineArgDiscount (SomeArgUse d1) (DiscSeq d2 m2) = DiscSeq (d1 + d2) m2
+combineArgDiscount (DiscSeq d1 m1) (SomeArgUse d2) = DiscSeq (d1 + d2) m1
+combineArgDiscount (DiscSeq d1 m1) (DiscSeq d2 m2) = DiscSeq (d1 + d2) (plusUFM_C combineMapEntry m1 m2)
+-- See Note [Function and non-function discounts] why we need this
+combineArgDiscount f1@(FunDisc d1 _f1) f2@(FunDisc d2 _f2) = if d1 > d2 then f1 else f2
+-- This can happen either through shadowing or with things like unsafeCoerce. A good idea to warn for debug builds but we don't want to panic here.
+combineArgDiscount f1@(FunDisc _d _n) u2 = pprTraceDebug "Variable seemingly discounted as both function and constructor" (ppr f1 $$ ppr u2) f1
+combineArgDiscount u1 f2@(FunDisc _d _n) = pprTraceDebug "Variable seemingly discounted as both function and constructor" (ppr u1 $$ ppr f2) f2
+
+combineMapEntry :: ConDiscount -> ConDiscount -> ConDiscount
+combineMapEntry (ConDiscount c1 dc1 u1) (ConDiscount c2 dc2 u2) =
+    assert(c1 == c2) $ ConDiscount c1 (dc1+dc2) (combinArgDiscountLists u1 u2)
+
+combinArgDiscountLists :: [ArgDiscount] -> [ArgDiscount] -> [ArgDiscount]
+combinArgDiscountLists uses1 uses2 = zipWith combineArgDiscount uses1 uses2
+
 instance Outputable ExprSize where
   ppr TooBig         = text "TooBig"
-  ppr (SizeIs a _ c) = brackets (int a <+> int c)
+  -- ppr (SizeIs a _ c) = brackets (int a <+> int c)
+  ppr (SizeIs a d c) = brackets (int a <+> int c <+> ppr d)
 
 -- subtract the discount before deciding whether to bale out. eg. we
 -- want to inline a large constructor application into a selector:
 --      tup = (a_1, ..., a_99)
 --      x = case tup of ...
 --
-mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
+mkSizeIs :: Int -> Int -> VarEnv ArgDiscount -> Int -> ExprSize
 mkSizeIs max n xs d | (n - d) > max = TooBig
                     | otherwise     = SizeIs n xs d
 
-maxSize :: ExprSize -> ExprSize -> ExprSize
-maxSize TooBig         _                                  = TooBig
-maxSize _              TooBig                             = TooBig
-maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2   = s1
-                                              | otherwise = s2
+maxSize :: Int -> ExprSize -> Int -> Int
+maxSize !max TooBig _n1 = max
+maxSize _max (SizeIs n1 _ _) n2 | n1 >= n2  = n1
+                                | otherwise = n2
 
 sizeZero :: ExprSize
 sizeN :: Int -> ExprSize
 
-sizeZero = SizeIs 0 emptyBag 0
-sizeN n  = SizeIs n emptyBag 0
+sizeZero = SizeIs 0 mempty 0
+sizeN n  = SizeIs n mempty 0
 
 {-
 ************************************************************************
@@ -990,15 +1122,21 @@ data ArgSummary = TrivArg       -- Nothing interesting
                 | NonTrivArg    -- Arg has structure
                 | ValueArg      -- Arg is a con-app or PAP
                                 -- ..or con-like. Note [Conlike is interesting]
+                | ConArg !DataCon
+                         [ArgSummary] -- It's important that the sub-summaries are
+                                      -- computed lazily. This way they only get forced
+                                      -- if there is a interesting discount which matches
+                                      -- on them. Which improves performance quite a lot.
 
 instance Outputable ArgSummary where
   ppr TrivArg    = text "TrivArg"
   ppr NonTrivArg = text "NonTrivArg"
   ppr ValueArg   = text "ValueArg"
+  ppr (ConArg con _args)  = text "ConArg:" <> ppr con -- <+> ppr args
 
-nonTriv ::  ArgSummary -> Bool
-nonTriv TrivArg = False
-nonTriv _       = True
+nonTrivArg ::  ArgSummary -> Bool
+nonTrivArg TrivArg = False
+nonTrivArg _       = True
 
 data CallCtxt
   = BoringCtxt
@@ -1021,474 +1159,3 @@ instance Outputable CallCtxt where
   ppr DiscArgCtxt = text "DiscArgCtxt"
   ppr RuleArgCtxt = text "RuleArgCtxt"
 
-callSiteInline :: Logger
-               -> UnfoldingOpts
-               -> Int                   -- Case depth
-               -> Id                    -- The Id
-               -> Bool                  -- True <=> unfolding is active
-               -> Bool                  -- True if there are no arguments at all (incl type args)
-               -> [ArgSummary]          -- One for each value arg; True if it is interesting
-               -> CallCtxt              -- True <=> continuation is interesting
-               -> Maybe CoreExpr        -- Unfolding, if any
-callSiteInline logger opts !case_depth id active_unfolding lone_variable arg_infos cont_info
-  = case idUnfolding id of
-      -- idUnfolding checks for loop-breakers, returning NoUnfolding
-      -- Things with an INLINE pragma may have an unfolding *and*
-      -- be a loop breaker  (maybe the knot is not yet untied)
-        CoreUnfolding { uf_tmpl = unf_template
-                      , uf_is_work_free = is_wf
-                      , uf_guidance = guidance, uf_expandable = is_exp }
-          | active_unfolding -> tryUnfolding logger opts case_depth id lone_variable
-                                    arg_infos cont_info unf_template
-                                    is_wf is_exp guidance
-          | otherwise -> traceInline logger opts id "Inactive unfolding:" (ppr id) Nothing
-        NoUnfolding      -> Nothing
-        BootUnfolding    -> Nothing
-        OtherCon {}      -> Nothing
-        DFunUnfolding {} -> Nothing     -- Never unfold a DFun
-
--- | Report the inlining of an identifier's RHS to the user, if requested.
-traceInline :: Logger -> UnfoldingOpts -> Id -> String -> SDoc -> a -> a
-traceInline logger opts inline_id str doc result
-  -- We take care to ensure that doc is used in only one branch, ensuring that
-  -- the simplifier can push its allocation into the branch. See Note [INLINE
-  -- conditional tracing utilities].
-  | enable    = logTraceMsg logger str doc result
-  | otherwise = result
-  where
-    enable
-      | logHasDumpFlag logger Opt_D_dump_verbose_inlinings
-      = True
-      | Just prefix <- unfoldingReportPrefix opts
-      = prefix `isPrefixOf` occNameString (getOccName inline_id)
-      | otherwise
-      = False
-{-# INLINE traceInline #-} -- see Note [INLINE conditional tracing utilities]
-
-{- Note [Avoid inlining into deeply nested cases]
-   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Consider a function f like this:
-
-  f arg1 arg2 =
-    case ...
-      ... -> g arg1
-      ... -> g arg2
-
-This function is small. So should be safe to inline.
-However sometimes this doesn't quite work out like that.
-Consider this code:
-
-f1 arg1 arg2 ... = ...
-    case _foo of
-      alt1 -> ... f2 arg1 ...
-      alt2 -> ... f2 arg2 ...
-
-f2 arg1 arg2 ... = ...
-    case _foo of
-      alt1 -> ... f3 arg1 ...
-      alt2 -> ... f3 arg2 ...
-
-f3 arg1 arg2 ... = ...
-
-... repeats up to n times. And then f1 is
-applied to some arguments:
-
-foo = ... f1 <interestingArgs> ...
-
-Initially f2..fn are not interesting to inline so we don't.
-However we see that f1 is applied to interesting args.
-So it's an obvious choice to inline those:
-
-foo =
-    ...
-      case _foo of
-        alt1 -> ... f2 <interestingArg> ...
-        alt2 -> ... f2 <interestingArg> ...
-
-As a result we go and inline f2 both mentions of f2 in turn are now applied to interesting
-arguments and f2 is small:
-
-foo =
-    ...
-      case _foo of
-        alt1 -> ... case _foo of
-            alt1 -> ... f3 <interestingArg> ...
-            alt2 -> ... f3 <interestingArg> ...
-
-        alt2 -> ... case _foo of
-            alt1 -> ... f3 <interestingArg> ...
-            alt2 -> ... f3 <interestingArg> ...
-
-The same thing happens for each binding up to f_n, duplicating the amount of inlining
-done in each step. Until at some point we are either done or run out of simplifier
-ticks/RAM. This pattern happened #18730.
-
-To combat this we introduce one more heuristic when weighing inlining decision.
-We keep track of a "case-depth". Which increases each time we look inside a case
-expression with more than one alternative.
-
-We then apply a penalty to inlinings based on the case-depth at which they would
-be inlined. Bounding the number of inlinings in such a scenario.
-
-The heuristic can be tuned in two ways:
-
-* We can ignore the first n levels of case nestings for inlining decisions using
-  -funfolding-case-threshold.
-* The penalty grows linear with the depth. It's computed as size*(depth-threshold)/scaling.
-  Scaling can be set with -funfolding-case-scaling.
-
-Some guidance on setting these defaults:
-
-* A low treshold (<= 2) is needed to prevent exponential cases from spiraling out of
-  control. We picked 2 for no particular reason.
-* Scaling the penalty by any more than 30 means the reproducer from
-  T18730 won't compile even with reasonably small values of n. Instead
-  it will run out of runs/ticks. This means to positively affect the reproducer
-  a scaling <= 30 is required.
-* A scaling of >= 15 still causes a few very large regressions on some nofib benchmarks.
-  (+80% for gc/fulsom, +90% for real/ben-raytrace, +20% for spectral/fibheaps)
-* A scaling of >= 25 showed no regressions on nofib. However it showed a number of
-  (small) regression for compiler perf benchmarks.
-
-The end result is that we are settling for a scaling of 30, with a threshold of 2.
-This gives us minimal compiler perf regressions. No nofib runtime regressions and
-will still avoid this pattern sometimes. This is a "safe" default, where we err on
-the side of compiler blowup instead of risking runtime regressions.
-
-For cases where the default falls short the flag can be changed to allow more/less inlining as
-needed on a per-module basis.
-
--}
-
-tryUnfolding :: Logger -> UnfoldingOpts -> Int -> Id -> Bool -> [ArgSummary] -> CallCtxt
-             -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
-             -> Maybe CoreExpr
-tryUnfolding logger opts !case_depth id lone_variable
-             arg_infos cont_info unf_template
-             is_wf is_exp guidance
- = case guidance of
-     UnfNever -> traceInline logger opts id str (text "UnfNever") Nothing
-
-     UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
-        | enough_args && (boring_ok || some_benefit || unfoldingVeryAggressive opts)
-                -- See Note [INLINE for small functions] (3)
-        -> traceInline logger opts id str (mk_doc some_benefit empty True) (Just unf_template)
-        | otherwise
-        -> traceInline logger opts id str (mk_doc some_benefit empty False) Nothing
-        where
-          some_benefit = calc_some_benefit uf_arity
-          enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
-
-     UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
-        | unfoldingVeryAggressive opts
-        -> traceInline logger opts id str (mk_doc some_benefit extra_doc True) (Just unf_template)
-        | is_wf && some_benefit && small_enough
-        -> traceInline logger opts id str (mk_doc some_benefit extra_doc True) (Just unf_template)
-        | otherwise
-        -> traceInline logger opts id str (mk_doc some_benefit extra_doc False) Nothing
-        where
-          some_benefit = calc_some_benefit (length arg_discounts)
-          extra_doc = vcat [ text "case depth =" <+> int case_depth
-                           , text "depth based penalty =" <+> int depth_penalty
-                           , text "discounted size =" <+> int adjusted_size ]
-          -- See Note [Avoid inlining into deeply nested cases]
-          depth_treshold = unfoldingCaseThreshold opts
-          depth_scaling = unfoldingCaseScaling opts
-          depth_penalty | case_depth <= depth_treshold = 0
-                        | otherwise       = (size * (case_depth - depth_treshold)) `div` depth_scaling
-          adjusted_size = size + depth_penalty - discount
-          small_enough = adjusted_size <= unfoldingUseThreshold opts
-          discount = computeDiscount arg_discounts res_discount arg_infos cont_info
-
-  where
-    mk_doc some_benefit extra_doc yes_or_no
-      = vcat [ text "arg infos" <+> ppr arg_infos
-             , text "interesting continuation" <+> ppr cont_info
-             , text "some_benefit" <+> ppr some_benefit
-             , text "is exp:" <+> ppr is_exp
-             , text "is work-free:" <+> ppr is_wf
-             , text "guidance" <+> ppr guidance
-             , extra_doc
-             , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
-
-    ctx = log_default_dump_context (logFlags logger)
-    str = "Considering inlining: " ++ showSDocOneLine ctx (ppr id)
-    n_val_args = length arg_infos
-
-           -- some_benefit is used when the RHS is small enough
-           -- and the call has enough (or too many) value
-           -- arguments (ie n_val_args >= arity). But there must
-           -- be *something* interesting about some argument, or the
-           -- result context, to make it worth inlining
-    calc_some_benefit :: Arity -> Bool   -- The Arity is the number of args
-                                         -- expected by the unfolding
-    calc_some_benefit uf_arity
-       | not saturated = interesting_args       -- Under-saturated
-                                        -- Note [Unsaturated applications]
-       | otherwise = interesting_args   -- Saturated or over-saturated
-                  || interesting_call
-      where
-        saturated      = n_val_args >= uf_arity
-        over_saturated = n_val_args > uf_arity
-        interesting_args = any nonTriv arg_infos
-                -- NB: (any nonTriv arg_infos) looks at the
-                -- over-saturated args too which is "wrong";
-                -- but if over-saturated we inline anyway.
-
-        interesting_call
-          | over_saturated
-          = True
-          | otherwise
-          = case cont_info of
-              CaseCtxt   -> not (lone_variable && is_exp)  -- Note [Lone variables]
-              ValAppCtxt -> True                           -- Note [Cast then apply]
-              RuleArgCtxt -> uf_arity > 0  -- See Note [RHS of lets]
-              DiscArgCtxt -> uf_arity > 0  -- Note [Inlining in ArgCtxt]
-              RhsCtxt NonRecursive
-                          -> uf_arity > 0  -- See Note [RHS of lets]
-              _other      -> False         -- See Note [Nested functions]
-
-
-{- Note [RHS of lets]
-~~~~~~~~~~~~~~~~~~~~~
-When the call is the argument of a function with a RULE, or the RHS of a let,
-we are a little bit keener to inline (in tryUnfolding).  For example
-     f y = (y,y,y)
-     g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
-We'd inline 'f' if the call was in a case context, and it kind-of-is,
-only we can't see it.  Also
-     x = f v
-could be expensive whereas
-     x = case v of (a,b) -> a
-is patently cheap and may allow more eta expansion.
-
-So, in `interesting_call` in `tryUnfolding`, we treat the RHS of a
-/non-recursive/ let as not-totally-boring.  A /recursive/ let isn't
-going be inlined so there is much less point.  Hence the (only reason
-for the) RecFlag in RhsCtxt
-
-Note [Unsaturated applications]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When a call is not saturated, we *still* inline if one of the
-arguments has interesting structure.  That's sometimes very important.
-A good example is the Ord instance for Bool in Base:
-
- Rec {
-    $fOrdBool =GHC.Classes.D:Ord
-                 @ Bool
-                 ...
-                 $cmin_ajX
-
-    $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
-    $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
-  }
-
-But the defn of GHC.Classes.$dmmin is:
-
-  $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
-    {- Arity: 3, HasNoCafRefs, Strictness: SLL,
-       Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
-                   case @ a GHC.Classes.<= @ a $dOrd x y of wild {
-                     GHC.Types.False -> y GHC.Types.True -> x }) -}
-
-We *really* want to inline $dmmin, even though it has arity 3, in
-order to unravel the recursion.
-
-
-Note [Things to watch]
-~~~~~~~~~~~~~~~~~~~~~~
-*   { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
-    Assume x is exported, so not inlined unconditionally.
-    Then we want x to inline unconditionally; no reason for it
-    not to, and doing so avoids an indirection.
-
-*   { x = I# 3; ....f x.... }
-    Make sure that x does not inline unconditionally!
-    Lest we get extra allocation.
-
-Note [Inlining an InlineRule]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-An InlineRules is used for
-  (a) programmer INLINE pragmas
-  (b) inlinings from worker/wrapper
-
-For (a) the RHS may be large, and our contract is that we *only* inline
-when the function is applied to all the arguments on the LHS of the
-source-code defn.  (The uf_arity in the rule.)
-
-However for worker/wrapper it may be worth inlining even if the
-arity is not satisfied (as we do in the CoreUnfolding case) so we don't
-require saturation.
-
-Note [Nested functions]
-~~~~~~~~~~~~~~~~~~~~~~~
-At one time we treated a call of a non-top-level function as
-"interesting" (regardless of how boring the context) in the hope
-that inlining it would eliminate the binding, and its allocation.
-Specifically, in the default case of interesting_call we had
-   _other -> not is_top && uf_arity > 0
-
-But actually postInlineUnconditionally does some of this and overall
-it makes virtually no difference to nofib.  So I simplified away this
-special case
-
-Note [Cast then apply]
-~~~~~~~~~~~~~~~~~~~~~~
-Consider
-   myIndex = __inline_me ( (/\a. <blah>) |> co )
-   co :: (forall a. a -> a) ~ (forall a. T a)
-     ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
-
-We need to inline myIndex to unravel this; but the actual call (myIndex a) has
-no value arguments.  The ValAppCtxt gives it enough incentive to inline.
-
-Note [Inlining in ArgCtxt]
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-The condition (arity > 0) here is very important, because otherwise
-we end up inlining top-level stuff into useless places; eg
-   x = I# 3#
-   f = \y.  g x
-This can make a very big difference: it adds 16% to nofib 'integer' allocs,
-and 20% to 'power'.
-
-At one stage I replaced this condition by 'True' (leading to the above
-slow-down).  The motivation was test eyeball/inline1.hs; but that seems
-to work ok now.
-
-NOTE: arguably, we should inline in ArgCtxt only if the result of the
-call is at least CONLIKE.  At least for the cases where we use ArgCtxt
-for the RHS of a 'let', we only profit from the inlining if we get a
-CONLIKE thing (modulo lets).
-
-Note [Lone variables]   See also Note [Interaction of exprIsWorkFree and lone variables]
-~~~~~~~~~~~~~~~~~~~~~   which appears below
-The "lone-variable" case is important.  I spent ages messing about
-with unsatisfactory variants, but this is nice.  The idea is that if a
-variable appears all alone
-
-        as an arg of lazy fn, or rhs    BoringCtxt
-        as scrutinee of a case          CaseCtxt
-        as arg of a fn                  ArgCtxt
-AND
-        it is bound to a cheap expression
-
-then we should not inline it (unless there is some other reason,
-e.g. it is the sole occurrence).  That is what is happening at
-the use of 'lone_variable' in 'interesting_call'.
-
-Why?  At least in the case-scrutinee situation, turning
-        let x = (a,b) in case x of y -> ...
-into
-        let x = (a,b) in case (a,b) of y -> ...
-and thence to
-        let x = (a,b) in let y = (a,b) in ...
-is bad if the binding for x will remain.
-
-Another example: I discovered that strings
-were getting inlined straight back into applications of 'error'
-because the latter is strict.
-        s = "foo"
-        f = \x -> ...(error s)...
-
-Fundamentally such contexts should not encourage inlining because, provided
-the RHS is "expandable" (see Note [exprIsExpandable] in GHC.Core.Utils) the
-context can ``see'' the unfolding of the variable (e.g. case or a
-RULE) so there's no gain.
-
-However, watch out:
-
- * Consider this:
-        foo = _inline_ (\n. [n])
-        bar = _inline_ (foo 20)
-        baz = \n. case bar of { (m:_) -> m + n }
-   Here we really want to inline 'bar' so that we can inline 'foo'
-   and the whole thing unravels as it should obviously do.  This is
-   important: in the NDP project, 'bar' generates a closure data
-   structure rather than a list.
-
-   So the non-inlining of lone_variables should only apply if the
-   unfolding is regarded as cheap; because that is when exprIsConApp_maybe
-   looks through the unfolding.  Hence the "&& is_wf" in the
-   InlineRule branch.
-
- * Even a type application or coercion isn't a lone variable.
-   Consider
-        case $fMonadST @ RealWorld of { :DMonad a b c -> c }
-   We had better inline that sucker!  The case won't see through it.
-
-   For now, I'm treating treating a variable applied to types
-   in a *lazy* context "lone". The motivating example was
-        f = /\a. \x. BIG
-        g = /\a. \y.  h (f a)
-   There's no advantage in inlining f here, and perhaps
-   a significant disadvantage.  Hence some_val_args in the Stop case
-
-Note [Interaction of exprIsWorkFree and lone variables]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The lone-variable test says "don't inline if a case expression
-scrutinises a lone variable whose unfolding is cheap".  It's very
-important that, under these circumstances, exprIsConApp_maybe
-can spot a constructor application. So, for example, we don't
-consider
-        let x = e in (x,x)
-to be cheap, and that's good because exprIsConApp_maybe doesn't
-think that expression is a constructor application.
-
-In the 'not (lone_variable && is_wf)' test, I used to test is_value
-rather than is_wf, which was utterly wrong, because the above
-expression responds True to exprIsHNF, which is what sets is_value.
-
-This kind of thing can occur if you have
-
-        {-# INLINE foo #-}
-        foo = let x = e in (x,x)
-
-which Roman did.
-
-
--}
-
-computeDiscount :: [Int] -> Int -> [ArgSummary] -> CallCtxt
-                -> Int
-computeDiscount arg_discounts res_discount arg_infos cont_info
-
-  = 10          -- Discount of 10 because the result replaces the call
-                -- so we count 10 for the function itself
-
-    + 10 * length actual_arg_discounts
-               -- Discount of 10 for each arg supplied,
-               -- because the result replaces the call
-
-    + total_arg_discount + res_discount'
-  where
-    actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
-    total_arg_discount   = sum actual_arg_discounts
-
-    mk_arg_discount _        TrivArg    = 0
-    mk_arg_discount _        NonTrivArg = 10
-    mk_arg_discount discount ValueArg   = discount
-
-    res_discount'
-      | LT <- arg_discounts `compareLength` arg_infos
-      = res_discount   -- Over-saturated
-      | otherwise
-      = case cont_info of
-           BoringCtxt  -> 0
-           CaseCtxt    -> res_discount  -- Presumably a constructor
-           ValAppCtxt  -> res_discount  -- Presumably a function
-           _           -> 40 `min` res_discount
-                -- ToDo: this 40 `min` res_discount doesn't seem right
-                --   for DiscArgCtxt it shouldn't matter because the function will
-                --       get the arg discount for any non-triv arg
-                --   for RuleArgCtxt we do want to be keener to inline; but not only
-                --       constructor results
-                --   for RhsCtxt I suppose that exposing a data con is good in general
-                --   And 40 seems very arbitrary
-                --
-                -- res_discount can be very large when a function returns
-                -- constructors; but we only want to invoke that large discount
-                -- when there's a case continuation.
-                -- Otherwise we, rather arbitrarily, threshold it.  Yuk.
-                -- But we want to avoid inlining large functions that return
-                -- constructors into contexts that are simply "interesting"


=====================================
compiler/GHC/Data/Graph/UnVar.hs
=====================================
@@ -14,6 +14,8 @@ It does not normalize the graphs. This means that g `unionUnVarGraph` g is
 equal to g, but twice as expensive and large.
 
 -}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+
 module GHC.Data.Graph.UnVar
     ( UnVarSet
     , emptyUnVarSet, mkUnVarSet, varEnvDom, unionUnVarSet, unionUnVarSets
@@ -45,8 +47,9 @@ import qualified Data.IntSet as S
 
 -- Set of uniques, i.e. for adjancet nodes
 newtype UnVarSet = UnVarSet (S.IntSet)
-    deriving Eq
+    deriving (Eq,Semigroup,Monoid)
 
+{-# INLINE k #-}
 k :: Var -> Int
 k v = getKey (getUnique v)
 
@@ -64,7 +67,8 @@ delUnVarSet :: UnVarSet -> Var -> UnVarSet
 delUnVarSet (UnVarSet s) v = UnVarSet $ k v `S.delete` s
 
 delUnVarSetList :: UnVarSet -> [Var] -> UnVarSet
-delUnVarSetList s vs = s `minusUnVarSet` mkUnVarSet vs
+delUnVarSetList (UnVarSet s) vs =
+  UnVarSet $ foldl' (\s v -> S.delete (k v) s) s vs
 
 minusUnVarSet :: UnVarSet -> UnVarSet -> UnVarSet
 minusUnVarSet (UnVarSet s) (UnVarSet s') = UnVarSet $ s `S.difference` s'
@@ -73,7 +77,7 @@ sizeUnVarSet :: UnVarSet -> Int
 sizeUnVarSet (UnVarSet s) = S.size s
 
 mkUnVarSet :: [Var] -> UnVarSet
-mkUnVarSet vs = UnVarSet $ S.fromList $ map k vs
+mkUnVarSet vs = UnVarSet $ foldl' (\s v -> S.insert (k v) s) S.empty vs
 
 varEnvDom :: VarEnv a -> UnVarSet
 varEnvDom ae = UnVarSet $ ufmToSet_Directly ae
@@ -82,7 +86,8 @@ extendUnVarSet :: Var -> UnVarSet -> UnVarSet
 extendUnVarSet v (UnVarSet s) = UnVarSet $ S.insert (k v) s
 
 extendUnVarSetList :: [Var] -> UnVarSet -> UnVarSet
-extendUnVarSetList vs s = s `unionUnVarSet` mkUnVarSet vs
+extendUnVarSetList vs (UnVarSet s) =
+    UnVarSet $ foldl' (\s v -> S.insert (k v) s) s vs
 
 unionUnVarSet :: UnVarSet -> UnVarSet -> UnVarSet
 unionUnVarSet (UnVarSet set1) (UnVarSet set2) = UnVarSet (set1 `S.union` set2)


=====================================
compiler/GHC/Driver/Flags.hs
=====================================
@@ -415,6 +415,7 @@ data GeneralFlag
    -- Suppress separate type signatures in core, but leave types on
    -- lambda bound vars
    | Opt_SuppressUnfoldings
+   | Opt_SuppressUnfoldingGuidance
    -- Suppress the details of even stable unfoldings
    | Opt_SuppressTypeSignatures
    -- Suppress unique ids on variables.


=====================================
compiler/GHC/Driver/Session.hs
=====================================
@@ -2825,6 +2825,8 @@ dynamic_flags_deps = [
       (intSuffix   (\n d -> d { unfoldingOpts = updateCaseThreshold n (unfoldingOpts d)}))
   , make_ord_flag defFlag "funfolding-case-scaling"
       (intSuffix   (\n d -> d { unfoldingOpts = updateCaseScaling n (unfoldingOpts d)}))
+  , make_ord_flag defFlag "funfolding-discount-depth"
+      (intSuffix   (\n d -> d { unfoldingOpts = updateMaxDiscountDepth n (unfoldingOpts d)}))
 
   , make_dep_flag defFlag "funfolding-keeness-factor"
       (floatSuffix (\_ d -> d))
@@ -3355,7 +3357,8 @@ dFlagsDeps = [
   flagSpec "suppress-type-signatures"   Opt_SuppressTypeSignatures,
   flagSpec "suppress-uniques"           Opt_SuppressUniques,
   flagSpec "suppress-var-kinds"         Opt_SuppressVarKinds,
-  flagSpec "suppress-core-sizes"        Opt_SuppressCoreSizes
+  flagSpec "suppress-core-sizes"        Opt_SuppressCoreSizes,
+  flagSpec "suppress-guidance"          Opt_SuppressUnfoldingGuidance
   ]
 
 -- | These @-f\<blah\>@ flags can all be reversed with @-fno-\<blah\>@
@@ -5017,6 +5020,7 @@ initSDocContext dflags style = SDC
   , sdocSuppressCoercions           = gopt Opt_SuppressCoercions dflags
   , sdocSuppressCoercionTypes       = gopt Opt_SuppressCoercionTypes dflags
   , sdocSuppressUnfoldings          = gopt Opt_SuppressUnfoldings dflags
+  , sdocSuppressUnfoldingGuidance   = gopt Opt_SuppressUnfoldingGuidance dflags
   , sdocSuppressVarKinds            = gopt Opt_SuppressVarKinds dflags
   , sdocSuppressUniques             = gopt Opt_SuppressUniques dflags
   , sdocSuppressModulePrefixes      = gopt Opt_SuppressModulePrefixes dflags


=====================================
compiler/GHC/Types/Unique/FM.hs
=====================================
@@ -79,12 +79,12 @@ module GHC.Types.Unique.FM (
         ufmToSet_Directly,
         nonDetUFMToList, ufmToIntMap, unsafeIntMapToUFM,
         unsafeCastUFMKey,
-        pprUniqFM, pprUFM, pprUFMWithKeys, pluralUFM
+        pprUniqFM, pprUFM, pprUFMWithKeys, pluralUFM,partitionWithKeyUFM
     ) where
 
 import GHC.Prelude
 
-import GHC.Types.Unique ( Uniquable(..), Unique, getKey )
+import GHC.Types.Unique ( Uniquable(..), Unique, getKey, mkUniqueGrimily )
 import GHC.Utils.Outputable
 import GHC.Utils.Panic.Plain
 import qualified Data.IntMap as M
@@ -361,6 +361,11 @@ partitionUFM p (UFM m) =
   case M.partition p m of
     (left, right) -> (UFM left, UFM right)
 
+partitionWithKeyUFM :: (Unique -> elt -> Bool) -> UniqFM key elt -> (UniqFM key elt, UniqFM key elt)
+partitionWithKeyUFM p (UFM m) =
+  case M.partitionWithKey (\k -> p (mkUniqueGrimily k)) m of
+    (left, right) -> (UFM left, UFM right)
+
 sizeUFM :: UniqFM key elt -> Int
 sizeUFM (UFM m) = M.size m
 


=====================================
compiler/GHC/Types/Var/Env.hs
=====================================
@@ -540,6 +540,7 @@ extendVarEnvList = addListToUFM
 plusVarEnv_C     = plusUFM_C
 plusVarEnv_CD    = plusUFM_CD
 plusMaybeVarEnv_C = plusMaybeUFM_C
+{-# INLINE delVarEnvList #-}
 delVarEnvList    = delListFromUFM
 delVarEnv        = delFromUFM
 minusVarEnv      = minusUFM


=====================================
compiler/GHC/Utils/Outputable.hs
=====================================
@@ -382,7 +382,8 @@ data SDocContext = SDC
   , sdocSuppressIdInfo              :: !Bool
   , sdocSuppressCoercions           :: !Bool
   , sdocSuppressCoercionTypes       :: !Bool
-  , sdocSuppressUnfoldings          :: !Bool
+  , sdocSuppressUnfoldings          :: !Bool -- Maybe it should be a cutoff for the depth instead?
+  , sdocSuppressUnfoldingGuidance   :: !Bool
   , sdocSuppressVarKinds            :: !Bool
   , sdocSuppressUniques             :: !Bool
   , sdocSuppressModulePrefixes      :: !Bool
@@ -443,6 +444,7 @@ defaultSDocContext = SDC
   , sdocSuppressCoercions           = False
   , sdocSuppressCoercionTypes       = False
   , sdocSuppressUnfoldings          = False
+  , sdocSuppressUnfoldingGuidance   = False
   , sdocSuppressVarKinds            = False
   , sdocSuppressUniques             = False
   , sdocSuppressModulePrefixes      = False


=====================================
compiler/ghc.cabal.in
=====================================
@@ -323,6 +323,7 @@ Library
         GHC.Core.Opt.Simplify
         GHC.Core.Opt.Simplify.Env
         GHC.Core.Opt.Simplify.Iteration
+        GHC.Core.Opt.Simplify.Inline
         GHC.Core.Opt.Simplify.Monad
         GHC.Core.Opt.Simplify.Utils
         GHC.Core.Opt.SpecConstr


=====================================
docs/users_guide/hints.rst
=====================================
@@ -404,6 +404,8 @@ decision about inlining a specific binding.
 * :ghc-flag:`-funfolding-case-scaling=⟨n⟩`
 * :ghc-flag:`-funfolding-dict-discount=⟨n⟩`
 * :ghc-flag:`-funfolding-fun-discount=⟨n⟩`
+* :ghc-flag:`-funfolding-max-guide-depth=⟨n⟩`
+* :ghc-flag:`-funfolding-discount-depth=⟨n⟩`
 
 Should the simplifier run out of ticks because of a inlining loop
 users are encouraged to try decreasing :ghc-flag:`-funfolding-case-threshold=⟨n⟩`


=====================================
docs/users_guide/using-optimisation.rst
=====================================
@@ -1651,6 +1651,37 @@ by saying ``-fno-wombat``.
     while still allowing GHC to compile modules containing such inlining loops.
 
 
+.. ghc-flag:: -funfolding-discount-depth=⟨n⟩
+    :shortdesc: *default: 20.* Don't look deeper than `n` levels into function argument use.
+    :type: dynamic
+    :category:
+
+    :default: 20
+
+    .. index::
+       single: inlining, controlling
+       single: unfolding, controlling
+
+    If we have a function application `f (Succ (Succ Zero))` with the function `f`::
+
+        f x =
+            case x of
+                Zero -> 0
+                Succ y -> case y of
+                    Zero -> 1
+                    Succ z -> case z of
+                        Zero -> 2
+                        _ -> error "Large"
+
+    Then GHC can consider the nested structure of the argument as well as how
+    deeply the function looks into the argument to make inlining decisions.
+
+    This allows us to properly estimate the result code size from applying arguments
+    with complex structure to functions taking these arguments appart.
+
+    However inspecting deeply nested arguments can be costly in terms of compile
+    time overhead. So we restrict these considerations to a certain depth.
+
 .. ghc-flag:: -fworker-wrapper
     :shortdesc: Enable the worker/wrapper transformation.
     :type: dynamic



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/bac008ae520815bc7cabd42240ca78565d0a52e0...4cbe93cf04b788857a93676c780b40e48cdbb0dc

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/bac008ae520815bc7cabd42240ca78565d0a52e0...4cbe93cf04b788857a93676c780b40e48cdbb0dc
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20221002/5a695bf6/attachment-0001.html>