[Git][ghc/ghc][wip/simplifier-tweaks] 2 commits: Comments only

Fri Jul 7 15:27:14 UTC 2023

Simon Peyton Jones pushed to branch wip/simplifier-tweaks at Glasgow Haskell Compiler / GHC


Commits:
13033d34 by Simon Peyton Jones at 2023-07-07T13:52:23+01:00
Comments only

- - - - -
4197f388 by Simon Peyton Jones at 2023-07-07T16:26:29+01:00
Fixes for T15630

What a struggle.  Finally understood exponential behaviour

- - - - -


6 changed files:

- compiler/GHC/Core/Coercion.hs
- compiler/GHC/Core/Opt/Simplify/Inline.hs
- compiler/GHC/Core/Opt/Simplify/Iteration.hs
- compiler/GHC/Core/Opt/Simplify/Utils.hs
- compiler/GHC/Core/SimpleOpt.hs
- testsuite/tests/perf/compiler/T15630a.hs


Changes:

=====================================
compiler/GHC/Core/Coercion.hs
=====================================
@@ -1155,6 +1155,7 @@ mkTransCo (GRefl r t1 (MCo co1)) (GRefl _ _ (MCo co2))
   = GRefl r t1 (MCo $ mkTransCo co1 co2)
 mkTransCo co1 co2                = TransCo co1 co2
 
+--------------------
 mkSelCo :: HasDebugCallStack
         => CoSel
         -> Coercion
@@ -1243,6 +1244,24 @@ getNthFun SelMult mult _   _   = mult
 getNthFun SelArg _     arg _   = arg
 getNthFun SelRes _     _   res = res
 
+getNthFromType :: HasDebugCallStack => CoSel -> Type -> Type
+getNthFromType (SelFun fs) ty
+  | Just (_af, mult, arg, res) <- splitFunTy_maybe ty
+  = getNthFun fs mult arg res
+
+getNthFromType (SelTyCon n _) ty
+  | Just args <- tyConAppArgs_maybe ty
+  = assertPpr (args `lengthExceeds` n) (ppr n $$ ppr ty) $
+    args `getNth` n
+
+getNthFromType SelForAll ty       -- Works for both tyvar and covar
+  | Just (tv,_) <- splitForAllTyCoVar_maybe ty
+  = tyVarKind tv
+
+getNthFromType cs ty
+  = pprPanic "getNthFromType" (ppr cs $$ ppr ty)
+
+--------------------
 mkLRCo :: LeftOrRight -> Coercion -> Coercion
 mkLRCo lr co
   | Just (ty, eq) <- isReflCo_maybe co
@@ -2457,23 +2476,6 @@ coercionLKind co
     go_app (InstCo co arg) args = go_app co (go arg:args)
     go_app co              args = piResultTys (go co) args
 
-getNthFromType :: HasDebugCallStack => CoSel -> Type -> Type
-getNthFromType (SelFun fs) ty
-  | Just (_af, mult, arg, res) <- splitFunTy_maybe ty
-  = getNthFun fs mult arg res
-
-getNthFromType (SelTyCon n _) ty
-  | Just args <- tyConAppArgs_maybe ty
-  = assertPpr (args `lengthExceeds` n) (ppr n $$ ppr ty) $
-    args `getNth` n
-
-getNthFromType SelForAll ty       -- Works for both tyvar and covar
-  | Just (tv,_) <- splitForAllTyCoVar_maybe ty
-  = tyVarKind tv
-
-getNthFromType cs ty
-  = pprPanic "getNthFromType" (ppr cs $$ ppr ty)
-
 coercionRKind :: Coercion -> Type
 coercionRKind co
   = go co


=====================================
compiler/GHC/Core/Opt/Simplify/Inline.hs
=====================================
@@ -89,14 +89,14 @@ StrictAnal.addStrictnessInfoToTopId
 
 callSiteInline :: Logger
                -> UnfoldingOpts
-               -> Int                   -- Case depth
+               -> Int -> Int            -- Case depth
                -> Id                    -- The Id
                -> Bool                  -- True <=> unfolding is active
                -> Bool                  -- True if there are no arguments at all (incl type args)
                -> [ArgSummary]          -- One for each value arg; True if it is interesting
                -> CallCtxt              -- True <=> continuation is interesting
                -> Maybe CoreExpr        -- Unfolding, if any
-callSiteInline logger opts !case_depth id active_unfolding lone_variable arg_infos cont_info
+callSiteInline logger opts !case_depth !inline_depth id active_unfolding lone_variable arg_infos cont_info
   = case idUnfolding id of
       -- idUnfolding checks for loop-breakers, returning NoUnfolding
       -- Things with an INLINE pragma may have an unfolding *and*
@@ -104,7 +104,7 @@ callSiteInline logger opts !case_depth id active_unfolding lone_variable arg_inf
         CoreUnfolding { uf_tmpl = unf_template
                       , uf_cache = unf_cache
                       , uf_guidance = guidance }
-          | active_unfolding -> tryUnfolding logger opts case_depth id lone_variable
+          | active_unfolding -> tryUnfolding logger opts case_depth inline_depth id lone_variable
                                     arg_infos cont_info unf_template
                                     unf_cache guidance
           | otherwise -> traceInline logger opts id "Inactive unfolding:" (ppr id) Nothing
@@ -227,10 +227,10 @@ needed on a per-module basis.
 
 -}
 
-tryUnfolding :: Logger -> UnfoldingOpts -> Int -> Id -> Bool -> [ArgSummary] -> CallCtxt
+tryUnfolding :: Logger -> UnfoldingOpts -> Int -> Int -> Id -> Bool -> [ArgSummary] -> CallCtxt
              -> CoreExpr -> UnfoldingCache -> UnfoldingGuidance
              -> Maybe CoreExpr
-tryUnfolding logger opts !case_depth id lone_variable arg_infos
+tryUnfolding logger opts !case_depth !inline_depth id lone_variable arg_infos
              cont_info unf_template unf_cache guidance
  = case guidance of
      UnfNever -> traceInline logger opts id str (text "UnfNever") Nothing
@@ -264,6 +264,7 @@ tryUnfolding logger opts !case_depth id lone_variable arg_infos
           discount = computeDiscount arg_discounts res_discount arg_infos cont_info
 
           extra_doc = vcat [ text "case depth =" <+> int case_depth
+                           , text "inline depth =" <+> int inline_depth
                            , text "depth based penalty =" <+> int depth_penalty
                            , text "discounted size =" <+> int adjusted_size ]
   where


=====================================
compiler/GHC/Core/Opt/Simplify/Iteration.hs
=====================================
@@ -420,11 +420,13 @@ simplAuxBind _str env bndr new_rhs
   -- The cases would be inlined unconditionally by completeBind:
   -- but it seems not uncommon, and it turns to be a little more
   -- efficient (in compile time allocations) to do it here.
+  -- Effectively this is just a poor man's postInlineUnconditionally
   -- See Note [Post-inline for single-use things] in GHC.Core.Opt.Simplify.Utils
   -- Note: auxiliary bindings have no NOLINE pragmas, RULEs, or stable unfoldings
   | exprIsTrivial new_rhs  -- Short-cut for let x = y in ...
     || case (idOccInfo bndr) of
-          OneOcc{ occ_n_br = 1, occ_in_lam = NotInsideLam } -> True
+          OneOcc{ occ_n_br = 1, occ_in_lam = NotInsideLam } -> -- pprTrace ("simplAuxBind:"++_str) (ppr bndr $$ ppr new_rhs)
+                                                               True
           _                                                 -> False
   = return ( emptyFloats env
            , case new_rhs of
@@ -1563,7 +1565,7 @@ completeBindX env from_what bndr rhs body cont
                                                bndr2 (emptyFloats env) rhs
               -- NB: it makes a surprisingly big difference (5% in compiler allocation
               -- in T9630) to pass 'env' rather than 'env1'.  It's fine to pass 'env',
-              -- because this is simplNonRecX, so bndr is not in scope in the RHS.
+              -- because this is completeBindX, so bndr is not in scope in the RHS.
 
         ; (bind_float, env2) <- completeBind (env2 `setInScopeFromF` rhs_floats)
                                              (BC_Let NotTopLevel NonRecursive)
@@ -1672,8 +1674,7 @@ simplCast env body co0 cont0
                ; case m_co1 of {
                    MRefl -> return (cont { sc_cont = tail'
                                          , sc_hole_ty = coercionLKind co }) ;
-                      -- Avoid simplifying if possible;
-                      -- See Note [Avoiding exponential behaviour]
+                      -- See Note [Avoiding simplifying repeatedly]
 
                    MCo co1 ->
             do { (dup', arg_se', arg') <- simplArg env dup fun_ty arg_se arg
@@ -1754,7 +1755,7 @@ simpl_lam env bndr body (ApplyToVal { sc_arg = arg, sc_env = arg_se
        ; let arg_ty = funArgTy fun_ty
        ; if | isSimplified dup  -- Don't re-simplify if we've simplified it once
                                 -- Including don't preInlineUnconditionally
-                                -- See Note [Avoiding exponential behaviour]
+                                -- See Note [Avoiding simplifying repeatedly]
             -> completeBindX env (FromBeta arg_ty) bndr arg body cont
 
             | Just env' <- preInlineUnconditionally env NotTopLevel bndr arg arg_se
@@ -1888,27 +1889,35 @@ Simplifier without first calling SimpleOpt, so anything involving
 GHCi or TH and operator sections will fall over if we don't take
 care here.
 
-Note [Avoiding exponential behaviour]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Note [Avoiding simplifying repeatedly]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 One way in which we can get exponential behaviour is if we simplify a
 big expression, and then re-simplify it -- and then this happens in a
 deeply-nested way.  So we must be jolly careful about re-simplifying
-an expression (#13379).  That is why simplNonRecX does not try
-preInlineUnconditionally (unlike simplNonRecE).
+an expression (#13379).
 
 Example:
   f BIG, where f has a RULE
 Then
  * We simplify BIG before trying the rule; but the rule does not fire
- * We inline f = \x. x True
- * So if we did preInlineUnconditionally we'd re-simplify (BIG True)
+   (forcing this simplification is why we have the RULE in this example)
+ * We inline f = \x. g x, in `simpl_lam`
+ * So if `simpl_lam` did preInlineUnconditionally we get (g BIG)
+ * Now if g has a RULE we'll simplify BIG again, and this whole thing can
+   iterate.
+ * However, if `f` did not have a RULE, so that BIG has /not/ already been
+   simplified, we /want/ to do preInlineUnconditionally in simpl_lam.
+
+So we go to some effort to avoid repeatedly simplifying the same thing:
 
-However, if BIG has /not/ already been simplified, we'd /like/ to
-simplify BIG True; maybe good things happen.  That is why
+* ApplyToVal has a (sc_dup :: DupFlag) field which records if the argument
+  has been evaluated.
 
-* simplLam has
-    - a case for (isSimplified dup), which goes via simplNonRecX, and
-    - a case for the un-simplified case, which goes via simplNonRecE
+* simplArg checks this flag to avoid re-simplifying.
+
+* simpl_lam has:
+    - a case for (isSimplified dup), which goes via completeBindX, and
+    - a case for an un-simplified argument, which tries preInlineUnconditionally
 
 * We go to some efforts to avoid unnecessarily simplifying ApplyToVal,
   in at least two places
@@ -1916,6 +1925,11 @@ simplify BIG True; maybe good things happen.  That is why
     - In rebuildCall we avoid simplifying arguments before we have to
       (see Note [Trying rewrite rules])
 
+All that said /postInlineUnconditionally/ (called in `completeBind`) does
+fire in the above (f BIG) situation.  See Note [Post-inline for single-use
+things] in Simplify.Utils.  This certainly risks repeated simplification, but
+in practice seems to be a small win.
+
 
 ************************************************************************
 *                                                                      *
@@ -2321,7 +2335,7 @@ rebuildCall env (ArgInfo { ai_fun = fun, ai_args = rev_args }) cont
 -----------------------------------
 tryInlining :: SimplEnv -> Logger -> OutId -> SimplCont -> SimplM (Maybe OutExpr)
 tryInlining env logger var cont
-  | Just expr <- callSiteInline logger uf_opts case_depth var active_unf
+  | Just expr <- callSiteInline logger uf_opts case_depth inline_depth var active_unf
                                 lone_variable arg_infos interesting_cont
   = do { dump_inline expr cont
        ; return (Just expr) }
@@ -2332,6 +2346,7 @@ tryInlining env logger var cont
   where
     uf_opts    = seUnfoldingOpts env
     case_depth = seCaseDepth env
+    inline_depth = seInlineDepth env
     (lone_variable, arg_infos, call_cont) = contArgs cont
     interesting_cont = interestingCallContext env call_cont
     active_unf       = activeUnfolding (seMode env) var
@@ -3845,7 +3860,7 @@ mkDupableStrictBind env arg_bndr join_rhs res_ty
 mkDupableAlt :: SimplEnv -> OutId
              -> JoinFloats -> OutAlt
              -> SimplM (JoinFloats, OutAlt)
-mkDupableAlt env case_bndr jfloats (Alt con alt_bndrs alt_rhs_in)
+mkDupableAlt _env case_bndr jfloats (Alt con alt_bndrs alt_rhs_in)
   | ok_to_dup_alt case_bndr alt_bndrs alt_rhs_in   -- See point (2) of Note [Duplicating join points]
   = return (jfloats, Alt con alt_bndrs alt_rhs_in)
 
@@ -3895,12 +3910,14 @@ mkDupableAlt env case_bndr jfloats (Alt con alt_bndrs alt_rhs_in)
               join_rhs   = mkLams (map zapIdUnfolding final_bndrs) rhs_with_seqs
 
         ; join_bndr <- newJoinId filtered_binders rhs_ty'
-        ; let join_bndr_w_unf = join_bndr `setIdUnfolding`
-                                mkUnfolding uf_opts VanillaSrc False False join_rhs Nothing
-              uf_opts   = seUnfoldingOpts env
+        ; let -- join_bndr_w_unf = join_bndr `setIdUnfolding`
+              --                  mkUnfolding uf_opts VanillaSrc False False join_rhs Nothing
+              -- uf_opts   = seUnfoldingOpts env
+              join_bndr_w_unf = join_bndr
               join_call = mkApps (Var join_bndr) final_args
               alt'      = Alt con alt_bndrs join_call
 
+--        ; pprTrace "Creating join point" (ppr join_bndr <+> equals <+> ppr join_rhs) $ return ()
         ; return ( jfloats `addJoinFlts` unitJoinFloat (NonRec join_bndr_w_unf join_rhs)
                  , alt') }
                 -- See Note [Duplicated env]
@@ -4312,6 +4329,7 @@ simplLetUnfolding env bind_cxt id new_rhs rhs_ty arity unf
     let !opts = seUnfoldingOpts env
     in mkLetUnfolding opts (bindContextLevel bind_cxt) VanillaSrc id new_rhs
   where
+    -- See Note [Exponential join point inlining]
     too_many_occs (ManyOccs {})             = True
     too_many_occs (OneOcc { occ_n_br = n }) = n > 10
     too_many_occs IAmDead                   = False


=====================================
compiler/GHC/Core/Opt/Simplify/Utils.hs
=====================================
@@ -287,9 +287,9 @@ instance Outputable SimplCont where
     = (text "StrictBind" <+> ppr b) $$ ppr cont
   ppr (StrictArg { sc_fun = ai, sc_cont = cont })
     = (text "StrictArg" <+> ppr (ai_fun ai)) $$ ppr cont
-  ppr (Select { sc_dup = dup, sc_bndr = bndr, sc_alts = alts, sc_env = se, sc_cont = cont })
+  ppr (Select { sc_dup = dup, sc_bndr = bndr, sc_alts = alts, sc_cont = cont })
     = (text "Select" <+> ppr dup <+> ppr bndr) $$
-       whenPprDebug (nest 2 $ vcat [ppr (seTvSubst se), ppr alts]) $$ ppr cont
+      whenPprDebug (nest 2 $ ppr alts) $$ ppr cont
 
 
 {- Note [The hole type in ApplyToTy]
@@ -1542,13 +1542,14 @@ postInlineUnconditionally env bind_cxt old_bndr bndr rhs
       OneOcc { occ_in_lam = in_lam, occ_int_cxt = int_cxt, occ_n_br = n_br }
         -- See Note [Inline small things to avoid creating a thunk]
 
+        | let not_inside_lam = in_lam == NotInsideLam
         -> n_br < 100  -- See Note [Suppress exponential blowup]
 
-           && (  (n_br == 1)   -- See Note [Post-inline for single-use things]
+           && (  (n_br == 1 && not_inside_lam)  -- See Note [Post-inline for single-use things]
               || smallEnoughToInline uf_opts unfolding)  -- Small enough to dup
                  -- ToDo: consider discount on smallEnoughToInline if int_cxt is true
 
-           && (in_lam == NotInsideLam ||
+           && (not_inside_lam ||
                         -- Outside a lambda, we want to be reasonably aggressive
                         -- about inlining into multiple branches of case
                         -- e.g. let x = <non-value>
@@ -1595,7 +1596,6 @@ in allocation if you miss this out.  And bits of GHC itself start
 to allocate more.  An egregious example is test perf/compiler/T14697,
 where GHC.Driver.CmdLine.$wprocessArgs allocated hugely more.
 
-
 Note [Post-inline for single-use things]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 If we have
@@ -1617,27 +1617,29 @@ we risk creating
 which will take another iteration of the Simplifier to eliminate.  We do this in
 two places
 
-1. `simplAuxBind` does a kind of poor-man's `postInlineUnconditionally`.  It
-   does not need to account for many of the cases (e.g. top level) that the
-   full `postInlineUnconditionally` does.  Moreover, we don't have an
-   OutId, which `postInlineUnconditionally` needs.
-
-2. In the full `postInlineUnconditionally` we also look for the special case
-   of "one occurrence, not under a lambda".
-
+1. In the full `postInlineUnconditionally` look for the special case
+   of "one occurrence, not under a lambda", and inline unconditionally then.

--- Here's an example that we don't handle well:
---      let f = if b then Left (\x.BIG) else Right (\y.BIG)
---      in \y. ....case f of {...} ....
--- Here f is used just once, and duplicating the case work is fine (exprIsCheap).
--- But
---  - We can't preInlineUnconditionally because that would invalidate
---    the occ info for b.
---  - We can't postInlineUnconditionally because the RHS is big, and
---    that risks exponential behaviour
---  - We can't call-site inline, because the rhs is big
--- Alas!
+   This is a bit risky: see Note [Avoiding simplifying repeatedly] in
+   Simplify.Iteration.  But in practice it seems to be a small win.
 
+2. `simplAuxBind` does a kind of poor-man's `postInlineUnconditionally`.  It
+   does not need to account for many of the cases (e.g. top level) that the
+   full `postInlineUnconditionally` does.  Moreover, we don't have an
+   OutId, which `postInlineUnconditionally` needs.  I got a slight improvement
+   in compiler performance when I added this test.
+
+Here's an example that we don't currently handle well:
+     let f = if b then Left (\x.BIG) else Right (\y.BIG)
+     in \y. ....case f of {...} ....
+Here f is used just once, and duplicating the case work is fine (exprIsCheap).
+But
+ - We can't preInlineUnconditionally because that would invalidate
+   the occ info for b.
+ - We can't postInlineUnconditionally because the RHS is big, and
+   that risks exponential behaviour
+ - We can't call-site inline, because the rhs is big
+Alas!
 
 
 Note [Suppress exponential blowup]
@@ -1658,6 +1660,16 @@ to j9a and two to j9b.  In pass 2, postInlineUnconditionally inlines
 all four of these calls, leaving four calls to j8a and j8b. Etc.
 Yikes!  This is exponential!
 
+A similar case 
+  let j1 x = ...
+      j2 x = ...jump j1 (x-1).....jump j1 (x-2)
+      ...
+  in case f (y+10) of { True -> jump j10 10; False -> j10 10 }
+
+In the RHS of j1..j10, no inlining happens because the calls don't look
+exciting enough.  But in the "in" part, the call-site inliner may inline
+j10 (since it is applied to 10).  That exposts 
+In each 
 A possible plan: stop doing postInlineUnconditionally
 for some fixed, smallish number of branches, say 4. But that turned
 out to be bad: see Note [Inline small things to avoid creating a thunk].


=====================================
compiler/GHC/Core/SimpleOpt.hs
=====================================
@@ -1296,7 +1296,7 @@ exprIsConApp_maybe ise@(ISE in_scope id_unf) expr
          in go subst' (float:floats) expr cont
 
     go subst floats (Case scrut b _ [Alt con vars expr]) cont
-       | do_case_elim scrut' b vars
+       | do_case_elim scrut' b vars  -- See Note [Case elim in exprIsConApp_maybe]
        = go (extend subst b scrut') floats expr cont
        | otherwise
        = let
@@ -1427,6 +1427,27 @@ dealWithStringLiteral fun str co =
       in pushCoDataCon consDataCon [Type charTy, char_expr, rest] co
 
 {-
+Note [Case elim in exprIsConApp_maybe]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Suppose we have
+   data K a = MkK !a
+
+   $WMkK x = case x of y -> K y   -- Wrapper for MkK
+
+   ...case $WMkK v of K w -> <rhs>
+
+We call `exprIsConApp_maybe` on ($WMkK v); we inline the wrapper
+and beta-reduce, so we get to
+   exprIsConApp_maybe (case v of y -> K y)
+
+So we may float the case, and end up with
+   case v of y -> <rhs>[y/w]
+
+But if `v` is already evaluated, the next run of the Simplifier will
+eliminate the case, and we may then make more progress with <rhs>.
+Better to do it in one iteration.  Hence the `do_case_elim`
+check in `exprIsConApp_maybe`.
+
 Note [Unfolding DFuns]
 ~~~~~~~~~~~~~~~~~~~~~~
 DFuns look like


=====================================
testsuite/tests/perf/compiler/T15630a.hs
=====================================
@@ -1,4 +1,4 @@
-module T15630 where
+module T15630a where
 
 data IValue = IDefault
             | IInt Int



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/9cb540eb7a61621830fda2263fe1f744c5e80002...4197f38879c95b907c800171460dce4b59b10a2b

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/9cb540eb7a61621830fda2263fe1f744c5e80002...4197f38879c95b907c800171460dce4b59b10a2b
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20230707/d50b55d9/attachment-0001.html>