[Git][ghc/ghc][wip/cfuneqcan-refactor] Improve performance

Tue Nov 10 17:57:11 UTC 2020

Richard Eisenberg pushed to branch wip/cfuneqcan-refactor at Glasgow Haskell Compiler / GHC


Commits:
1c2aa0e3 by Richard Eisenberg at 2020-11-10T17:56:29+00:00
Improve performance

- - - - -


2 changed files:

- compiler/GHC/Tc/Solver/Flatten.hs
- compiler/GHC/Tc/Solver/Monad.hs


Changes:

=====================================
compiler/GHC/Tc/Solver/Flatten.hs
=====================================
@@ -364,18 +364,17 @@ faster. This doesn't seem quite worth it, yet.
 
 Note [flatten_exact_fam_app_fully performance]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The refactor of GRefl seems to cause performance trouble for T9872x:
-the allocation of flatten_exact_fam_app_fully_performance
-increased. See note [Generalized reflexive coercion] in
-GHC.Core.TyCo.Rep for more information about GRefl and #15192 for the
-current state.
-
-The explicit pattern match in homogenise_result helps with T9872a, b, c.
-
-Still, it increases the expected allocation of T9872d by ~2%.
-
-TODO: a step-by-step replay of the refactor to analyze the performance.
-
+Once we've got a flat rhs, we extend the flat-cache to record
+the result. Doing so can save lots of work when the same redex shows up more
+than once. Note that we record the link from the redex all the way to its
+*final* value, not just the single step reduction. Interestingly, adding to the
+flat-cache for the first reduction *doubles* the allocations
+for the T9872a test. However, using the flat-cache in
+the later reduction is a similar gain. I (Richard E) don't currently
+(Dec '14 nor Nov '20) have any knowledge as to *why* these facts are true.
+Perhaps the first use of the flat-cache doesn't add much, because we didn't
+need to reduce in the arguments (and instance lookup is similar to cache
+lookup).
 -}
 
 {-# INLINE flatten_args_tc #-}
@@ -766,7 +765,7 @@ flatten_fam_app tc tys  -- Can be over-saturated
          ; flatten_app_ty_args xi1 co1 tys_rest }
 
 -- the [TcType] exactly saturate the TyCon

--- See note [flatten_exact_fam_app_fully performance]
+-- See Note [flatten_exact_fam_app_fully performance]
 flatten_exact_fam_app_fully :: TyCon -> [TcType] -> FlatM (Xi, Coercion)
 flatten_exact_fam_app_fully tc tys
   = do { checkStackDepth (mkTyConApp tc tys)
@@ -774,14 +773,15 @@ flatten_exact_fam_app_fully tc tys
        -- Step 1. Try to reduce without reducing arguments first.
        ; result1 <- try_to_reduce tc tys
        ; case result1 of
-         { Just (co, xi) -> finish (xi, co)
+         { Just (co, xi) -> do { (xi2, co2) <- bumpDepth $ flatten_one xi
+                               ; return (xi2, co2 `mkTcTransCo` co) }
          ; Nothing ->
 
         -- That didn't work. So reduce the arguments.
     do { (xis, cos, kind_co) <- flatten_args_tc tc (repeat Nominal) tys
            -- kind_co :: tcTypeKind(F xis) ~N tcTypeKind(F tys)
 
-       ; eq_rel  <- getEqRel
+       ; eq_rel <- getEqRel
        ; let role    = eqRelRole eq_rel
              args_co = mkTyConAppCo role tc cos
            -- args_co :: F xis ~r F tys
@@ -791,14 +791,7 @@ flatten_exact_fam_app_fully tc tys
                --   assume co :: xi ~r F xis, co is homogeneous
                --   then xi' :: tcTypeKind(F tys)
                --   and co' :: xi' ~r F tys, which is homogeneous
-             homogenise xi co = (casted_xi, final_co)
-               where
-                 casted_xi = xi `mkCastTy` kind_co
-                   -- casted_xi :: tcTypeKind(F tys)
-                 homo_co   = mkTcGReflLeftCo role xi kind_co
-                   -- homo_co :: casted_xi ~r xi
-
-                 final_co  = homo_co `mkTcTransCo` co `mkTcTransCo` args_co
+             homogenise xi co = homogenise_result xi (co `mkTcTransCo` args_co) role kind_co
 
        ; result2 <- liftTcS $ lookupFamAppInert tc xis
        ; flavour <- getFlavour
@@ -807,7 +800,8 @@ flatten_exact_fam_app_fully tc tys
              -- co :: F xis ~ir xi
 
              | fr `eqCanRewriteFR` (flavour, eq_rel) ->
-                 do { traceFlat "flatten/flat-cache hit" (ppr tc <+> ppr xis $$ ppr xi)
+                 do { traceFlat "rewrite family application with inert"
+                                (ppr tc <+> ppr xis $$ ppr xi)
                     ; finish (homogenise xi downgraded_co) }
              where
                inert_role    = eqRelRole inert_eq_rel
@@ -832,7 +826,10 @@ flatten_exact_fam_app_fully tc tys
                          ; let final_co = fully_co `mkTcTransCo` co
                          ; eq_rel <- getEqRel
                          ; flavour <- getFlavour
-                         ; when (eq_rel == NomEq && flavour /= Derived) $ -- the cache only wants Nominal eqs
+                         ; when (eq_rel == NomEq && flavour /= Derived) $
+                             -- the cache only wants Nominal eqs
+                             -- and Wanteds can rewrite Deriveds; the cache
+                             -- has only Givens
                            liftTcS $ extendFamAppCache tc tys (final_co, fully)
                          ; return (fully, final_co) }
 
@@ -846,210 +843,19 @@ try_to_reduce tc tys
   where
     downgrade :: Maybe (TcCoercionN, TcType) -> FlatM (Maybe (TcCoercion, TcType))
     downgrade Nothing = return Nothing
-    downgrade (Just (co, xi))
+    downgrade result@(Just (co, xi))
       = do { traceFlat "Eager T.F. reduction success" $
              vcat [ ppr tc, ppr tys, ppr xi
                   , ppr co <+> dcolon <+> ppr (coercionKind co)
                   ]
-           ; role <- getRole
-           ; return (Just (tcDowngradeRole role Nominal co, xi)) }
-
-{- "RAE"
-  -- See Note [Reduce type family applications eagerly]
-     -- the following tcTypeKind should never be evaluated, as it's just used in
-     -- casting, and casts by refl are dropped
-  = do { mOut <- try_to_reduce_nocache tc tys
-       ; case mOut of
-           Just out -> pure out
-           Nothing -> do
-               { -- First, flatten the arguments
-               ; (xis, cos, kind_co)
-                   <- setEqRel NomEq $  -- just do this once, instead of for
-                                        -- each arg
-                      flatten_args_tc tc (repeat Nominal) tys
-                      -- kind_co :: tcTypeKind(F xis) ~N tcTypeKind(F tys)
-               ; eq_rel   <- getEqRel
-               ; cur_fr <- getFlavourRole
-               ; let role   = eqRelRole eq_rel
-                     ret_co = mkTyConAppCo role tc cos
-                      -- ret_co :: F xis ~ F tys; might be heterogeneous
-
-                -- Now, look in the inerts and the cache
-               ; mb_ct <- liftTcS $ lookupFamApp tc xis
-               ; dflags <- getDynFlags
-               ; loc <- getLoc
-               ; case mb_ct of
-                   Just (co, rhs_ty, inert_fr@(_, inert_eq_rel))  -- co :: F xis ~ rhs_ty
-                     | inert_fr `eqCanRewriteFR` cur_fr
-                        -- See Note [Runaway Derived rewriting]
-                     , let reduction_ok | (Derived, _) <- cur_fr
-                                        = not (subGoalDepthExceeded dflags
-                                                 (bumpSubGoalDepth (ctLocDepth loc)))
-                                        | otherwise
-                                        = True
-                     , reduction_ok
-                     ->  -- Usable hit in the flat-cache
-                        do { traceFlat "flatten/flat-cache hit" $
-                               (ppr tc <+> ppr xis $$ ppr rhs_ty)
-                           ; (rhs_xi, rhs_co) <- flatten_one rhs_ty
-                                  -- There may be more work to do on the rhs:
-                                  -- flatten it
-                                  -- rhs_co :: rhs_xi ~ rhs_ty
-                           ; let xi  = rhs_xi `mkCastTy` kind_co
-                                 co' = mkTcCoherenceLeftCo role rhs_xi kind_co rhs_co
-                                       `mkTransCo`
-                                       tcDowngradeRole role (eqRelRole inert_eq_rel)
-                                                            (mkTcSymCo co)
-                                       `mkTransCo` ret_co
-                           ; return (xi, co')
-                           }
-
-
-                   -- Try to reduce the family application right now
-                   -- See Note [Reduce type family applications eagerly]
-                   _ -> do { mOut <- try_to_reduce tc
-                                                   xis
-                                                   kind_co
-                                                   (`mkTransCo` ret_co)
-                           ; case mOut of
-                               Just out -> pure out
-                               Nothing -> do
-                                 { traceFlat "flatten/flat-cache miss" $
-                                     ppr tc <+> ppr xis
-                                 ; let uncasted_ty = mkTyConApp tc xis
-                                       casted_ty   = uncasted_ty `mkCastTy` kind_co
-                                       final_co    = mkTcCoherenceLeftCo role uncasted_ty
-                                                       kind_co ret_co
-                                 ; return ( casted_ty, final_co )
-                                 }
-                           }
-               }
-        }
-
-  where
-
-    -- try_to_reduce and try_to_reduce_nocache (below) could be unified into
-    -- a more general definition, but it was observed that separating them
-    -- gives better performance (lower allocation numbers in T9872x).
-
-    try_to_reduce :: TyCon   -- F, family tycon
-                  -> [Type]  -- args, not necessarily flattened
-                  -> CoercionN -- kind_co :: tcTypeKind(F args) ~N
-                               --            tcTypeKind(F orig_args)
-                               -- where
-                               -- orig_args is what was passed to the outer
-                               -- function
-                  -> (   Coercion     -- :: (xi |> kind_co) ~ F args
-                      -> Coercion )   -- what to return from outer function
-                  -> FlatM (Maybe (Xi, Coercion))
-    try_to_reduce tc tys kind_co update_co
-      = do { checkStackDepth (mkTyConApp tc tys)
-           ; mb_match <- liftTcS $ matchFam tc tys
-           ; case mb_match of
-                 -- NB: norm_co will always be homogeneous. All type families
-                 -- are homogeneous.
-               Just (norm_co, norm_ty)
-                 -> do { traceFlat "Eager T.F. reduction success" $
-                         vcat [ ppr tc, ppr tys, ppr norm_ty
-                              , ppr norm_co <+> dcolon
-                                            <+> ppr (coercionKind norm_co)
-                              ]
-                       ; (xi, final_co) <- bumpDepth $ flatten_one norm_ty
-                       ; eq_rel <- getEqRel
-                       ; let co = maybeTcSubCo eq_rel norm_co
-                                   `mkTransCo` mkSymCo final_co
-                           -- NB: only extend cache with nominal equalities
-                       ; when (eq_rel == NomEq) $
-                         liftTcS $ extendFamAppCache tc tys (co, xi)
-                       ; let role = eqRelRole eq_rel
-                             xi' = xi `mkCastTy` kind_co
-                             co' = update_co $
-                                   mkTcCoherenceLeftCo role xi kind_co (mkSymCo co)
-                       ; return $ Just (xi', co') }
-               Nothing -> pure Nothing }
-
-    try_to_reduce_nocache :: TyCon   -- F, family tycon
-                          -> [Type]  -- args, not necessarily flattened
-                          -> FlatM (Maybe (Xi, Coercion))
-    try_to_reduce_nocache tc tys
-      = do { checkStackDepth (mkTyConApp tc tys)
-           ; mb_match <- liftTcS $ matchFam tc tys
-           ; case mb_match of
-                 -- NB: norm_co will always be homogeneous. All type families
-                 -- are homogeneous.
-               Just (norm_co, norm_ty)
-                 -> do { (xi, final_co) <- bumpDepth $ flatten_one norm_ty
-                       ; eq_rel <- getEqRel
-                       ; let co  = mkSymCo (maybeTcSubCo eq_rel norm_co
-                                            `mkTransCo` mkSymCo final_co)
-                       ; return $ Just (xi, co) }
-               Nothing -> pure Nothing }
--}
-
-{- Note [Reduce type family applications eagerly]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-"RAE": Update Note.
-
-If we come across a type-family application like (Append (Cons x Nil) t),
-then, rather than flattening to a skolem etc, we may as well just reduce
-it on the spot to (Cons x t).  This saves a lot of intermediate steps.
-Examples that are helped are tests T9872, and T5321Fun.
-
-Performance testing indicates that it's best to try this *twice*, once
-before flattening arguments and once after flattening arguments.
-Adding the extra reduction attempt before flattening arguments cut
-the allocation amounts for the T9872{a,b,c} tests by half.
-
-An example of where the early reduction appears helpful:
-
-  type family Last x where
-    Last '[x]     = x
-    Last (h ': t) = Last t
-
-  workitem: (x ~ Last '[1,2,3,4,5,6])
-
-Flattening the argument never gets us anywhere, but trying to flatten
-it at every step is quadratic in the length of the list. Reducing more
-eagerly makes simplifying the right-hand type linear in its length.
-
-Testing also indicated that the early reduction should *not* use the
-flat-cache, but that the later reduction *should*. (Although the
-effect was not large.)  Hence the Bool argument to try_to_reduce.  To
-me (SLPJ) this seems odd; I get that eager reduction usually succeeds;
-and if don't use the cache for eager reduction, we will miss most of
-the opportunities for using it at all.  More exploration would be good
-here.
-
-At the end, once we've got a flat rhs, we extend the flatten-cache to record
-the result. Doing so can save lots of work when the same redex shows up more
-than once. Note that we record the link from the redex all the way to its
-*final* value, not just the single step reduction. Interestingly, using the
-flat-cache for the first reduction resulted in an increase in allocations
-of about 3% for the four T9872x tests. However, using the flat-cache in
-the later reduction is a similar gain. I (Richard E) don't currently (Dec '14)
-have any knowledge as to *why* these facts are true.
-
-Note [Runaway Derived rewriting]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-"RAE": Remove. We do occurs-checking now.
-
-Suppose we have
-  [WD] F a ~ T (F a)
-We *don't* want to fall into a hole using that to rewrite a Derived
-  [D] F a ~ Int
-because that produces an unhelpful error about a reduction stack overflow.
-Instead, when we've reached the reduction stack limit on a Derived, just stop.
-Either the program will be accepted if all the Wanteds are solved (good),
-or a Wanted will not be solved, and will be reported (also good).
-
-There is a tiny chance that continuing to try to reduce the Derived would
-yield fruit, and there is no way for the user to know this is the case
-(or to increase the reduction stack limit). But I see no easy way of
-communicating this. Since Deriveds are likely going to be short-lived
-at this point, it's not worth thinking about further.
-
-This arose in typecheck/should_fail/T13320.
+           ; eq_rel <- getEqRel
+              -- manually doing it this way avoids allocation in the vastly
+              -- common NomEq case
+           ; case eq_rel of
+               NomEq  -> return result
+               ReprEq -> return (Just (mkSubCo co, xi)) }
 
+{-
 ************************************************************************
 *                                                                      *
              Flattening a type variable


=====================================
compiler/GHC/Tc/Solver/Monad.hs
=====================================
@@ -406,6 +406,9 @@ data InertSet
        , inert_famapp_cache :: FunEqMap (TcCoercion, TcType)
               -- If    F tys :-> (co, rhs, flav),
               -- then  co :: rhs ~ F tys
+              -- all evidence is from instances or Givens
+              -- (We have no way of "kicking out" from the cache, so putting
+              --  wanteds here means we can end up solving a Wanted with itself. Bad)
               --
               -- Some entries in the cache might have arisen from Wanteds, and
               -- so this should be used only for rewriting Wanteds.
@@ -2385,11 +2388,15 @@ lookupFamAppInert fam_tc tys
       | otherwise = Nothing
 
 lookupFamAppCache :: CtFlavour -> TyCon -> [Type] -> TcS (Maybe (TcCoercion, TcType))
-lookupFamAppCache Given _ _ = return Nothing
-  -- the famapp_cache contains some wanteds. Not appropriate to rewrite a Given.
 lookupFamAppCache _ fam_tc tys
   = do { IS { inert_famapp_cache = famapp_cache } <- getTcSInerts
-       ; return (findFunEq famapp_cache fam_tc tys) }
+       ; case findFunEq famapp_cache fam_tc tys of
+           result@(Just (co, ty)) ->
+             do { traceTcS "famapp_cache hit" (vcat [ ppr (mkTyConApp fam_tc tys)
+                                                    , ppr ty
+                                                    , ppr co ])
+                ; return result }
+           Nothing -> return Nothing }
 
 lookupInInerts :: CtLoc -> TcPredType -> TcS (Maybe CtEvidence)
 -- Is this exact predicate type cached in the solved or canonicals of the InertSet?



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/1c2aa0e32e96f23caf12e6f19108401c5a6afc8f

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/1c2aa0e32e96f23caf12e6f19108401c5a6afc8f
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20201110/ff6c6d82/attachment-0001.html>