[Git][ghc/ghc][wip/fprof-overloaded] add -fprof-late-overloaded and -fprof-late-overloaded-calls

Finley McIlwaine (@FinleyMcIlwaine) gitlab at gitlab.haskell.org
Mon Mar 4 17:02:53 UTC 2024



Finley McIlwaine pushed to branch wip/fprof-overloaded at Glasgow Haskell Compiler / GHC


Commits:
61bb5ff6 by Finley McIlwaine at 2024-03-04T09:01:40-08:00
add -fprof-late-overloaded and -fprof-late-overloaded-calls

* Refactor late cost centre insertion for extensibility
* Add two more late cost centre insertion methods that add SCCs to overloaded
  top level bindings and call sites with dictionary arguments.
* Some tests for the basic functionality of the new insertion methods

Resolves: #24500

- - - - -


26 changed files:

- compiler/GHC/Core/LateCC.hs
- + compiler/GHC/Core/LateCC/OverloadedCalls.hs
- + compiler/GHC/Core/LateCC/TopLevelBinds.hs
- + compiler/GHC/Core/LateCC/Types.hs
- + compiler/GHC/Core/LateCC/Utils.hs
- compiler/GHC/Core/Opt/Pipeline.hs
- compiler/GHC/Driver/Flags.hs
- compiler/GHC/Driver/Main.hs
- compiler/GHC/Driver/Session.hs
- compiler/GHC/Tc/Utils/TcType.hs
- compiler/ghc.cabal.in
- docs/users_guide/9.10.1-notes.rst
- docs/users_guide/profiling.rst
- testsuite/tests/profiling/should_run/all.T
- + testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.hs
- + testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.prof.sample
- + testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.stdout
- + testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.hs
- + testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.prof.sample
- + testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.stdout
- + testsuite/tests/profiling/should_run/scc-prof-overloaded001.hs
- + testsuite/tests/profiling/should_run/scc-prof-overloaded001.prof.sample
- + testsuite/tests/profiling/should_run/scc-prof-overloaded001.stdout
- + testsuite/tests/profiling/should_run/scc-prof-overloaded002.hs
- + testsuite/tests/profiling/should_run/scc-prof-overloaded002.prof.sample
- + testsuite/tests/profiling/should_run/scc-prof-overloaded002.stdout


Changes:

=====================================
compiler/GHC/Core/LateCC.hs
=====================================
@@ -1,164 +1,90 @@
-{-# LANGUAGE DerivingStrategies #-}
-{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE RecordWildCards #-}
 
--- | Adds cost-centers after the core piple has run.
+-- | Adds cost-centers after the core pipline has run.
 module GHC.Core.LateCC
-    ( addLateCostCentresMG
-    , addLateCostCentresPgm
-    , addLateCostCentres -- Might be useful for API users
-    , Env(..)
+    ( -- * Inserting cost centres
+      addLateCostCenters
     ) where
 
-import Control.Applicative
-import Control.Monad
-import qualified Data.Set as S
-
 import GHC.Prelude
-import GHC.Types.CostCentre
-import GHC.Types.CostCentre.State
-import GHC.Types.Name hiding (varName)
-import GHC.Types.Tickish
-import GHC.Unit.Module.ModGuts
-import GHC.Types.Var
-import GHC.Unit.Types
-import GHC.Data.FastString
-import GHC.Core
-import GHC.Core.Opt.Monad
-import GHC.Core.Utils (mkTick)
-import GHC.Types.Id
-import GHC.Driver.DynFlags
 
+import GHC.Core
+import GHC.Core.LateCC.OverloadedCalls
+import GHC.Core.LateCC.TopLevelBinds
+import GHC.Core.LateCC.Types
+import GHC.Core.LateCC.Utils
+import GHC.Core.Seq
+import qualified GHC.Data.Strict as Strict
+import GHC.Core.Utils
+import GHC.Tc.Utils.TcType
+import GHC.Types.SrcLoc
+import GHC.Utils.Error
 import GHC.Utils.Logger
 import GHC.Utils.Outputable
-import GHC.Utils.Misc
-import GHC.Utils.Error (withTiming)
-import GHC.Utils.Monad.State.Strict
-
-
-{- Note [Collecting late cost centres]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Usually cost centres defined by a module are collected
-during tidy by collectCostCentres. However with `-fprof-late`
-we insert cost centres after inlining. So we keep a list of
-all the cost centres we inserted and combine that with the list
-of cost centres found during tidy.
-
-To avoid overhead when using -fprof-inline there is a flag to stop
-us from collecting them here when we run this pass before tidy.
-
-Note [Adding late cost centres]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The basic idea is very simple. For every top level binder
-`f = rhs` we compile it as if the user had written
-`f = {-# SCC f #-} rhs`.
-
-If we do this after unfoldings for `f` have been created this
-doesn't impact core-level optimizations at all. If we do it
-before the cost centre will be included in the unfolding and
-might inhibit optimizations at the call site. For this reason
-we provide flags for both approaches as they have different
-tradeoffs.
-
-We also don't add a cost centre for any binder that is a constructor
-worker or wrapper. These will never meaningfully enrich the resulting
-profile so we improve efficiency by omitting those.
-
--}
-
-addLateCostCentresMG :: ModGuts -> CoreM ModGuts
-addLateCostCentresMG guts = do
-  dflags <- getDynFlags
-  let env :: Env
-      env = Env
-        { thisModule = mg_module guts
-        , countEntries = gopt Opt_ProfCountEntries dflags
-        , collectCCs = False -- See Note [Collecting late cost centres]
-        }
-  let guts' = guts { mg_binds = fstOf3 (addLateCostCentres env (mg_binds guts))
-                   }
-  return guts'
-
-addLateCostCentresPgm :: DynFlags -> Logger -> Module -> CoreProgram -> IO (CoreProgram, S.Set CostCentre, CostCentreState)
-addLateCostCentresPgm dflags logger mod binds =
-  withTiming logger
-               (text "LateCC"<+>brackets (ppr mod))
-               (\(a,b,c) -> a `seqList` (b `seq` (c `seq` ()))) $ do
-  let env = Env
-        { thisModule = mod
-        , countEntries = gopt Opt_ProfCountEntries dflags
-        , collectCCs = True -- See Note [Collecting late cost centres]
-        }
-      (binds', ccs, cc_state) = addLateCostCentres env binds
-  when (dopt Opt_D_dump_late_cc dflags || dopt Opt_D_verbose_core2core dflags) $
-    putDumpFileMaybe logger Opt_D_dump_late_cc "LateCC" FormatCore (vcat (map ppr binds'))
-  return (binds', ccs, cc_state)
 
-addLateCostCentres :: Env -> CoreProgram -> (CoreProgram, S.Set CostCentre, CostCentreState)
-addLateCostCentres env binds =
-  let (binds', state) = runState (mapM (doBind env) binds) initLateCCState
-  in (binds', lcs_ccs state, lcs_state state)
-
-
-doBind :: Env -> CoreBind -> M CoreBind
-doBind env (NonRec b rhs) = NonRec b <$> doBndr env b rhs
-doBind env (Rec bs) = Rec <$> mapM doPair bs
+-- | Late cost center insertion logic used by the driver
+addLateCostCenters ::
+     Logger
+  -- ^ Logger
+  -> LateCCConfig
+  -- ^ Late cost center configuration
+  -> CoreProgram
+  -- ^ The program
+  -> IO (CoreProgram, LateCCState (Strict.Maybe SrcSpan))
+addLateCostCenters logger LateCCConfig{..} core_binds = do
+
+    -- If top-level late CCs are enabled via either -fprof-late or
+    -- -fprof-late-overloaded, add them
+    (top_level_cc_binds, top_level_late_cc_state) <-
+      case lateCCConfig_whichBinds of
+        LateCCNone ->
+          return (core_binds, initLateCCState ())
+        _ ->
+          withTiming
+            logger
+            (text "LateTopLevelCCs" <+> brackets (ppr this_mod))
+            (\(binds, late_cc_state) -> seqBinds binds `seq` late_cc_state `seq` ())
+            $ {-# SCC lateTopLevelCCs #-} do
+              pure $
+                doLateCostCenters
+                  lateCCConfig_env
+                  (initLateCCState ())
+                  (topLevelBindsCC top_level_cc_pred)
+                  core_binds
+
+    -- If overloaded call CCs are enabled via -fprof-late-overloaded-calls, add
+    -- them
+    (late_cc_binds, late_cc_state) <-
+      if lateCCConfig_overloadedCalls then
+        withTiming
+            logger
+            (text "LateOverloadedCallsCCs" <+> brackets (ppr this_mod))
+            (\(binds, late_cc_state) -> seqBinds binds `seq` late_cc_state `seq` ())
+            $ {-# SCC lateoverloadedCallsCCs #-} do
+              pure $
+                doLateCostCenters
+                  lateCCConfig_env
+                  (top_level_late_cc_state { lateCCState_extra = Strict.Nothing })
+                  overloadedCallsCC
+                  top_level_cc_binds
+      else
+        return
+          ( top_level_cc_binds
+          , top_level_late_cc_state { lateCCState_extra = Strict.Nothing }
+          )
+
+    return (late_cc_binds, late_cc_state)
   where
-    doPair :: ((Id, CoreExpr) -> M (Id, CoreExpr))
-    doPair (b,rhs) = (b,) <$> doBndr env b rhs
-
-doBndr :: Env -> Id -> CoreExpr -> M CoreExpr
-doBndr env bndr rhs
-  -- Cost centres on constructor workers are pretty much useless
-  -- so we don't emit them if we are looking at the rhs of a constructor
-  -- binding.
-  | Just _ <- isDataConId_maybe bndr = pure rhs
-  | otherwise = doBndr' env bndr rhs
-
-
--- We want to put the cost centre below the lambda as we only care about executions of the RHS.
-doBndr' :: Env -> Id -> CoreExpr -> State LateCCState CoreExpr
-doBndr' env bndr (Lam b rhs) = Lam b <$> doBndr' env bndr rhs
-doBndr' env bndr rhs = do
-    let name = idName bndr
-        name_loc = nameSrcSpan name
-        cc_name = getOccFS name
-        count = countEntries env
-    cc_flavour <- getCCFlavour cc_name
-    let cc_mod = thisModule env
-        bndrCC = NormalCC cc_flavour cc_name cc_mod name_loc
-        note = ProfNote bndrCC count True
-    addCC env bndrCC
-    return $ mkTick note rhs
-
-data LateCCState = LateCCState
-    { lcs_state :: !CostCentreState
-    , lcs_ccs   :: S.Set CostCentre
-    }
-type M = State LateCCState
-
-initLateCCState :: LateCCState
-initLateCCState = LateCCState newCostCentreState mempty
-
-getCCFlavour :: FastString -> M CCFlavour
-getCCFlavour name = mkLateCCFlavour <$> getCCIndex' name
-
-getCCIndex' :: FastString -> M CostCentreIndex
-getCCIndex' name = do
-  state <- get
-  let (index,cc_state') = getCCIndex name (lcs_state state)
-  put (state { lcs_state = cc_state'})
-  return index
-
-addCC :: Env -> CostCentre -> M ()
-addCC !env cc = do
-    state <- get
-    when (collectCCs env) $ do
-        let ccs' = S.insert cc (lcs_ccs state)
-        put (state { lcs_ccs = ccs'})
-
-data Env = Env
-  { thisModule  :: !Module
-  , countEntries:: !Bool
-  , collectCCs  :: !Bool
-  }
-
+    top_level_cc_pred :: CoreExpr -> Bool
+    top_level_cc_pred =
+        case lateCCConfig_whichBinds of
+          LateCCAllBinds ->
+            const True
+          LateCCOverloadedBinds ->
+            isOverloadedTy . exprType
+          LateCCNone ->
+            -- This is here for completeness, we won't actually use this
+            -- predicate in this case since we'll shortcut.
+            const False
+
+    this_mod = lateCCEnv_module lateCCConfig_env


=====================================
compiler/GHC/Core/LateCC/OverloadedCalls.hs
=====================================
@@ -0,0 +1,204 @@
+{-# LANGUAGE LambdaCase    #-}
+{-# LANGUAGE TupleSections #-}
+
+module GHC.Core.LateCC.OverloadedCalls
+  ( overloadedCallsCC
+  ) where
+
+import GHC.Prelude
+
+import Control.Monad.Trans.Class
+import Control.Monad.Trans.Reader
+import Control.Monad.Trans.State.Strict
+import qualified GHC.Data.Strict as Strict
+
+import GHC.Data.FastString
+import GHC.Core
+import GHC.Core.LateCC.Utils
+import GHC.Core.LateCC.Types
+import GHC.Core.Make
+import GHC.Core.Predicate
+import GHC.Core.Type
+import GHC.Core.Utils
+import GHC.Tc.Utils.TcType
+import GHC.Types.Id
+import GHC.Types.Name
+import GHC.Types.SrcLoc
+import GHC.Types.Tickish
+import GHC.Types.Var
+import GHC.Utils.Outputable
+
+type OverloadedCallsCCState = Strict.Maybe SrcSpan
+
+-- | Insert cost centres on function applications with dictionary arguments. The
+-- source locations attached to the cost centres is approximated based on the
+-- "closest" source note encountered in the traversal.
+overloadedCallsCC :: CoreBind -> LateCCM OverloadedCallsCCState CoreBind
+overloadedCallsCC =
+    processBind
+  where
+    processBind :: CoreBind -> LateCCM OverloadedCallsCCState CoreBind
+    processBind core_bind =
+        case core_bind of
+          NonRec b e ->
+            NonRec b <$> wrap_if_join b (processExpr e)
+          Rec es ->
+            Rec <$> mapM (\(b,e) -> (b,) <$> wrap_if_join b (processExpr e)) es
+      where
+        -- If an overloaded function is turned into a join point, we won't add
+        -- SCCs directly to calls since it makes them non-tail calls. Instead,
+        -- we look for join points here and add an SCC to their RHS if they are
+        -- overloaded.
+        wrap_if_join ::
+             CoreBndr
+          -> LateCCM OverloadedCallsCCState CoreExpr
+          -> LateCCM OverloadedCallsCCState CoreExpr
+        wrap_if_join b pexpr = do
+            expr <- pexpr
+            if isJoinId b && isOverloadedTy (exprType expr) then do
+              let
+                cc_name :: FastString
+                cc_name = fsLit "join-rhs-" `appendFS` getOccFS b
+
+              cc_srcspan <-
+                fmap (Strict.fromMaybe (UnhelpfulSpan UnhelpfulNoLocationInfo)) $
+                  lift $ gets lateCCState_extra
+
+              insertCC cc_name cc_srcspan expr
+            else
+              return expr
+
+
+    processExpr :: CoreExpr -> LateCCM OverloadedCallsCCState CoreExpr
+    processExpr expr =
+      case expr of
+        -- The case we care about: Application
+        app at App{} -> do
+          -- Here we have some application like `f v1 ... vN`, where v1 ... vN
+          -- should be the function's type arguments followed by the value
+          -- arguments. To determine if the `f` is an overloaded function, we
+          -- check if any of the arguments v1 ... vN are dictionaries.
+          let
+            (f, xs) = collectArgs app
+            resultTy = applyTypeToArgs empty (exprType f) xs
+
+          -- Recursively process the arguments first for no particular reason
+          args <- mapM processExpr xs
+          let app' = mkCoreApps f args
+
+          if
+              -- Check if any of the arguments are dictionaries
+              any isDictExpr args
+
+              -- Avoid instrumenting dictionary functions, which may be
+              -- overloaded if there are superclasses, by checking if the result
+              -- type of the function is a dictionary type.
+            && not (isDictTy resultTy)
+
+              -- Avoid instrumenting constraint selectors like eq_sel
+            && (typeTypeOrConstraint resultTy /= ConstraintLike)
+
+              -- Avoid instrumenting join points.
+              -- (See comment in processBind above)
+            && not (isJoinVarExpr f)
+          then do
+            -- Extract a name and source location from the function being
+            -- applied
+            let
+              cc_name :: FastString
+              cc_name =
+                fsLit $ maybe "<no name available>" getOccString (exprName app)
+
+            cc_srcspan <-
+              fmap (Strict.fromMaybe (UnhelpfulSpan UnhelpfulNoLocationInfo)) $
+                lift $ gets lateCCState_extra
+
+            insertCC cc_name cc_srcspan app'
+          else
+            return app'
+
+        -- For recursive constructors of Expr, we traverse the nested Exprs
+        Lam b e ->
+          mkCoreLams [b] <$> processExpr e
+        Let b e ->
+          mkCoreLet <$> processBind b <*> processExpr e
+        Case e b t alts ->
+              Case
+          <$> processExpr e
+          <*> pure b
+          <*> pure t
+          <*> mapM processAlt alts
+        Cast e co ->
+          mkCast <$> processExpr e <*> pure co
+        Tick t e -> do
+          trackSourceNote t $
+            mkTick t <$> processExpr e
+
+        -- For non-recursive constructors of Expr, we do nothing
+        x -> return x
+
+    processAlt :: CoreAlt -> LateCCM OverloadedCallsCCState CoreAlt
+    processAlt (Alt c bs e) = Alt c bs <$> processExpr e
+
+    trackSourceNote :: CoreTickish -> LateCCM OverloadedCallsCCState a -> LateCCM OverloadedCallsCCState a
+    trackSourceNote tick act =
+      case tick of
+        SourceNote rss _ -> do
+          -- Prefer source notes from the current file
+          in_current_file <-
+            maybe False ((== EQ) . lexicalCompareFS (srcSpanFile rss)) <$>
+              asks lateCCEnv_file
+          if not in_current_file then
+            act
+          else do
+            loc <- lift $ gets lateCCState_extra
+            lift . modify $ \s ->
+              s { lateCCState_extra =
+                    Strict.Just $ RealSrcSpan rss mempty
+                }
+            x <- act
+            lift . modify $ \s ->
+              s { lateCCState_extra = loc
+                }
+            return x
+        _ ->
+          act
+
+    -- Utility functions
+
+    -- Extract a Name from an expression. If it is an application, attempt to
+    -- extract a name from the applied function. If it is a variable, return the
+    -- Name of the variable. If it is a tick/cast, attempt to extract a Name
+    -- from the expression held in the tick/cast. Otherwise return Nothing.
+    exprName :: CoreExpr -> Maybe Name
+    exprName =
+        \case
+          App f _ ->
+            exprName f
+          Var f ->
+            Just (idName f)
+          Tick _ e ->
+            exprName e
+          Cast e _ ->
+            exprName e
+          _ ->
+            Nothing
+
+    -- Determine whether an expression is a dictionary
+    isDictExpr :: CoreExpr -> Bool
+    isDictExpr =
+        maybe False isDictTy . exprType'
+      where
+        exprType' :: CoreExpr -> Maybe Type
+        exprType' = \case
+            Type{} -> Nothing
+            expr -> Just $ exprType expr
+
+    -- Determine whether an expression is a join variable
+    isJoinVarExpr :: CoreExpr -> Bool
+    isJoinVarExpr =
+        \case
+          Var var -> isJoinId var
+          Tick _ e -> isJoinVarExpr e
+          Cast e _ -> isJoinVarExpr e
+          _ -> False


=====================================
compiler/GHC/Core/LateCC/TopLevelBinds.hs
=====================================
@@ -0,0 +1,106 @@
+{-# LANGUAGE TupleSections #-}
+module GHC.Core.LateCC.TopLevelBinds where
+
+import GHC.Prelude
+
+import GHC.Core
+-- import GHC.Core.LateCC
+import GHC.Core.LateCC.Types
+import GHC.Core.LateCC.Utils
+import GHC.Core.Opt.Monad
+import GHC.Driver.DynFlags
+import GHC.Types.Id
+import GHC.Types.Name
+import GHC.Unit.Module.ModGuts
+
+{- Note [Collecting late cost centres]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Usually cost centres defined by a module are collected
+during tidy by collectCostCentres. However with `-fprof-late`
+we insert cost centres after inlining. So we keep a list of
+all the cost centres we inserted and combine that with the list
+of cost centres found during tidy.
+
+To avoid overhead when using -fprof-inline there is a flag to stop
+us from collecting them here when we run this pass before tidy.
+
+Note [Adding late cost centres to top level bindings]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The basic idea is very simple. For every top level binder
+`f = rhs` we compile it as if the user had written
+`f = {-# SCC f #-} rhs`.
+
+If we do this after unfoldings for `f` have been created this
+doesn't impact core-level optimizations at all. If we do it
+before the cost centre will be included in the unfolding and
+might inhibit optimizations at the call site. For this reason
+we provide flags for both approaches as they have different
+tradeoffs.
+
+We also don't add a cost centre for any binder that is a constructor
+worker or wrapper. These will never meaningfully enrich the resulting
+profile so we improve efficiency by omitting those.
+
+-}
+
+-- | Add late cost centres directly to the 'ModGuts'. This is used inside the
+-- core pipeline with the -fprof-late-inline flag. It should not be used after
+-- tidy, since it does not manually track inserted cost centers. See
+-- Note [Collecting late cost centres].
+topLevelBindsCCMG :: ModGuts -> CoreM ModGuts
+topLevelBindsCCMG guts = do
+    dflags <- getDynFlags
+    let
+      env =
+        LateCCEnv
+          { lateCCEnv_module = mg_module guts
+
+            -- We don't use this for topLevelBindsCC, so Nothing is okay
+          , lateCCEnv_file = Nothing
+
+          , lateCCEnv_countEntries= gopt Opt_ProfCountEntries dflags
+          , lateCCEnv_collectCCs = False
+          }
+      guts' =
+        guts
+          { mg_binds =
+              fst
+                ( doLateCostCenters
+                    env
+                    (initLateCCState ())
+                    (topLevelBindsCC (const True))
+                    (mg_binds guts)
+                )
+          }
+    return guts'
+
+-- | Insert cost centres on top-level bindings in the module, depending on
+-- whether or not they satisfy the given predicate.
+topLevelBindsCC :: (CoreExpr -> Bool) -> CoreBind -> LateCCM s CoreBind
+topLevelBindsCC pred core_bind =
+    case core_bind of
+      NonRec b rhs ->
+        NonRec b <$> doBndr b rhs
+      Rec bs ->
+        Rec <$> mapM doPair bs
+  where
+    doPair :: ((Id, CoreExpr) -> LateCCM s (Id, CoreExpr))
+    doPair (b,rhs) = (b,) <$> doBndr b rhs
+
+    doBndr :: Id -> CoreExpr -> LateCCM s CoreExpr
+    doBndr bndr rhs
+      -- Cost centres on constructor workers are pretty much useless
+      -- so we don't emit them if we are looking at the rhs of a constructor
+      -- binding.
+      | Just _ <- isDataConId_maybe bndr = pure rhs
+      | otherwise = if pred rhs then addCC bndr rhs else pure rhs
+
+    -- We want to put the cost centre below the lambda as we only care about
+    -- executions of the RHS.
+    addCC :: Id -> CoreExpr -> LateCCM s CoreExpr
+    addCC bndr (Lam b rhs) = Lam b <$> addCC bndr rhs
+    addCC bndr rhs = do
+      let name = idName bndr
+          cc_loc = nameSrcSpan name
+          cc_name = getOccFS name
+      insertCC cc_name cc_loc rhs
\ No newline at end of file


=====================================
compiler/GHC/Core/LateCC/Types.hs
=====================================
@@ -0,0 +1,74 @@
+-- | Types related to late cost center insertion
+module GHC.Core.LateCC.Types
+  ( LateCCConfig(..)
+  , LateCCBindSpec(..)
+  , LateCCEnv(..)
+  , LateCCState(..)
+  , initLateCCState
+  , LateCCM
+  ) where
+
+import GHC.Prelude
+
+import Control.Monad.Trans.Reader
+import Control.Monad.Trans.State.Strict
+import qualified Data.Set as S
+
+import GHC.Data.FastString
+import GHC.Types.CostCentre
+import GHC.Types.CostCentre.State
+import GHC.Unit.Types
+
+-- | Late cost center insertion configuration.
+--
+-- Specifies whether cost centers are added to overloaded function call sites
+-- and/or top-level bindings, and which top-level bindings they are added to.
+-- Also holds the cost center insertion environment.
+data LateCCConfig =
+      LateCCConfig
+        { lateCCConfig_whichBinds :: !LateCCBindSpec
+        , lateCCConfig_overloadedCalls :: !Bool
+        , lateCCConfig_env :: !LateCCEnv
+        }
+
+-- | The types of top-level bindings we support adding cost centers to.
+data LateCCBindSpec =
+      LateCCNone
+    | LateCCAllBinds
+    | LateCCOverloadedBinds
+
+-- | Late cost centre insertion environment
+data LateCCEnv = LateCCEnv
+  { lateCCEnv_module :: !Module
+    -- ^ Current module
+  , lateCCEnv_file :: Maybe FastString
+    -- ^ Current file, if we have one
+  , lateCCEnv_countEntries:: !Bool
+    -- ^ Whether the inserted cost centers should count entries
+  , lateCCEnv_collectCCs  :: !Bool
+    -- ^ Whether to collect the cost centres we insert. See
+    -- Note [Collecting late cost centres]
+
+  }
+
+-- | Late cost centre insertion state, indexed by some extra state type that an
+-- insertion method may require.
+data LateCCState s = LateCCState
+    { lateCCState_ccs :: !(S.Set CostCentre)
+      -- ^ Cost centres that have been inserted
+    , lateCCState_ccState :: !CostCentreState
+      -- ^ Per-module state tracking for cost centre indices
+    , lateCCState_extra :: !s
+    }
+
+-- | The empty late cost centre insertion state
+initLateCCState :: s -> LateCCState s
+initLateCCState s =
+    LateCCState
+      { lateCCState_ccState = newCostCentreState
+      , lateCCState_ccs = mempty
+      , lateCCState_extra = s
+      }
+
+-- | Late cost centre insertion monad
+type LateCCM s = ReaderT LateCCEnv (State (LateCCState s))


=====================================
compiler/GHC/Core/LateCC/Utils.hs
=====================================
@@ -0,0 +1,80 @@
+module GHC.Core.LateCC.Utils
+  ( -- * Inserting cost centres
+    doLateCostCenters -- Might be useful for API users
+
+    -- ** Helpers for defining insertion methods
+  , getCCFlavour
+  , insertCC
+  ) where
+
+import GHC.Prelude
+
+import Control.Monad
+import Control.Monad.Trans.Class
+import Control.Monad.Trans.Reader
+import Control.Monad.Trans.State.Strict
+import qualified Data.Set as S
+
+import GHC.Core
+import GHC.Core.LateCC.Types
+import GHC.Core.Utils
+import GHC.Data.FastString
+import GHC.Types.CostCentre
+import GHC.Types.CostCentre.State
+import GHC.Types.SrcLoc
+import GHC.Types.Tickish
+
+-- | Insert cost centres into the 'CoreProgram' using the provided environment,
+-- initial state, and insertion method.
+doLateCostCenters
+  :: LateCCEnv
+  -- ^ Environment to run the insertion in
+  -> LateCCState s
+  -- ^ Initial state to run the insertion with
+  -> (CoreBind -> LateCCM s CoreBind)
+  -- ^ Insertion method
+  -> CoreProgram
+  -- ^ Bindings to consider
+  -> (CoreProgram, LateCCState s)
+doLateCostCenters env state method binds =
+    runLateCC env state $ mapM method binds
+
+-- | Evaluate late cost centre insertion
+runLateCC :: LateCCEnv -> LateCCState s -> LateCCM s a -> (a, LateCCState s)
+runLateCC env state = (`runState` state) . (`runReaderT` env)
+
+-- | Given the name of a cost centre, get its flavour
+getCCFlavour :: FastString -> LateCCM s CCFlavour
+getCCFlavour name = mkLateCCFlavour <$> getCCIndex' name
+  where
+    getCCIndex' :: FastString -> LateCCM s CostCentreIndex
+    getCCIndex' name = do
+      cc_state <- lift $ gets lateCCState_ccState
+      let (index, cc_state') = getCCIndex name cc_state
+      lift . modify $ \s -> s { lateCCState_ccState = cc_state'}
+      return index
+
+-- | Insert a cost centre with the specified name and source span on the given
+-- expression. The inserted cost centre will be appropriately tracked in the
+-- late cost centre state.
+insertCC
+  :: FastString
+  -- ^ Name of the cost centre to insert
+  -> SrcSpan
+  -- ^ Source location to associate with the cost centre
+  -> CoreExpr
+  -- ^ Expression to wrap in the cost centre
+  -> LateCCM s CoreExpr
+insertCC cc_name cc_loc expr = do
+    cc_flavour <- getCCFlavour cc_name
+    env <- ask
+    let
+      cc_mod = lateCCEnv_module env
+      cc = NormalCC cc_flavour cc_name cc_mod cc_loc
+      note = ProfNote cc (lateCCEnv_countEntries env) True
+    when (lateCCEnv_collectCCs env) $ do
+        lift . modify $ \s ->
+          s { lateCCState_ccs = S.insert cc (lateCCState_ccs s)
+            }
+    return $ mkTick note expr
+


=====================================
compiler/GHC/Core/Opt/Pipeline.hs
=====================================
@@ -43,7 +43,7 @@ import GHC.Core.Opt.CallArity    ( callArityAnalProgram )
 import GHC.Core.Opt.Exitify      ( exitifyProgram )
 import GHC.Core.Opt.WorkWrap     ( wwTopBinds )
 import GHC.Core.Opt.CallerCC     ( addCallerCostCentres )
-import GHC.Core.LateCC           (addLateCostCentresMG)
+import GHC.Core.LateCC.TopLevelBinds (topLevelBindsCCMG)
 import GHC.Core.Seq (seqBinds)
 import GHC.Core.FamInstEnv
 
@@ -520,7 +520,7 @@ doCorePass pass guts = do
                                  addCallerCostCentres guts
 
     CoreAddLateCcs            -> {-# SCC "AddLateCcs" #-}
-                                 addLateCostCentresMG guts
+                                 topLevelBindsCCMG guts
 
     CoreDoPrintCore           -> {-# SCC "PrintCore" #-}
                                  liftIO $ printCore logger (mg_binds guts) >> return guts


=====================================
compiler/GHC/Driver/Flags.hs
=====================================
@@ -341,6 +341,8 @@ data GeneralFlag
    | Opt_ProfCountEntries
    | Opt_ProfLateInlineCcs
    | Opt_ProfLateCcs
+   | Opt_ProfLateOverloadedCcs
+   | Opt_ProfLateoverloadedCallsCCs
    | Opt_ProfManualCcs -- ^ Ignore manual SCC annotations
 
    -- misc opts


=====================================
compiler/GHC/Driver/Main.hs
=====================================
@@ -175,7 +175,6 @@ import GHC.Iface.Ext.Debug  ( diffFile, validateScopes )
 import GHC.Core
 import GHC.Core.Lint.Interactive ( interactiveInScope )
 import GHC.Core.Tidy           ( tidyExpr )
-import GHC.Core.Type           ( Type, Kind )
 import GHC.Core.Utils          ( exprType )
 import GHC.Core.ConLike
 import GHC.Core.Opt.Pipeline
@@ -185,7 +184,8 @@ import GHC.Core.InstEnv
 import GHC.Core.FamInstEnv
 import GHC.Core.Rules
 import GHC.Core.Stats
-import GHC.Core.LateCC (addLateCostCentresPgm)
+import GHC.Core.LateCC
+import GHC.Core.LateCC.Types
 
 
 import GHC.CoreToStg.Prep
@@ -197,6 +197,7 @@ import GHC.Parser.Lexer as Lexer
 
 import GHC.Tc.Module
 import GHC.Tc.Utils.Monad
+import GHC.Tc.Utils.TcType
 import GHC.Tc.Zonk.Env ( ZonkFlexi (DefaultFlexi) )
 
 import GHC.Stg.Syntax
@@ -297,7 +298,6 @@ import GHC.StgToCmm.Utils (IPEStats)
 import GHC.Types.Unique.FM
 import GHC.Types.Unique.DFM
 import GHC.Cmm.Config (CmmConfig)
-import GHC.Types.CostCentre.State (newCostCentreState)
 
 
 {- **********************************************************************
@@ -1791,22 +1791,41 @@ hscGenHardCode hsc_env cgguts location output_filename = do
 
 
         -------------------
-        -- Insert late cost centres if enabled.
-        -- If `-fprof-late-inline` is enabled we can skip this, as it will have added
-        -- a superset of cost centres we would add here already.
-
-        (late_cc_binds, late_local_ccs, cc_state) <-
-              if gopt Opt_ProfLateCcs dflags && not (gopt Opt_ProfLateInlineCcs dflags)
-                  then
-                    withTiming
-                      logger
-                      (text "LateCCs"<+>brackets (ppr this_mod))
-                      (const ())
-                      $ {-# SCC lateCC #-} do
-                        (binds, late_ccs, cc_state) <- addLateCostCentresPgm dflags logger this_mod core_binds
-                        return ( binds, (S.toList late_ccs `mappend` local_ccs ), cc_state)
+        -- Insert late cost centres based on the provided flags.
+        --
+        -- If -fprof-late-inline is enabled, we will skip adding CCs on any
+        -- top-level bindings here (via shortcut in `addLateCostCenters`),
+        -- since it will have already added a superset of the CCs we would add
+        -- here.
+        let
+          late_cc_config :: LateCCConfig
+          late_cc_config =
+            LateCCConfig
+              { lateCCConfig_whichBinds =
+                  if gopt Opt_ProfLateInlineCcs dflags then
+                    LateCCNone
+                  else if gopt Opt_ProfLateCcs dflags then
+                    LateCCAllBinds
+                  else if gopt Opt_ProfLateOverloadedCcs dflags then
+                    LateCCOverloadedBinds
                   else
-                    return (core_binds, local_ccs, newCostCentreState)
+                    LateCCNone
+              , lateCCConfig_overloadedCalls =
+                  gopt Opt_ProfLateoverloadedCallsCCs dflags
+              , lateCCConfig_env =
+                  LateCCEnv
+                    { lateCCEnv_module = this_mod
+                    , lateCCEnv_file = fsLit <$> ml_hs_file location
+                    , lateCCEnv_countEntries= gopt Opt_ProfCountEntries dflags
+                    , lateCCEnv_collectCCs = True
+                    }
+              }
+
+        (late_cc_binds, late_cc_state) <-
+          addLateCostCenters logger late_cc_config core_binds
+
+        when (dopt Opt_D_dump_late_cc dflags || dopt Opt_D_verbose_core2core dflags) $
+          putDumpFileMaybe logger Opt_D_dump_late_cc "LateCC" FormatCore (vcat (map ppr late_cc_binds))
 
         -------------------
         -- Run late plugins
@@ -1820,7 +1839,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do
               cg_hpc_info      = hpc_info,
               cg_spt_entries   = spt_entries,
               cg_binds         = late_binds,
-              cg_ccs           = late_local_ccs'
+              cg_ccs           = late_local_ccs
             }
           , _
           ) <-
@@ -1833,9 +1852,9 @@ hscGenHardCode hsc_env cgguts location output_filename = do
               (($ hsc_env) . latePlugin)
                 ( cgguts
                     { cg_binds = late_cc_binds
-                    , cg_ccs = late_local_ccs
+                    , cg_ccs = S.toList (lateCCState_ccs late_cc_state) ++ local_ccs
                     }
-                , cc_state
+                , lateCCState_ccState late_cc_state
                 )
 
         let
@@ -1876,7 +1895,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do
         let (stg_binds,_stg_deps) = unzip stg_binds_with_deps
 
         let cost_centre_info =
-              (late_local_ccs' ++ caf_ccs, caf_cc_stacks)
+              (late_local_ccs ++ caf_ccs, caf_cc_stacks)
             platform = targetPlatform dflags
             prof_init
               | sccProfilingEnabled dflags = profilingInitCode platform this_mod cost_centre_info


=====================================
compiler/GHC/Driver/Session.hs
=====================================
@@ -2444,6 +2444,8 @@ fFlagsDeps = [
   flagSpec "prof-cafs"                        Opt_AutoSccsOnIndividualCafs,
   flagSpec "prof-count-entries"               Opt_ProfCountEntries,
   flagSpec "prof-late"                        Opt_ProfLateCcs,
+  flagSpec "prof-late-overloaded"             Opt_ProfLateOverloadedCcs,
+  flagSpec "prof-late-overloaded-calls"       Opt_ProfLateoverloadedCallsCCs,
   flagSpec "prof-manual"                      Opt_ProfManualCcs,
   flagSpec "prof-late-inline"                 Opt_ProfLateInlineCcs,
   flagSpec "regs-graph"                       Opt_RegsGraph,
@@ -3763,6 +3765,10 @@ needSourceNotes :: DynFlags -> Bool
 needSourceNotes dflags = debugLevel dflags > 0
                        || gopt Opt_InfoTableMap dflags
 
+                       -- Source ticks are used to approximate the location of
+                       -- overloaded call cost centers
+                       || gopt Opt_ProfLateoverloadedCallsCCs dflags
+
 -- -----------------------------------------------------------------------------
 -- Linker/compiler information
 


=====================================
compiler/GHC/Tc/Utils/TcType.hs
=====================================
@@ -1907,7 +1907,7 @@ isRhoExpTy (Infer {}) = True
 
 isOverloadedTy :: Type -> Bool
 -- Yes for a type of a function that might require evidence-passing
--- Used only by bindLocalMethods
+-- Used by bindLocalMethods and for -fprof-late-overloaded
 isOverloadedTy ty | Just ty' <- coreView ty = isOverloadedTy ty'
 isOverloadedTy (ForAllTy _  ty)             = isOverloadedTy ty
 isOverloadedTy (FunTy { ft_af = af })       = isInvisibleFunArg af


=====================================
compiler/ghc.cabal.in
=====================================
@@ -336,6 +336,10 @@ Library
         GHC.Core.Lint
         GHC.Core.Lint.Interactive
         GHC.Core.LateCC
+        GHC.Core.LateCC.Types
+        GHC.Core.LateCC.TopLevelBinds
+        GHC.Core.LateCC.Utils
+        GHC.Core.LateCC.OverloadedCalls
         GHC.Core.Make
         GHC.Core.Map.Expr
         GHC.Core.Map.Type


=====================================
docs/users_guide/9.10.1-notes.rst
=====================================
@@ -186,6 +186,15 @@ Compiler
   This means that if you are using ``-fllvm`` you now need ``llc``, ``opt`` and ``clang``
   available.
 
+- The :ghc-flag:`-fprof-late-overloaded` flag has been introduced. It causes
+  cost centres to be added to *overloaded* top level bindings, unlike
+  :ghc-flag:`-fprof-late` which adds cost centres to all top level bindings.
+
+- The :ghc-flag:`-fprof-late-overloaded-calls` flag has been introduced. It
+  causes cost centres to be inserted at call sites including instance dictionary
+  arguments. This may be preferred over :ghc-flag:`-fprof-late-overloaded` since
+  it may reveal whether imported functions are called overloaded.
+
 JavaScript backend
 ~~~~~~~~~~~~~~~~~~
 


=====================================
docs/users_guide/profiling.rst
=====================================
@@ -518,6 +518,49 @@ of your profiled program will be different to that of the unprofiled one.
 
     You can try this mode if :ghc-flag:`-fprof-late` results in a profile that's too hard to interpret.
 
+.. ghc-flag:: -fprof-late-overloaded
+    :shortdesc: Auto-add ``SCC``\\ s to all top level overloaded bindings *after* the core pipeline has run.
+    :type: dynamic
+    :reverse: -fno-prof-late-overloaded
+    :category:
+
+    :since: 9.10.1
+
+    Adds an automatic ``SCC`` annotation to all *overloaded* top level bindings
+    late in the compilation pipeline after the optimizer has run and unfoldings
+    have been created. This means these cost centres will not interfere with
+    core-level optimizations and the resulting profile will be closer to the
+    performance profile of an optimized non-profiled executable.
+
+    This flag can help determine which top level bindings encountered during a
+    program's execution are still overloaded after inlining and specialization.
+
+.. ghc-flag:: -fprof-late-overloaded-calls
+    :shortdesc: Auto-add ``SCC``\\ s to all call sites that include dictionary arguments *after* the core pipeline has run.
+    :type: dynamic
+    :reverse: -fno-prof-late-overloaded-calls
+    :category:
+
+    :since: 9.10.1
+
+    Adds an automatic ``SCC`` annotation to all call sites that include
+    dictionary arguments late in the compilation pipeline after the optimizer
+    has run and unfoldings have been created. This means these cost centres will
+    not interfere with core-level optimizations and the resulting profile will
+    be closer to the performance profile of an optimized non-profiled
+    executable.
+
+    This flag is potentially more useful than :ghc-flag:`-fprof-late-overloaded`
+    since it will also add ``SCC`` annotations to call sites of imported
+    overloaded functions.
+
+    Some overloaded calls may not be annotated, specifically in cases where the
+    optimizer turns an overloaded function into a join point. Calls to such
+    functions will not be wrapped in ``SCC`` annotations, since it would make
+    them non-tail calls, which is a requirement for join points. Instead,
+    ``SCC`` annotations are added around the body of overloaded join variables
+    and given distinct names (``join-rhs-<var>``) to avoid confusion.
+
 .. ghc-flag:: -fprof-cafs
     :shortdesc: Auto-add ``SCC``\\ s to all CAFs
     :type: dynamic


=====================================
testsuite/tests/profiling/should_run/all.T
=====================================
@@ -195,3 +195,30 @@ test('ignore_scc', [], compile_and_run,
      ['-fno-prof-manual'])
 
 test('T21446', [], makefile_test, ['T21446'])
+
+
+test('scc-prof-overloaded001',
+     [],
+     compile_and_run,
+     ['-fno-prof-auto -fno-full-laziness -fprof-late-overloaded'] # See Note [consistent stacks]
+)
+
+test('scc-prof-overloaded002',
+     [],
+     compile_and_run,
+     ['-fno-prof-auto -fno-full-laziness -fprof-late-overloaded'] # See Note [consistent stacks]
+)
+
+test('scc-prof-overloaded-calls001',
+     [],
+     compile_and_run,
+     # Need optimizations to get rid of unwanted overloaded calls
+     ['-O -fno-prof-auto -fno-full-laziness -fprof-late-overloaded-calls'] # See Note [consistent stacks]
+)
+
+test('scc-prof-overloaded-calls002',
+     [],
+     compile_and_run,
+     # Need optimizations to get rid of unwanted overloaded calls
+     ['-O -fno-prof-auto -fprof-late-overloaded-calls']
+)


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.hs
=====================================
@@ -0,0 +1,24 @@
+-- Running this program should result in two calls to overloaded functions: One
+-- with the $fShowX dictionary, the next with the $fShowList dictionary
+-- constructor for X.
+--
+-- Note that although the `$fShowList` dictionary constructor is itself
+-- overloaded, it should not get an SCC since we avoid instrumenting overloaded
+-- calls that result in dictionaries.
+--
+-- With just -fprof-late-overloaded, only `invoke` should get an SCC, since it
+-- is the only overloaded top level binding. With
+-- `-fprof-late-overloaded-calls`, the calls to both `invoke` and `f` (in the
+-- body of invoke) should get SCCs.
+
+module Main where
+
+{-# NOINLINE invoke #-}
+invoke :: Show a => (Show [a] => [a] -> String) -> a -> String
+invoke f x = f [x]
+
+data X = X
+    deriving Show
+
+main :: IO ()
+main = putStrLn (invoke show X)


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.prof.sample
=====================================
@@ -0,0 +1,26 @@
+	Thu Jan  4 11:49 2024 Time and Allocation Profiling Report  (Final)
+
+	   scc-prof-overloaded-calls001 +RTS -hc -p -RTS
+
+	total time  =        0.00 secs   (0 ticks @ 1000 us, 1 processor)
+	total alloc =      48,320 bytes  (excludes profiling overheads)
+
+COST CENTRE MODULE           SRC              %time %alloc
+
+MAIN        MAIN             <built-in>         0.0   20.5
+CAF         GHC.IO.Handle.FD <entire-module>    0.0   71.9
+CAF         GHC.IO.Encoding  <entire-module>    0.0    5.1
+CAF         GHC.Conc.Signal  <entire-module>    0.0    1.3
+
+
+                                                                                              individual      inherited
+COST CENTRE  MODULE                SRC                                     no.     entries  %time %alloc   %time %alloc
+
+MAIN         MAIN                  <built-in>                              128           0    0.0   20.5     0.0  100.0
+ CAF         Main                  <entire-module>                         255           0    0.0    0.0     0.0    0.8
+  invoke     Main                  scc-prof-overloaded-calls001.hs:24:1-31 256           1    0.0    0.3     0.0    0.8
+   f         Main                  scc-prof-overloaded-calls001.hs:18:1-18 257           1    0.0    0.6     0.0    0.6
+ CAF         GHC.Conc.Signal       <entire-module>                         238           0    0.0    1.3     0.0    1.3
+ CAF         GHC.IO.Encoding       <entire-module>                         219           0    0.0    5.1     0.0    5.1
+ CAF         GHC.IO.Encoding.Iconv <entire-module>                         217           0    0.0    0.4     0.0    0.4
+ CAF         GHC.IO.Handle.FD      <entire-module>                         208           0    0.0   71.9     0.0   71.9


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.stdout
=====================================
@@ -0,0 +1 @@
+[X]


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.hs
=====================================
@@ -0,0 +1,65 @@
+-- Running this program should result in seven calls to overloaded functions
+-- with increasing numbers of dictionary arguments.
+--
+-- With just -fprof-late-overloaded, no SCCs should be added, since none of the
+-- overloaded functions are top level. With `-fprof-late-overloaded-calls`, all
+-- seven calls should get *distinct* SCCs with separate source locations even
+-- though the overloaded functions share an OccName (`f`).
+
+module Main where
+
+data X = X
+
+instance Show     X where
+instance Num      X where
+instance Eq       X where
+instance Enum     X where
+instance Ord      X where
+instance Real     X where
+instance Integral X where
+
+-- No overloaded call
+{-# NOINLINE invoke0 #-}
+invoke0 :: (forall a. a -> a -> String) -> X -> String
+invoke0 f val = f val val
+
+{-# NOINLINE invoke1 #-}
+invoke1 :: (forall a. Show a => a -> a -> String) -> X -> String
+invoke1 f val = f val val
+
+{-# NOINLINE invoke2 #-}
+invoke2 :: (forall a. (Show a, Num a) => a -> a -> String) -> X -> String
+invoke2 f val = f val val
+
+{-# NOINLINE invoke3 #-}
+invoke3 :: (forall a. (Show a, Num a, Eq a) => a -> a -> String) -> X -> String
+invoke3 f val = f val val
+
+{-# NOINLINE invoke4 #-}
+invoke4 :: (forall a. (Show a, Num a, Eq a, Enum a) => a -> a -> String) -> X -> String
+invoke4 f val = f val val
+
+{-# NOINLINE invoke5 #-}
+invoke5 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a) => a -> a -> String) -> X -> String
+invoke5 f val = f val val
+
+{-# NOINLINE invoke6 #-}
+invoke6 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a, Real a) => a -> a -> String) -> X -> String
+invoke6 f val = f val val
+
+{-# NOINLINE invoke7 #-}
+invoke7 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a, Real a, Integral a) => a -> a -> String) -> X -> String
+invoke7 f val = f val val
+
+main :: IO ()
+main = do
+    putStrLn $ invoke0 (\_ _ -> s) X
+    putStrLn $ invoke1 (\_ _ -> s) X
+    putStrLn $ invoke2 (\_ _ -> s) X
+    putStrLn $ invoke3 (\_ _ -> s) X
+    putStrLn $ invoke4 (\_ _ -> s) X
+    putStrLn $ invoke5 (\_ _ -> s) X
+    putStrLn $ invoke6 (\_ _ -> s) X
+    putStrLn $ invoke7 (\_ _ -> s) X
+  where
+    s = "wibbly"


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.prof.sample
=====================================
@@ -0,0 +1,31 @@
+	Fri Jan  5 11:06 2024 Time and Allocation Profiling Report  (Final)
+
+	   scc-prof-overloaded-calls002 +RTS -p -RTS
+
+	total time  =        0.00 secs   (0 ticks @ 1000 us, 1 processor)
+	total alloc =      59,152 bytes  (excludes profiling overheads)
+
+COST CENTRE MODULE           SRC              %time %alloc
+
+MAIN        MAIN             <built-in>         0.0   34.8
+CAF         GHC.IO.Handle.FD <entire-module>    0.0   58.7
+CAF         GHC.IO.Encoding  <entire-module>    0.0    4.1
+CAF         GHC.Conc.Signal  <entire-module>    0.0    1.1
+
+
+                                                                                              individual      inherited
+COST CENTRE  MODULE                SRC                                     no.     entries  %time %alloc   %time %alloc
+
+MAIN         MAIN                  <built-in>                              128           0    0.0   34.8     0.0  100.0
+ CAF         Main                  <entire-module>                         255           0    0.0    0.6     0.0    0.9
+  f          Main                  scc-prof-overloaded-calls002.hs:52:1-25 262           1    0.0    0.1     0.0    0.1
+  f          Main                  scc-prof-overloaded-calls002.hs:48:1-25 261           1    0.0    0.1     0.0    0.1
+  f          Main                  scc-prof-overloaded-calls002.hs:44:1-25 260           1    0.0    0.1     0.0    0.1
+  f          Main                  scc-prof-overloaded-calls002.hs:40:1-25 259           1    0.0    0.0     0.0    0.0
+  f          Main                  scc-prof-overloaded-calls002.hs:36:1-25 258           1    0.0    0.0     0.0    0.0
+  f          Main                  scc-prof-overloaded-calls002.hs:32:1-25 257           1    0.0    0.0     0.0    0.0
+  f          Main                  scc-prof-overloaded-calls002.hs:28:1-25 256           1    0.0    0.0     0.0    0.0
+ CAF         GHC.Conc.Signal       <entire-module>                         238           0    0.0    1.1     0.0    1.1
+ CAF         GHC.IO.Encoding       <entire-module>                         219           0    0.0    4.1     0.0    4.1
+ CAF         GHC.IO.Encoding.Iconv <entire-module>                         217           0    0.0    0.3     0.0    0.3
+ CAF         GHC.IO.Handle.FD      <entire-module>                         208           0    0.0   58.7     0.0   58.7


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.stdout
=====================================
@@ -0,0 +1,8 @@
+wibbly
+wibbly
+wibbly
+wibbly
+wibbly
+wibbly
+wibbly
+wibbly


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded001.hs
=====================================
@@ -0,0 +1,24 @@
+-- Running this program should result in two calls to overloaded functions: One
+-- with the $fShowX dictionary, the next with the $fShowList dictionary
+-- constructor for X.
+--
+-- Note that although the `$fShowList` dictionary constructor is itself
+-- overloaded, it should not get an SCC since we avoid instrumenting overloaded
+-- calls that result in dictionaries.
+--
+-- With just -fprof-late-overloaded, only `invoke` should get an SCC, since it
+-- is the only overloaded top level binding. With
+-- `-fprof-late-overloaded-calls`, the calls to both `invoke` and `f` (in the
+-- body of invoke) should get SCCs.
+
+module Main where
+
+{-# NOINLINE invoke #-}
+invoke :: Show a => (Show [a] => [a] -> String) -> a -> String
+invoke f x = f [x]
+
+data X = X
+    deriving Show
+
+main :: IO ()
+main = putStrLn (invoke show X)


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded001.prof.sample
=====================================
@@ -0,0 +1,25 @@
+	Thu Jan  4 11:26 2024 Time and Allocation Profiling Report  (Final)
+
+	   scc-prof-overloaded001 +RTS -hc -p -RTS
+
+	total time  =        0.00 secs   (0 ticks @ 1000 us, 1 processor)
+	total alloc =      48,304 bytes  (excludes profiling overheads)
+
+COST CENTRE MODULE           SRC              %time %alloc
+
+MAIN        MAIN             <built-in>         0.0   20.5
+CAF         GHC.IO.Handle.FD <entire-module>    0.0   71.9
+CAF         GHC.IO.Encoding  <entire-module>    0.0    5.1
+CAF         GHC.Conc.Signal  <entire-module>    0.0    1.3
+
+
+                                                                                       individual      inherited
+COST CENTRE  MODULE                SRC                              no.     entries  %time %alloc   %time %alloc
+
+MAIN         MAIN                  <built-in>                       128           0    0.0   20.5     0.0  100.0
+ CAF         Main                  <entire-module>                  255           0    0.0    0.0     0.0    0.8
+  invoke     Main                  scc-prof-overloaded001.hs:18:1-6 256           1    0.0    0.8     0.0    0.8
+ CAF         GHC.Conc.Signal       <entire-module>                  238           0    0.0    1.3     0.0    1.3
+ CAF         GHC.IO.Encoding       <entire-module>                  219           0    0.0    5.1     0.0    5.1
+ CAF         GHC.IO.Encoding.Iconv <entire-module>                  217           0    0.0    0.4     0.0    0.4
+ CAF         GHC.IO.Handle.FD      <entire-module>                  208           0    0.0   71.9     0.0   71.9


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded001.stdout
=====================================
@@ -0,0 +1 @@
+[X]


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded002.hs
=====================================
@@ -0,0 +1,65 @@
+-- Running this program should result in seven calls to overloaded functions
+-- with increasing numbers of dictionary arguments.
+--
+-- With just -fprof-late-overloaded, no SCCs should be added, since none of the
+-- overloaded functions are top level. With `-fprof-late-overloaded-calls`, all
+-- seven calls should get *distinct* SCCs with separate source locations even
+-- though the overloaded functions share an OccName (`f`).
+
+module Main where
+
+data X = X
+
+instance Show     X where
+instance Num      X where
+instance Eq       X where
+instance Enum     X where
+instance Ord      X where
+instance Real     X where
+instance Integral X where
+
+-- No overloaded call
+{-# NOINLINE invoke0 #-}
+invoke0 :: (forall a. a -> a -> String) -> X -> String
+invoke0 f val = f val val
+
+{-# NOINLINE invoke1 #-}
+invoke1 :: (forall a. Show a => a -> a -> String) -> X -> String
+invoke1 f val = f val val
+
+{-# NOINLINE invoke2 #-}
+invoke2 :: (forall a. (Show a, Num a) => a -> a -> String) -> X -> String
+invoke2 f val = f val val
+
+{-# NOINLINE invoke3 #-}
+invoke3 :: (forall a. (Show a, Num a, Eq a) => a -> a -> String) -> X -> String
+invoke3 f val = f val val
+
+{-# NOINLINE invoke4 #-}
+invoke4 :: (forall a. (Show a, Num a, Eq a, Enum a) => a -> a -> String) -> X -> String
+invoke4 f val = f val val
+
+{-# NOINLINE invoke5 #-}
+invoke5 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a) => a -> a -> String) -> X -> String
+invoke5 f val = f val val
+
+{-# NOINLINE invoke6 #-}
+invoke6 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a, Real a) => a -> a -> String) -> X -> String
+invoke6 f val = f val val
+
+{-# NOINLINE invoke7 #-}
+invoke7 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a, Real a, Integral a) => a -> a -> String) -> X -> String
+invoke7 f val = f val val
+
+main :: IO ()
+main = do
+    putStrLn $ invoke0 (\_ _ -> s) X
+    putStrLn $ invoke1 (\_ _ -> s) X
+    putStrLn $ invoke2 (\_ _ -> s) X
+    putStrLn $ invoke3 (\_ _ -> s) X
+    putStrLn $ invoke4 (\_ _ -> s) X
+    putStrLn $ invoke5 (\_ _ -> s) X
+    putStrLn $ invoke6 (\_ _ -> s) X
+    putStrLn $ invoke7 (\_ _ -> s) X
+  where
+    s = "wibbly"


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded002.prof.sample
=====================================
@@ -0,0 +1,23 @@
+	Thu Jan  4 11:55 2024 Time and Allocation Profiling Report  (Final)
+
+	   scc-prof-overloaded002 +RTS -hc -p -RTS
+
+	total time  =        0.00 secs   (0 ticks @ 1000 us, 1 processor)
+	total alloc =      56,472 bytes  (excludes profiling overheads)
+
+COST CENTRE MODULE           SRC              %time %alloc
+
+MAIN        MAIN             <built-in>         0.0   32.7
+CAF         GHC.IO.Handle.FD <entire-module>    0.0   61.5
+CAF         GHC.IO.Encoding  <entire-module>    0.0    4.3
+CAF         GHC.Conc.Signal  <entire-module>    0.0    1.1
+
+
+                                                                      individual      inherited
+COST CENTRE  MODULE                SRC             no.     entries  %time %alloc   %time %alloc
+
+MAIN         MAIN                  <built-in>      128           0    0.0   32.7     0.0  100.0
+ CAF         GHC.Conc.Signal       <entire-module> 238           0    0.0    1.1     0.0    1.1
+ CAF         GHC.IO.Encoding       <entire-module> 219           0    0.0    4.3     0.0    4.3
+ CAF         GHC.IO.Encoding.Iconv <entire-module> 217           0    0.0    0.4     0.0    0.4
+ CAF         GHC.IO.Handle.FD      <entire-module> 208           0    0.0   61.5     0.0   61.5


=====================================
testsuite/tests/profiling/should_run/scc-prof-overloaded002.stdout
=====================================
@@ -0,0 +1,8 @@
+wibbly
+wibbly
+wibbly
+wibbly
+wibbly
+wibbly
+wibbly
+wibbly



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/61bb5ff68630c203eaae4baba554640246df5a63

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/61bb5ff68630c203eaae4baba554640246df5a63
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240304/9e8a919e/attachment-0001.html>


More information about the ghc-commits mailing list