[Git][ghc/ghc][wip/jsem] mp
Matthew Pickering (@mpickering)
gitlab at gitlab.haskell.org
Wed Oct 26 15:32:16 UTC 2022
Matthew Pickering pushed to branch wip/jsem at Glasgow Haskell Compiler / GHC
Commits:
1b3737f0 by Matthew Pickering at 2022-10-26T16:32:04+01:00
mp
- - - - -
2 changed files:
- compiler/GHC/Driver/MakeSem.hs
- utils/haddock
Changes:
=====================================
compiler/GHC/Driver/MakeSem.hs
=====================================
@@ -2,6 +2,7 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE NumericUnderscores #-}
-- | Implementation of a jobserver using system semaphores.
--
@@ -71,8 +72,8 @@ data JobserverOptions
defaultJobserverOptions :: JobserverOptions
defaultJobserverOptions =
JobserverOptions
- { releaseDebounce = 100
- , setNumCapsDebounce = 100
+ { releaseDebounce = 1000 -- 1 second
+ , setNumCapsDebounce = 1000 -- 1 second
}
-- | Resources available for running jobs, i.e.
@@ -350,7 +351,7 @@ tryAcquire opts js@( Jobserver { jobs = jobs_tvar })
return do
action <- acquireThread js
-- Set a debounce after acquiring a token.
- can_release_tvar <- registerDelay $ releaseDebounce opts
+ can_release_tvar <- registerDelay $ (releaseDebounce opts * 1000)
return $ st { jobserverAction = action
, canReleaseToken = can_release_tvar }
tryAcquire _ _ _ = retry
@@ -368,10 +369,8 @@ tryRelease sjs@( Jobserver { jobs = jobs_tvar } )
, canReleaseToken = can_release_tvar } )
= do
jobs <- readTVar jobs_tvar
- pprTraceM "try_release" (ppr jobs)
guard $ guardRelease jobs
can_release <- readTVar can_release_tvar
- pprTraceM "try_release" (ppr can_release)
guard can_release
return do
action <- releaseThread sjs
@@ -398,7 +397,6 @@ tryNoticeIdle opts jobs_tvar jobserver_state
-> STM (IO JobserverState)
sync_num_caps can_change_numcaps_tvar threadFinished_tmvar = do
mb_ex <- takeTMVar threadFinished_tmvar
- pprTraceM "MB_EX" (text $ show mb_ex)
for_ mb_ex MC.throwM
Jobs { tokensOwned } <- readTVar jobs_tvar
can_change_numcaps <- readTVar can_change_numcaps_tvar
@@ -410,7 +408,7 @@ tryNoticeIdle opts jobs_tvar jobserver_state
then return can_change_numcaps_tvar
else do
setNumCapabilities tokensOwned
- registerDelay $ setNumCapsDebounce opts
+ registerDelay $ (setNumCapsDebounce opts * 1000)
return $
jobserver_state
{ jobserverAction = Idle
@@ -422,7 +420,6 @@ tryStopThread :: TVar JobResources
-> JobserverState
-> STM (IO JobserverState)
tryStopThread jobs_tvar jsj = do
- pprTraceM "TRY STOP THREAD" empty
case jobserverAction jsj of
Acquiring { activeWaitId = wait_id } -> do
jobs <- readTVar jobs_tvar
@@ -430,7 +427,10 @@ tryStopThread jobs_tvar jsj = do
return do
interruptWaitOnSemaphore wait_id
return $ jsj { jobserverAction = Idle }
- Idle -> retry
+ _ -> retry
+ where
+ kill_thread_and_idle tid =
+ killThread tid $> jsj { jobserverAction = Idle }
-- | Main jobserver loop: acquire/release resources as
-- needed for the pending jobs and available semaphore tokens.
@@ -468,7 +468,7 @@ makeJobserver logger sem_name = do
}
jobs_tvar <- newTVarIO init_jobs
let
- opts = defaultJobserverOptions -- TODO: allow this to be configure
+ opts = defaultJobserverOptions -- TODO: allow this to be configured
sjs = Jobserver { jSemaphore = semaphore
, jobs = jobs_tvar
, jobsLogger = logger }
@@ -511,3 +511,56 @@ runJSemAbstractSem logger sem action = MC.mask \ unmask -> do
(_ :: Either MC.SomeException ()) <- MC.try cleanup
MC.throwM e1
Right x -> cleanup $> x
+
+{-
+Note [Architecture of the Job Server]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In `-jsem` mode the amount of parrelism that GHC can use is controlled by a
+system semaphore. We take resources from it when we need them and give them back
+if we don't have enought to do.
+
+A naive implementation would just take and release the semaphore around performing
+the action but this leads to two issues.
+
+* When taking a slot in the semaphore we must call `setNumCapabilities` in order
+ to adjust how many capabilities are available for parralel garbage collection. This
+ causes a synchronisation
+* We want to implement a debounce so that whilst there is pending work in the current
+ process we prefer to keep hold of resources from the semaphore. This reduces
+ overall memory usage as there are less live GHC processes at once.
+
+Therefore the obtention of semaphore resources is separated away from the
+request for the resource in the driver.
+
+A slot from the semaphore is requested using `acquireJob`, this creates a pending
+job which is a MVar which can be filling in to signal that the requested slot is ready.
+
+When the job is finished, the slot is released by calling `releaseJob`, which just
+increases the number of `free` jobs. If there are more pending jobs when the free count
+is increased the slot is immediately reused (see `modifyJobResources`).
+
+The `jobServerLoop` interacts with the system semaphore, when there are still pending
+jobs then `acquireThread` blocks waiting for a slot in the semaphore and increases
+the owned count when the slot is obtained.
+
+When there are free slots, no pending jobs and the debounce has expired
+then `releaseThread` will release slots back to the global semaphore.
+
+`tryStopThread` attempts to kill threads which are waiting to acquire a resource
+when we no longer need it. For example, consider that we attempt to acquire two
+slots of the semaphore but the first job finishes before we acquire the second resources,
+the second slot is no longer needed so we should cancel the wait (as it would not be used to
+do any work and not returned until the debounce). We just need to kill in the acquiring
+state because the releading state can't block.
+
+Note [Eventlog Messages for jsem]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It can be tricky to verify that the work is shared adequately across different
+processes. To help debug this whenever the global state changes the values of
+`JobResources` are output to the eventlog. There are some scripts which can be used
+to analyse this output and report statistics about core saturation in this
+github repo (https://github.com/mpickering/ghc-jsem-analyse).
+
+-}
=====================================
utils/haddock
=====================================
@@ -1 +1 @@
-Subproject commit 57b7493ba60bc4f4cf6b57b900b0c46fe8d86669
+Subproject commit e5b41a9f92de608f3605ef54da5709074e189ad9
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/1b3737f00937862a5d8c132e3deb8a3272543cd4
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/1b3737f00937862a5d8c132e3deb8a3272543cd4
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20221026/40cacc3d/attachment-0001.html>
More information about the ghc-commits
mailing list