[Git][ghc/ghc][wip/jsem] mp

Wed Oct 26 15:32:16 UTC 2022


Matthew Pickering pushed to branch wip/jsem at Glasgow Haskell Compiler / GHC


Commits:
1b3737f0 by Matthew Pickering at 2022-10-26T16:32:04+01:00
mp

- - - - -


2 changed files:

- compiler/GHC/Driver/MakeSem.hs
- utils/haddock


Changes:

=====================================
compiler/GHC/Driver/MakeSem.hs
=====================================
@@ -2,6 +2,7 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE RecordWildCards #-}
 {-# LANGUAGE TupleSections #-}
+{-# LANGUAGE NumericUnderscores #-}
 
 -- | Implementation of a jobserver using system semaphores.
 --
@@ -71,8 +72,8 @@ data JobserverOptions
 defaultJobserverOptions :: JobserverOptions
 defaultJobserverOptions =
   JobserverOptions
-    { releaseDebounce    = 100
-    , setNumCapsDebounce = 100
+    { releaseDebounce    = 1000 -- 1 second
+    , setNumCapsDebounce = 1000 -- 1 second
     }
 
 -- | Resources available for running jobs, i.e.
@@ -350,7 +351,7 @@ tryAcquire opts js@( Jobserver { jobs = jobs_tvar })
     return do
       action           <- acquireThread js
       -- Set a debounce after acquiring a token.
-      can_release_tvar <- registerDelay $ releaseDebounce opts
+      can_release_tvar <- registerDelay $ (releaseDebounce opts * 1000)
       return $ st { jobserverAction = action
                   , canReleaseToken = can_release_tvar }
 tryAcquire _ _ _ = retry
@@ -368,10 +369,8 @@ tryRelease sjs@( Jobserver { jobs = jobs_tvar } )
       , canReleaseToken = can_release_tvar } )
   = do
     jobs <- readTVar jobs_tvar
-    pprTraceM "try_release" (ppr jobs)
     guard  $ guardRelease jobs
     can_release <- readTVar can_release_tvar
-    pprTraceM "try_release" (ppr can_release)
     guard can_release
     return do
       action <- releaseThread sjs
@@ -398,7 +397,6 @@ tryNoticeIdle opts jobs_tvar jobserver_state
                   -> STM (IO JobserverState)
     sync_num_caps can_change_numcaps_tvar threadFinished_tmvar = do
       mb_ex <- takeTMVar threadFinished_tmvar
-      pprTraceM "MB_EX" (text $ show mb_ex)
       for_ mb_ex MC.throwM
       Jobs { tokensOwned } <- readTVar jobs_tvar
       can_change_numcaps <- readTVar can_change_numcaps_tvar
@@ -410,7 +408,7 @@ tryNoticeIdle opts jobs_tvar jobserver_state
           then return can_change_numcaps_tvar
           else do
             setNumCapabilities tokensOwned
-            registerDelay $ setNumCapsDebounce opts
+            registerDelay $ (setNumCapsDebounce opts * 1000)
         return $
           jobserver_state
             { jobserverAction  = Idle
@@ -422,7 +420,6 @@ tryStopThread :: TVar JobResources
               -> JobserverState
               -> STM (IO JobserverState)
 tryStopThread jobs_tvar jsj = do
-  pprTraceM "TRY STOP THREAD" empty
   case jobserverAction jsj of
     Acquiring { activeWaitId = wait_id } -> do
      jobs <- readTVar jobs_tvar
@@ -430,7 +427,10 @@ tryStopThread jobs_tvar jsj = do
      return do
        interruptWaitOnSemaphore wait_id
        return $ jsj { jobserverAction = Idle }
-    Idle -> retry
+    _ -> retry
+    where
+     kill_thread_and_idle tid =
+       killThread tid $> jsj { jobserverAction = Idle }
 
 -- | Main jobserver loop: acquire/release resources as
 -- needed for the pending jobs and available semaphore tokens.
@@ -468,7 +468,7 @@ makeJobserver logger sem_name = do
            }
   jobs_tvar <- newTVarIO init_jobs
   let
-    opts = defaultJobserverOptions -- TODO: allow this to be configure
+    opts = defaultJobserverOptions -- TODO: allow this to be configured
     sjs = Jobserver { jSemaphore = semaphore
                     , jobs       = jobs_tvar
                     , jobsLogger = logger }
@@ -511,3 +511,56 @@ runJSemAbstractSem logger sem action = MC.mask \ unmask -> do
       (_ :: Either MC.SomeException ()) <- MC.try cleanup
       MC.throwM e1
     Right x -> cleanup $> x
+
+{-
+Note [Architecture of the Job Server]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In `-jsem` mode the amount of parrelism that GHC can use is controlled by a
+system semaphore. We take resources from it when we need them and give them back
+if we don't have enought to do.
+
+A naive implementation would just take and release the semaphore around performing
+the action but this leads to two issues.
+
+* When taking a slot in the semaphore we must call `setNumCapabilities` in order
+  to adjust how many capabilities are available for parralel garbage collection. This
+  causes a synchronisation
+* We want to implement a debounce so that whilst there is pending work in the current
+  process we prefer to keep hold of resources from the semaphore. This reduces
+  overall memory usage as there are less live GHC processes at once.
+
+Therefore the obtention of semaphore resources is separated away from the
+request for the resource in the driver.
+
+A slot from the semaphore is requested using `acquireJob`, this creates a pending
+job which is a MVar which can be filling in to signal that the requested slot is ready.
+
+When the job is finished, the slot is released by calling `releaseJob`, which just
+increases the number of `free` jobs. If there are more pending jobs when the free count
+is increased the slot is immediately reused (see `modifyJobResources`).
+
+The `jobServerLoop` interacts with the system semaphore, when there are still pending
+jobs then `acquireThread` blocks waiting for a slot in the semaphore and increases
+the owned count when the slot is obtained.
+
+When there are free slots, no pending jobs and the debounce has expired
+then `releaseThread` will release slots back to the global semaphore.
+
+`tryStopThread` attempts to kill threads which are waiting to acquire a resource
+when we no longer need it. For example, consider that we attempt to acquire two
+slots of the semaphore but the first job finishes before we acquire the second resources,
+the second slot is no longer needed so we should cancel the wait (as it would not be used to
+do any work and not returned until the debounce). We just need to kill in the acquiring
+state because the releading state can't block.
+
+Note [Eventlog Messages for jsem]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It can be tricky to verify that the work is shared adequately across different
+processes. To help debug this whenever the global state changes the values of
+`JobResources` are output to the eventlog. There are some scripts which can be used
+to analyse this output and report statistics about core saturation in this
+github repo (https://github.com/mpickering/ghc-jsem-analyse).
+
+-}


=====================================
utils/haddock
=====================================
@@ -1 +1 @@
-Subproject commit 57b7493ba60bc4f4cf6b57b900b0c46fe8d86669
+Subproject commit e5b41a9f92de608f3605ef54da5709074e189ad9



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/1b3737f00937862a5d8c132e3deb8a3272543cd4

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/1b3737f00937862a5d8c132e3deb8a3272543cd4
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20221026/40cacc3d/attachment-0001.html>