[Git][ghc/ghc][wip/andreask/spark-gc] Only gc sparks locally when we can ensure marking is done.

Andreas Klebinger (@AndreasK) gitlab at gitlab.haskell.org
Tue Nov 29 11:59:00 UTC 2022



Andreas Klebinger pushed to branch wip/andreask/spark-gc at Glasgow Haskell Compiler / GHC


Commits:
7f8043aa by Andreas Klebinger at 2022-11-29T12:57:06+01:00
Only gc sparks locally when we can ensure marking is done.

When performing GC without work stealing there was no guarantee that
spark pruning was happening after marking of the sparks. This could
cause us to GC live sparks under certain circumstances.

Fixes #22528.

- - - - -


3 changed files:

- docs/users_guide/9.6.1-notes.rst
- rts/Sparks.c
- rts/sm/GC.c


Changes:

=====================================
docs/users_guide/9.6.1-notes.rst
=====================================
@@ -126,6 +126,9 @@ Runtime system
   Previously only live blocks were taken into account.
   This makes it more likely to trigger promptly when the heap is highly fragmented.
 
+- Fixed a bug that sometimes caused live sparks to be GC'ed too early either during
+  minor GC or major GC with workstealing disabled. See #22528.
+
 
 ``base`` library
 ~~~~~~~~~~~~~~~~
@@ -146,9 +149,9 @@ Runtime system
 
 - Updated to `Unicode 15.0.0 <https://www.unicode.org/versions/Unicode15.0.0/>`_.
 
-- Add standard Unicode case predicates :base-ref:`Data.Char.isUpperCase` and 
-  :base-ref:`Data.Char.isLowerCase`. These predicates use the standard Unicode 
-  case properties and are more intuitive than :base-ref:`Data.Char.isUpper` and 
+- Add standard Unicode case predicates :base-ref:`Data.Char.isUpperCase` and
+  :base-ref:`Data.Char.isLowerCase`. These predicates use the standard Unicode
+  case properties and are more intuitive than :base-ref:`Data.Char.isUpper` and
   :base-ref:`Data.Char.isLower`.
 
 ``ghc-prim`` library


=====================================
rts/Sparks.c
=====================================
@@ -79,6 +79,28 @@ newSpark (StgRegTable *reg, StgClosure *p)
     return 1;
 }
 
+/* Note [Pruning the spark pool]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+pruneSparkQueue checks if closures have been evacuated to know weither or
+not a spark can be GCed. We make sure this is reliable in multiple ways:
++ If we don't run parallel GC then the GC Lead does all the pruning after
+  all marking has been done.
++ If we run without work stealing then GC workers are not syncronized so
+  we leave it to the GC Lead to prune sparks once marking is done.
+  The alternative would be to introduce a new synchronization point or to
+  simply keep all sparks alive. Both of which seem more costly on average.
++ If work stealing is enabled all GC threads will be running
+  scavenge_until_all_done until regular heap marking is done. This means
+  we can perform spark pruning in the GC workers themselves upon return
+  from scavenge_until_all_done.
+
+The only wart is that if we prune sparks locally we might
+miss sparks reachable via weak pointers as these are marked in the main
+thread concurrently to the calls to pruneSparkQueue. To fix this problem would
+require a GC barrier but that seems to high a price to pay.
+*/
+
+
 /* --------------------------------------------------------------------------
  * Remove all sparks from the spark queues which should not spark any
  * more.  Called after GC. We assume exclusive access to the structure
@@ -181,7 +203,7 @@ pruneSparkQueue (bool nonmovingMarkFinished, Capability *cap)
           cap->spark_stats.fizzled++;
           traceEventSparkFizzle(cap);
       } else {
-          info = spark->header.info;
+          info = RELAXED_LOAD(&spark->header.info);
           load_load_barrier();
           if (IS_FORWARDING_PTR(info)) {
               tmp = (StgClosure*)UN_FORWARDING_PTR(info);
@@ -226,6 +248,10 @@ pruneSparkQueue (bool nonmovingMarkFinished, Capability *cap)
                       traceEventSparkFizzle(cap);
                   }
               } else {
+                  debugBelch("%p, %p, %p\n", info, spark, spark->header.info);
+                  if(info != spark->header.info) {
+                    barf("%p, %p, %p\n", info, spark, spark->header.info);
+                  }
                   pruned_sparks++; // discard spark
                   cap->spark_stats.gcd++;
                   traceEventSparkGC(cap);


=====================================
rts/sm/GC.c
=====================================
@@ -292,6 +292,7 @@ GarbageCollect (uint32_t collect_gen,
       any_work, scav_find_work, max_n_todo_overflow;
 #if defined(THREADED_RTS)
   gc_thread *saved_gct;
+  bool gc_sparks_all_caps;
 #endif
   uint32_t g, n;
   // The time we should report our heap census as occurring at, if necessary.
@@ -559,6 +560,9 @@ GarbageCollect (uint32_t collect_gen,
   StgTSO *resurrected_threads = END_TSO_QUEUE;
   // must be last...  invariant is that everything is fully
   // scavenged at this point.
+#if defined(THREADED_RTS)
+  gc_sparks_all_caps = !work_stealing || !is_par_gc();
+#endif
   work_stealing = false;
   while (traverseWeakPtrList(&dead_weak_ptr_list, &resurrected_threads))
   {
@@ -571,7 +575,8 @@ GarbageCollect (uint32_t collect_gen,
   gcStableNameTable();
 
 #if defined(THREADED_RTS)
-  if (!is_par_gc()) {
+  // See Note [Pruning the spark pool]
+  if(gc_sparks_all_caps) {
       for (n = 0; n < n_capabilities; n++) {
           pruneSparkQueue(false, capabilities[n]);
       }
@@ -1387,7 +1392,6 @@ void
 gcWorkerThread (Capability *cap)
 {
     gc_thread *saved_gct;
-
     // necessary if we stole a callee-saves register for gct:
     saved_gct = gct;
 
@@ -1418,13 +1422,10 @@ gcWorkerThread (Capability *cap)
     scavenge_until_all_done();
 
 #if defined(THREADED_RTS)
-    // Now that the whole heap is marked, we discard any sparks that
-    // were found to be unreachable.  The main GC thread is currently
-    // marking heap reachable via weak pointers, so it is
-    // non-deterministic whether a spark will be retained if it is
-    // only reachable via weak pointers.  To fix this problem would
-    // require another GC barrier, which is too high a price.
-    pruneSparkQueue(false, cap);
+    // See Note [Pruning the spark pool]
+    if(work_stealing && is_par_gc()) {
+        pruneSparkQueue(false, cap);
+    }
 #endif
 
     // Wait until we're told to continue



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/7f8043aa0c44027568c9b876534f01084721a983

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/7f8043aa0c44027568c9b876534f01084721a983
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20221129/13792ce5/attachment-0001.html>


More information about the ghc-commits mailing list