[Git][ghc/ghc][wip/gc/optimize] 17 commits: Nonmoving: Allow aging and refactor static objects logic

Wed Jun 19 03:37:32 UTC 2019


Ben Gamari pushed to branch wip/gc/optimize at Glasgow Haskell Compiler / GHC


Commits:
88607191 by Ben Gamari at 2019-06-19T03:32:50Z
Nonmoving: Allow aging and refactor static objects logic

This commit does two things:

 * Allow aging of objects during the preparatory minor GC
 * Refactor handling of static objects to avoid the use of a hashtable

- - - - -
94b2eae1 by Ben Gamari at 2019-06-19T03:32:50Z
Disable aging when doing deadlock detection GC

- - - - -
3facbbd9 by Ben Gamari at 2019-06-19T03:32:51Z
More comments for aging

- - - - -
44afa56b by Ben Gamari at 2019-06-19T03:33:51Z
NonMoving: Eliminate integer division in nonmovingBlockCount

Perf showed that the this single div was capturing up to 10% of samples
in nonmovingMark. However, the overwhelming majority of cases is looking
at small block sizes. These cases we can easily compute explicitly,
allowing the compiler to turn the division into a significantly more
efficient division-by-constant.

While the increase in source code looks scary, this all optimises down
to very nice looking assembler. At this point the only remaining
hotspots in nonmovingBlockCount are due to memory access.

- - - - -
93145511 by Ben Gamari at 2019-06-19T03:34:14Z
Allocate mark queues in larger block groups

- - - - -
b952914e by Ben Gamari at 2019-06-19T03:34:23Z
NonMovingMark: Optimize representation of mark queue

This shortens MarkQueueEntry by 30% (one word)

- - - - -
4fcb6e9e by Ben Gamari at 2019-06-19T03:34:23Z
NonMoving: Optimize bitmap search during allocation

Use memchr instead of a open-coded loop. This is nearly twice as fast in
a synthetic benchmark.

- - - - -
73f21d4d by Ben Gamari at 2019-06-19T03:34:23Z
rts: Add prefetch macros

- - - - -
e5fcd5c9 by Ben Gamari at 2019-06-19T03:34:23Z
NonMoving: Prefetch when clearing bitmaps

Ensure that the bitmap of the segmentt that we will clear next is in
cache by the time we reach it.

- - - - -
3c033190 by Ben Gamari at 2019-06-19T03:34:23Z
NonMoving: Inline nonmovingClearAllBitmaps

- - - - -
3f6009d2 by Ben Gamari at 2019-06-19T03:34:24Z
NonMoving: Fuse sweep preparation into mark prep

- - - - -
c1f16cc0 by Ben Gamari at 2019-06-19T03:34:24Z
NonMoving: Pre-fetch during mark

This improved overall runtime on nofib's constraints test by nearly 10%.

- - - - -
70d5d812 by Ben Gamari at 2019-06-19T03:34:24Z
NonMoving: Prefetch segment header

- - - - -
ddb3648b by Ben Gamari at 2019-06-19T03:34:24Z
NonMoving: Optimise allocator cache behavior

Previously we would look at the segment header to determine the block
size despite the fact that we already had the block size at hand.

- - - - -
757621d8 by Ben Gamari at 2019-06-19T03:34:24Z
NonMovingMark: Eliminate redundant check_in_nonmoving_heaps

- - - - -
cbe2959b by Ben Gamari at 2019-06-19T03:34:24Z
NonMoving: Don't do major GC if one is already running

Previously we would perform a preparatory moving collection, resulting
in many things being added to the mark queue. When we finished with this
we would realize in nonmovingCollect that there was already a collection
running, in which case we would simply not run the nonmoving collector.

However, it was very easy to end up in a "treadmilling" situation: all
subsequent GC following the first failed major GC would be scheduled as
major GCs. Consequently we would continuously feed the concurrent
collector with more mark queue entries and it would never finish.

This patch aborts the major collection far earlier, meaning that we
avoid adding nonmoving objects to the mark queue and allowing the
concurrent collector to finish.

- - - - -
a5cd845b by Ben Gamari at 2019-06-19T03:34:24Z
Nonmoving: Ensure write barrier vanishes in non-threaded RTS

- - - - -


22 changed files:

- includes/Cmm.h
- includes/Rts.h
- includes/rts/NonMoving.h
- rts/Messages.c
- rts/PrimOps.cmm
- rts/STM.c
- rts/Schedule.c
- rts/ThreadPaused.c
- rts/Threads.c
- rts/Updates.h
- rts/sm/Evac.c
- rts/sm/GC.c
- rts/sm/GC.h
- rts/sm/GCAux.c
- rts/sm/NonMoving.c
- rts/sm/NonMoving.h
- rts/sm/NonMovingMark.c
- rts/sm/NonMovingMark.h
- rts/sm/NonMovingScav.c
- rts/sm/NonMovingSweep.c
- rts/sm/NonMovingSweep.h
- rts/sm/Storage.c


Changes:

=====================================
includes/Cmm.h
=====================================
@@ -940,19 +940,23 @@
     return (dst);
 
 
+//
+// Nonmoving write barrier helpers
+//
+// See Note [Update remembered set] in NonMovingMark.c.
+
 #if defined(THREADED_RTS)
-#define IF_WRITE_BARRIER_ENABLED                               \
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
     if (W_[nonmoving_write_barrier_enabled] != 0) (likely: False)
 #else
 // A similar measure is also taken in rts/NonMoving.h, but that isn't visible from C--
-#define IF_WRITE_BARRIER_ENABLED                               \
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
     if (0)
 #define nonmoving_write_barrier_enabled 0
 #endif
 
 // A useful helper for pushing a pointer to the update remembered set.
-// See Note [Update remembered set] in NonMovingMark.c.
 #define updateRemembSetPushPtr(p)                                    \
-    IF_WRITE_BARRIER_ENABLED {                                       \
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {                             \
       ccall updateRemembSetPushClosure_(BaseReg "ptr", p "ptr");     \
     }


=====================================
includes/Rts.h
=====================================
@@ -74,6 +74,10 @@ extern "C" {
 #define RTS_UNREACHABLE abort()
 #endif
 
+/* Prefetch primitives */
+#define prefetchForRead(ptr) __builtin_prefetch(ptr, 0)
+#define prefetchForWrite(ptr) __builtin_prefetch(ptr, 1)
+
 /* Fix for mingw stat problem (done here so it's early enough) */
 #if defined(mingw32_HOST_OS)
 #define __MSVCRT__ 1


=====================================
includes/rts/NonMoving.h
=====================================
@@ -21,4 +21,7 @@ void updateRemembSetPushClosure(Capability *cap, StgClosure *p);
 
 void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p);
 
+// Note that RTS code should not condition on this directly by rather
+// use the IF_NONMOVING_WRITE_BARRIER_ENABLED macro to ensure that
+// the barrier is eliminated in the non-threaded RTS.
 extern StgWord DLL_IMPORT_DATA_VAR(nonmoving_write_barrier_enabled);


=====================================
rts/Messages.c
=====================================
@@ -256,7 +256,7 @@ loop:
 
         // point to the BLOCKING_QUEUE from the BLACKHOLE
         write_barrier(); // make the BQ visible
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, (StgClosure*)p);
         }
         ((StgInd*)bh)->indirectee = (StgClosure *)bq;
@@ -287,7 +287,7 @@ loop:
         }
 #endif
 
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             // We are about to overwrite bq->queue; make sure its current value
             // makes it into the update remembered set
             updateRemembSetPushClosure(cap, (StgClosure*)bq->queue);


=====================================
rts/PrimOps.cmm
=====================================
@@ -474,7 +474,7 @@ stg_copyArray_barrier ( W_ hdr_size, gcptr dst, W_ dst_off, W_ n)
     end = p + WDS(n);
 
 again:
-    IF_WRITE_BARRIER_ENABLED {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         ccall updateRemembSetPushClosure_(BaseReg "ptr", W_[p] "ptr");
     }
     p = p + WDS(1);
@@ -490,7 +490,7 @@ stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n)
     W_ dst_p, src_p, bytes;
 
     if (n > 0) {
-        IF_WRITE_BARRIER_ENABLED {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
                                       dst, dst_off, n);
         }
@@ -511,7 +511,7 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
     W_ dst_p, src_p, bytes;
 
     if (n > 0) {
-        IF_WRITE_BARRIER_ENABLED {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
                                       dst, dst_off, n);
         }


=====================================
rts/STM.c
=====================================
@@ -297,8 +297,10 @@ static StgClosure *lock_tvar(Capability *cap,
   } while (cas((void *)&(s -> current_value),
                (StgWord)result, (StgWord)trec) != (StgWord)result);
 
-  if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
-      updateRemembSetPushClosure(cap, result);
+
+  IF_NONMOVING_WRITE_BARRIER_ENABLED {
+      if (result)
+          updateRemembSetPushClosure(cap, result);
   }
   return result;
 }
@@ -323,8 +325,9 @@ static StgBool cond_lock_tvar(Capability *cap,
   TRACE("%p : cond_lock_tvar(%p, %p)", trec, s, expected);
   w = cas((void *)&(s -> current_value), (StgWord)expected, (StgWord)trec);
   result = (StgClosure *)w;
-  if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
-      updateRemembSetPushClosure(cap, expected);
+  IF_NONMOVING_WRITE_BARRIER_ENABLED {
+      if (result)
+          updateRemembSetPushClosure(cap, expected);
   }
   TRACE("%p : %s", trec, result ? "success" : "failure");
   return (result == expected);


=====================================
rts/Schedule.c
=====================================
@@ -164,7 +164,8 @@ static void scheduleHandleThreadBlocked( StgTSO *t );
 static bool scheduleHandleThreadFinished( Capability *cap, Task *task,
                                           StgTSO *t );
 static bool scheduleNeedHeapProfile(bool ready_to_gc);
-static void scheduleDoGC(Capability **pcap, Task *task, bool force_major);
+static void scheduleDoGC( Capability **pcap, Task *task,
+                          bool force_major, bool deadlock_detect );
 
 static void deleteThread (StgTSO *tso);
 static void deleteAllThreads (void);
@@ -264,7 +265,7 @@ schedule (Capability *initialCapability, Task *task)
     case SCHED_INTERRUPTING:
         debugTrace(DEBUG_sched, "SCHED_INTERRUPTING");
         /* scheduleDoGC() deletes all the threads */
-        scheduleDoGC(&cap,task,true);
+        scheduleDoGC(&cap,task,true,false);
 
         // after scheduleDoGC(), we must be shutting down.  Either some
         // other Capability did the final GC, or we did it above,
@@ -561,7 +562,7 @@ run_thread:
     }
 
     if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) {
-      scheduleDoGC(&cap,task,false);
+      scheduleDoGC(&cap,task,false,false);
     }
   } /* end of while() */
 }
@@ -935,7 +936,7 @@ scheduleDetectDeadlock (Capability **pcap, Task *task)
         // they are unreachable and will therefore be sent an
         // exception.  Any threads thus released will be immediately
         // runnable.
-        scheduleDoGC (pcap, task, true/*force major GC*/);
+        scheduleDoGC (pcap, task, true/*force major GC*/, true/*deadlock detection*/);
         cap = *pcap;
         // when force_major == true. scheduleDoGC sets
         // recent_activity to ACTIVITY_DONE_GC and turns off the timer
@@ -1005,7 +1006,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS)
     while (!emptyInbox(cap)) {
         // Executing messages might use heap, so we should check for GC.
         if (doYouWantToGC(cap)) {
-            scheduleDoGC(pcap, cap->running_task, false);
+            scheduleDoGC(pcap, cap->running_task, false, false);
             cap = *pcap;
         }
 
@@ -1552,9 +1553,11 @@ void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task)
  * Perform a garbage collection if necessary
  * -------------------------------------------------------------------------- */
 
+// N.B. See Note [Deadlock detection under nonmoving collector] for rationale
+// behind deadlock_detect argument.
 static void
 scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
-              bool force_major)
+              bool force_major, bool deadlock_detect)
 {
     Capability *cap = *pcap;
     bool heap_census;
@@ -1847,9 +1850,9 @@ delete_threads_and_gc:
     // emerge they don't immediately re-enter the GC.
     pending_sync = 0;
     signalCondition(&sync_finished_cond);
-    GarbageCollect(collect_gen, heap_census, gc_type, cap, idle_cap);
+    GarbageCollect(collect_gen, heap_census, deadlock_detect, gc_type, cap, idle_cap);
 #else
-    GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
+    GarbageCollect(collect_gen, heap_census, deadlock_detect, 0, cap, NULL);
 #endif
 
     // If we're shutting down, don't leave any idle GC work to do.
@@ -2500,7 +2503,7 @@ resumeThread (void *task_)
     incall->suspended_tso = NULL;
     incall->suspended_cap = NULL;
     // we will modify tso->_link
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         updateRemembSetPushClosure(cap, (StgClosure *)tso->_link);
     }
     tso->_link = END_TSO_QUEUE;
@@ -2717,7 +2720,7 @@ exitScheduler (bool wait_foreign USED_IF_THREADS)
         nonmovingExit();
         Capability *cap = task->cap;
         waitForCapability(&cap,task);
-        scheduleDoGC(&cap,task,true);
+        scheduleDoGC(&cap,task,true,false);
         ASSERT(task->incall->tso == NULL);
         releaseCapability(cap);
     }
@@ -2785,7 +2788,7 @@ performGC_(bool force_major)
     // TODO: do we need to traceTask*() here?
 
     waitForCapability(&cap,task);
-    scheduleDoGC(&cap,task,force_major);
+    scheduleDoGC(&cap,task,force_major,false);
     releaseCapability(cap);
     boundTaskExiting(task);
 }


=====================================
rts/ThreadPaused.c
=====================================
@@ -330,15 +330,16 @@ threadPaused(Capability *cap, StgTSO *tso)
             }
 #endif
 
-            if (RTS_UNLIKELY(nonmoving_write_barrier_enabled
-                             && ip_THUNK(INFO_PTR_TO_STRUCT(bh_info)))) {
-                // We are about to replace a thunk with a blackhole.
-                // Add the free variables of the closure we are about to
-                // overwrite to the update remembered set.
-                // N.B. We caught the WHITEHOLE case above.
-                updateRemembSetPushThunkEager(cap,
-                                             THUNK_INFO_PTR_TO_STRUCT(bh_info),
-                                             (StgThunk *) bh);
+            IF_NONMOVING_WRITE_BARRIER_ENABLED {
+                if (ip_THUNK(INFO_PTR_TO_STRUCT(bh_info))) {
+                    // We are about to replace a thunk with a blackhole.
+                    // Add the free variables of the closure we are about to
+                    // overwrite to the update remembered set.
+                    // N.B. We caught the WHITEHOLE case above.
+                    updateRemembSetPushThunkEager(cap,
+                                                 THUNK_INFO_PTR_TO_STRUCT(bh_info),
+                                                 (StgThunk *) bh);
+                }
             }
 
             // The payload of the BLACKHOLE points to the TSO


=====================================
rts/Threads.c
=====================================
@@ -711,7 +711,7 @@ threadStackUnderflow (Capability *cap, StgTSO *tso)
             barf("threadStackUnderflow: not enough space for return values");
         }
 
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             // ensure that values that we copy into the new stack are marked
             // for the nonmoving collector. Note that these values won't
             // necessarily form a full closure so we need to handle them


=====================================
rts/Updates.h
=====================================
@@ -44,7 +44,7 @@
     W_ bd;                                                      \
                                                                 \
     OVERWRITING_CLOSURE(p1);                                    \
-    IF_WRITE_BARRIER_ENABLED {                                  \
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {                        \
       ccall updateRemembSetPushThunk_(BaseReg, p1 "ptr");       \
     }                                                           \
     StgInd_indirectee(p1) = p2;                                 \
@@ -73,7 +73,7 @@ INLINE_HEADER void updateWithIndirection (Capability *cap,
     /* not necessarily true: ASSERT( !closure_IND(p1) ); */
     /* occurs in RaiseAsync.c:raiseAsync() */
     OVERWRITING_CLOSURE(p1);
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         updateRemembSetPushThunk(cap, (StgThunk*)p1);
     }
     ((StgInd *)p1)->indirectee = p2;


=====================================
rts/sm/Evac.c
=====================================
@@ -69,12 +69,6 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
 {
     ASSERT(gen_no < RtsFlags.GcFlags.generations);
 
-    if (RtsFlags.GcFlags.useNonmoving && major_gc) {
-        // unconditionally promote to non-moving heap in major gc
-        gct->copied += size;
-        return nonmovingAllocate(gct->cap, size);
-    }
-
     StgPtr to;
     gen_workspace *ws;
 
@@ -91,9 +85,34 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
         }
     }
 
-    if (RtsFlags.GcFlags.useNonmoving && gen_no == oldest_gen->no) {
-        gct->copied += size;
-        return nonmovingAllocate(gct->cap, size);
+    if (RtsFlags.GcFlags.useNonmoving) {
+        /* See Note [Deadlock detection under nonmoving collector]. */
+        if (deadlock_detect_gc)
+            gen_no = oldest_gen->no;
+
+        if (gen_no == oldest_gen->no) {
+            gct->copied += size;
+            to = nonmovingAllocate(gct->cap, size);
+
+            // Add segment to the todo list unless it's already there
+            // current->todo_link == NULL means not in todo list
+            struct NonmovingSegment *seg = nonmovingGetSegment(to);
+            if (!seg->todo_link) {
+                gen_workspace *ws = &gct->gens[oldest_gen->no];
+                seg->todo_link = ws->todo_seg;
+                ws->todo_seg = seg;
+            }
+
+            // The object which refers to this closure may have been aged (i.e.
+            // retained in a younger generation). Consequently, we must add the
+            // closure to the mark queue to ensure that it will be marked.
+            //
+            // However, if we are in a deadlock detection GC then we disable aging
+            // so there is no need.
+            if (major_gc && !deadlock_detect_gc)
+                markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) to);
+            return to;
+        }
     }
 
     ws = &gct->gens[gen_no];  // zero memory references here
@@ -312,9 +331,10 @@ evacuate_large(StgPtr p)
    */
   new_gen_no = bd->dest_no;
 
-  if (RtsFlags.GcFlags.useNonmoving && major_gc) {
+  if (deadlock_detect_gc) {
+      /* See Note [Deadlock detection under nonmoving collector]. */
       new_gen_no = oldest_gen->no;
-  } else if (new_gen_no < gct->evac_gen_no) {
+  } else  if (new_gen_no < gct->evac_gen_no) {
       if (gct->eager_promotion) {
           new_gen_no = gct->evac_gen_no;
       } else {
@@ -363,6 +383,13 @@ evacuate_large(StgPtr p)
 STATIC_INLINE void
 evacuate_static_object (StgClosure **link_field, StgClosure *q)
 {
+    if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)) {
+        // See Note [Static objects under the nonmoving collector] in Storage.c.
+        if (major_gc && !deadlock_detect_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+        return;
+    }
+
     StgWord link = (StgWord)*link_field;
 
     // See Note [STATIC_LINK fields] for how the link field bits work
@@ -603,6 +630,8 @@ loop:
           // NOTE: large objects in nonmoving heap are also marked with
           // BF_NONMOVING. Those are moved to scavenged_large_objects list in
           // mark phase.
+          if (major_gc && !deadlock_detect_gc)
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
           return;
       }
 
@@ -629,6 +658,13 @@ loop:
       // they are not)
       if (bd->flags & BF_COMPACT) {
           evacuate_compact((P_)q);
+
+          // We may have evacuated the block to the nonmoving generation. If so
+          // we need to make sure it is added to the mark queue since the only
+          // reference to it may be from the moving heap.
+          if (major_gc && bd->flags & BF_NONMOVING && !deadlock_detect_gc) {
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+          }
           return;
       }
 
@@ -636,6 +672,13 @@ loop:
        */
       if (bd->flags & BF_LARGE) {
           evacuate_large((P_)q);
+
+          // We may have evacuated the block to the nonmoving generation. If so
+          // we need to make sure it is added to the mark queue since the only
+          // reference to it may be from the moving heap.
+          if (major_gc && bd->flags & BF_NONMOVING && !deadlock_detect_gc) {
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+          }
           return;
       }
 
@@ -937,6 +980,8 @@ evacuate_BLACKHOLE(StgClosure **p)
     ASSERT((bd->flags & BF_COMPACT) == 0);
 
     if (bd->flags & BF_NONMOVING) {
+        if (major_gc && !deadlock_detect_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
         return;
     }
 


=====================================
rts/sm/GC.c
=====================================
@@ -108,6 +108,7 @@
  */
 uint32_t N;
 bool major_gc;
+bool deadlock_detect_gc;
 
 /* Data used for allocation area sizing.
  */
@@ -198,6 +199,7 @@ StgPtr mark_sp;            // pointer to the next unallocated mark stack entry
 void
 GarbageCollect (uint32_t collect_gen,
                 bool do_heap_census,
+                bool deadlock_detect,
                 uint32_t gc_type USED_IF_THREADS,
                 Capability *cap,
                 bool idle_cap[])
@@ -267,7 +269,25 @@ GarbageCollect (uint32_t collect_gen,
   N = collect_gen;
   major_gc = (N == RtsFlags.GcFlags.generations-1);
 
-  if (major_gc) {
+  /* See Note [Deadlock detection under nonmoving collector]. */
+  deadlock_detect_gc = deadlock_detect;
+
+#if defined(THREADED_RTS)
+  if (major_gc && RtsFlags.GcFlags.useNonmoving && concurrent_coll_running) {
+      /* If there is already a concurrent major collection running then
+       * there is no benefit to starting another.
+       * TODO: Catch heap-size runaway.
+       */
+      N--;
+      collect_gen--;
+      major_gc = false;
+  }
+#endif
+
+  /* N.B. The nonmoving collector works a bit differently. See
+   * Note [Static objects under the nonmoving collector].
+   */
+  if (major_gc && !RtsFlags.GcFlags.useNonmoving) {
       prev_static_flag = static_flag;
       static_flag =
           static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
@@ -740,6 +760,11 @@ GarbageCollect (uint32_t collect_gen,
       // so we need to mark those too.
       // Note that in sequential case these lists will be appended with more
       // weaks and threads found to be dead in mark.
+#if !defined(THREADED_RTS)
+      // In the non-threaded runtime this is the only time we push to the
+      // upd_rem_set
+      nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue);
+#endif
       nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads);
       ACQUIRE_SM_LOCK;
   }


=====================================
rts/sm/GC.h
=====================================
@@ -17,9 +17,12 @@
 
 #include "HeapAlloc.h"
 
-void GarbageCollect (uint32_t force_major_gc,
+void GarbageCollect (uint32_t collect_gen,
                      bool do_heap_census,
-                     uint32_t gc_type, Capability *cap, bool idle_cap[]);
+                     bool deadlock_detect,
+                     uint32_t gc_type,
+                     Capability *cap,
+                     bool idle_cap[]);
 
 typedef void (*evac_fn)(void *user, StgClosure **root);
 
@@ -30,6 +33,8 @@ bool doIdleGCWork(Capability *cap, bool all);
 
 extern uint32_t N;
 extern bool major_gc;
+/* See Note [Deadlock detection under nonmoving collector]. */
+extern bool deadlock_detect_gc;
 
 extern bdescr *mark_stack_bd;
 extern bdescr *mark_stack_top_bd;


=====================================
rts/sm/GCAux.c
=====================================
@@ -142,14 +142,14 @@ markCAFs (evac_fn evac, void *user)
     StgIndStatic *c;
 
     for (c = dyn_caf_list;
-         c != (StgIndStatic*)END_OF_CAF_LIST;
+         ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
         c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
         evac(user, &c->indirectee);
     }
     for (c = revertible_caf_list;
-         c != (StgIndStatic*)END_OF_CAF_LIST;
+         ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
         c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);


=====================================
rts/sm/NonMoving.c
=====================================
@@ -68,6 +68,113 @@ Mutex concurrent_coll_finished_lock;
  *    stopAllCapabilitiesWith(SYNC_FLUSH_UPD_REM_SET). Capabilities are held
  *    the final mark has concluded.
  *
+ * Note that possibility of concurrent minor and non-moving collections
+ * requires that we handle static objects a bit specially. See
+ * Note [Static objects under the nonmoving collector] in Storage.c
+ * for details.
+ *
+ *
+ * Note [Aging under the non-moving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * The initial design of the non-moving collector mandated that all live data
+ * be evacuated to the non-moving heap prior to a major collection. This
+ * simplified certain bits of implementation and eased reasoning. However, it
+ * was (unsurprisingly) also found to result in significant amounts of
+ * unnecessary copying.
+ *
+ * Consequently, we now allow aging. Aging allows the preparatory GC leading up
+ * to a major collection to evacuate some objects into the young generation.
+ * However, this introduces the following tricky case that might arise after
+ * we have finished the preparatory GC:
+ *
+ *       moving heap  ┆  non-moving heap
+ *     ───────────────┆──────────────────
+ *                    ┆
+ *      B ←────────────── A ←─────────────── root
+ *      │             ┆     ↖─────────────── gen1 mut_list
+ *      ╰───────────────→ C
+ *                    ┆
+ *
+ * In this case C is clearly live, but the non-moving collector can only see
+ * this by walking through B, which lives in the moving heap. However, doing so
+ * would require that we synchronize with the mutator/minor GC to ensure that it
+ * isn't in the middle of moving B. What to do?
+ *
+ * The solution we use here is to teach the preparatory moving collector to
+ * "evacuate" objects it encounters in the non-moving heap by adding them to
+ * the mark queue. This is implemented by pushing the object to the update
+ * remembered set of the capability held by the evacuating gc_thread
+ * (implemented by markQueuePushClosureGC)
+ *
+ * Consequently collection of the case above would proceed as follows:
+ *
+ *  1. Initial state:
+ *      * A lives in the non-moving heap and is reachable from the root set
+ *      * A is on the oldest generation's mut_list, since it contains a pointer
+ *        to B, which lives in a younger generation
+ *      * B lives in the moving collector's from space
+ *      * C lives in the non-moving heap
+ *
+ *  2. Preparatory GC: Scavenging mut_lists:
+ *
+ *     The mut_list of the oldest generation is scavenged, resulting in B being
+ *     evacuated (aged) into the moving collector's to-space.
+ *
+ *  3. Preparatory GC: Scavenge B
+ *
+ *     B (now in to-space) is scavenged, resulting in evacuation of C.
+ *     evacuate(C) pushes a reference to C to the mark queue.
+ *
+ *  4. Non-moving GC: C is marked
+ *
+ *     The non-moving collector will come to C in the mark queue and mark it.
+ *
+ *
+ * Note [Deadlock detection under the non-moving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * In GHC the garbage collector is responsible for identifying deadlocked
+ * programs. Providing for this responsibility is slightly tricky in the
+ * non-moving collector due to the existence of aging. In particular, the
+ * non-moving collector cannot traverse objects living in a young generation
+ * but reachable from the non-moving generation, as described in Note [Aging
+ * under the non-moving collector].
+ *
+ * However, this can pose trouble for deadlock detection since it means that we
+ * may conservatively mark dead closures as live. Consider this case:
+ *
+ *       moving heap  ┆  non-moving heap
+ *     ───────────────┆──────────────────
+ *                    ┆
+ *      MVAR_QUEUE ←───── TSO ←───────────── gen1 mut_list
+ *        ↑  │  ╰────────↗  │
+ *        │  │        ┆     │
+ *        │  │        ┆     ↓
+ *        │  ╰──────────→ MVAR
+ *        ╰─────────────────╯
+ *                    ┆
+ *
+ * In this case we have a TSO blocked on a dead MVar. Because the MVAR_TSO_QUEUE on
+ * which it is blocked lives in the moving heap, the TSO is necessarily on the
+ * oldest generation's mut_list. As in Note [Aging under the non-moving
+ * collector], the MVAR_TSO_QUEUE will be evacuated. If MVAR_TSO_QUEUE is aged
+ * (e.g. evacuated to the young generation) then the MVAR will be added to the
+ * mark queue. Consequently, we will falsely conclude that the MVAR is still
+ * alive and fail to spot the deadlock.
+ *
+ * To avoid this sort of situation we disable aging when we are starting a
+ * major GC specifically for deadlock detection (as done by
+ * scheduleDetectDeadlock). This condition is recorded by the
+ * deadlock_detect_gc global variable declared in GC.h. Setting this has a few
+ * effects on the preparatory GC:
+ *
+ *  - Evac.c:alloc_for_copy forces evacuation to the non-moving generation.
+ *
+ *  - The evacuation logic usually responsible for pushing objects living in
+ *    the non-moving heap to the mark queue is disabled. This is safe because
+ *    we know that all live objects will be in the non-moving heap by the end
+ *    of the preparatory moving collection.
+ *
  *
  * Note [Live data accounting in nonmoving collector]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -146,10 +253,26 @@ static struct NonmovingSegment *nonmovingPopFreeSegment(void)
     }
 }
 
+unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size)
+{
+  // We compute the overwhelmingly common size cases directly to avoid a very
+  // expensive integer division.
+  switch (log_block_size) {
+    case 3:  return nonmovingBlockCount(3);
+    case 4:  return nonmovingBlockCount(4);
+    case 5:  return nonmovingBlockCount(5);
+    case 6:  return nonmovingBlockCount(6);
+    case 7:  return nonmovingBlockCount(7);
+    default: return nonmovingBlockCount(log_block_size);
+  }
+}
+
 /*
  * Request a fresh segment from the free segment list or allocate one of the
  * given node.
  *
+ * Caller must hold SM_MUTEX (although we take the gc_alloc_block_sync spinlock
+ * under the assumption that we are in a GC context).
  */
 static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node)
 {
@@ -192,10 +315,12 @@ static inline unsigned long log2_ceil(unsigned long x)
 }
 
 // Advance a segment's next_free pointer. Returns true if segment if full.
-static bool advance_next_free(struct NonmovingSegment *seg)
+static bool advance_next_free(struct NonmovingSegment *seg, const unsigned int blk_count)
 {
-    uint8_t *bitmap = seg->bitmap;
-    unsigned int blk_count = nonmovingSegmentBlockCount(seg);
+    const uint8_t *bitmap = seg->bitmap;
+    ASSERT(blk_count == nonmovingSegmentBlockCount(seg));
+#if defined(NAIVE_ADVANCE_FREE)
+    // reference implementation
     for (unsigned int i = seg->next_free+1; i < blk_count; i++) {
         if (!bitmap[i]) {
             seg->next_free = i;
@@ -204,6 +329,16 @@ static bool advance_next_free(struct NonmovingSegment *seg)
     }
     seg->next_free = blk_count;
     return true;
+#else
+    const uint8_t *c = memchr(&bitmap[seg->next_free+1], 0, blk_count - seg->next_free - 1);
+    if (c == NULL) {
+        seg->next_free = blk_count;
+        return true;
+    } else {
+        seg->next_free = c - bitmap;
+        return false;
+    }
+#endif
 }
 
 static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *alloca)
@@ -221,34 +356,27 @@ static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *al
     }
 }
 
-/* sz is in words */
+/* Allocate a block in the nonmoving heap. Caller must hold SM_MUTEX. sz is in words */
 GNUC_ATTR_HOT
 void *nonmovingAllocate(Capability *cap, StgWord sz)
 {
-    unsigned int allocator_idx = log2_ceil(sz * sizeof(StgWord)) - NONMOVING_ALLOCA0;
+    unsigned int log_block_size = log2_ceil(sz * sizeof(StgWord));
+    unsigned int block_count = nonmovingBlockCountFromSize(log_block_size);
 
     // The max we ever allocate is 3276 bytes (anything larger is a large
     // object and not moved) which is covered by allocator 9.
-    ASSERT(allocator_idx < NONMOVING_ALLOCA_CNT);
+    ASSERT(log_block_size < NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT);
 
-    struct NonmovingAllocator *alloca = nonmovingHeap.allocators[allocator_idx];
+    struct NonmovingAllocator *alloca = nonmovingHeap.allocators[log_block_size - NONMOVING_ALLOCA0];
 
     // Allocate into current segment
     struct NonmovingSegment *current = alloca->current[cap->no];
     ASSERT(current); // current is never NULL
-    void *ret = nonmovingSegmentGetBlock(current, current->next_free);
+    void *ret = nonmovingSegmentGetBlock_(current, log_block_size, current->next_free);
     ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment
 
-    // Add segment to the todo list unless it's already there
-    // current->todo_link == NULL means not in todo list
-    if (!current->todo_link) {
-        gen_workspace *ws = &gct->gens[oldest_gen->no];
-        current->todo_link = ws->todo_seg;
-        ws->todo_seg = current;
-    }
-
     // Advance the current segment's next_free or allocate a new segment if full
-    bool full = advance_next_free(current);
+    bool full = advance_next_free(current, block_count);
     if (full) {
         // Current segment is full: update live data estimate link it to
         // filled, take an active segment if one exists, otherwise allocate a
@@ -256,8 +384,9 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
 
         // Update live data estimate.
         // See Note [Live data accounting in nonmoving collector].
-        unsigned int new_blocks =  nonmovingSegmentBlockCount(current) - current->next_free_snap;
-        atomic_inc(&oldest_gen->live_estimate, new_blocks * nonmovingSegmentBlockSize(current) / sizeof(W_));
+        unsigned int new_blocks = block_count - current->next_free_snap;
+        unsigned int block_size = 1 << log_block_size;
+        atomic_inc(&oldest_gen->live_estimate, new_blocks * block_size / sizeof(W_));
 
         // push the current segment to the filled list
         nonmovingPushFilledSegment(current);
@@ -268,7 +397,7 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
         // there are no active segments, allocate new segment
         if (new_current == NULL) {
             new_current = nonmovingAllocSegment(cap->node);
-            nonmovingInitSegment(new_current, NONMOVING_ALLOCA0 + allocator_idx);
+            nonmovingInitSegment(new_current, log_block_size);
         }
 
         // make it current
@@ -352,37 +481,23 @@ void nonmovingAddCapabilities(uint32_t new_n_caps)
     nonmovingHeap.n_caps = new_n_caps;
 }
 
-static void nonmovingClearBitmap(struct NonmovingSegment *seg)
+static inline void nonmovingClearBitmap(struct NonmovingSegment *seg)
 {
     unsigned int n = nonmovingSegmentBlockCount(seg);
     memset(seg->bitmap, 0, n);
 }
 
-static void nonmovingClearSegmentBitmaps(struct NonmovingSegment *seg)
-{
-    while (seg) {
-        nonmovingClearBitmap(seg);
-        seg = seg->link;
-    }
-}
-
-static void nonmovingClearAllBitmaps(void)
-{
-    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
-        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
-        nonmovingClearSegmentBitmaps(alloca->filled);
-    }
-
-    // Clear large object bits
-    for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) {
-        bd->flags &= ~BF_MARKED;
-    }
-}
-
 /* Prepare the heap bitmaps and snapshot metadata for a mark */
 static void nonmovingPrepareMark(void)
 {
-    nonmovingClearAllBitmaps();
+    // See Note [Static objects under the nonmoving collector].
+    prev_static_flag = static_flag;
+    static_flag =
+        static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
+
+    // Should have been cleared by the last sweep
+    ASSERT(nonmovingHeap.sweep_list == NULL);
+
     nonmovingBumpEpoch();
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
@@ -393,11 +508,28 @@ static void nonmovingPrepareMark(void)
             seg->next_free_snap = seg->next_free;
         }
 
-        // Update filled segments' snapshot pointers
-        struct NonmovingSegment *seg = alloca->filled;
-        while (seg) {
-            seg->next_free_snap = seg->next_free;
-            seg = seg->link;
+        // Update filled segments' snapshot pointers and move to sweep_list
+        uint32_t n_filled = 0;
+        struct NonmovingSegment *const filled = alloca->filled;
+        alloca->filled = NULL;
+        if (filled) {
+            struct NonmovingSegment *seg = filled;
+            while (true) {
+                n_filled++;
+                prefetchForRead(seg->link);
+                // Clear bitmap
+                prefetchForWrite(seg->link->bitmap);
+                nonmovingClearBitmap(seg);
+                // Set snapshot
+                seg->next_free_snap = seg->next_free;
+                if (seg->link)
+                    seg = seg->link;
+                else
+                    break;
+            }
+            // add filled segments to sweep_list
+            seg->link = nonmovingHeap.sweep_list;
+            nonmovingHeap.sweep_list = filled;
         }
 
         // N.B. It's not necessary to update snapshot pointers of active segments;
@@ -418,6 +550,12 @@ static void nonmovingPrepareMark(void)
     oldest_gen->n_large_blocks = 0;
     nonmoving_live_words = 0;
 
+    // Clear large object bits
+    for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) {
+        bd->flags &= ~BF_MARKED;
+    }
+
+
 #if defined(DEBUG)
     debug_caf_list_snapshot = debug_caf_list;
     debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
@@ -468,7 +606,6 @@ void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads)
     resizeGenerations();
 
     nonmovingPrepareMark();
-    nonmovingPrepareSweep();
 
     // N.B. These should have been cleared at the end of the last sweep.
     ASSERT(nonmoving_marked_large_objects == NULL);
@@ -665,7 +802,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
 
 #if defined(DEBUG)
     // Zap CAFs that we will sweep
-    nonmovingGcCafs(mark_queue);
+    nonmovingGcCafs();
 #endif
 
     ASSERT(mark_queue->top->head == 0);


=====================================
rts/sm/NonMoving.h
=====================================
@@ -92,6 +92,9 @@ struct NonmovingHeap {
 extern struct NonmovingHeap nonmovingHeap;
 
 extern uint64_t nonmoving_live_words;
+#if defined(THREADED_RTS)
+extern bool concurrent_coll_running;
+#endif
 
 void nonmovingInit(void);
 void nonmovingExit(void);
@@ -160,22 +163,34 @@ INLINE_HEADER unsigned int nonmovingSegmentBlockSize(struct NonmovingSegment *se
     return 1 << seg->block_size;
 }
 
-// How many blocks does the given segment contain? Also the size of the bitmap.
-INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg)
+// How many blocks does a segment with the given block size have?
+INLINE_HEADER unsigned int nonmovingBlockCount(uint8_t log_block_size)
 {
-  unsigned int sz = nonmovingSegmentBlockSize(seg);
   unsigned int segment_data_size = NONMOVING_SEGMENT_SIZE - sizeof(struct NonmovingSegment);
   segment_data_size -= segment_data_size % SIZEOF_VOID_P;
-  return segment_data_size / (sz + 1);
+  unsigned int blk_size = 1 << log_block_size;
+  // N.B. +1 accounts for the byte in the mark bitmap.
+  return segment_data_size / (blk_size + 1);
 }
 
-// Get a pointer to the given block index
-INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i)
+unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size);
+
+// How many blocks does the given segment contain? Also the size of the bitmap.
+INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg)
 {
+  return nonmovingBlockCountFromSize(seg->block_size);
+}
+
+// Get a pointer to the given block index assuming that the block size is as
+// given (avoiding a potential cache miss when this information is already
+// available). The log_block_size argument must be equal to seg->block_size.
+INLINE_HEADER void *nonmovingSegmentGetBlock_(struct NonmovingSegment *seg, uint8_t log_block_size, nonmoving_block_idx i)
+{
+  ASSERT(log_block_size == seg->block_size);
   // Block size in bytes
-  unsigned int blk_size = nonmovingSegmentBlockSize(seg);
+  unsigned int blk_size = 1 << log_block_size;
   // Bitmap size in bytes
-  W_ bitmap_size = nonmovingSegmentBlockCount(seg) * sizeof(uint8_t);
+  W_ bitmap_size = nonmovingBlockCountFromSize(log_block_size) * sizeof(uint8_t);
   // Where the actual data starts (address of the first block).
   // Use ROUNDUP_BYTES_TO_WDS to align to word size. Note that
   // ROUNDUP_BYTES_TO_WDS returns in _words_, not in _bytes_, so convert it back
@@ -184,15 +199,26 @@ INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmo
   return (void*)(data + i*blk_size);
 }
 
+// Get a pointer to the given block index.
+INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i)
+{
+  return nonmovingSegmentGetBlock_(seg, seg->block_size, i);
+}
+
 // Get the segment which a closure resides in. Assumes that pointer points into
 // non-moving heap.
-INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p)
+INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment_unchecked(StgPtr p)
 {
-    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
     const uintptr_t mask = ~NONMOVING_SEGMENT_MASK;
     return (struct NonmovingSegment *) (((uintptr_t) p) & mask);
 }
 
+INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p)
+{
+    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
+    return nonmovingGetSegment_unchecked(p);
+}
+
 INLINE_HEADER nonmoving_block_idx nonmovingGetBlockIdx(StgPtr p)
 {
     ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));


=====================================
rts/sm/NonMovingMark.c
=====================================
@@ -205,7 +205,7 @@ static void init_mark_queue_(MarkQueue *queue);
  * Really the argument type should be UpdRemSet* but this would be rather
  * inconvenient without polymorphism.
  */
-static void nonmovingAddUpdRemSetBlocks(MarkQueue *rset)
+void nonmovingAddUpdRemSetBlocks(MarkQueue *rset)
 {
     if (markQueueIsEmpty(rset)) return;
 
@@ -361,7 +361,7 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
         } else {
             // allocate a fresh block.
             ACQUIRE_SM_LOCK;
-            bdescr *bd = allocGroup(1);
+            bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
             bd->link = q->blocks;
             q->blocks = bd;
             q->top = (MarkQueueBlock *) bd->start;
@@ -374,16 +374,49 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
     q->top->head++;
 }
 
+/* A variant of push to be used by the minor GC when it encounters a reference
+ * to an object in the non-moving heap. In contrast to the other push
+ * operations this uses the gc_alloc_block_sync spinlock instead of the
+ * SM_LOCK to allocate new blocks in the event that the mark queue is full.
+ */
+void
+markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
+{
+    /* We should not make it here if we are doing a deadlock detect GC.
+     * See Note [Deadlock detection under nonmoving collector].
+     */
+    ASSERT(!deadlock_detect_gc);
+
+    // Are we at the end of the block?
+    if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
+        // Yes, this block is full.
+        // allocate a fresh block.
+        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
+        bd->link = q->blocks;
+        q->blocks = bd;
+        q->top = (MarkQueueBlock *) bd->start;
+        q->top->head = 0;
+        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+    }
+
+    MarkQueueEnt ent = {
+        .mark_closure = {
+            .p = UNTAG_CLOSURE(p),
+            .origin = NULL,
+        }
+    };
+    q->top->entries[q->top->head] = ent;
+    q->top->head++;
+}
+
 static inline
 void push_closure (MarkQueue *q,
                    StgClosure *p,
                    StgClosure **origin)
 {
-    // TODO: Push this into callers where they already have the Bdescr
-    if (HEAP_ALLOCED_GC(p) && (Bdescr((StgPtr) p)->gen != oldest_gen))
-        return;
-
 #if defined(DEBUG)
+    ASSERT(!HEAP_ALLOCED_GC(p) || (Bdescr((StgPtr) p)->gen == oldest_gen));
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
     // Commenting out: too slow
     // if (RtsFlags.DebugFlags.sanity) {
@@ -393,8 +426,12 @@ void push_closure (MarkQueue *q,
     // }
 #endif
 
+    // This must be true as origin points to a pointer and therefore must be
+    // word-aligned. However, we check this as otherwise we would confuse this
+    // with a mark_array entry
+    ASSERT(((uintptr_t) origin & 0x3) == 0);
+
     MarkQueueEnt ent = {
-        .type = MARK_CLOSURE,
         .mark_closure = {
             .p = UNTAG_CLOSURE(p),
             .origin = origin,
@@ -413,10 +450,9 @@ void push_array (MarkQueue *q,
         return;
 
     MarkQueueEnt ent = {
-        .type = MARK_ARRAY,
         .mark_array = {
             .array = array,
-            .start_index = start_index
+            .start_index = (start_index << 16) | 0x3,
         }
     };
     push(q, &ent);
@@ -493,15 +529,11 @@ void updateRemembSetPushThunkEager(Capability *cap,
         MarkQueue *queue = &cap->upd_rem_set.queue;
         push_thunk_srt(queue, &info->i);
 
-        // Don't record the origin of objects living outside of the nonmoving
-        // heap; we can't perform the selector optimisation on them anyways.
-        bool record_origin = check_in_nonmoving_heap((StgClosure*)thunk);
-
         for (StgWord i = 0; i < info->i.layout.payload.ptrs; i++) {
             if (check_in_nonmoving_heap(thunk->payload[i])) {
-                push_closure(queue,
-                             thunk->payload[i],
-                             record_origin ? &thunk->payload[i] : NULL);
+                // Don't bother to push origin; it makes the barrier needlessly
+                // expensive with little benefit.
+                push_closure(queue, thunk->payload[i], NULL);
             }
         }
         break;
@@ -510,7 +542,9 @@ void updateRemembSetPushThunkEager(Capability *cap,
     {
         MarkQueue *queue = &cap->upd_rem_set.queue;
         StgAP *ap = (StgAP *) thunk;
-        push_closure(queue, ap->fun, &ap->fun);
+        if (check_in_nonmoving_heap(ap->fun)) {
+            push_closure(queue, ap->fun, NULL);
+        }
         mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args);
         break;
     }
@@ -531,9 +565,10 @@ void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p)
 
 inline void updateRemembSetPushClosure(Capability *cap, StgClosure *p)
 {
-    if (!check_in_nonmoving_heap(p)) return;
-    MarkQueue *queue = &cap->upd_rem_set.queue;
-    push_closure(queue, p, NULL);
+    if (check_in_nonmoving_heap(p)) {
+        MarkQueue *queue = &cap->upd_rem_set.queue;
+        push_closure(queue, p, NULL);
+    }
 }
 
 void updateRemembSetPushClosure_(StgRegTable *reg, StgClosure *p)
@@ -630,7 +665,10 @@ void markQueuePushClosure (MarkQueue *q,
                            StgClosure *p,
                            StgClosure **origin)
 {
-    push_closure(q, p, origin);
+    // TODO: Push this into callers where they already have the Bdescr
+    if (check_in_nonmoving_heap(p)) {
+        push_closure(q, p, origin);
+    }
 }
 
 /* TODO: Do we really never want to specify the origin here? */
@@ -667,7 +705,7 @@ void markQueuePushArray (MarkQueue *q,
  *********************************************************/
 
 // Returns invalid MarkQueueEnt if queue is empty.
-static MarkQueueEnt markQueuePop (MarkQueue *q)
+static MarkQueueEnt markQueuePop_ (MarkQueue *q)
 {
     MarkQueueBlock *top;
 
@@ -679,7 +717,7 @@ again:
         // Is this the first block of the queue?
         if (q->blocks->link == NULL) {
             // Yes, therefore queue is empty...
-            MarkQueueEnt none = { .type = NULL_ENTRY };
+            MarkQueueEnt none = { .null_entry = { .p = NULL } };
             return none;
         } else {
             // No, unwind to the previous block and try popping again...
@@ -698,6 +736,47 @@ again:
     return ent;
 }
 
+static MarkQueueEnt markQueuePop (MarkQueue *q)
+{
+#if MARK_PREFETCH_QUEUE_DEPTH == 0
+    return markQueuePop_(q);
+#else
+    unsigned int i = q->prefetch_head;
+    while (nonmovingMarkQueueEntryType(&q->prefetch_queue[i]) == NULL_ENTRY) {
+        MarkQueueEnt new = markQueuePop_(q);
+        if (nonmovingMarkQueueEntryType(&new) == NULL_ENTRY) {
+            // Mark queue is empty; look for any valid entries in the prefetch
+            // queue
+            for (unsigned int j = (i+1) % MARK_PREFETCH_QUEUE_DEPTH;
+                 j != i;
+                 j = (j+1) % MARK_PREFETCH_QUEUE_DEPTH)
+            {
+                if (nonmovingMarkQueueEntryType(&q->prefetch_queue[j]) != NULL_ENTRY) {
+                    i = j;
+                    goto done;
+                }
+            }
+            return new;
+        }
+
+        // The entry may not be a MARK_CLOSURE but it doesn't matter, our
+        // MarkQueueEnt encoding always places the pointer to the object to be
+        // marked first.
+        prefetchForRead(&new.mark_closure.p->header.info);
+        prefetchForRead(&nonmovingGetSegment_unchecked((StgPtr) new.mark_closure.p)->block_size);
+        q->prefetch_queue[i] = new;
+        i = (i + 1) % MARK_PREFETCH_QUEUE_DEPTH;
+    }
+
+  done:
+    ;
+    MarkQueueEnt ret = q->prefetch_queue[i];
+    q->prefetch_queue[i].null_entry.p = NULL;
+    q->prefetch_head = i;
+    return ret;
+#endif
+}
+
 /*********************************************************
  * Creating and destroying MarkQueues and UpdRemSets
  *********************************************************/
@@ -705,17 +784,20 @@ again:
 /* Must hold sm_mutex. */
 static void init_mark_queue_ (MarkQueue *queue)
 {
-    bdescr *bd = allocGroup(1);
+    bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
     queue->blocks = bd;
     queue->top = (MarkQueueBlock *) bd->start;
     queue->top->head = 0;
+#if MARK_PREFETCH_QUEUE_DEPTH > 0
+    memset(&queue->prefetch_queue, 0, sizeof(queue->prefetch_queue));
+    queue->prefetch_head = 0;
+#endif
 }
 
 /* Must hold sm_mutex. */
 void initMarkQueue (MarkQueue *queue)
 {
     init_mark_queue_(queue);
-    queue->marked_objects = allocHashTable();
     queue->is_upd_rem_set = false;
 }
 
@@ -723,8 +805,6 @@ void initMarkQueue (MarkQueue *queue)
 void init_upd_rem_set (UpdRemSet *rset)
 {
     init_mark_queue_(&rset->queue);
-    // Update remembered sets don't have to worry about static objects
-    rset->queue.marked_objects = NULL;
     rset->queue.is_upd_rem_set = true;
 }
 
@@ -739,7 +819,6 @@ void reset_upd_rem_set (UpdRemSet *rset)
 void freeMarkQueue (MarkQueue *queue)
 {
     freeChain_lock(queue->blocks);
-    freeHashTable(queue->marked_objects, NULL);
 }
 
 #if defined(THREADED_RTS) && defined(DEBUG)
@@ -986,12 +1065,32 @@ mark_stack (MarkQueue *queue, StgStack *stack)
     mark_stack_(queue, stack->sp, stack->stack + stack->stack_size);
 }
 
+/* See Note [Static objects under the nonmoving collector].
+ *
+ * Returns true if the object needs to be marked.
+ */
+static bool
+bump_static_flag(StgClosure **link_field, StgClosure *q STG_UNUSED)
+{
+    while (1) {
+        StgWord link = (StgWord) *link_field;
+        StgWord new = (link & ~STATIC_BITS) | static_flag;
+        if ((link & STATIC_BITS) == static_flag)
+            return false;
+        else if (cas((StgVolatilePtr) link_field, link, new) == link) {
+            return true;
+        }
+    }
+}
+
 static GNUC_ATTR_HOT void
 mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
 {
     (void)origin; // TODO: should be used for selector/thunk optimisations
 
  try_again:
+    ;
+    bdescr *bd = NULL;
     p = UNTAG_CLOSURE(p);
 
 #   define PUSH_FIELD(obj, field)                                \
@@ -1009,45 +1108,46 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
             return;
         }
 
-        if (lookupHashTable(queue->marked_objects, (W_)p)) {
-            // already marked
-            return;
-        }
-
-        insertHashTable(queue->marked_objects, (W_)p, (P_)1);
-
         switch (type) {
 
         case THUNK_STATIC:
             if (info->srt != 0) {
-                markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above
+                if (bump_static_flag(THUNK_STATIC_LINK((StgClosure *)p), p)) {
+                    markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above
+                }
             }
             return;
 
         case FUN_STATIC:
             if (info->srt != 0 || info->layout.payload.ptrs != 0) {
-                markQueuePushFunSrt(queue, info); // TODO this function repeats the check above
-
-                // a FUN_STATIC can also be an SRT, so it may have pointer
-                // fields.  See Note [SRTs] in CmmBuildInfoTables, specifically
-                // the [FUN] optimisation.
-                // TODO (osa) I don't understand this comment
-                for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
-                    PUSH_FIELD(p, payload[i]);
+                if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) {
+                    markQueuePushFunSrt(queue, info); // TODO this function repeats the check above
+
+                    // a FUN_STATIC can also be an SRT, so it may have pointer
+                    // fields.  See Note [SRTs] in CmmBuildInfoTables, specifically
+                    // the [FUN] optimisation.
+                    // TODO (osa) I don't understand this comment
+                    for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                        PUSH_FIELD(p, payload[i]);
+                    }
                 }
             }
             return;
 
         case IND_STATIC:
-            PUSH_FIELD((StgInd *) p, indirectee);
+            if (bump_static_flag(IND_STATIC_LINK((StgClosure *)p), p)) {
+                PUSH_FIELD((StgInd *) p, indirectee);
+            }
             return;
 
         case CONSTR:
         case CONSTR_1_0:
         case CONSTR_2_0:
         case CONSTR_1_1:
-            for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
-                PUSH_FIELD(p, payload[i]);
+            if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) {
+                for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                    PUSH_FIELD(p, payload[i]);
+                }
             }
             return;
 
@@ -1061,19 +1161,17 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
         }
     }
 
-    bdescr *bd = Bdescr((StgPtr) p);
+    bd = Bdescr((StgPtr) p);
 
     if (bd->gen != oldest_gen) {
-        // Here we have an object living outside of the non-moving heap. Since
-        // we moved everything to the non-moving heap before starting the major
-        // collection, we know that we don't need to trace it: it was allocated
-        // after we took our snapshot.
-#if !defined(THREADED_RTS)
-        // This should never happen in the non-concurrent case
-        barf("Closure outside of non-moving heap: %p", p);
-#else
+        // Here we have an object living outside of the non-moving heap. While
+        // we likely evacuated nearly everything to the nonmoving heap during
+        // preparation there are nevertheless a few ways in which we might trace
+        // a reference into younger generations:
+        //
+        //  * a mutable object might have been updated
+        //  * we might have aged an object
         return;
-#endif
     }
 
     ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info);
@@ -1440,13 +1538,13 @@ nonmovingMark (MarkQueue *queue)
         count++;
         MarkQueueEnt ent = markQueuePop(queue);
 
-        switch (ent.type) {
+        switch (nonmovingMarkQueueEntryType(&ent)) {
         case MARK_CLOSURE:
             mark_closure(queue, ent.mark_closure.p, ent.mark_closure.origin);
             break;
         case MARK_ARRAY: {
             const StgMutArrPtrs *arr = ent.mark_array.array;
-            StgWord start = ent.mark_array.start_index;
+            StgWord start = ent.mark_array.start_index >> 16;
             StgWord end = start + MARK_ARRAY_CHUNK_LENGTH;
             if (end < arr->ptrs) {
                 markQueuePushArray(queue, ent.mark_array.array, end);
@@ -1691,13 +1789,17 @@ void nonmovingResurrectThreads (struct MarkQueue_ *queue, StgTSO **resurrected_t
 
 void printMarkQueueEntry (MarkQueueEnt *ent)
 {
-    if (ent->type == MARK_CLOSURE) {
+    switch(nonmovingMarkQueueEntryType(ent)) {
+      case MARK_CLOSURE:
         debugBelch("Closure: ");
         printClosure(ent->mark_closure.p);
-    } else if (ent->type == MARK_ARRAY) {
+        break;
+      case MARK_ARRAY:
         debugBelch("Array\n");
-    } else {
+        break;
+      case NULL_ENTRY:
         debugBelch("End of mark\n");
+        break;
     }
 }
 


=====================================
rts/sm/NonMovingMark.h
=====================================
@@ -43,9 +43,16 @@ enum EntryType {
  */
 
 typedef struct {
-    enum EntryType type;
-    // All pointers should be untagged
+    // Which kind of mark queue entry we have is determined by the low bits of
+    // the second word: they must be zero in the case of a mark_closure entry
+    // (since the second word of a mark_closure entry points to a pointer and
+    // pointers must be word-aligned).  In the case of a mark_array we set them
+    // to 0x3 (the value of start_index is shifted to the left to accomodate
+    // this). null_entry where p==NULL is used to indicate the end of the queue.
     union {
+        struct {
+            void *p;              // must be NULL
+        } null_entry;
         struct {
             StgClosure *p;        // the object to be marked
             StgClosure **origin;  // field where this reference was found.
@@ -53,11 +60,23 @@ typedef struct {
         } mark_closure;
         struct {
             const StgMutArrPtrs *array;
-            StgWord start_index;
+            StgWord start_index;  // start index is shifted to the left by 16 bits
         } mark_array;
     };
 } MarkQueueEnt;
 
+INLINE_HEADER enum EntryType nonmovingMarkQueueEntryType(MarkQueueEnt *ent)
+{
+    if (ent->null_entry.p == NULL) {
+        return NULL_ENTRY;
+    } else if (((uintptr_t) ent->mark_closure.origin & TAG_BITS) == 0) {
+        return MARK_CLOSURE;
+    } else {
+        ASSERT((ent->mark_array.start_index & TAG_BITS) == 0x3);
+        return MARK_ARRAY;
+    }
+}
+
 typedef struct {
     // index of first *unused* queue entry
     uint32_t head;
@@ -65,6 +84,9 @@ typedef struct {
     MarkQueueEnt entries[];
 } MarkQueueBlock;
 
+// How far ahead in mark queue to prefetch?
+#define MARK_PREFETCH_QUEUE_DEPTH 5
+
 /* The mark queue is not capable of concurrent read or write.
  *
  * invariants:
@@ -83,9 +105,12 @@ typedef struct MarkQueue_ {
     // Is this a mark queue or a capability-local update remembered set?
     bool is_upd_rem_set;
 
-    // Marked objects outside of nonmoving heap, namely large and static
-    // objects.
-    HashTable *marked_objects;
+#if MARK_PREFETCH_QUEUE_DEPTH > 0
+    // A ring-buffer of entries which we will mark next
+    MarkQueueEnt prefetch_queue[MARK_PREFETCH_QUEUE_DEPTH];
+    // The first free slot in prefetch_queue.
+    uint8_t prefetch_head;
+#endif
 } MarkQueue;
 
 /* While it shares its representation with MarkQueue, UpdRemSet differs in
@@ -97,8 +122,11 @@ typedef struct {
     MarkQueue queue;
 } UpdRemSet;
 
+// Number of blocks to allocate for a mark queue
+#define MARK_QUEUE_BLOCKS 16
+
 // The length of MarkQueueBlock.entries
-#define MARK_QUEUE_BLOCK_ENTRIES ((BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt))
+#define MARK_QUEUE_BLOCK_ENTRIES ((MARK_QUEUE_BLOCKS * BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt))
 
 extern bdescr *nonmoving_large_objects, *nonmoving_marked_large_objects;
 extern memcount n_nonmoving_large_blocks, n_nonmoving_marked_large_blocks;
@@ -115,6 +143,15 @@ extern StgIndStatic *debug_caf_list_snapshot;
 extern MarkQueue *current_mark_queue;
 extern bdescr *upd_rem_set_block_list;
 
+// A similar macro is defined in includes/Cmm.h for C-- code.
+#if defined(THREADED_RTS)
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED \
+    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+#else
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED \
+    if (0)
+#endif
+
 void nonmovingMarkInitUpdRemSet(void);
 
 void init_upd_rem_set(UpdRemSet *rset);
@@ -143,8 +180,10 @@ void nonmovingResurrectThreads(struct MarkQueue_ *queue, StgTSO **resurrected_th
 bool nonmovingIsAlive(StgClosure *p);
 void nonmovingMarkDeadWeak(struct MarkQueue_ *queue, StgWeak *w);
 void nonmovingMarkLiveWeak(struct MarkQueue_ *queue, StgWeak *w);
+void nonmovingAddUpdRemSetBlocks(struct MarkQueue_ *rset);
 
 void markQueuePush(MarkQueue *q, const MarkQueueEnt *ent);
+void markQueuePushClosureGC(MarkQueue *q, StgClosure *p);
 void markQueuePushClosure(MarkQueue *q,
                              StgClosure *p,
                              StgClosure **origin);


=====================================
rts/sm/NonMovingScav.c
=====================================
@@ -16,6 +16,7 @@ nonmovingScavengeOne (StgClosure *q)
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
     StgPtr p = (StgPtr)q;
     const StgInfoTable *info = get_itbl(q);
+    const bool saved_eager_promotion = gct->eager_promotion;
 
     switch (info->type) {
 
@@ -23,9 +24,11 @@ nonmovingScavengeOne (StgClosure *q)
     case MVAR_DIRTY:
     {
         StgMVar *mvar = ((StgMVar *)p);
+        gct->eager_promotion = false;
         evacuate((StgClosure **)&mvar->head);
         evacuate((StgClosure **)&mvar->tail);
         evacuate((StgClosure **)&mvar->value);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             mvar->header.info = &stg_MVAR_DIRTY_info;
         } else {
@@ -37,8 +40,10 @@ nonmovingScavengeOne (StgClosure *q)
     case TVAR:
     {
         StgTVar *tvar = ((StgTVar *)p);
+        gct->eager_promotion = false;
         evacuate((StgClosure **)&tvar->current_value);
         evacuate((StgClosure **)&tvar->first_watch_queue_entry);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             tvar->header.info = &stg_TVAR_DIRTY_info;
         } else {
@@ -122,10 +127,21 @@ nonmovingScavengeOne (StgClosure *q)
         break;
     }
 
+    case WEAK:
+    {
+        // We must evacuate the key since it may refer to an object in the
+        // moving heap which may be long gone by the time we call
+        // nonmovingTidyWeaks.
+        StgWeak *weak = (StgWeak *) p;
+        gct->eager_promotion = true;
+        evacuate(&weak->key);
+        gct->eager_promotion = saved_eager_promotion;
+        goto gen_obj;
+    }
+
     gen_obj:
     case CONSTR:
     case CONSTR_NOCAF:
-    case WEAK:
     case PRIM:
     {
         StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
@@ -145,7 +161,9 @@ nonmovingScavengeOne (StgClosure *q)
 
     case MUT_VAR_CLEAN:
     case MUT_VAR_DIRTY:
+        gct->eager_promotion = false;
         evacuate(&((StgMutVar *)p)->var);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
         } else {
@@ -157,10 +175,12 @@ nonmovingScavengeOne (StgClosure *q)
     {
         StgBlockingQueue *bq = (StgBlockingQueue *)p;
 
+        gct->eager_promotion = false;
         evacuate(&bq->bh);
         evacuate((StgClosure**)&bq->owner);
         evacuate((StgClosure**)&bq->queue);
         evacuate((StgClosure**)&bq->link);
+        gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
             bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
@@ -202,11 +222,9 @@ nonmovingScavengeOne (StgClosure *q)
     case MUT_ARR_PTRS_CLEAN:
     case MUT_ARR_PTRS_DIRTY:
     {
-        // We don't eagerly promote objects pointed to by a mutable
-        // array, but if we find the array only points to objects in
-        // the same or an older generation, we mark it "clean" and
-        // avoid traversing it during minor GCs.
+        gct->eager_promotion = false;
         scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -234,14 +252,13 @@ nonmovingScavengeOne (StgClosure *q)
     case SMALL_MUT_ARR_PTRS_DIRTY:
         // follow everything
     {
-        // We don't eagerly promote objects pointed to by a mutable
-        // array, but if we find the array only points to objects in
-        // the same or an older generation, we mark it "clean" and
-        // avoid traversing it during minor GCs.
         StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p);
+        gct->eager_promotion = false;
         for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) {
             evacuate((StgClosure **)p);
         }
+        gct->eager_promotion = saved_eager_promotion;
+
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -278,21 +295,21 @@ nonmovingScavengeOne (StgClosure *q)
     {
         StgStack *stack = (StgStack*)p;
 
+        gct->eager_promotion = false;
         scavenge_stack(stack->sp, stack->stack + stack->stack_size);
+        gct->eager_promotion = saved_eager_promotion;
         stack->dirty = gct->failed_to_evac;
-        // TODO (osa): There may be something special about stacks that we're
-        // missing. All other mut objects are marked by using a different info
-        // table except stacks.
-
         break;
     }
 
     case MUT_PRIM:
     {
         StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+        gct->eager_promotion = false;
         for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
             evacuate((StgClosure **)p);
         }
+        gct->eager_promotion = saved_eager_promotion;
         gct->failed_to_evac = true; // mutable
         break;
     }
@@ -302,12 +319,14 @@ nonmovingScavengeOne (StgClosure *q)
         StgWord i;
         StgTRecChunk *tc = ((StgTRecChunk *) p);
         TRecEntry *e = &(tc -> entries[0]);
+        gct->eager_promotion = false;
         evacuate((StgClosure **)&tc->prev_chunk);
         for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
           evacuate((StgClosure **)&e->tvar);
           evacuate((StgClosure **)&e->expected_value);
           evacuate((StgClosure **)&e->new_value);
         }
+        gct->eager_promotion = saved_eager_promotion;
         gct->failed_to_evac = true; // mutable
         break;
       }


=====================================
rts/sm/NonMovingSweep.c
=====================================
@@ -17,38 +17,6 @@
 #include "Trace.h"
 #include "StableName.h"
 
-static struct NonmovingSegment *pop_all_filled_segments(struct NonmovingAllocator *alloc)
-{
-    while (true) {
-        struct NonmovingSegment *head = alloc->filled;
-        if (cas((StgVolatilePtr) &alloc->filled, (StgWord) head, (StgWord) NULL) == (StgWord) head)
-            return head;
-    }
-}
-
-void nonmovingPrepareSweep()
-{
-    ASSERT(nonmovingHeap.sweep_list == NULL);
-
-    // Move blocks in the allocators' filled lists into sweep_list
-    for (unsigned int alloc_idx = 0; alloc_idx < NONMOVING_ALLOCA_CNT; alloc_idx++)
-    {
-        struct NonmovingAllocator *alloc = nonmovingHeap.allocators[alloc_idx];
-        struct NonmovingSegment *filled = pop_all_filled_segments(alloc);
-
-        // Link filled to sweep_list
-        if (filled) {
-            struct NonmovingSegment *filled_head = filled;
-            // Find end of filled list
-            while (filled->link) {
-                filled = filled->link;
-            }
-            filled->link = nonmovingHeap.sweep_list;
-            nonmovingHeap.sweep_list = filled_head;
-        }
-    }
-}
-
 // On which list should a particular segment be placed?
 enum SweepResult {
     SEGMENT_FREE,     // segment is empty: place on free list
@@ -102,7 +70,7 @@ nonmovingSweepSegment(struct NonmovingSegment *seg)
 
 #if defined(DEBUG)
 
-void nonmovingGcCafs(struct MarkQueue_ *queue)
+void nonmovingGcCafs()
 {
     uint32_t i = 0;
     StgIndStatic *next;
@@ -116,7 +84,8 @@ void nonmovingGcCafs(struct MarkQueue_ *queue)
         const StgInfoTable *info = get_itbl((StgClosure*)caf);
         ASSERT(info->type == IND_STATIC);
 
-        if (lookupHashTable(queue->marked_objects, (StgWord) caf) == NULL) {
+        StgWord flag = ((StgWord) caf->static_link) & STATIC_BITS;
+        if (flag != 0 && flag != static_flag) {
             debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", caf);
             SET_INFO((StgClosure*)caf, &stg_GCD_CAF_info); // stub it
         } else {


=====================================
rts/sm/NonMovingSweep.h
=====================================
@@ -22,11 +22,7 @@ void nonmovingSweepLargeObjects(void);
 // Remove dead entries in the stable name table
 void nonmovingSweepStableNameTable(void);
 
-// Collect the set of segments to be collected during a major GC into
-// nonmovingHeap.sweep_list.
-void nonmovingPrepareSweep(void);
-
 #if defined(DEBUG)
 // The non-moving equivalent of the moving collector's gcCAFs.
-void nonmovingGcCafs(struct MarkQueue_ *queue);
+void nonmovingGcCafs(void);
 #endif


=====================================
rts/sm/Storage.c
=====================================
@@ -321,7 +321,8 @@ freeStorage (bool free_heap)
 }
 
 /* -----------------------------------------------------------------------------
-   Note [CAF management].
+   Note [CAF management]
+   ~~~~~~~~~~~~~~~~~~~~~
 
    The entry code for every CAF does the following:
 
@@ -356,6 +357,7 @@ freeStorage (bool free_heap)
 
    ------------------
    Note [atomic CAF entry]
+   ~~~~~~~~~~~~~~~~~~~~~~~
 
    With THREADED_RTS, newCAF() is required to be atomic (see
    #5558). This is because if two threads happened to enter the same
@@ -369,6 +371,7 @@ freeStorage (bool free_heap)
 
    ------------------
    Note [GHCi CAFs]
+   ~~~~~~~~~~~~~~~~
 
    For GHCI, we have additional requirements when dealing with CAFs:
 
@@ -388,6 +391,51 @@ freeStorage (bool free_heap)
 
       -- SDM 29/1/01
 
+   ------------------
+   Note [Static objects under the nonmoving collector]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   Static object management is a bit tricky under the nonmoving collector as we
+   need to maintain a bit more state than in the moving collector. In
+   particular, the moving collector uses the low bits of the STATIC_LINK field
+   to determine whether the object has been moved to the scavenger's work list
+   (see Note [STATIC_LINK fields] in Storage.h).
+
+   However, the nonmoving collector also needs a place to keep its mark bit.
+   This is problematic as we therefore need at least three bits of state
+   but can assume only two bits are available in STATIC_LINK (due to 32-bit
+   systems).
+
+   To accomodate this we move handling of static objects entirely to the
+   oldest generation when the nonmoving collector is in use. To do this safely
+   and efficiently we allocate the blackhole created by lockCAF() directly in
+   the non-moving heap. This means that the moving collector can completely
+   ignore static objects in minor collections since they are guaranteed not to
+   have any references into the moving heap. Of course, the blackhole itself
+   likely will contain a reference into the moving heap but this is
+   significantly easier to handle, being a heap-allocated object (see Note
+   [Aging under the non-moving collector] in NonMoving.c for details).
+
+   During the moving phase of a major collection we treat static objects
+   as we do any other reference into the non-moving heap by pushing them
+   to the non-moving mark queue (see Note [Aging under the non-moving
+   collector]).
+
+   This allows the non-moving collector to have full control over the flags
+   in STATIC_LINK, which it uses as described in Note [STATIC_LINK fields]).
+   This is implemented by NonMovingMark.c:bump_static_flag.
+
+   In short, the plan is:
+
+     - lockCAF allocates its blackhole in the nonmoving heap. This is important
+       to ensure that we do not need to place the static object on the mut_list
+       lest we would need somw way to ensure that it evacuate only once during
+       a moving collection.
+
+     - evacuate_static_object adds merely pushes objects to the mark queue
+
+     - the nonmoving collector uses the flags in STATIC_LINK as its mark bit.
+
    -------------------------------------------------------------------------- */
 
 STATIC_INLINE StgInd *
@@ -430,7 +478,7 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     // reference should be in SRTs
     ASSERT(orig_info_tbl->layout.payload.ptrs == 0);
     // Becuase the payload is empty we just push the SRT
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(orig_info_tbl);
         if (thunk_info->i.srt) {
             updateRemembSetPushClosure(cap, GET_SRT(thunk_info));
@@ -441,7 +489,16 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     caf->saved_info = orig_info;
 
     // Allocate the blackhole indirection closure
-    bh = (StgInd *)allocate(cap, sizeofW(*bh));
+    if (RtsFlags.GcFlags.useNonmoving) {
+        // See Note [Static objects under the nonmoving collector].
+        ACQUIRE_SM_LOCK;
+        bh = (StgInd *)nonmovingAllocate(cap, sizeofW(*bh));
+        RELEASE_SM_LOCK;
+        recordMutableCap((StgClosure*)bh,
+                         regTableToCapability(reg), oldest_gen->no);
+    } else {
+        bh = (StgInd *)allocate(cap, sizeofW(*bh));
+    }
     SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
     bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
 
@@ -481,7 +538,9 @@ newCAF(StgRegTable *reg, StgIndStatic *caf)
     else
     {
         // Put this CAF on the mutable list for the old generation.
-        if (oldest_gen->no != 0) {
+        // N.B. the nonmoving collector works a bit differently: see
+        // Note [Static objects under the nonmoving collector].
+        if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) {
             recordMutableCap((StgClosure*)caf,
                              regTableToCapability(reg), oldest_gen->no);
         }
@@ -512,6 +571,10 @@ setKeepCAFs (void)
     keepCAFs = 1;
 }
 
+// A list of CAFs linked together via STATIC_LINK where new CAFs are placed
+// until the next GC.
+StgClosure *nonmoving_todo_caf_list;
+
 // An alternate version of newCAF which is used for dynamically loaded
 // object code in GHCi.  In this case we want to retain *all* CAFs in
 // the object code, because they might be demanded at any time from an
@@ -558,7 +621,9 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf)
     if (!bh) return NULL;
 
     // Put this CAF on the mutable list for the old generation.
-    if (oldest_gen->no != 0) {
+    // N.B. the nonmoving collector works a bit differently:
+    // see Note [Static objects under the nonmoving collector].
+    if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) {
         recordMutableCap((StgClosure*)caf,
                          regTableToCapability(reg), oldest_gen->no);
     }
@@ -1140,7 +1205,7 @@ dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old)
     if (mvar->header.info == &stg_MUT_VAR_CLEAN_info) {
         mvar->header.info = &stg_MUT_VAR_DIRTY_info;
         recordClosureMutated(cap, (StgClosure *) mvar);
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled != 0)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure_(reg, old);
         }
     }
@@ -1159,7 +1224,7 @@ dirty_TVAR(Capability *cap, StgTVar *p,
     if (p->header.info == &stg_TVAR_CLEAN_info) {
         p->header.info = &stg_TVAR_DIRTY_info;
         recordClosureMutated(cap,(StgClosure*)p);
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled != 0)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, old);
         }
     }
@@ -1176,8 +1241,9 @@ setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
     if (tso->dirty == 0) {
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, (StgClosure *) tso->_link);
+        }
     }
     tso->_link = target;
 }
@@ -1188,8 +1254,9 @@ setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target)
     if (tso->dirty == 0) {
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, (StgClosure *) tso->block_info.prev);
+        }
     }
     tso->block_info.prev = target;
 }
@@ -1202,8 +1269,9 @@ dirty_TSO (Capability *cap, StgTSO *tso)
         recordClosureMutated(cap,(StgClosure*)tso);
     }
 
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         updateRemembSetPushTSO(cap, tso);
+    }
 }
 
 void
@@ -1211,8 +1279,9 @@ dirty_STACK (Capability *cap, StgStack *stack)
 {
     // First push to upd_rem_set before we set stack->dirty since we
     // the nonmoving collector may already be marking the stack.
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         updateRemembSetPushStack(cap, stack);
+    }
 
     if (! (stack->dirty & STACK_DIRTY)) {
         stack->dirty = STACK_DIRTY;
@@ -1236,7 +1305,7 @@ void
 update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val)
 {
     Capability *cap = regTableToCapability(reg);
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         StgMVar *mvar = (StgMVar *) p;
         updateRemembSetPushClosure(cap, old_val);
         updateRemembSetPushClosure(cap, (StgClosure *) mvar->head);



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/7f6da43cd1666c2efd032557458f1b1e9fb8bdf1...a5cd845b0e7aaa67ef7321846e07bdfc4e266206

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/7f6da43cd1666c2efd032557458f1b1e9fb8bdf1...a5cd845b0e7aaa67ef7321846e07bdfc4e266206
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20190618/ed17decb/attachment-0001.html>