[Git][ghc/ghc][wip/T22264] 35 commits: rts: Drop SM spinlock

Ben Gamari (@bgamari) gitlab at gitlab.haskell.org
Sun Dec 4 19:06:58 UTC 2022



Ben Gamari pushed to branch wip/T22264 at Glasgow Haskell Compiler / GHC


Commits:
be9950df by Ben Gamari at 2022-11-19T20:24:58-05:00
rts: Drop SM spinlock

- - - - -
8b8cd0fe by Ben Gamari at 2022-11-23T19:41:35-05:00
Various

- - - - -
562dd80c by Ben Gamari at 2022-11-23T19:41:43-05:00
Take SM mutex in setNumCapabilities

- - - - -
4b023123 by Ben Gamari at 2022-11-23T19:42:02-05:00
setRecentActivities

- - - - -
68b4a6ab by Ben Gamari at 2022-11-23T19:42:13-05:00
Fix type issues in Sparks.h

- - - - -
f205a5d3 by Ben Gamari at 2022-11-23T19:42:56-05:00
locking

- - - - -
07134f76 by Ben Gamari at 2022-11-23T19:43:20-05:00
Various

- - - - -
2b6f9bd4 by Ben Gamari at 2022-11-23T19:43:48-05:00
C++ typing issues

- - - - -
dfa7f23d by Ben Gamari at 2022-11-23T19:44:18-05:00
Sanity: Look at nonmoving saved_filled lists

- - - - -
6b911ba7 by Ben Gamari at 2022-11-23T19:44:39-05:00
CheckGC

- - - - -
a0111bbd by Ben Gamari at 2022-11-23T21:25:32-05:00
CheckGc: Refactor

- - - - -
da615ef3 by Ben Gamari at 2022-11-23T22:36:16-05:00
Fix it

- - - - -
7850a894 by Ben Gamari at 2022-11-25T14:00:28-05:00
check-gc

- - - - -
be510990 by Ben Gamari at 2022-11-25T17:48:25-05:00
Check gc

- - - - -
cff00edb by Ben Gamari at 2022-11-25T18:52:46-05:00
Check Gc

- - - - -
c25eb12f by Ben Gamari at 2022-11-25T18:52:56-05:00
locking

- - - - -
0ad4d2fb by Ben Gamari at 2022-11-25T20:09:30-05:00
Locking

- - - - -
230d8760 by Ben Gamari at 2022-11-26T21:27:12-05:00
locking

- - - - -
da5bdfb6 by Ben Gamari at 2022-11-26T21:27:29-05:00
Check GC

- - - - -
4dcea2c5 by Ben Gamari at 2022-11-26T21:27:46-05:00
Locking

- - - - -
6ed7e21a by Ben Gamari at 2022-11-26T21:28:08-05:00
Locking

- - - - -
0c097193 by Ben Gamari at 2022-11-26T21:28:20-05:00
FIX THE DAMN THING

- - - - -
cac0caeb by Ben Gamari at 2022-11-29T23:09:48-05:00
Evac: Squash data race in eval_selector_chain

- - - - -
9fe222a6 by Ben Gamari at 2022-11-29T23:11:01-05:00
nonmoving: Paranoia

- - - - -
a2daf6bf by Ben Gamari at 2022-11-29T23:11:26-05:00
nonmoving: Fix weaks

- - - - -
5fef104b by Ben Gamari at 2022-11-30T15:43:05-05:00
nonmoving: Fix selectors

- - - - -
f0161e29 by Ben Gamari at 2022-11-30T15:43:36-05:00
nonmoving: Avoid n_caps race

- - - - -
3f4f06e4 by Ben Gamari at 2022-11-30T15:44:16-05:00
nonmoving: Paranoia

- - - - -
571ac83f by Ben Gamari at 2022-11-30T15:44:41-05:00
nonmoving: Assert weak pointer consistency

- - - - -
b56776c3 by Ben Gamari at 2022-12-02T17:03:56-05:00
CheckGc: Distinguish tagged pointers

- - - - -
4b2dfabe by Ben Gamari at 2022-12-02T17:04:19-05:00
CheckGc: Flush

- - - - -
d382a58c by Ben Gamari at 2022-12-02T17:04:48-05:00
rts/gc: Add missing write barriers in selector optimisation

- - - - -
4f45b245 by Ben Gamari at 2022-12-02T17:06:42-05:00
CheckGc: Add missing SRT

- - - - -
fb45fba5 by Ben Gamari at 2022-12-02T17:21:32-05:00
CheckGc: Factor out constructor, fun, thunk traversal

- - - - -
f095454b by Ben Gamari at 2022-12-02T17:50:04-05:00
CheckGc: More sanity checking

- - - - -


26 changed files:

- rts/Capability.h
- rts/CheckUnload.c
- rts/Proftimer.c
- rts/Schedule.h
- rts/Sparks.h
- rts/Stats.c
- rts/Trace.h
- rts/include/rts/storage/MBlock.h
- rts/rts.cabal.in
- rts/sm/BlockAlloc.c
- + rts/sm/CheckGc.cpp
- rts/sm/Evac.c
- rts/sm/GC.c
- rts/sm/GC.h
- rts/sm/GCUtils.c
- rts/sm/GCUtils.h
- rts/sm/HeapAlloc.h
- rts/sm/MarkStack.h
- rts/sm/NonMoving.c
- rts/sm/NonMovingMark.c
- rts/sm/NonMovingMark.h
- rts/sm/NonMovingShortcut.c
- rts/sm/Sanity.c
- rts/sm/Sanity.h
- rts/sm/Scav.c
- rts/sm/Storage.c


Changes:

=====================================
rts/Capability.h
=====================================
@@ -20,10 +20,10 @@
 
 #pragma once
 
-#include "sm/GC.h" // for evac_fn
 #include "Task.h"
 #include "Sparks.h"
-#include "sm/NonMovingMark.h" // for MarkQueue
+#include "sm/GC.h" // for evac_fn
+#include "sm/NonMovingMark.h" // for UpdRemSet
 
 #include "BeginPrivate.h"
 


=====================================
rts/CheckUnload.c
=====================================
@@ -15,6 +15,7 @@
 #include "Hash.h"
 #include "LinkerInternals.h"
 #include "CheckUnload.h"
+#include "sm/HeapAlloc.h"
 #include "sm/Storage.h"
 #include "sm/GCThread.h"
 #include "sm/HeapUtils.h"


=====================================
rts/Proftimer.c
=====================================
@@ -123,7 +123,7 @@ handleProfTick(void)
     if (RELAXED_LOAD_ALWAYS(&do_prof_ticks)) {
         uint32_t n;
         for (n=0; n < getNumCapabilities(); n++) {
-            Capability *cap = getCapbility(n);
+            Capability *cap = getCapability(n);
             cap->r.rCCCS->time_ticks++;
             traceProfSampleCostCentre(cap, cap->r.rCCCS, total_ticks);
         }


=====================================
rts/Schedule.h
=====================================
@@ -132,7 +132,7 @@ setRecentActivity(enum RecentActivity new_value)
 INLINE_HEADER enum RecentActivity
 getRecentActivity(void)
 {
-    return RELAXED_LOAD_ALWAYS(&recent_activity);
+    return (enum RecentActivity) RELAXED_LOAD_ALWAYS(&recent_activity);
 }
 
 /* Thread queues.


=====================================
rts/Sparks.h
=====================================
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "sm/GC.h" // for evac_fn
 #include "WSDeque.h"
 
 #include "BeginPrivate.h"
@@ -56,7 +57,7 @@ INLINE_HEADER long sparkPoolSize  (SparkPool *pool);
 
 INLINE_HEADER StgClosure* reclaimSpark(SparkPool *pool)
 {
-    return popWSDeque(pool);
+    return (StgClosure*) popWSDeque(pool);
 }
 
 INLINE_HEADER bool looksEmpty(SparkPool* deque)
@@ -89,7 +90,7 @@ INLINE_HEADER void discardSparks (SparkPool *pool)
 
 INLINE_HEADER StgClosure * tryStealSpark (SparkPool *pool)
 {
-    return stealWSDeque_(pool);
+    return (StgClosure *) stealWSDeque_(pool);
     // use the no-loopy version, stealWSDeque_(), since if we get a
     // spurious NULL here the caller may want to try stealing from
     // other pools before trying again.


=====================================
rts/Stats.c
=====================================
@@ -963,11 +963,6 @@ static void report_summary(const RTSSummaryStats* sum)
                     , col_width[1], "SpinLock"
                     , col_width[2], "Spins"
                     , col_width[3], "Yields");
-        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                    , col_width[0], ""
-                    , col_width[1], "gc_alloc_block_sync"
-                    , col_width[2], gc_alloc_block_sync.spin
-                    , col_width[3], gc_alloc_block_sync.yield);
         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
                     , col_width[0], ""
                     , col_width[1], "whitehole_gc"
@@ -1142,10 +1137,6 @@ static void report_machine_readable (const RTSSummaryStats * sum)
 
     // next, internal counters
 #if defined(PROF_SPIN)
-    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
-    MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
-            gc_alloc_block_sync.yield);
-    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
     MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
     MR_STAT("waitForGcThreads_yield", FMT_Word64,
             waitForGcThreads_yield);
@@ -1572,9 +1563,6 @@ SpinLock:
 Not all of these are actual SpinLocks, see the details below.
 
 Actual SpinLocks:
-* gc_alloc_block:
-    This SpinLock protects the block allocator and free list manager. See
-    BlockAlloc.c.
 * gen[g].sync:
     These SpinLocks, one per generation, protect the generations[g] data
     structure during garbage collection.


=====================================
rts/Trace.h
=====================================
@@ -33,7 +33,9 @@ void tracingAddCapapilities (uint32_t from, uint32_t to);
 #endif /* TRACING */
 
 typedef StgWord32 CapsetID;
+#if !defined(__cplusplus)
 typedef StgWord16 CapsetType;
+#endif
 enum CapsetType { CapsetTypeCustom = CAPSET_TYPE_CUSTOM,
                   CapsetTypeOsProcess = CAPSET_TYPE_OSPROCESS,
                   CapsetTypeClockdomain = CAPSET_TYPE_CLOCKDOMAIN };


=====================================
rts/include/rts/storage/MBlock.h
=====================================
@@ -26,7 +26,3 @@ extern void freeAllMBlocks(void);
 extern void *getFirstMBlock(void **state);
 extern void *getNextMBlock(void **state, void *mblock);
 
-#if defined(THREADED_RTS)
-// needed for HEAP_ALLOCED below
-extern SpinLock gc_alloc_block_sync;
-#endif


=====================================
rts/rts.cabal.in
=====================================
@@ -448,6 +448,9 @@ library
                  -- AutoApply is generated
                  AutoApply.cmm
 
+    -- Debugging
+    c-sources: sm/CheckGc.cpp
+
     -- Adjustor stuff
     if flag(libffi-adjustors)
       c-sources: adjustor/LibffiAdjustor.c


=====================================
rts/sm/BlockAlloc.c
=====================================
@@ -438,6 +438,7 @@ alloc_mega_group (uint32_t node, StgWord mblocks)
     bdescr *best, *bd;
     StgWord n;
 
+    ASSERT_SM_LOCK();
     n = MBLOCK_GROUP_BLOCKS(mblocks);
 
     if(defer_mblock_frees)
@@ -496,6 +497,7 @@ allocGroupOnNode (uint32_t node, W_ n)
     bdescr *bd, *rem;
     StgWord ln;
 
+    ASSERT_SM_LOCK();
     if (n == 0) barf("allocGroup: requested zero blocks");
 
     if (n >= BLOCKS_PER_MBLOCK)
@@ -709,6 +711,7 @@ bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max)
 {
     bdescr *bd;
     StgWord ln, lnmax;
+    ASSERT_SM_LOCK();
 
     if (min >= BLOCKS_PER_MBLOCK) {
         return allocGroupOnNode(node,max);
@@ -933,8 +936,7 @@ freeGroup(bdescr *p)
   StgWord ln;
   uint32_t node;
 
-  // not true in multithreaded GC:
-  // ASSERT_SM_LOCK();
+  ASSERT_SM_LOCK();
 
   ASSERT(RELAXED_LOAD(&p->free) != (P_)-1);
 


=====================================
rts/sm/CheckGc.cpp
=====================================
@@ -0,0 +1,973 @@
+extern "C" {
+#include "Rts.h"
+#include "StableName.h" /* for FOR_EACH_STABLE_NAME */
+#include "StablePtr.h" /* for markStablePtrTable */
+#include "Schedule.h" /* for markScheduler */
+#include "Capability.h"
+#include "HeapAlloc.h"
+#include "STM.h"
+}
+
+#include <iostream>
+#include <fstream>
+#include <set>
+#include <vector>
+#include <queue>
+#include <unordered_set>
+
+class TaggedClosurePtr {
+    StgClosure *ptr;
+public:
+    TaggedClosurePtr(StgClosure* ptr) : ptr(ptr) {}
+    TaggedClosurePtr(StgClosure* ptr, uint8_t tag) : TaggedClosurePtr(TAG_CLOSURE(tag, ptr)) {}
+
+    StgClosure *get_tagged() const {
+        return ptr;
+    }
+    StgClosure *untag() const {
+        return UNTAG_CLOSURE(ptr);
+    }
+    uint8_t get_tag() const {
+        return (StgWord) ptr & TAG_MASK;
+    }
+
+    //inline StgClosure& operator->() { return *untag(); }
+
+    friend inline bool operator==(const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) {
+        return lhs.ptr == rhs.ptr;
+    }
+    friend inline bool operator!=(const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) { return !(lhs == rhs); }
+    friend inline bool operator< (const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) {
+        return lhs.ptr < rhs.ptr;
+    }
+    friend inline bool operator> (const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) { return rhs < lhs; }
+    friend inline bool operator<=(const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) { return !(lhs > rhs); }
+    friend inline bool operator>=(const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) { return !(lhs < rhs); }
+};
+
+template<>
+struct std::hash<TaggedClosurePtr> {
+    std::size_t operator()(TaggedClosurePtr const& p) const noexcept {
+        return std::hash<StgClosure*>{}(p.get_tagged());
+    }
+};
+
+class HeapVisitor {
+public:
+    // Visit an SRT
+    virtual void visit_srt(StgClosure* c);
+
+    // Visit a normal closure
+    virtual void visit_closure(TaggedClosurePtr c);
+
+    virtual void visit_thunk(StgThunk *thunk, size_t n_ptrs);
+    virtual void visit_fun(StgClosure *constr, size_t n_ptrs);
+    virtual void visit_constr(StgClosure *constr, size_t n_ptrs);
+    virtual void visit_array(StgMutArrPtrs *arr);
+    virtual void visit_small_array(StgSmallMutArrPtrs *arr);
+    virtual void visit_bytearray(StgArrBytes* arr);
+
+    virtual void visit_stack(StgPtr sp, StgPtr end);
+    virtual void visit_tso(StgTSO* tso);
+    virtual void visit_weak(StgWeak* w);
+    virtual void visit_mvar(StgMVar* mvar);
+    virtual void visit_tvar(StgTVar* tvar);
+    virtual void visit_trec_header(StgTRecHeader *trec);
+    virtual void visit_trec_chunk(StgTRecChunk* tc);
+    virtual void visit_continuation(StgContinuation* tc);
+
+    virtual void visit_small_bitmap(StgClosure *const *payload, StgWord bitmap, StgWord size);
+    virtual void visit_large_bitmap(StgClosure *const *payload, const StgLargeBitmap *bitmap, StgWord size);
+    void visit_pap_payload(StgClosure *fun, StgClosure **payload, StgWord n_args);
+
+    virtual void visit_invalid(StgClosure *const c);
+};
+
+void HeapVisitor::visit_thunk(StgThunk *thunk, size_t n_ptrs)
+{
+    const StgInfoTable *info = get_itbl((StgClosure *) thunk);
+    const StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(info);
+    if (thunk_info->i.srt) {
+        StgClosure *srt = (StgClosure*) GET_SRT(thunk_info);
+        visit_srt(srt);
+    };
+    for (size_t i=0; i < n_ptrs; i++) {
+        visit_closure(thunk->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_fun(StgClosure *fun, size_t n_ptrs)
+{
+    const StgInfoTable *info = get_itbl(fun);
+    const StgFunInfoTable *fun_info = itbl_to_fun_itbl(info);
+    if (fun_info->i.srt) {
+        StgClosure *srt = (StgClosure*) GET_SRT(fun_info);
+        visit_srt(srt);
+    };
+    for (size_t i=0; i < n_ptrs; i++) {
+        visit_closure(fun->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_constr(StgClosure *constr, size_t n_ptrs)
+{
+    for (size_t i=0; i < n_ptrs; i++) {
+        visit_closure(constr->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_srt(StgClosure* c)
+{
+    visit_closure(c);
+}
+
+void HeapVisitor::visit_invalid(StgClosure *const _c)
+{
+    abort();
+}
+
+void HeapVisitor::visit_weak(StgWeak* w)
+{
+    visit_closure(w->key);
+    visit_closure(w->value);
+    visit_closure(w->finalizer);
+    visit_closure(w->cfinalizers);
+}
+
+void HeapVisitor::visit_mvar(StgMVar* mvar)
+{
+    visit_closure((StgClosure*) mvar->head);
+    visit_closure((StgClosure*) mvar->tail);
+    visit_closure(mvar->value);
+}
+
+void HeapVisitor::visit_small_array(StgSmallMutArrPtrs *arr)
+{
+    for (StgWord i=0; i < arr->ptrs; i++) {
+        visit_closure(arr->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_array(StgMutArrPtrs *arr)
+{
+    for (StgWord i=0; i < arr->ptrs; i++) {
+        visit_closure(arr->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_bytearray(StgArrBytes* _arr) { }
+
+void HeapVisitor::visit_tso(StgTSO *tso)
+{
+    if (tso->bound != NULL) {
+
+        visit_closure((StgClosure*) tso->bound->tso);
+    }
+    if (tso->label != NULL) {
+        visit_closure({(StgClosure*) tso->label});
+    }
+    visit_closure((StgClosure*) tso->blocked_exceptions);
+    visit_closure((StgClosure*) tso->bq);
+    visit_closure((StgClosure*) tso->stackobj);
+    visit_closure((StgClosure*) tso->_link);
+    visit_trec_header(tso->trec);
+
+    switch (tso->why_blocked) {
+    case BlockedOnMVar:
+    case BlockedOnMVarRead:
+    case BlockedOnBlackHole:
+    case BlockedOnMsgThrowTo:
+    case NotBlocked:
+        visit_closure(tso->block_info.closure);
+        break;
+    default:
+        break;
+    }
+}
+
+void HeapVisitor::visit_continuation(StgContinuation *cont)
+{
+    visit_stack(cont->stack, cont->stack + cont->stack_size);
+}
+
+void HeapVisitor::visit_tvar(StgTVar *tvar)
+{
+    visit_closure(tvar->current_value);
+    visit_closure((StgClosure*) tvar->first_watch_queue_entry);
+}
+
+void HeapVisitor::visit_trec_header(StgTRecHeader *trec)
+{
+    if (trec == NO_TREC) {
+        return;
+    }
+    visit_trec_chunk(trec->current_chunk);
+    visit_closure((StgClosure*) trec->enclosing_trec);
+}
+
+void HeapVisitor::visit_trec_chunk(StgTRecChunk *tc)
+{
+    if (tc->prev_chunk != END_STM_CHUNK_LIST) {
+        visit_closure((StgClosure*) tc->prev_chunk);
+    }
+
+    for (uint32_t i = 0; i < tc->next_entry_idx; i++) {
+        TRecEntry *e = &tc->entries[i];
+        visit_closure((StgClosure*)e->tvar);
+        visit_closure(e->expected_value);
+        visit_closure(e->new_value);
+    }
+}
+
+void HeapVisitor::visit_stack(StgPtr p, StgPtr stack_end)
+{
+    while (p < stack_end) {
+        const StgRetInfoTable* info = get_ret_itbl((StgClosure *) p);
+
+        auto add_srt_ptrs = [&] () {
+            if (info->i.srt) {
+                StgClosure *srt = (StgClosure*)GET_SRT(info);
+                visit_srt(srt);
+            }
+        };
+
+        switch (info->i.type) {
+  
+        case UPDATE_FRAME:
+        {
+            StgUpdateFrame *frame = (StgUpdateFrame *)p;
+            visit_closure(frame->updatee);
+            p += sizeofW(StgUpdateFrame);
+            continue;
+        }
+  
+        case CATCH_STM_FRAME:
+        case CATCH_RETRY_FRAME:
+        case ATOMICALLY_FRAME:
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
+        case CATCH_FRAME:
+        case RET_SMALL:
+        {
+            StgWord bitmap = BITMAP_BITS(info->i.layout.bitmap);
+            StgWord size   = BITMAP_SIZE(info->i.layout.bitmap);
+            // NOTE: the payload starts immediately after the info-ptr, we
+            // don't have an StgHeader in the same sense as a heap closure.
+            p++;
+            visit_small_bitmap((StgClosure**) p, bitmap, size);
+            p += size;
+            add_srt_ptrs();
+            continue;
+        }
+
+        case RET_BCO:
+        {
+            p++;
+            StgBCO *bco = (StgBCO *)*p;
+            visit_closure((StgClosure *) bco);
+            p++;
+            StgWord size = BCO_BITMAP_SIZE(bco);
+            visit_large_bitmap((StgClosure**) p, BCO_BITMAP(bco), size);
+            p += size;
+            continue;
+        }
+
+        case RET_BIG:
+        {
+            StgWord size = GET_LARGE_BITMAP(&info->i)->size;
+            p++;
+            visit_large_bitmap((StgClosure**) p, GET_LARGE_BITMAP(&info->i), size);
+            p += size;
+            // and don't forget to follow the SRT
+            add_srt_ptrs();
+            break;
+        }
+
+        case RET_FUN:
+        {
+            StgRetFun *ret_fun = (StgRetFun *)p;
+            visit_closure(ret_fun->fun);
+
+            const StgFunInfoTable *fun_info = get_fun_itbl(UNTAG_CLOSURE(ret_fun->fun));
+            switch (fun_info->f.fun_type) {
+                case ARG_GEN:
+                {
+                    StgWord bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+                    StgWord size = BITMAP_SIZE(fun_info->f.b.bitmap);
+                    visit_small_bitmap(ret_fun->payload, bitmap, size);
+                    p = (StgPtr) ((StgClosure **) &ret_fun->payload + size);
+                    break;
+                }
+                case ARG_GEN_BIG:
+                {
+                    StgWord size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+                    visit_large_bitmap(ret_fun->payload, GET_FUN_LARGE_BITMAP(fun_info), size);
+                    p = (StgPtr) ((StgClosure **) &ret_fun->payload + size);
+                    break;
+                }
+                default:
+                {
+                    StgWord bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+                    StgWord size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+                    visit_small_bitmap(ret_fun->payload, bitmap, size);
+                    p = (StgPtr) ((StgClosure **) &ret_fun->payload + size);
+                    break;
+                }
+            }
+            add_srt_ptrs();
+            break;
+        }
+        default:
+            abort();
+        }
+    }
+}
+
+void HeapVisitor::visit_small_bitmap(
+        StgClosure *const *payload,
+        StgWord bitmap,
+        StgWord size)
+{
+    while (size > 0) {
+        if ((bitmap & 1) == 0) {
+            visit_closure(*payload);
+        }
+        payload++;
+        bitmap = bitmap >> 1;
+        size--;
+    }
+}
+
+void HeapVisitor::visit_large_bitmap(
+        StgClosure *const * payload,
+        const StgLargeBitmap *large_bitmap,
+        StgWord size)
+{
+    // Bitmap may have more bits than `size` when scavenging PAP payloads. See
+    // comments around StgPAP.
+    ASSERT(large_bitmap->size >= size);
+
+    uint32_t b = 0;
+    for (uint32_t i = 0; i < size; b++) {
+        StgWord bitmap = large_bitmap->bitmap[b];
+        uint32_t j = stg_min(size-i, BITS_IN(W_));
+        i += j;
+        for (; j > 0; j--, payload++) {
+            if ((bitmap & 1) == 0) {
+                visit_closure(*payload);
+            }
+            bitmap = bitmap >> 1;
+        }
+    }
+}
+
+void HeapVisitor::visit_pap_payload(
+        StgClosure *fun,
+        StgClosure **payload,
+        StgWord n_args)
+{
+    fun = UNTAG_CLOSURE(fun);
+    const StgFunInfoTable *fun_info = get_fun_itbl(fun);
+    ASSERT(fun_info->i.type != PAP);
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+        visit_small_bitmap(payload, BITMAP_BITS(fun_info->f.b.bitmap), n_args);
+        break;
+    case ARG_GEN_BIG:
+        visit_large_bitmap(payload, GET_FUN_LARGE_BITMAP(fun_info), n_args);
+        break;
+    case ARG_BCO:
+        visit_large_bitmap(payload, BCO_BITMAP(fun), n_args);
+        break;
+    default:
+    {
+        StgWord bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+        visit_small_bitmap(payload, bitmap, n_args);
+    }
+    }
+}
+
+void HeapVisitor::visit_closure(TaggedClosurePtr tagged)
+{
+    StgClosure *c = tagged.untag();
+    if (c->header.info == (StgInfoTable *) 0xaaaaaaaaaaaaaaaa || !LOOKS_LIKE_CLOSURE_PTR(c)) {
+        visit_invalid(c);
+        return;
+    }
+
+    const StgInfoTable *info = get_itbl(c);
+    auto generic_closure = [&] () {
+        for (StgClosure **p = &c->payload[0]; p < &c->payload[info->layout.payload.ptrs]; p++) {
+            visit_closure(*p);
+        }
+    };
+
+    switch (info->type) {
+
+    case MVAR_CLEAN:
+    case MVAR_DIRTY:
+        visit_mvar((StgMVar *) c);
+        break;
+    case TVAR:
+        visit_tvar((StgTVar *) c);
+        break;
+
+    case IND:
+    case IND_STATIC:
+        visit_closure(((StgInd *) c)->indirectee);
+        break;
+
+    case THUNK_0_1:
+    case THUNK_0_2:
+        visit_thunk((StgThunk*) c, 0);
+        break;
+    case THUNK_1_1:
+    case THUNK_1_0:
+        visit_thunk((StgThunk*) c, 1);
+        break;
+    case THUNK_2_0:
+        visit_thunk((StgThunk*) c, 2);
+        break;
+    case THUNK:
+        visit_thunk((StgThunk*) c, info->layout.payload.ptrs);
+        break;
+    case THUNK_STATIC:
+        visit_thunk((StgThunk*) c, 0);
+        break;
+
+    case FUN_1_0:
+        visit_fun(c, 1);
+        break;
+    case FUN_0_1:
+    case FUN_0_2:
+        visit_fun(c, 0);
+        break;
+    case FUN_1_1:
+        visit_fun(c, 1);
+        break;
+    case FUN_2_0:
+        visit_fun(c, 2);
+        break;
+    case FUN:
+    case FUN_STATIC:
+        visit_fun(c, info->layout.payload.ptrs);
+        break;
+
+    case CONSTR_0_1:
+    case CONSTR_0_2:
+        visit_constr(c, 0);
+        break;
+    case CONSTR_1_0:
+    case CONSTR_1_1:
+        visit_constr(c, 1);
+        break;
+    case CONSTR_2_0:
+        visit_constr(c, 2);
+        break;
+    case CONSTR:
+    case CONSTR_NOCAF:
+        visit_constr(c, info->layout.payload.ptrs);
+        break;
+
+    case PRIM:
+        generic_closure();
+        break;
+    case WEAK:
+        visit_weak((StgWeak*) c);
+        break;
+    case BCO:
+    {
+        StgBCO *bco = (StgBCO *)c;
+        visit_closure((StgClosure*) bco->instrs);
+        visit_closure((StgClosure*) bco->literals);
+        visit_closure((StgClosure*) bco->ptrs);
+        break;
+    }
+    case BLACKHOLE:
+    {
+        StgInd *ind = (StgInd*) c;
+        visit_closure(ind->indirectee);
+        break;
+    }
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    {
+        StgMutVar *mv = (StgMutVar*) c;
+        visit_closure(mv->var);
+        break;
+    }
+    case BLOCKING_QUEUE:
+    {
+        StgBlockingQueue *bq = (StgBlockingQueue *)c;
+        visit_closure((StgClosure*) bq->bh);
+        visit_closure((StgClosure*) bq->owner);
+        visit_closure((StgClosure*) bq->queue);
+        visit_closure((StgClosure*) bq->link);
+        break;
+    }
+    case THUNK_SELECTOR:
+    {
+        StgSelector *s = (StgSelector *)c;
+        visit_closure(s->selectee);
+        break;
+    }
+    case AP_STACK:
+    {
+        StgAP_STACK *ap = (StgAP_STACK *)c;
+        visit_closure(ap->fun);
+        visit_stack((StgPtr) ap->payload, (StgPtr) ap->payload + ap->size);
+        break;
+    }
+    case PAP:
+    {
+        StgPAP *pap = (StgPAP*) c;
+        visit_closure(pap->fun);
+        visit_pap_payload(pap->fun, (StgClosure**) pap->payload, pap->n_args);
+        break;
+    }
+    case AP:
+    {
+        StgAP *ap = (StgAP*) c;
+        visit_closure(ap->fun);
+        visit_pap_payload(ap->fun, (StgClosure**) ap->payload, ap->n_args);
+        break;
+    }
+    case ARR_WORDS:
+        visit_bytearray((StgArrBytes *) c);
+        break;
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+        visit_array((StgMutArrPtrs *) c);
+        break;
+    case SMALL_MUT_ARR_PTRS_CLEAN:
+    case SMALL_MUT_ARR_PTRS_DIRTY:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
+        visit_small_array((StgSmallMutArrPtrs *) c);
+        break;
+    case TSO:
+        visit_tso((StgTSO *) c);
+        break;
+    case STACK:
+    {
+        StgStack *stack = (StgStack *) c;
+        visit_stack(stack->sp, stack->stack + stack->stack_size);
+        break;
+    }
+    case MUT_PRIM:
+        generic_closure();
+        break;
+    case TREC_CHUNK:
+        visit_trec_chunk((StgTRecChunk *) c);
+        break;
+    case CONTINUATION:
+        visit_continuation((StgContinuation *) c);
+        break;
+    default:
+        visit_invalid(c);
+        break;
+    }
+}
+
+class PredicatedHeapVisitor : HeapVisitor {
+    bool should_visit(StgClosure *);
+
+    virtual void visit_srt(StgClosure* c) {
+        if (should_visit(c)) { HeapVisitor::visit_srt(c); }
+    }
+
+    virtual void visit_closure(TaggedClosurePtr c) {
+        if (should_visit(c.untag())) { HeapVisitor::visit_closure(c); }
+    }
+};
+
+// Collect direct pointers
+struct CollectPointers : HeapVisitor {
+    std::set<TaggedClosurePtr> accum;
+    bool invalid;
+    CollectPointers() : accum(), invalid(false) {}
+    void visit_root(StgClosure *c) {
+        HeapVisitor::visit_closure(c);
+    }
+    void visit_closure(TaggedClosurePtr c) {
+        accum.insert(c);
+    }
+    void visit_invalid(StgClosure *const _c) { invalid = true; }
+};
+
+static std::set<TaggedClosurePtr> collect_pointers(StgClosure* c)
+{
+    CollectPointers v;
+    v.visit_root(c);
+    return v.accum;
+}
+
+
+
+struct Error {
+    StgClosure *closure;
+    std::string what;
+    Error(StgClosure *closure, std::string what) : closure(closure), what(what) {}
+};
+
+static std::ostream& operator<<(std::ostream& os, const Error& err) {
+    os << std::hex << "0x" << (StgWord) err.closure << ": " << err.what << "\n";
+    return os;
+}
+
+class CheckVisitor : HeapVisitor {
+    std::vector<Error> errors;
+    uint8_t tag;
+    void visit_constr(StgClosure* c) {
+        const StgInfoTable *info = get_itbl(c);
+        if (tag != 0) {
+            uint8_t constr_tag = info->srt;  // zero-based
+            if (tag != std::min(TAG_MASK, constr_tag+1)) {
+                errors.push_back(Error(c, "invalid tag"));
+            }
+        }
+    }
+
+    void visit_closure(TaggedClosurePtr c) { }
+public:
+    const std::vector<Error>& get_errors() const { return errors; }
+
+    void check_closure(TaggedClosurePtr c) {
+        tag = c.get_tag();
+        HeapVisitor::visit_closure(c);
+    }
+};
+
+struct CheckGc {
+    std::queue<TaggedClosurePtr> queue;
+    std::unordered_set<TaggedClosurePtr> enqueued;
+
+    void enqueue(TaggedClosurePtr ptr) {
+        ASSERT(ptr != NULL);
+        if (!is_enqueued(ptr)) {
+            queue.push(ptr);
+            enqueued.insert(ptr);
+        }
+    }
+
+    bool finished() {
+        return queue.empty();
+    }
+
+    TaggedClosurePtr pop() {
+        TaggedClosurePtr p = queue.front();
+        queue.pop();
+        return p;
+    }
+
+    bool is_enqueued(TaggedClosurePtr ptr) {
+        return enqueued.find(ptr) != enqueued.end();
+    }
+};
+
+static void enqueue_root(void *user_data, StgClosure **root)
+{
+    CheckGc* env = (CheckGc*) user_data;
+    env->enqueue(*root);
+}
+
+static void enqueue_roots(CheckGc& env)
+{
+    FOR_EACH_STABLE_NAME(p, if (p->sn_obj) env.enqueue(p->sn_obj););
+    markStablePtrTable(enqueue_root, &env);
+    for (uint32_t n = 0; n < getNumCapabilities(); n++) {
+        markCapability(enqueue_root, (void*) &env, getCapability(n), false/*mark sparks*/);
+    }
+    markCAFs(enqueue_root, &env);
+    markScheduler(enqueue_root, &env);
+
+    for (StgWeak *w = nonmoving_weak_ptr_list; w != NULL; w = w->link) {
+        env.enqueue((StgClosure *) w);
+    }
+
+    for (uint32_t g = 0; g <= N; g++) {
+        generation *gen = &generations[g];
+        for (StgWeak *w = gen->weak_ptr_list; w != NULL; w = RELAXED_LOAD(&w->link)) {
+            env.enqueue((StgClosure *) w);
+        }
+    }
+}
+
+extern "C" {
+void check_gc();
+}
+
+struct NodeName {
+    const StgClosure *c;
+    NodeName(const StgClosure *c) : c(c) {}
+};
+
+static std::ostream& operator<<(std::ostream& os, const NodeName& n) {
+    os << std::hex << "\"" << n.c << "\"" << std::dec;
+    return os;
+}
+
+static void dump_heap(std::ofstream& of)
+{
+    of << "digraph {\n";
+    CheckGc env;
+    enqueue_roots(env);
+    while (!env.finished()) {
+        TaggedClosurePtr tagged = env.pop();
+        StgClosure* c = tagged.untag();
+        NodeName n(c);
+        if (c->header.info == (StgInfoTable *) 0xaaaaaaaaaaaaaaaa) {
+            of << n << " [type=invalid];\n";
+            continue;
+        }
+
+        const StgInfoTable *info = get_itbl(c);
+        switch (info->type) {
+            case CONSTR:
+            case CONSTR_1_0:
+            case CONSTR_0_1:
+            case CONSTR_2_0:
+            case CONSTR_1_1:
+            case CONSTR_0_2:
+            case CONSTR_NOCAF:
+            {
+                const StgConInfoTable *con_info = get_con_itbl(c);
+                of << n << " [type=CONSTR constr=\"" << GET_CON_DESC(con_info) << "\"];\n";
+                break;
+            }
+            case FUN:
+            case FUN_1_0:
+            case FUN_0_1:
+            case FUN_2_0:
+            case FUN_1_1:
+            case FUN_0_2:
+                of << n << " [type=FUN];\n";
+                break;
+            case FUN_STATIC:
+                of << n << " [type=FUN_STATIC];\n";
+                break;
+            case THUNK:
+            case THUNK_1_0:
+            case THUNK_0_1:
+            case THUNK_1_1:
+            case THUNK_0_2:
+            case THUNK_2_0:
+                of << n << " [type=THUNK];\n";
+                break;
+            case THUNK_STATIC:
+                of << n << " [type=THUNK_STATIC];\n";
+                break;
+            case THUNK_SELECTOR:
+                of << n << " [type=THUNK_SEL];\n";
+                break;
+            case BCO:
+                of << n << " [type=BCO];\n";
+                break;
+            case AP:
+                of << n << " [type=AP];\n";
+                break;
+            case PAP:
+                of << n << " [type=PAP];\n";
+                break;
+            case AP_STACK:
+                of << n << " [type=AP_STACK];\n";
+                break;
+            case IND:
+                of << n << " [type=IND];\n";
+                break;
+            case IND_STATIC:
+                of << n << " [type=IND_STATIC];\n";
+                break;
+            case BLOCKING_QUEUE:
+                of << n << " [type=BLOCKING_QUEUE];\n";
+                break;
+            case BLACKHOLE:
+                of << n << " [type=BLACKHOLE];\n";
+                break;
+            case MVAR_CLEAN:
+            case MVAR_DIRTY:
+                of << n << " [type=MVAR];\n";
+                break;
+            case TVAR:
+                of << n << " [type=TVAR];\n";
+                break;
+            case ARR_WORDS:
+                of << n << " [type=ARR_WORDS];\n";
+                break;
+            case MUT_ARR_PTRS_CLEAN:
+            case MUT_ARR_PTRS_DIRTY:
+            case MUT_ARR_PTRS_FROZEN_CLEAN:
+            case MUT_ARR_PTRS_FROZEN_DIRTY:
+                of << n << " [type=MUT_ARR_PTRS];\n";
+                break;
+            case SMALL_MUT_ARR_PTRS_CLEAN:
+            case SMALL_MUT_ARR_PTRS_DIRTY:
+            case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+            case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
+                of << n << " [type=SMALL_MUT_ARR_PTRS];\n";
+                break;
+            case MUT_VAR_CLEAN:
+            case MUT_VAR_DIRTY:
+                of << n << " [type=MUT_VAR];\n";
+                break;
+            case WEAK:
+                of << n << " [type=WEAK];\n";
+                break;
+            case PRIM:
+                of << n << " [type=PRIM];\n";
+                break;
+            case MUT_PRIM:
+                of << n << " [type=MUT_PRIM];\n";
+                break;
+            case TSO:
+                of << n << " [type=TSO];\n";
+                break;
+            case STACK:
+                of << n << " [type=STACK];\n";
+                break;
+            case TREC_CHUNK:
+                of << n << " [type=TREC_CHUNK];\n";
+                break;
+            case WHITEHOLE:
+                of << n << " [type=WHITEHOLE];\n";
+                break;
+            case COMPACT_NFDATA:
+                of << n << " [type=COMPACT_NFDATA];\n";
+                break;
+            case CONTINUATION:
+                of << n << " [type=CONTINUATION];\n";
+                break;
+            default:
+                of << n << " [type=unknown];\n";
+                break;
+        }
+
+        if (!HEAP_ALLOCED((StgPtr) c)) {
+            of << n << " [static=yes];\n";
+        } else {
+            bdescr *bd = Bdescr((StgPtr) c);
+            of << n << " [gen=" << bd->gen_no << "];\n";
+            if (bd->flags & BF_EVACUATED) {
+                of << n << " [evacuated=yes];\n";
+            }
+            if (bd->flags & BF_PINNED) {
+                of << n << " [pinned=yes];\n";
+            }
+            if (bd->flags & BF_LARGE) {
+                of << n << " [large=yes];\n";
+            } else if (bd->flags & BF_NONMOVING) {
+                struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) c);
+                nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) c);
+                uint8_t mark = nonmovingGetMark(seg, block_idx);
+                StgClosure *snapshot_loc =
+                  (StgClosure *) nonmovingSegmentGetBlock(seg, nonmovingSegmentInfo(seg)->next_free_snap);
+                if (c > snapshot_loc) {
+                    of << n << " [nonmoving=yes new=yes mark=" << (StgWord) mark << "];\n";
+                } else {
+                    of << n << " [nonmoving=yes mark=" << (StgWord) mark << "];\n";
+                }
+            } else {
+                of << n << " [moving=yes];\n";
+            }
+        }
+        for (TaggedClosurePtr p : collect_pointers(c)) {
+            of << n << " -> " << NodeName(p.untag()) << ";\n";
+            env.enqueue(p);
+        }
+    }
+    of << "}\n";
+}
+
+void dump_heap_to(const char *fname);
+void dump_heap_to(const char *fname)
+{
+    std::ofstream out(fname);
+    dump_heap(out);
+    out.flush();
+}
+
+void check_gc()
+{
+    CheckGc env;
+    enqueue_roots(env);
+    std::vector<Error> errors;
+
+    while (!env.finished()) {
+        TaggedClosurePtr tagged = env.pop();
+        StgClosure* c = tagged.untag();
+
+        {
+            CheckVisitor check;
+            check.check_closure(tagged);
+            for (const Error& e : check.get_errors()) {
+                errors.push_back(e);
+            }
+        }
+
+        for (TaggedClosurePtr p : collect_pointers(c)) {
+            env.enqueue(p);
+        }
+
+        if (c->header.info == (StgInfoTable *) 0xaaaaaaaaaaaaaaaa) {
+            errors.push_back(Error(c, "is invalid closure"));
+            continue;
+        }
+
+        const StgInfoTable *info = get_itbl(c);
+        if (!HEAP_ALLOCED((StgPtr) c)) {
+            switch (info->type) {
+            case THUNK_STATIC:
+                if (info->srt != 0) {
+            
+                }
+            }
+        } else {
+            bdescr *bd = Bdescr((StgPtr) c);
+            if (bd->gen_no < 1) {
+                /* nothing to check as we are focused on post nonmoving-GC checking */
+            } else if (bd->flags & BF_NONMOVING && bd->flags & BF_LARGE) {
+                if (bd->flags & BF_NONMOVING_SWEEPING && !(bd->flags & BF_MARKED)) {
+                    errors.push_back(Error(c, "is not marked yet being swept"));
+                }
+            } else if (bd->flags & BF_NONMOVING) {
+                struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) c);
+                nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) c);
+                uint8_t mark = nonmovingGetMark(seg, block_idx);
+                StgClosure *snapshot_loc =
+                  (StgClosure *) nonmovingSegmentGetBlock(seg, nonmovingSegmentInfo(seg)->next_free_snap);
+                if (bd->flags & BF_NONMOVING_SWEEPING) {
+                    /* in a swept segment */
+                    if (mark != nonmovingMarkEpoch) {
+                        errors.push_back(Error(c, "is unmarked nonmoving object being swept"));
+                    }
+                } else if (c < snapshot_loc) {
+                    /* not in a swept segment but in the snapshot */
+                    if (mark != nonmovingMarkEpoch) {
+                        errors.push_back(Error(c, "is unmarked nonmoving object in the snapshot"));
+                    }
+                } else {
+                    /* not in the snapshot; nothing to assert */
+                }
+            } else if (bd->flags & BF_LARGE) {
+                if (! (bd->flags & BF_MARKED)) {
+                    errors.push_back(Error(c, "is unmarked large object"));
+                }
+            } else {
+                if (!(bd->flags & BF_EVACUATED)) {
+                    //errors.push_back(Error(c, "is in from-space block"));
+                }
+            }
+        }
+    }
+
+    if (!errors.empty()) {
+        for (auto err : errors) {
+            std::cerr << err << "\n";
+        }
+        dump_heap_to("heap.dot");
+        abort();
+    }
+}
+


=====================================
rts/sm/Evac.c
=====================================
@@ -1232,7 +1232,7 @@ unchain_thunk_selectors(StgSelector *p, StgClosure *val)
    -------------------------------------------------------------------------- */
 
 static void
-eval_thunk_selector (StgClosure **q, StgSelector *p, bool evac)
+eval_thunk_selector (StgClosure **const q, StgSelector *p, bool evac)
                  // NB. for legacy reasons, p & q are swapped around :(
 {
     uint32_t field;
@@ -1252,13 +1252,22 @@ selector_chain:
 
     bd = Bdescr((StgPtr)p);
     if (HEAP_ALLOCED_GC(p)) {
+        const uint16_t flags = RELAXED_LOAD(&bd->flags);
+
+        // We should never see large objects here
+        ASSERT(!(flags & BF_LARGE));
+
         // If the THUNK_SELECTOR is in to-space or in a generation that we
         // are not collecting, then bale out early.  We won't be able to
         // save any space in any case, and updating with an indirection is
         // trickier in a non-collected gen: we would have to update the
         // mutable list.
-        if (RELAXED_LOAD(&bd->flags) & (BF_EVACUATED | BF_NONMOVING)) {
+        if (flags & (BF_EVACUATED | BF_NONMOVING)) {
             unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p);
+            if (flags & BF_NONMOVING) {
+                // See Note [Non-moving GC: Marking evacuated objects].
+                markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure*) p);
+            }
             *q = (StgClosure *)p;
             // shortcut, behave as for:  if (evac) evacuate(q);
             if (evac && bd->gen_no < gct->evac_gen_no) {
@@ -1273,7 +1282,7 @@ selector_chain:
         // (scavenge_mark_stack doesn't deal with IND).  BEWARE!  This
         // bit is very tricky to get right.  If you make changes
         // around here, test by compiling stage 3 with +RTS -c -RTS.
-        if (bd->flags & BF_MARKED) {
+        if (flags & BF_MARKED) {
             // must call evacuate() to mark this closure if evac==true
             *q = (StgClosure *)p;
             if (evac) evacuate(q);
@@ -1313,6 +1322,12 @@ selector_chain:
             //   - undo the chain we've built to point to p.
             SET_INFO((StgClosure *)p, (const StgInfoTable *)info_ptr);
             RELEASE_STORE(q, (StgClosure *) p);
+            if (Bdescr((StgPtr)p)->flags & BF_NONMOVING) {
+                // See Note [Non-moving GC: Marking evacuated objects].
+                // TODO: This really shouldn't be necessary since whoever won
+                // the race should have pushed
+                markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure*) p);
+            }
             if (evac) evacuate(q);
             unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p);
             return;
@@ -1403,6 +1418,11 @@ selector_loop:
                   case THUNK_SELECTOR:
                       // Use payload to make a list of thunk selectors, to be
                       // used in unchain_thunk_selectors
+                      //
+                      // FIXME: This seems racy; should we lock this selector to
+                      // ensure that another thread doesn't clobber this node
+                      // of the chain. This would result in some previous
+                      // selectors not being updated when we unchain.
                       RELAXED_STORE(&((StgClosure*)p)->payload[0], (StgClosure *)prev_thunk_selector);
                       prev_thunk_selector = p;
                       p = (StgSelector*)val;
@@ -1427,6 +1447,12 @@ selector_loop:
               // eval_thunk_selector(), because we know val is not
               // a THUNK_SELECTOR.
               if (evac) evacuate(q);
+
+              if (isNonmovingClosure(*q)) {
+                  // See Note [Non-moving GC: Marking evacuated objects].
+                  markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure*) *q);
+              }
+
               return;
           }
 
@@ -1471,6 +1497,10 @@ selector_loop:
           // recurse indefinitely, so we impose a depth bound.
           // See Note [Selector optimisation depth limit].
           if (gct->thunk_selector_depth >= MAX_THUNK_SELECTOR_DEPTH) {
+              if (isNonmovingClosure((StgClosure *) p)) {
+                  // See Note [Non-moving GC: Marking evacuated objects].
+                  markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure*) p);
+              }
               goto bale_out;
           }
 
@@ -1517,5 +1547,9 @@ bale_out:
     if (evac) {
         copy(q,(const StgInfoTable *)info_ptr,(StgClosure *)p,THUNK_SELECTOR_sizeW(),bd->dest_no);
     }
+    if (isNonmovingClosure(*q)) {
+        // See Note [Non-moving GC: Marking evacuated objects].
+        markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, *q);
+    }
     unchain_thunk_selectors(prev_thunk_selector, *q);
 }


=====================================
rts/sm/GC.c
=====================================
@@ -312,8 +312,6 @@ GarbageCollect (uint32_t collect_gen,
   CostCentreStack *save_CCS[getNumCapabilities()];
 #endif
 
-  ACQUIRE_SM_LOCK;
-
 #if defined(RTS_USER_SIGNALS)
   if (RtsFlags.MiscFlags.install_signal_handlers) {
     // block signals
@@ -591,9 +589,7 @@ GarbageCollect (uint32_t collect_gen,
   // the current garbage collection, so we invoke LdvCensusForDead().
   if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
       || RtsFlags.ProfFlags.bioSelector != NULL) {
-      RELEASE_SM_LOCK; // LdvCensusForDead may need to take the lock
       LdvCensusForDead(N);
-      ACQUIRE_SM_LOCK;
   }
 #endif
 
@@ -762,7 +758,7 @@ GarbageCollect (uint32_t collect_gen,
         }
         else // not compacted
         {
-            freeChain(gen->old_blocks);
+            freeChain_lock(gen->old_blocks);
         }
 
         gen->old_blocks = NULL;
@@ -773,7 +769,7 @@ GarbageCollect (uint32_t collect_gen,
          * collection from large_objects.  Any objects left on the
          * large_objects list are therefore dead, so we free them here.
          */
-        freeChain(gen->large_objects);
+        freeChain_lock(gen->large_objects);
         gen->large_objects  = gen->scavenged_large_objects;
         gen->n_large_blocks = gen->n_scavenged_large_blocks;
         gen->n_large_words  = countOccupied(gen->large_objects);
@@ -892,7 +888,7 @@ GarbageCollect (uint32_t collect_gen,
   if (mark_stack_top_bd != NULL) {
       debugTrace(DEBUG_gc, "mark stack: %d blocks",
                  countBlocks(mark_stack_top_bd));
-      freeChain(mark_stack_top_bd);
+      freeChain_lock(mark_stack_top_bd);
   }
 
   // Free any bitmaps.
@@ -944,9 +940,7 @@ GarbageCollect (uint32_t collect_gen,
 
   // Start any pending finalizers.  Must be after
   // updateStableTables() and stableUnlock() (see #4221).
-  RELEASE_SM_LOCK;
   scheduleFinalizers(cap, dead_weak_ptr_list);
-  ACQUIRE_SM_LOCK;
 
   // check sanity after GC
   // before resurrectThreads(), because that might overwrite some
@@ -961,9 +955,7 @@ GarbageCollect (uint32_t collect_gen,
   // behind.
   if (do_heap_census) {
       debugTrace(DEBUG_sched, "performing heap census");
-      RELEASE_SM_LOCK;
       heapCensus(mut_time);
-      ACQUIRE_SM_LOCK;
   }
 
 #if defined(TICKY_TICKY)
@@ -977,14 +969,14 @@ GarbageCollect (uint32_t collect_gen,
 #endif
 
   // send exceptions to any threads which were about to die
-  RELEASE_SM_LOCK;
   resurrectThreads(resurrected_threads);
-  ACQUIRE_SM_LOCK;
 
   // Finally free the deferred mblocks by sorting the deferred free list and
   // merging it into the actual sorted free list. This needs to happen here so
   // that the `returnMemoryToOS` call down below can successfully free memory.
+  ACQUIRE_SM_LOCK;
   commitMBlockFreeing();
+  RELEASE_SM_LOCK;
 
   if (major_gc) {
       W_ need_prealloc, need_live, need, got;
@@ -1097,8 +1089,6 @@ GarbageCollect (uint32_t collect_gen,
   }
 #endif
 
-  RELEASE_SM_LOCK;
-
   SET_GCT(saved_gct);
 }
 
@@ -1148,7 +1138,7 @@ new_gc_thread (uint32_t n, gc_thread *t)
         // but can't, because it uses gct which isn't set up at this point.
         // Hence, allocate a block for todo_bd manually:
         {
-            bdescr *bd = allocBlockOnNode(capNoToNumaNode(n));
+            bdescr *bd = allocBlockOnNode_lock(capNoToNumaNode(n));
                 // no lock, locks aren't initialised yet
             initBdescr(bd, ws->gen, ws->gen->to);
             bd->flags = BF_EVACUATED;
@@ -1607,7 +1597,7 @@ static void
 stash_mut_list (Capability *cap, uint32_t gen_no)
 {
     cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
-    RELEASE_STORE(&cap->mut_lists[gen_no], allocBlockOnNode_sync(cap->node));
+    RELEASE_STORE(&cap->mut_lists[gen_no], allocBlockOnNode_lock(cap->node));
 }
 
 /* ----------------------------------------------------------------------------
@@ -1634,9 +1624,9 @@ prepare_collected_gen (generation *gen)
         // a check for NULL in recordMutable().
         for (i = 0; i < getNumCapabilities(); i++) {
             bdescr *old = RELAXED_LOAD(&getCapability(i)->mut_lists[g]);
-            freeChain(old);
+            freeChain_lock(old);
 
-            bdescr *new = allocBlockOnNode(capNoToNumaNode(i));
+            bdescr *new = allocBlockOnNode_lock(capNoToNumaNode(i));
             RELAXED_STORE(&getCapability(i)->mut_lists[g], new);
         }
     }
@@ -1719,7 +1709,7 @@ prepare_collected_gen (generation *gen)
         bitmap_size = gen->n_old_blocks * BLOCK_SIZE / BITS_IN(W_);
 
         if (bitmap_size > 0) {
-            bitmap_bdescr = allocGroup((StgWord)BLOCK_ROUND_UP(bitmap_size)
+            bitmap_bdescr = allocGroup_lock((StgWord)BLOCK_ROUND_UP(bitmap_size)
                                        / BLOCK_SIZE);
             gen->bitmap = bitmap_bdescr;
             bitmap = bitmap_bdescr->start;


=====================================
rts/sm/GC.h
=====================================
@@ -13,8 +13,6 @@
 
 #pragma once
 
-#include "HeapAlloc.h"
-
 #include "BeginPrivate.h"
 
 void GarbageCollect (uint32_t collect_gen,


=====================================
rts/sm/GCUtils.c
=====================================
@@ -26,38 +26,15 @@
 #include "WSDeque.h"
 #endif
 
-#if defined(THREADED_RTS)
-SpinLock gc_alloc_block_sync;
-#endif
-
 static void push_todo_block(bdescr *bd, gen_workspace *ws);
 
-bdescr* allocGroup_sync(uint32_t n)
-{
-    bdescr *bd;
-    uint32_t node = capNoToNumaNode(gct->thread_index);
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    bd = allocGroupOnNode(node,n);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-    return bd;
-}
-
-bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n)
-{
-    bdescr *bd;
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    bd = allocGroupOnNode(node,n);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-    return bd;
-}
-
 static uint32_t
-allocBlocks_sync(uint32_t n, bdescr **hd)
+allocBlocks_lock(uint32_t n, bdescr **hd)
 {
     bdescr *bd;
     uint32_t i;
     uint32_t node = capNoToNumaNode(gct->thread_index);
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+    ACQUIRE_SM_LOCK;
     bd = allocLargeChunkOnNode(node,1,n);
     // NB. allocLargeChunk, rather than allocGroup(n), to allocate in a
     // fragmentation-friendly way.
@@ -70,27 +47,11 @@ allocBlocks_sync(uint32_t n, bdescr **hd)
     bd[n-1].link = NULL;
     // We have to hold the lock until we've finished fiddling with the metadata,
     // otherwise the block allocator can get confused.
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+    RELEASE_SM_LOCK;
     *hd = bd;
     return n;
 }
 
-void
-freeChain_sync(bdescr *bd)
-{
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    freeChain(bd);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-}
-
-void
-freeGroup_sync(bdescr *bd)
-{
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    freeGroup(bd);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-}
-
 /* -----------------------------------------------------------------------------
    Workspace utilities
    -------------------------------------------------------------------------- */
@@ -303,7 +264,7 @@ todo_block_full (uint32_t size, gen_workspace *ws)
                 // object.  However, if the object we're copying is
                 // larger than a block, then we might have an empty
                 // block here.
-                freeGroup_sync(bd);
+                freeGroup_lock(bd);
             } else {
                 push_scanned_block(bd, ws);
             }
@@ -343,14 +304,14 @@ alloc_todo_block (gen_workspace *ws, uint32_t size)
     else
     {
         if (size > BLOCK_SIZE_W) {
-            bd = allocGroup_sync((W_)BLOCK_ROUND_UP(size*sizeof(W_))
+            bd = allocGroup_lock((W_)BLOCK_ROUND_UP(size*sizeof(W_))
                                  / BLOCK_SIZE);
         } else {
             if (gct->free_blocks) {
                 bd = gct->free_blocks;
                 gct->free_blocks = bd->link;
             } else {
-                allocBlocks_sync(16, &bd);
+                allocBlocks_lock(16, &bd);
                 gct->free_blocks = bd->link;
             }
         }


=====================================
rts/sm/GCUtils.h
=====================================
@@ -17,22 +17,6 @@
 
 #include "BeginPrivate.h"
 
-bdescr* allocGroup_sync(uint32_t n);
-bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n);
-
-INLINE_HEADER bdescr *allocBlock_sync(void)
-{
-    return allocGroup_sync(1);
-}
-
-INLINE_HEADER bdescr *allocBlockOnNode_sync(uint32_t node)
-{
-    return allocGroupOnNode_sync(node,1);
-}
-
-void    freeChain_sync(bdescr *bd);
-void    freeGroup_sync(bdescr *bd);
-
 void    push_scanned_block   (bdescr *bd, gen_workspace *ws);
 StgPtr  todo_block_full      (uint32_t size, gen_workspace *ws);
 StgPtr  alloc_todo_block     (gen_workspace *ws, uint32_t size);
@@ -62,7 +46,7 @@ recordMutableGen_GC (StgClosure *p, uint32_t gen_no)
     bd = gct->mut_lists[gen_no];
     if (bd->free >= bd->start + BLOCK_SIZE_W) {
         bdescr *new_bd;
-        new_bd = allocBlock_sync();
+        new_bd = allocBlock_lock();
         new_bd->link = bd;
         bd = new_bd;
         gct->mut_lists[gen_no] = bd;


=====================================
rts/sm/HeapAlloc.h
=====================================
@@ -10,6 +10,8 @@
 
 #include "BeginPrivate.h"
 
+#include "Storage.h"
+
 /* -----------------------------------------------------------------------------
    The HEAP_ALLOCED() test.
 
@@ -210,9 +212,9 @@ StgBool HEAP_ALLOCED_GC(const void *p)
     } else {
         // putting the rest out of line turned out to be a slight
         // performance improvement:
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        ACQUIRE_SM_LOCK; // TODO: this may be too expensive
         b = HEAP_ALLOCED_miss(mblock,p);
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+        RELEASE_SM_LOCK;
         return b;
     }
 }


=====================================
rts/sm/MarkStack.h
=====================================
@@ -32,7 +32,7 @@ push_mark_stack(StgPtr p)
         }
         else
         {
-            bd = allocBlock_sync();
+            bd = allocBlock_lock();
             bd->link = mark_stack_bd;
             bd->u.back = NULL;
             mark_stack_bd->u.back = bd; // double-link the new block on


=====================================
rts/sm/NonMoving.c
=====================================
@@ -598,14 +598,10 @@ static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node)
 
     // Nothing in the free list, allocate a new segment...
     if (ret == NULL) {
-        // Take gc spinlock: another thread may be scavenging a moving
-        // generation and call `todo_block_full`
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
         bdescr *bd = allocAlignedGroupOnNode(node, NONMOVING_SEGMENT_BLOCKS);
         // See Note [Live data accounting in nonmoving collector].
         oldest_gen->n_blocks += bd->blocks;
         oldest_gen->n_words  += BLOCK_SIZE_W * bd->blocks;
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
 
         for (StgWord32 i = 0; i < bd->blocks; ++i) {
             initBdescr(&bd[i], oldest_gen, oldest_gen);
@@ -668,7 +664,7 @@ static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *al
     }
 }
 
-/* Allocate a block in the nonmoving heap. Caller must hold SM_MUTEX. sz is in words */
+/* Allocate a block in the nonmoving heap. sz is in words */
 GNUC_ATTR_HOT
 void *nonmovingAllocate(Capability *cap, StgWord sz)
 {
@@ -708,7 +704,9 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
 
         // there are no active segments, allocate new segment
         if (new_current == NULL) {
+            ACQUIRE_SM_LOCK;
             new_current = nonmovingAllocSegment(cap->node);
+            RELEASE_SM_LOCK;
             nonmovingInitSegment(new_current, log_block_size);
         }
 
@@ -791,14 +789,13 @@ void nonmovingExit(void)
 /*
  * Assumes that no garbage collector or mutator threads are running to safely
  * resize the nonmoving_allocators.
- *
- * Must hold sm_mutex.
  */
 void nonmovingAddCapabilities(uint32_t new_n_caps)
 {
     unsigned int old_n_caps = nonmovingHeap.n_caps;
     struct NonmovingAllocator **allocs = nonmovingHeap.allocators;
 
+    ACQUIRE_SM_LOCK;
     for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
         struct NonmovingAllocator *old = allocs[i];
         allocs[i] = alloc_nonmoving_allocator(new_n_caps);
@@ -820,6 +817,7 @@ void nonmovingAddCapabilities(uint32_t new_n_caps)
         }
     }
     nonmovingHeap.n_caps = new_n_caps;
+    RELEASE_SM_LOCK;
 }
 
 void nonmovingClearBitmap(struct NonmovingSegment *seg)
@@ -844,13 +842,14 @@ static void nonmovingPrepareMark(void)
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
 
         // Update current segments' snapshot pointers
-        for (uint32_t cap_n = 0; cap_n < getNumCapabilities(); ++cap_n) {
+        for (uint32_t cap_n = 0; cap_n < nonmovingHeap.n_caps; ++cap_n) {
             struct NonmovingSegment *seg = alloca->current[cap_n];
             nonmovingSegmentInfo(seg)->next_free_snap = seg->next_free;
         }
 
         // Save the filled segments for later processing during the concurrent
         // mark phase.
+        ASSERT(alloca->saved_filled == NULL);
         alloca->saved_filled = alloca->filled;
         alloca->filled = NULL;
 
@@ -926,6 +925,7 @@ void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads)
     ASSERT(n_nonmoving_marked_compact_blocks == 0);
 
     MarkQueue *mark_queue = stgMallocBytes(sizeof(MarkQueue), "mark queue");
+    mark_queue->blocks = NULL;
     initMarkQueue(mark_queue);
     current_mark_queue = mark_queue;
 
@@ -1090,6 +1090,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
             seg->link = nonmovingHeap.sweep_list;
             nonmovingHeap.sweep_list = filled;
         }
+        nonmovingHeap.allocators[alloca_idx]->saved_filled = NULL;
     }
 
     // Mark Weak#s
@@ -1183,7 +1184,8 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
         nonmoving_old_threads = END_TSO_QUEUE;
     }
 
-    nonmoving_weak_ptr_list = nonmoving_old_weak_ptr_list;
+    // At this point point any weak that remains on nonmoving_old_weak_ptr_list
+    // has a dead key.
     nonmoving_old_weak_ptr_list = NULL;
 
     // Prune spark lists
@@ -1236,6 +1238,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
 
 #if defined(THREADED_RTS) && defined(NONCONCURRENT_SWEEP)
 #if defined(DEBUG)
+    check_gc();
     checkNonmovingHeap(&nonmovingHeap);
     checkSanity(true, true);
 #endif
@@ -1293,7 +1296,7 @@ void assert_in_nonmoving_heap(StgPtr p)
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
         // Search current segments
-        for (uint32_t cap_idx = 0; cap_idx < getNumCapabilities(); ++cap_idx) {
+        for (uint32_t cap_idx = 0; cap_idx < nonmovingHeap.n_caps; ++cap_idx) {
             struct NonmovingSegment *seg = alloca->current[cap_idx];
             if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
                 return;
@@ -1365,7 +1368,7 @@ void nonmovingPrintAllocator(struct NonmovingAllocator *alloc)
         debugBelch("%p ", (void*)seg);
     }
     debugBelch("\nCurrent segments:\n");
-    for (uint32_t i = 0; i < getNumCapabilities(); ++i) {
+    for (uint32_t i = 0; i < nonmovingHeap.n_caps; ++i) {
         debugBelch("%p ", alloc->current[i]);
     }
     debugBelch("\n");
@@ -1376,7 +1379,7 @@ void locate_object(P_ obj)
     // Search allocators
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
-        for (uint32_t cap = 0; cap < getNumCapabilities(); ++cap) {
+        for (uint32_t cap = 0; cap < nonmovingHeap.n_caps; ++cap) {
             struct NonmovingSegment *seg = alloca->current[cap];
             if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
                 debugBelch("%p is in current segment of capability %d of allocator %d at %p\n", obj, cap, alloca_idx, (void*)seg);


=====================================
rts/sm/NonMovingMark.c
=====================================
@@ -37,6 +37,7 @@ static void trace_PAP_payload (MarkQueue *queue,
                                StgClosure *fun,
                                StgClosure **payload,
                                StgWord size);
+static bool isNonmovingWeak(StgWeak *weak);
 
 // How many Array# entries to add to the mark queue at once?
 #define MARK_ARRAY_CHUNK_LENGTH 128
@@ -270,6 +271,7 @@ static void nonmovingAddUpdRemSetBlocks_(MarkQueue *rset)
     bdescr *end = start;
     while (end->link != NULL)
         end = end->link;
+    rset->blocks = NULL;
 
     // add the blocks to the global remembered set
     ACQUIRE_LOCK(&upd_rem_set_lock);
@@ -291,10 +293,8 @@ static void nonmovingAddUpdRemSetBlocks_lock(MarkQueue *rset)
 
     nonmovingAddUpdRemSetBlocks_(rset);
     // Reset the state of the remembered set.
-    ACQUIRE_SM_LOCK;
     init_mark_queue_(rset);
     rset->is_upd_rem_set = true;
-    RELEASE_SM_LOCK;
 }
 
 /*
@@ -468,9 +468,7 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
 }
 
 /* A variant of push to be used by the minor GC when it encounters a reference
- * to an object in the non-moving heap. In contrast to the other push
- * operations this uses the gc_alloc_block_sync spinlock instead of the
- * SM_LOCK to allocate new blocks in the event that the mark queue is full.
+ * to an object in the non-moving heap.
  */
 void
 markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
@@ -491,13 +489,13 @@ markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
     if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
         // Yes, this block is full.
         // allocate a fresh block.
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        ACQUIRE_SM_LOCK;
         bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
         bd->link = q->blocks;
         q->blocks = bd;
         q->top = (MarkQueueBlock *) bd->start;
         q->top->head = 0;
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+        RELEASE_SM_LOCK;
     }
 
     MarkQueueEnt ent = {
@@ -647,6 +645,16 @@ void updateRemembSetPushThunkEager(Capability *cap,
         }
         break;
     }
+    case THUNK_SELECTOR:
+    {
+        StgSelector *sel = (StgSelector *) thunk;
+        if (check_in_nonmoving_heap(sel->selectee)) {
+            // Don't bother to push origin; it makes the barrier needlessly
+            // expensive with little benefit.
+            push_closure(queue, sel->selectee, NULL);
+        }
+        break;
+    }
     case AP:
     {
         StgAP *ap = (StgAP *) thunk;
@@ -656,9 +664,11 @@ void updateRemembSetPushThunkEager(Capability *cap,
         trace_PAP_payload(queue, ap->fun, ap->payload, ap->n_args);
         break;
     }
-    case THUNK_SELECTOR:
+    // We may end up here if a thunk update races with another update.
+    // In this case there is nothing to do as the other thread will have
+    // already pushed the updated thunk's free variables to the update
+    // remembered set.
     case BLACKHOLE:
-        // TODO: This is right, right?
         break;
     // The selector optimization performed by the nonmoving mark may have
     // overwritten a thunk which we are updating with an indirection.
@@ -916,7 +926,8 @@ static MarkQueueEnt markQueuePop (MarkQueue *q)
 /* Must hold sm_mutex. */
 static void init_mark_queue_ (MarkQueue *queue)
 {
-    bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
+    bdescr *bd = allocGroup_lock(MARK_QUEUE_BLOCKS);
+    ASSERT(queue->blocks == NULL);
     queue->blocks = bd;
     queue->top = (MarkQueueBlock *) bd->start;
     queue->top->head = 0;
@@ -926,14 +937,12 @@ static void init_mark_queue_ (MarkQueue *queue)
 #endif
 }
 
-/* Must hold sm_mutex. */
 void initMarkQueue (MarkQueue *queue)
 {
     init_mark_queue_(queue);
     queue->is_upd_rem_set = false;
 }
 
-/* Must hold sm_mutex. */
 void nonmovingInitUpdRemSet (UpdRemSet *rset)
 {
     init_mark_queue_(&rset->queue);
@@ -1522,10 +1531,12 @@ mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin)
         break;
     }
 
+    case WEAK:
+        ASSERT(isNonmovingWeak((StgWeak*) p));
+        // fallthrough
     gen_obj:
     case CONSTR:
     case CONSTR_NOCAF:
-    case WEAK:
     case PRIM:
     {
         for (StgWord i = 0; i < info->layout.payload.ptrs; i++) {
@@ -1582,8 +1593,15 @@ mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin)
     }
 
     case THUNK_SELECTOR:
-        nonmoving_eval_thunk_selector(queue, (StgSelector*)p, origin);
+    {
+        StgSelector *sel = (StgSelector *) p;
+        // We may be able to evaluate this selector which may render the
+        // selectee unreachable. However, we must mark the selectee regardless
+        // to satisfy the snapshot invariant.
+        PUSH_FIELD(sel, selectee);
+        nonmoving_eval_thunk_selector(queue, sel, origin);
         break;
+    }
 
     case AP_STACK: {
         StgAP_STACK *ap = (StgAP_STACK *)p;
@@ -1915,6 +1933,17 @@ void nonmovingMarkWeakPtrList (struct MarkQueue_ *queue)
     }
 }
 
+static bool isNonmovingWeak(StgWeak *weak)
+{
+    for (StgWeak *w = nonmoving_old_weak_ptr_list; w != NULL; w = w->link) {
+        if (w == weak) return true;
+    }
+    for (StgWeak *w = nonmoving_weak_ptr_list; w != NULL; w = w->link) {
+        if (w == weak) return true;
+    }
+    return false;
+}
+
 // Non-moving heap variant of `tidyWeakList`
 bool nonmovingTidyWeaks (struct MarkQueue_ *queue)
 {
@@ -1947,7 +1976,7 @@ bool nonmovingTidyWeaks (struct MarkQueue_ *queue)
             *last_w = w->link;
             next_w = w->link;
 
-            // and put it on the weak ptr list
+            // and put it on nonmoving_weak_ptr_list
             w->link = nonmoving_weak_ptr_list;
             nonmoving_weak_ptr_list = w;
         } else {


=====================================
rts/sm/NonMovingMark.h
=====================================
@@ -9,10 +9,11 @@
 #pragma once
 
 #include "Task.h"
-#include "NonMoving.h"
 
 #include "BeginPrivate.h"
 
+struct NonMovingHeap;
+
 enum EntryType {
     NULL_ENTRY = 0,
     MARK_CLOSURE = 1,
@@ -63,7 +64,7 @@ INLINE_HEADER enum EntryType nonmovingMarkQueueEntryType(MarkQueueEnt *ent)
 {
     uintptr_t tag = (uintptr_t) ent->null_entry.p & TAG_MASK;
     ASSERT(tag <= MARK_ARRAY);
-    return tag;
+    return (enum EntryType) tag;
 }
 
 typedef struct {
@@ -155,7 +156,7 @@ void markQueueAddRoot(MarkQueue* q, StgClosure** root);
 
 void initMarkQueue(MarkQueue *queue);
 void freeMarkQueue(MarkQueue *queue);
-void nonmovingMark(struct MarkQueue_ *restrict queue);
+void nonmovingMark(struct MarkQueue_ *__restrict__ queue);
 
 void nonmovingMarkWeakPtrList(struct MarkQueue_ *queue);
 bool nonmovingTidyWeaks(struct MarkQueue_ *queue);


=====================================
rts/sm/NonMovingShortcut.c
=====================================
@@ -10,6 +10,7 @@
 #include "Rts.h"
 #include "GC.h"
 #include "SMPClosureOps.h"
+#include "NonMoving.h"
 #include "NonMovingMark.h"
 #include "NonMovingShortcut.h"
 #include "Printer.h"


=====================================
rts/sm/Sanity.c
=====================================
@@ -639,6 +639,7 @@ void checkNonmovingHeap (const struct NonmovingHeap *heap)
     for (unsigned int i=0; i < NONMOVING_ALLOCA_CNT; i++) {
         const struct NonmovingAllocator *alloc = heap->allocators[i];
         checkNonmovingSegments(alloc->filled);
+        checkNonmovingSegments(alloc->saved_filled);
         checkNonmovingSegments(alloc->active);
         for (unsigned int cap=0; cap < getNumCapabilities(); cap++) {
             checkNonmovingSegments(alloc->current[cap]);
@@ -1071,6 +1072,7 @@ findMemoryLeak (void)
         for (i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
             struct NonmovingAllocator *alloc = nonmovingHeap.allocators[i];
             markNonMovingSegments(alloc->filled);
+            markNonMovingSegments(alloc->saved_filled);
             markNonMovingSegments(alloc->active);
             for (j = 0; j < getNumCapabilities(); j++) {
                 markNonMovingSegments(alloc->current[j]);


=====================================
rts/sm/Sanity.h
=====================================
@@ -19,6 +19,8 @@
 # define MAX_SLOTS      100000
 # endif
 
+struct NonmovingHeap;
+
 /* debugging routines */
 void checkSanity        ( bool after_gc, bool major_gc );
 void checkNurserySanity ( nursery *nursery );


=====================================
rts/sm/Scav.c
=====================================
@@ -1730,7 +1730,7 @@ scavenge_capability_mut_lists (Capability *cap)
     if (RtsFlags.GcFlags.useNonmoving && major_gc) {
         uint32_t g = oldest_gen->no;
         scavenge_mutable_list(cap->saved_mut_lists[g], oldest_gen);
-        freeChain_sync(cap->saved_mut_lists[g]);
+        freeChain_lock(cap->saved_mut_lists[g]);
         cap->saved_mut_lists[g] = NULL;
         return;
     }
@@ -1743,7 +1743,7 @@ scavenge_capability_mut_lists (Capability *cap)
      */
     for (uint32_t g = RtsFlags.GcFlags.generations-1; g > N; g--) {
         scavenge_mutable_list(cap->saved_mut_lists[g], &generations[g]);
-        freeChain_sync(cap->saved_mut_lists[g]);
+        freeChain_lock(cap->saved_mut_lists[g]);
         cap->saved_mut_lists[g] = NULL;
     }
 }


=====================================
rts/sm/Storage.c
=====================================
@@ -193,14 +193,13 @@ initStorage (void)
   initMutex(&sm_mutex);
 #endif
 
-  ACQUIRE_SM_LOCK;
-
   /* allocate generation info array */
   generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations
                                              * sizeof(struct generation_),
                                              "initStorage: gens");
 
   /* Initialise all generations */
+  ACQUIRE_SM_LOCK;
   for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
       initGeneration(&generations[g], g);
   }
@@ -214,16 +213,11 @@ initStorage (void)
       generations[g].to = &generations[g+1];
   }
   oldest_gen->to = oldest_gen;
+  RELEASE_SM_LOCK;
 
   // Nonmoving heap uses oldest_gen so initialize it after initializing oldest_gen
   nonmovingInit();
 
-#if defined(THREADED_RTS)
-  // nonmovingAddCapabilities allocates segments, which requires taking the gc
-  // sync lock, so initialize it before nonmovingAddCapabilities
-  initSpinLock(&gc_alloc_block_sync);
-#endif
-
   if (RtsFlags.GcFlags.useNonmoving)
       nonmovingAddCapabilities(getNumCapabilities());
 
@@ -261,8 +255,6 @@ initStorage (void)
 
   IF_DEBUG(gc, statDescribeGens());
 
-  RELEASE_SM_LOCK;
-
   traceInitEvent(traceHeapInfo);
 
 }
@@ -314,17 +306,20 @@ void storageAddCapabilities (uint32_t from, uint32_t to)
     assignNurseriesToCapabilities(from,to);
 
     // allocate a block for each mut list
+    ACQUIRE_SM_LOCK;
     for (n = from; n < to; n++) {
         for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
             getCapability(n)->mut_lists[g] =
                 allocBlockOnNode(capNoToNumaNode(n));
         }
     }
+    RELEASE_SM_LOCK;
 
     // Initialize NonmovingAllocators and UpdRemSets
     if (RtsFlags.GcFlags.useNonmoving) {
         nonmovingAddCapabilities(to);
-        for (i = 0; i < to; ++i) {
+        for (i = from; i < to; i++) {
+            getCapability(i)->upd_rem_set.queue.blocks = NULL;
             nonmovingInitUpdRemSet(&getCapability(i)->upd_rem_set);
         }
     }
@@ -564,9 +559,7 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     // Allocate the blackhole indirection closure
     if (RtsFlags.GcFlags.useNonmoving) {
         // See Note [Static objects under the nonmoving collector].
-        ACQUIRE_SM_LOCK;
         bh = (StgInd *)nonmovingAllocate(cap, sizeofW(*bh));
-        RELEASE_SM_LOCK;
         recordMutableCap((StgClosure*)bh,
                          regTableToCapability(reg), oldest_gen->no);
     } else {
@@ -724,6 +717,7 @@ allocNursery (uint32_t node, bdescr *tail, W_ blocks)
     // automatic prefetching works across nursery blocks.  This is a
     // tiny optimisation (~0.5%), but it's free.
 
+    ACQUIRE_SM_LOCK;
     while (blocks > 0) {
         n = stg_min(BLOCKS_PER_MBLOCK, blocks);
         // allocLargeChunk will prefer large chunks, but will pick up
@@ -759,6 +753,7 @@ allocNursery (uint32_t node, bdescr *tail, W_ blocks)
 
         tail = &bd[0];
     }
+    RELEASE_SM_LOCK;
 
     return &bd[0];
 }
@@ -878,7 +873,7 @@ resizeNurseriesEach (W_ blocks)
                 next_bd = bd->link;
                 next_bd->u.back = NULL;
                 nursery_blocks -= bd->blocks; // might be a large block
-                freeGroup(bd);
+                freeGroup_lock(bd);
                 bd = next_bd;
             }
             nursery->blocks = bd;
@@ -1299,9 +1294,7 @@ allocatePinned (Capability *cap, W_ n /*words*/, W_ alignment /*bytes*/, W_ alig
         if (bd == NULL) {
             // The pinned block list is empty: allocate a fresh block (we can't fail
             // here).
-            ACQUIRE_SM_LOCK;
             bd = allocNursery(cap->node, NULL, PINNED_EMPTY_SIZE);
-            RELEASE_SM_LOCK;
         }
 
         // Bump up the nursery pointer to avoid the pathological situation



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/1fdffe157170bb42741c44bb03d44f61ed5f8822...f095454b5bcf0d604e62c784931f80b5d7f0fb88

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/1fdffe157170bb42741c44bb03d44f61ed5f8822...f095454b5bcf0d604e62c784931f80b5d7f0fb88
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20221204/6c1957d2/attachment-0001.html>


More information about the ghc-commits mailing list