[Git][ghc/ghc][wip/T22264] 6 commits: rts: Drop racy assertion

Tue Dec 6 12:29:19 UTC 2022


Ben Gamari pushed to branch wip/T22264 at Glasgow Haskell Compiler / GHC


Commits:
b57fd39a by Ben Gamari at 2022-12-06T07:29:00-05:00
rts: Drop racy assertion

0e274c39bf836d5bb846f5fa08649c75f85326ac added an assertion in
`dirty_MUT_VAR` checking that the MUT_VAR being dirtied was clean.
However, this isn't necessarily the case since another thread may have
raced us to dirty the object.

- - - - -
4fc3d778 by Ben Gamari at 2022-12-06T07:29:00-05:00
rts: Drop SM spinlock

- - - - -
8334376c by Ben Gamari at 2022-12-06T07:29:00-05:00
rts: C++ typing issues

Make the RTS compilable with a C++ compiler by inserting necessary
casts.

- - - - -
d4209888 by Ben Gamari at 2022-12-06T07:29:00-05:00
CheckGC

- - - - -
9828dd44 by Ben Gamari at 2022-12-06T07:29:00-05:00
setNumCapabilities

- - - - -
14f1045d by Ben Gamari at 2022-12-06T07:29:00-05:00
nonmoving: Disable shortcutting

- - - - -


26 changed files:

- libraries/base/GHC/Conc/Sync.hs
- rts/Capability.h
- rts/CheckUnload.c
- rts/Schedule.c
- rts/Schedule.h
- rts/Sparks.h
- rts/Stats.c
- rts/Trace.h
- rts/include/rts/Threads.h
- rts/include/rts/storage/MBlock.h
- rts/rts.cabal.in
- rts/sm/BlockAlloc.c
- + rts/sm/CheckGc.cpp
- rts/sm/GC.c
- rts/sm/GC.h
- rts/sm/GCUtils.c
- rts/sm/GCUtils.h
- rts/sm/HeapAlloc.h
- rts/sm/MarkStack.h
- rts/sm/NonMoving.c
- rts/sm/NonMovingMark.c
- rts/sm/NonMovingMark.h
- rts/sm/NonMovingShortcut.c
- rts/sm/Sanity.h
- rts/sm/Scav.c
- rts/sm/Storage.c


Changes:

=====================================
libraries/base/GHC/Conc/Sync.hs
=====================================
@@ -380,10 +380,15 @@ to avoid contention with other processes in the machine.
 setNumCapabilities :: Int -> IO ()
 setNumCapabilities i
   | i <= 0    = failIO $ "setNumCapabilities: Capability count ("++show i++") must be positive"
-  | otherwise = c_setNumCapabilities (fromIntegral i)
+  | otherwise = do
+      ret <- c_setNumCapabilities (fromIntegral i)
+      case ret of
+        0 -> return ()
+        1 -> yield >> setNumCapabilities i
+        _ -> failIO $ "setNumCapabilities: Unknown result"
 
 foreign import ccall safe "setNumCapabilities"
-  c_setNumCapabilities :: CUInt -> IO ()
+  c_setNumCapabilities :: CUInt -> IO CInt
 
 -- | Returns the number of CPUs that the machine has
 --


=====================================
rts/Capability.h
=====================================
@@ -20,10 +20,10 @@
 
 #pragma once
 
-#include "sm/GC.h" // for evac_fn
 #include "Task.h"
 #include "Sparks.h"
-#include "sm/NonMovingMark.h" // for MarkQueue
+#include "sm/GC.h" // for evac_fn
+#include "sm/NonMovingMark.h" // for UpdRemSet
 #include "IOManager.h" // for CapIOManager
 
 #include "BeginPrivate.h"


=====================================
rts/CheckUnload.c
=====================================
@@ -15,6 +15,7 @@
 #include "Hash.h"
 #include "LinkerInternals.h"
 #include "CheckUnload.h"
+#include "sm/HeapAlloc.h"
 #include "sm/Storage.h"
 #include "sm/GCThread.h"
 #include "sm/HeapUtils.h"


=====================================
rts/Schedule.c
=====================================
@@ -2225,9 +2225,12 @@ forkProcess(HsStablePtr *entry
  * Finally we release the Capabilities we are holding, and start
  * worker Tasks on the new Capabilities we created.
  *
+ * One wrinkle here is that we must also ensure that we don't change the
+ * capability count while the nonmoving mark thread is active.
+ *
  * ------------------------------------------------------------------------- */
 
-void
+int
 setNumCapabilities (uint32_t new_n_capabilities USED_IF_THREADS)
 {
 #if !defined(THREADED_RTS)
@@ -2247,11 +2250,15 @@ setNumCapabilities (uint32_t new_n_capabilities USED_IF_THREADS)
     Capability *old_capabilities = NULL;
     uint32_t old_n_capabilities = n_capabilities;
 
+    if (RELAXED_LOAD(&concurrent_coll_running)) {
+        return 1;
+    }
+
     if (new_n_capabilities == enabled_capabilities) {
-        return;
+        return 0;
     } else if (new_n_capabilities <= 0) {
         errorBelch("setNumCapabilities: Capability count must be positive");
-        return;
+        return 1;
     }
 
 
@@ -2353,6 +2360,7 @@ setNumCapabilities (uint32_t new_n_capabilities USED_IF_THREADS)
 
     rts_unlock(cap);
 
+    return 0;
 #endif // THREADED_RTS
 }
 


=====================================
rts/Schedule.h
=====================================
@@ -131,7 +131,7 @@ setRecentActivity(enum RecentActivity new_value)
 INLINE_HEADER enum RecentActivity
 getRecentActivity(void)
 {
-    return RELAXED_LOAD_ALWAYS(&recent_activity);
+    return (enum RecentActivity) RELAXED_LOAD_ALWAYS(&recent_activity);
 }
 
 extern bool heap_overflow;


=====================================
rts/Sparks.h
=====================================
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "sm/GC.h" // for evac_fn
 #include "WSDeque.h"
 
 #include "BeginPrivate.h"


=====================================
rts/Stats.c
=====================================
@@ -963,11 +963,6 @@ static void report_summary(const RTSSummaryStats* sum)
                     , col_width[1], "SpinLock"
                     , col_width[2], "Spins"
                     , col_width[3], "Yields");
-        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                    , col_width[0], ""
-                    , col_width[1], "gc_alloc_block_sync"
-                    , col_width[2], gc_alloc_block_sync.spin
-                    , col_width[3], gc_alloc_block_sync.yield);
         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
                     , col_width[0], ""
                     , col_width[1], "whitehole_gc"
@@ -1142,10 +1137,6 @@ static void report_machine_readable (const RTSSummaryStats * sum)
 
     // next, internal counters
 #if defined(PROF_SPIN)
-    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
-    MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
-            gc_alloc_block_sync.yield);
-    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
     MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
     MR_STAT("waitForGcThreads_yield", FMT_Word64,
             waitForGcThreads_yield);
@@ -1572,9 +1563,6 @@ SpinLock:
 Not all of these are actual SpinLocks, see the details below.
 
 Actual SpinLocks:
-* gc_alloc_block:
-    This SpinLock protects the block allocator and free list manager. See
-    BlockAlloc.c.
 * gen[g].sync:
     These SpinLocks, one per generation, protect the generations[g] data
     structure during garbage collection.


=====================================
rts/Trace.h
=====================================
@@ -33,7 +33,9 @@ void tracingAddCapapilities (uint32_t from, uint32_t to);
 #endif /* TRACING */
 
 typedef StgWord32 CapsetID;
+#if !defined(__cplusplus)
 typedef StgWord16 CapsetType;
+#endif
 enum CapsetType { CapsetTypeCustom = CAPSET_TYPE_CUSTOM,
                   CapsetTypeOsProcess = CAPSET_TYPE_OSPROCESS,
                   CapsetTypeClockdomain = CAPSET_TYPE_CLOCKDOMAIN };


=====================================
rts/include/rts/Threads.h
=====================================
@@ -85,4 +85,4 @@ extern Capability MainCapability;
 // Change the number of capabilities (only supports increasing the
 // current value at the moment).
 //
-extern void setNumCapabilities (uint32_t new_);
+extern int setNumCapabilities (uint32_t new_);


=====================================
rts/include/rts/storage/MBlock.h
=====================================
@@ -26,7 +26,3 @@ extern void freeAllMBlocks(void);
 extern void *getFirstMBlock(void **state);
 extern void *getNextMBlock(void **state, void *mblock);
 
-#if defined(THREADED_RTS)
-// needed for HEAP_ALLOCED below
-extern SpinLock gc_alloc_block_sync;
-#endif


=====================================
rts/rts.cabal.in
=====================================
@@ -619,6 +619,7 @@ library
                  linker/elf_tlsgd.c
                  linker/elf_util.c
                  sm/BlockAlloc.c
+                 sm/CheckGc.cpp
                  sm/CNF.c
                  sm/Compact.c
                  sm/Evac.c


=====================================
rts/sm/BlockAlloc.c
=====================================
@@ -438,6 +438,7 @@ alloc_mega_group (uint32_t node, StgWord mblocks)
     bdescr *best, *bd;
     StgWord n;
 
+    ASSERT_SM_LOCK();
     n = MBLOCK_GROUP_BLOCKS(mblocks);
 
     if(defer_mblock_frees)
@@ -496,6 +497,7 @@ allocGroupOnNode (uint32_t node, W_ n)
     bdescr *bd, *rem;
     StgWord ln;
 
+    ASSERT_SM_LOCK();
     if (n == 0) barf("allocGroup: requested zero blocks");
 
     if (n >= BLOCKS_PER_MBLOCK)
@@ -709,6 +711,7 @@ bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max)
 {
     bdescr *bd;
     StgWord ln, lnmax;
+    ASSERT_SM_LOCK();
 
     if (min >= BLOCKS_PER_MBLOCK) {
         return allocGroupOnNode(node,max);
@@ -933,8 +936,7 @@ freeGroup(bdescr *p)
   StgWord ln;
   uint32_t node;
 
-  // not true in multithreaded GC:
-  // ASSERT_SM_LOCK();
+  ASSERT_SM_LOCK();
 
   ASSERT(RELAXED_LOAD(&p->free) != (P_)-1);
 


=====================================
rts/sm/CheckGc.cpp
=====================================
@@ -0,0 +1,973 @@
+extern "C" {
+#include "Rts.h"
+#include "StableName.h" /* for FOR_EACH_STABLE_NAME */
+#include "StablePtr.h" /* for markStablePtrTable */
+#include "Schedule.h" /* for markScheduler */
+#include "Capability.h"
+#include "HeapAlloc.h"
+#include "STM.h"
+}
+
+#include <iostream>
+#include <fstream>
+#include <set>
+#include <vector>
+#include <queue>
+#include <unordered_set>
+
+class TaggedClosurePtr {
+    StgClosure *ptr;
+public:
+    TaggedClosurePtr(StgClosure* ptr) : ptr(ptr) {}
+    TaggedClosurePtr(StgClosure* ptr, uint8_t tag) : TaggedClosurePtr(TAG_CLOSURE(tag, ptr)) {}
+
+    StgClosure *get_tagged() const {
+        return ptr;
+    }
+    StgClosure *untag() const {
+        return UNTAG_CLOSURE(ptr);
+    }
+    uint8_t get_tag() const {
+        return (StgWord) ptr & TAG_MASK;
+    }
+
+    //inline StgClosure& operator->() { return *untag(); }
+
+    friend inline bool operator==(const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) {
+        return lhs.ptr == rhs.ptr;
+    }
+    friend inline bool operator!=(const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) { return !(lhs == rhs); }
+    friend inline bool operator< (const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) {
+        return lhs.ptr < rhs.ptr;
+    }
+    friend inline bool operator> (const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) { return rhs < lhs; }
+    friend inline bool operator<=(const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) { return !(lhs > rhs); }
+    friend inline bool operator>=(const TaggedClosurePtr& lhs, const TaggedClosurePtr& rhs) { return !(lhs < rhs); }
+};
+
+template<>
+struct std::hash<TaggedClosurePtr> {
+    std::size_t operator()(TaggedClosurePtr const& p) const noexcept {
+        return std::hash<StgClosure*>{}(p.get_tagged());
+    }
+};
+
+class HeapVisitor {
+public:
+    // Visit an SRT
+    virtual void visit_srt(StgClosure* c);
+
+    // Visit a normal closure
+    virtual void visit_closure(TaggedClosurePtr c);
+
+    virtual void visit_thunk(StgThunk *thunk, size_t n_ptrs);
+    virtual void visit_fun(StgClosure *constr, size_t n_ptrs);
+    virtual void visit_constr(StgClosure *constr, size_t n_ptrs);
+    virtual void visit_array(StgMutArrPtrs *arr);
+    virtual void visit_small_array(StgSmallMutArrPtrs *arr);
+    virtual void visit_bytearray(StgArrBytes* arr);
+
+    virtual void visit_stack(StgPtr sp, StgPtr end);
+    virtual void visit_tso(StgTSO* tso);
+    virtual void visit_weak(StgWeak* w);
+    virtual void visit_mvar(StgMVar* mvar);
+    virtual void visit_tvar(StgTVar* tvar);
+    virtual void visit_trec_header(StgTRecHeader *trec);
+    virtual void visit_trec_chunk(StgTRecChunk* tc);
+    virtual void visit_continuation(StgContinuation* tc);
+
+    virtual void visit_small_bitmap(StgClosure *const *payload, StgWord bitmap, StgWord size);
+    virtual void visit_large_bitmap(StgClosure *const *payload, const StgLargeBitmap *bitmap, StgWord size);
+    void visit_pap_payload(StgClosure *fun, StgClosure **payload, StgWord n_args);
+
+    virtual void visit_invalid(StgClosure *const c);
+};
+
+void HeapVisitor::visit_thunk(StgThunk *thunk, size_t n_ptrs)
+{
+    const StgInfoTable *info = get_itbl((StgClosure *) thunk);
+    const StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(info);
+    if (thunk_info->i.srt) {
+        StgClosure *srt = (StgClosure*) GET_SRT(thunk_info);
+        visit_srt(srt);
+    };
+    for (size_t i=0; i < n_ptrs; i++) {
+        visit_closure(thunk->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_fun(StgClosure *fun, size_t n_ptrs)
+{
+    const StgInfoTable *info = get_itbl(fun);
+    const StgFunInfoTable *fun_info = itbl_to_fun_itbl(info);
+    if (fun_info->i.srt) {
+        StgClosure *srt = (StgClosure*) GET_SRT(fun_info);
+        visit_srt(srt);
+    };
+    for (size_t i=0; i < n_ptrs; i++) {
+        visit_closure(fun->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_constr(StgClosure *constr, size_t n_ptrs)
+{
+    for (size_t i=0; i < n_ptrs; i++) {
+        visit_closure(constr->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_srt(StgClosure* c)
+{
+    visit_closure(c);
+}
+
+void HeapVisitor::visit_invalid(StgClosure *const _c)
+{
+    abort();
+}
+
+void HeapVisitor::visit_weak(StgWeak* w)
+{
+    visit_closure(w->key);
+    visit_closure(w->value);
+    visit_closure(w->finalizer);
+    visit_closure(w->cfinalizers);
+}
+
+void HeapVisitor::visit_mvar(StgMVar* mvar)
+{
+    visit_closure((StgClosure*) mvar->head);
+    visit_closure((StgClosure*) mvar->tail);
+    visit_closure(mvar->value);
+}
+
+void HeapVisitor::visit_small_array(StgSmallMutArrPtrs *arr)
+{
+    for (StgWord i=0; i < arr->ptrs; i++) {
+        visit_closure(arr->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_array(StgMutArrPtrs *arr)
+{
+    for (StgWord i=0; i < arr->ptrs; i++) {
+        visit_closure(arr->payload[i]);
+    }
+}
+
+void HeapVisitor::visit_bytearray(StgArrBytes* _arr) { }
+
+void HeapVisitor::visit_tso(StgTSO *tso)
+{
+    if (tso->bound != NULL) {
+
+        visit_closure((StgClosure*) tso->bound->tso);
+    }
+    if (tso->label != NULL) {
+        visit_closure({(StgClosure*) tso->label});
+    }
+    visit_closure((StgClosure*) tso->blocked_exceptions);
+    visit_closure((StgClosure*) tso->bq);
+    visit_closure((StgClosure*) tso->stackobj);
+    visit_closure((StgClosure*) tso->_link);
+    visit_trec_header(tso->trec);
+
+    switch (tso->why_blocked) {
+    case BlockedOnMVar:
+    case BlockedOnMVarRead:
+    case BlockedOnBlackHole:
+    case BlockedOnMsgThrowTo:
+    case NotBlocked:
+        visit_closure(tso->block_info.closure);
+        break;
+    default:
+        break;
+    }
+}
+
+void HeapVisitor::visit_continuation(StgContinuation *cont)
+{
+    visit_stack(cont->stack, cont->stack + cont->stack_size);
+}
+
+void HeapVisitor::visit_tvar(StgTVar *tvar)
+{
+    visit_closure(tvar->current_value);
+    visit_closure((StgClosure*) tvar->first_watch_queue_entry);
+}
+
+void HeapVisitor::visit_trec_header(StgTRecHeader *trec)
+{
+    if (trec == NO_TREC) {
+        return;
+    }
+    visit_trec_chunk(trec->current_chunk);
+    visit_closure((StgClosure*) trec->enclosing_trec);
+}
+
+void HeapVisitor::visit_trec_chunk(StgTRecChunk *tc)
+{
+    if (tc->prev_chunk != END_STM_CHUNK_LIST) {
+        visit_closure((StgClosure*) tc->prev_chunk);
+    }
+
+    for (uint32_t i = 0; i < tc->next_entry_idx; i++) {
+        TRecEntry *e = &tc->entries[i];
+        visit_closure((StgClosure*)e->tvar);
+        visit_closure(e->expected_value);
+        visit_closure(e->new_value);
+    }
+}
+
+void HeapVisitor::visit_stack(StgPtr p, StgPtr stack_end)
+{
+    while (p < stack_end) {
+        const StgRetInfoTable* info = get_ret_itbl((StgClosure *) p);
+
+        auto add_srt_ptrs = [&] () {
+            if (info->i.srt) {
+                StgClosure *srt = (StgClosure*)GET_SRT(info);
+                visit_srt(srt);
+            }
+        };
+
+        switch (info->i.type) {
+  
+        case UPDATE_FRAME:
+        {
+            StgUpdateFrame *frame = (StgUpdateFrame *)p;
+            visit_closure(frame->updatee);
+            p += sizeofW(StgUpdateFrame);
+            continue;
+        }
+  
+        case CATCH_STM_FRAME:
+        case CATCH_RETRY_FRAME:
+        case ATOMICALLY_FRAME:
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
+        case CATCH_FRAME:
+        case RET_SMALL:
+        {
+            StgWord bitmap = BITMAP_BITS(info->i.layout.bitmap);
+            StgWord size   = BITMAP_SIZE(info->i.layout.bitmap);
+            // NOTE: the payload starts immediately after the info-ptr, we
+            // don't have an StgHeader in the same sense as a heap closure.
+            p++;
+            visit_small_bitmap((StgClosure**) p, bitmap, size);
+            p += size;
+            add_srt_ptrs();
+            continue;
+        }
+
+        case RET_BCO:
+        {
+            p++;
+            StgBCO *bco = (StgBCO *)*p;
+            visit_closure((StgClosure *) bco);
+            p++;
+            StgWord size = BCO_BITMAP_SIZE(bco);
+            visit_large_bitmap((StgClosure**) p, BCO_BITMAP(bco), size);
+            p += size;
+            continue;
+        }
+
+        case RET_BIG:
+        {
+            StgWord size = GET_LARGE_BITMAP(&info->i)->size;
+            p++;
+            visit_large_bitmap((StgClosure**) p, GET_LARGE_BITMAP(&info->i), size);
+            p += size;
+            // and don't forget to follow the SRT
+            add_srt_ptrs();
+            break;
+        }
+
+        case RET_FUN:
+        {
+            StgRetFun *ret_fun = (StgRetFun *)p;
+            visit_closure(ret_fun->fun);
+
+            const StgFunInfoTable *fun_info = get_fun_itbl(UNTAG_CLOSURE(ret_fun->fun));
+            switch (fun_info->f.fun_type) {
+                case ARG_GEN:
+                {
+                    StgWord bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+                    StgWord size = BITMAP_SIZE(fun_info->f.b.bitmap);
+                    visit_small_bitmap(ret_fun->payload, bitmap, size);
+                    p = (StgPtr) ((StgClosure **) &ret_fun->payload + size);
+                    break;
+                }
+                case ARG_GEN_BIG:
+                {
+                    StgWord size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+                    visit_large_bitmap(ret_fun->payload, GET_FUN_LARGE_BITMAP(fun_info), size);
+                    p = (StgPtr) ((StgClosure **) &ret_fun->payload + size);
+                    break;
+                }
+                default:
+                {
+                    StgWord bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+                    StgWord size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+                    visit_small_bitmap(ret_fun->payload, bitmap, size);
+                    p = (StgPtr) ((StgClosure **) &ret_fun->payload + size);
+                    break;
+                }
+            }
+            add_srt_ptrs();
+            break;
+        }
+        default:
+            abort();
+        }
+    }
+}
+
+void HeapVisitor::visit_small_bitmap(
+        StgClosure *const *payload,
+        StgWord bitmap,
+        StgWord size)
+{
+    while (size > 0) {
+        if ((bitmap & 1) == 0) {
+            visit_closure(*payload);
+        }
+        payload++;
+        bitmap = bitmap >> 1;
+        size--;
+    }
+}
+
+void HeapVisitor::visit_large_bitmap(
+        StgClosure *const * payload,
+        const StgLargeBitmap *large_bitmap,
+        StgWord size)
+{
+    // Bitmap may have more bits than `size` when scavenging PAP payloads. See
+    // comments around StgPAP.
+    ASSERT(large_bitmap->size >= size);
+
+    uint32_t b = 0;
+    for (uint32_t i = 0; i < size; b++) {
+        StgWord bitmap = large_bitmap->bitmap[b];
+        uint32_t j = stg_min(size-i, BITS_IN(W_));
+        i += j;
+        for (; j > 0; j--, payload++) {
+            if ((bitmap & 1) == 0) {
+                visit_closure(*payload);
+            }
+            bitmap = bitmap >> 1;
+        }
+    }
+}
+
+void HeapVisitor::visit_pap_payload(
+        StgClosure *fun,
+        StgClosure **payload,
+        StgWord n_args)
+{
+    fun = UNTAG_CLOSURE(fun);
+    const StgFunInfoTable *fun_info = get_fun_itbl(fun);
+    ASSERT(fun_info->i.type != PAP);
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+        visit_small_bitmap(payload, BITMAP_BITS(fun_info->f.b.bitmap), n_args);
+        break;
+    case ARG_GEN_BIG:
+        visit_large_bitmap(payload, GET_FUN_LARGE_BITMAP(fun_info), n_args);
+        break;
+    case ARG_BCO:
+        visit_large_bitmap(payload, BCO_BITMAP(fun), n_args);
+        break;
+    default:
+    {
+        StgWord bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+        visit_small_bitmap(payload, bitmap, n_args);
+    }
+    }
+}
+
+void HeapVisitor::visit_closure(TaggedClosurePtr tagged)
+{
+    StgClosure *c = tagged.untag();
+    if (c->header.info == (StgInfoTable *) 0xaaaaaaaaaaaaaaaa || !LOOKS_LIKE_CLOSURE_PTR(c)) {
+        visit_invalid(c);
+        return;
+    }
+
+    const StgInfoTable *info = get_itbl(c);
+    auto generic_closure = [&] () {
+        for (StgClosure **p = &c->payload[0]; p < &c->payload[info->layout.payload.ptrs]; p++) {
+            visit_closure(*p);
+        }
+    };
+
+    switch (info->type) {
+
+    case MVAR_CLEAN:
+    case MVAR_DIRTY:
+        visit_mvar((StgMVar *) c);
+        break;
+    case TVAR:
+        visit_tvar((StgTVar *) c);
+        break;
+
+    case IND:
+    case IND_STATIC:
+        visit_closure(((StgInd *) c)->indirectee);
+        break;
+
+    case THUNK_0_1:
+    case THUNK_0_2:
+        visit_thunk((StgThunk*) c, 0);
+        break;
+    case THUNK_1_1:
+    case THUNK_1_0:
+        visit_thunk((StgThunk*) c, 1);
+        break;
+    case THUNK_2_0:
+        visit_thunk((StgThunk*) c, 2);
+        break;
+    case THUNK:
+        visit_thunk((StgThunk*) c, info->layout.payload.ptrs);
+        break;
+    case THUNK_STATIC:
+        visit_thunk((StgThunk*) c, 0);
+        break;
+
+    case FUN_1_0:
+        visit_fun(c, 1);
+        break;
+    case FUN_0_1:
+    case FUN_0_2:
+        visit_fun(c, 0);
+        break;
+    case FUN_1_1:
+        visit_fun(c, 1);
+        break;
+    case FUN_2_0:
+        visit_fun(c, 2);
+        break;
+    case FUN:
+    case FUN_STATIC:
+        visit_fun(c, info->layout.payload.ptrs);
+        break;
+
+    case CONSTR_0_1:
+    case CONSTR_0_2:
+        visit_constr(c, 0);
+        break;
+    case CONSTR_1_0:
+    case CONSTR_1_1:
+        visit_constr(c, 1);
+        break;
+    case CONSTR_2_0:
+        visit_constr(c, 2);
+        break;
+    case CONSTR:
+    case CONSTR_NOCAF:
+        visit_constr(c, info->layout.payload.ptrs);
+        break;
+
+    case PRIM:
+        generic_closure();
+        break;
+    case WEAK:
+        visit_weak((StgWeak*) c);
+        break;
+    case BCO:
+    {
+        StgBCO *bco = (StgBCO *)c;
+        visit_closure((StgClosure*) bco->instrs);
+        visit_closure((StgClosure*) bco->literals);
+        visit_closure((StgClosure*) bco->ptrs);
+        break;
+    }
+    case BLACKHOLE:
+    {
+        StgInd *ind = (StgInd*) c;
+        visit_closure(ind->indirectee);
+        break;
+    }
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    {
+        StgMutVar *mv = (StgMutVar*) c;
+        visit_closure(mv->var);
+        break;
+    }
+    case BLOCKING_QUEUE:
+    {
+        StgBlockingQueue *bq = (StgBlockingQueue *)c;
+        visit_closure((StgClosure*) bq->bh);
+        visit_closure((StgClosure*) bq->owner);
+        visit_closure((StgClosure*) bq->queue);
+        visit_closure((StgClosure*) bq->link);
+        break;
+    }
+    case THUNK_SELECTOR:
+    {
+        StgSelector *s = (StgSelector *)c;
+        visit_closure(s->selectee);
+        break;
+    }
+    case AP_STACK:
+    {
+        StgAP_STACK *ap = (StgAP_STACK *)c;
+        visit_closure(ap->fun);
+        visit_stack((StgPtr) ap->payload, (StgPtr) ap->payload + ap->size);
+        break;
+    }
+    case PAP:
+    {
+        StgPAP *pap = (StgPAP*) c;
+        visit_closure(pap->fun);
+        visit_pap_payload(pap->fun, (StgClosure**) pap->payload, pap->n_args);
+        break;
+    }
+    case AP:
+    {
+        StgAP *ap = (StgAP*) c;
+        visit_closure(ap->fun);
+        visit_pap_payload(ap->fun, (StgClosure**) ap->payload, ap->n_args);
+        break;
+    }
+    case ARR_WORDS:
+        visit_bytearray((StgArrBytes *) c);
+        break;
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+        visit_array((StgMutArrPtrs *) c);
+        break;
+    case SMALL_MUT_ARR_PTRS_CLEAN:
+    case SMALL_MUT_ARR_PTRS_DIRTY:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
+        visit_small_array((StgSmallMutArrPtrs *) c);
+        break;
+    case TSO:
+        visit_tso((StgTSO *) c);
+        break;
+    case STACK:
+    {
+        StgStack *stack = (StgStack *) c;
+        visit_stack(stack->sp, stack->stack + stack->stack_size);
+        break;
+    }
+    case MUT_PRIM:
+        generic_closure();
+        break;
+    case TREC_CHUNK:
+        visit_trec_chunk((StgTRecChunk *) c);
+        break;
+    case CONTINUATION:
+        visit_continuation((StgContinuation *) c);
+        break;
+    default:
+        visit_invalid(c);
+        break;
+    }
+}
+
+class PredicatedHeapVisitor : HeapVisitor {
+    bool should_visit(StgClosure *);
+
+    virtual void visit_srt(StgClosure* c) {
+        if (should_visit(c)) { HeapVisitor::visit_srt(c); }
+    }
+
+    virtual void visit_closure(TaggedClosurePtr c) {
+        if (should_visit(c.untag())) { HeapVisitor::visit_closure(c); }
+    }
+};
+
+// Collect direct pointers
+struct CollectPointers : HeapVisitor {
+    std::set<TaggedClosurePtr> accum;
+    bool invalid;
+    CollectPointers() : accum(), invalid(false) {}
+    void visit_root(StgClosure *c) {
+        HeapVisitor::visit_closure(c);
+    }
+    void visit_closure(TaggedClosurePtr c) {
+        accum.insert(c);
+    }
+    void visit_invalid(StgClosure *const _c) { invalid = true; }
+};
+
+static std::set<TaggedClosurePtr> collect_pointers(StgClosure* c)
+{
+    CollectPointers v;
+    v.visit_root(c);
+    return v.accum;
+}
+
+
+
+struct Error {
+    StgClosure *closure;
+    std::string what;
+    Error(StgClosure *closure, std::string what) : closure(closure), what(what) {}
+};
+
+static std::ostream& operator<<(std::ostream& os, const Error& err) {
+    os << std::hex << "0x" << (StgWord) err.closure << ": " << err.what << "\n";
+    return os;
+}
+
+class CheckVisitor : HeapVisitor {
+    std::vector<Error> errors;
+    uint8_t tag;
+    void visit_constr(StgClosure* c) {
+        const StgInfoTable *info = get_itbl(c);
+        if (tag != 0) {
+            uint8_t constr_tag = info->srt;  // zero-based
+            if (tag != std::min(TAG_MASK, constr_tag+1)) {
+                errors.push_back(Error(c, "invalid tag"));
+            }
+        }
+    }
+
+    void visit_closure(TaggedClosurePtr c) { }
+public:
+    const std::vector<Error>& get_errors() const { return errors; }
+
+    void check_closure(TaggedClosurePtr c) {
+        tag = c.get_tag();
+        HeapVisitor::visit_closure(c);
+    }
+};
+
+struct CheckGc {
+    std::queue<TaggedClosurePtr> queue;
+    std::unordered_set<TaggedClosurePtr> enqueued;
+
+    void enqueue(TaggedClosurePtr ptr) {
+        ASSERT(ptr != NULL);
+        if (!is_enqueued(ptr)) {
+            queue.push(ptr);
+            enqueued.insert(ptr);
+        }
+    }
+
+    bool finished() {
+        return queue.empty();
+    }
+
+    TaggedClosurePtr pop() {
+        TaggedClosurePtr p = queue.front();
+        queue.pop();
+        return p;
+    }
+
+    bool is_enqueued(TaggedClosurePtr ptr) {
+        return enqueued.find(ptr) != enqueued.end();
+    }
+};
+
+static void enqueue_root(void *user_data, StgClosure **root)
+{
+    CheckGc* env = (CheckGc*) user_data;
+    env->enqueue(*root);
+}
+
+static void enqueue_roots(CheckGc& env)
+{
+    FOR_EACH_STABLE_NAME(p, if (p->sn_obj) env.enqueue(p->sn_obj););
+    markStablePtrTable(enqueue_root, &env);
+    for (uint32_t n = 0; n < getNumCapabilities(); n++) {
+        markCapability(enqueue_root, (void*) &env, getCapability(n), false/*mark sparks*/);
+    }
+    markCAFs(enqueue_root, &env);
+    markScheduler(enqueue_root, &env);
+
+    for (StgWeak *w = nonmoving_weak_ptr_list; w != NULL; w = w->link) {
+        env.enqueue((StgClosure *) w);
+    }
+
+    for (uint32_t g = 0; g <= N; g++) {
+        generation *gen = &generations[g];
+        for (StgWeak *w = gen->weak_ptr_list; w != NULL; w = RELAXED_LOAD(&w->link)) {
+            env.enqueue((StgClosure *) w);
+        }
+    }
+}
+
+extern "C" {
+void check_gc();
+}
+
+struct NodeName {
+    const StgClosure *c;
+    NodeName(const StgClosure *c) : c(c) {}
+};
+
+static std::ostream& operator<<(std::ostream& os, const NodeName& n) {
+    os << std::hex << "\"" << n.c << "\"" << std::dec;
+    return os;
+}
+
+static void dump_heap(std::ofstream& of)
+{
+    of << "digraph {\n";
+    CheckGc env;
+    enqueue_roots(env);
+    while (!env.finished()) {
+        TaggedClosurePtr tagged = env.pop();
+        StgClosure* c = tagged.untag();
+        NodeName n(c);
+        if (c->header.info == (StgInfoTable *) 0xaaaaaaaaaaaaaaaa) {
+            of << n << " [type=invalid];\n";
+            continue;
+        }
+
+        const StgInfoTable *info = get_itbl(c);
+        switch (info->type) {
+            case CONSTR:
+            case CONSTR_1_0:
+            case CONSTR_0_1:
+            case CONSTR_2_0:
+            case CONSTR_1_1:
+            case CONSTR_0_2:
+            case CONSTR_NOCAF:
+            {
+                const StgConInfoTable *con_info = get_con_itbl(c);
+                of << n << " [type=CONSTR constr=\"" << GET_CON_DESC(con_info) << "\"];\n";
+                break;
+            }
+            case FUN:
+            case FUN_1_0:
+            case FUN_0_1:
+            case FUN_2_0:
+            case FUN_1_1:
+            case FUN_0_2:
+                of << n << " [type=FUN];\n";
+                break;
+            case FUN_STATIC:
+                of << n << " [type=FUN_STATIC];\n";
+                break;
+            case THUNK:
+            case THUNK_1_0:
+            case THUNK_0_1:
+            case THUNK_1_1:
+            case THUNK_0_2:
+            case THUNK_2_0:
+                of << n << " [type=THUNK];\n";
+                break;
+            case THUNK_STATIC:
+                of << n << " [type=THUNK_STATIC];\n";
+                break;
+            case THUNK_SELECTOR:
+                of << n << " [type=THUNK_SEL];\n";
+                break;
+            case BCO:
+                of << n << " [type=BCO];\n";
+                break;
+            case AP:
+                of << n << " [type=AP];\n";
+                break;
+            case PAP:
+                of << n << " [type=PAP];\n";
+                break;
+            case AP_STACK:
+                of << n << " [type=AP_STACK];\n";
+                break;
+            case IND:
+                of << n << " [type=IND];\n";
+                break;
+            case IND_STATIC:
+                of << n << " [type=IND_STATIC];\n";
+                break;
+            case BLOCKING_QUEUE:
+                of << n << " [type=BLOCKING_QUEUE];\n";
+                break;
+            case BLACKHOLE:
+                of << n << " [type=BLACKHOLE];\n";
+                break;
+            case MVAR_CLEAN:
+            case MVAR_DIRTY:
+                of << n << " [type=MVAR];\n";
+                break;
+            case TVAR:
+                of << n << " [type=TVAR];\n";
+                break;
+            case ARR_WORDS:
+                of << n << " [type=ARR_WORDS];\n";
+                break;
+            case MUT_ARR_PTRS_CLEAN:
+            case MUT_ARR_PTRS_DIRTY:
+            case MUT_ARR_PTRS_FROZEN_CLEAN:
+            case MUT_ARR_PTRS_FROZEN_DIRTY:
+                of << n << " [type=MUT_ARR_PTRS];\n";
+                break;
+            case SMALL_MUT_ARR_PTRS_CLEAN:
+            case SMALL_MUT_ARR_PTRS_DIRTY:
+            case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+            case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
+                of << n << " [type=SMALL_MUT_ARR_PTRS];\n";
+                break;
+            case MUT_VAR_CLEAN:
+            case MUT_VAR_DIRTY:
+                of << n << " [type=MUT_VAR];\n";
+                break;
+            case WEAK:
+                of << n << " [type=WEAK];\n";
+                break;
+            case PRIM:
+                of << n << " [type=PRIM];\n";
+                break;
+            case MUT_PRIM:
+                of << n << " [type=MUT_PRIM];\n";
+                break;
+            case TSO:
+                of << n << " [type=TSO];\n";
+                break;
+            case STACK:
+                of << n << " [type=STACK];\n";
+                break;
+            case TREC_CHUNK:
+                of << n << " [type=TREC_CHUNK];\n";
+                break;
+            case WHITEHOLE:
+                of << n << " [type=WHITEHOLE];\n";
+                break;
+            case COMPACT_NFDATA:
+                of << n << " [type=COMPACT_NFDATA];\n";
+                break;
+            case CONTINUATION:
+                of << n << " [type=CONTINUATION];\n";
+                break;
+            default:
+                of << n << " [type=unknown];\n";
+                break;
+        }
+
+        if (!HEAP_ALLOCED((StgPtr) c)) {
+            of << n << " [static=yes];\n";
+        } else {
+            bdescr *bd = Bdescr((StgPtr) c);
+            of << n << " [gen=" << bd->gen_no << "];\n";
+            if (bd->flags & BF_EVACUATED) {
+                of << n << " [evacuated=yes];\n";
+            }
+            if (bd->flags & BF_PINNED) {
+                of << n << " [pinned=yes];\n";
+            }
+            if (bd->flags & BF_LARGE) {
+                of << n << " [large=yes];\n";
+            } else if (bd->flags & BF_NONMOVING) {
+                struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) c);
+                nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) c);
+                uint8_t mark = nonmovingGetMark(seg, block_idx);
+                StgClosure *snapshot_loc =
+                  (StgClosure *) nonmovingSegmentGetBlock(seg, nonmovingSegmentInfo(seg)->next_free_snap);
+                if (c > snapshot_loc) {
+                    of << n << " [nonmoving=yes new=yes mark=" << (StgWord) mark << "];\n";
+                } else {
+                    of << n << " [nonmoving=yes mark=" << (StgWord) mark << "];\n";
+                }
+            } else {
+                of << n << " [moving=yes];\n";
+            }
+        }
+        for (TaggedClosurePtr p : collect_pointers(c)) {
+            of << n << " -> " << NodeName(p.untag()) << ";\n";
+            env.enqueue(p);
+        }
+    }
+    of << "}\n";
+}
+
+void dump_heap_to(const char *fname);
+void dump_heap_to(const char *fname)
+{
+    std::ofstream out(fname);
+    dump_heap(out);
+    out.flush();
+}
+
+void check_gc()
+{
+    CheckGc env;
+    enqueue_roots(env);
+    std::vector<Error> errors;
+
+    while (!env.finished()) {
+        TaggedClosurePtr tagged = env.pop();
+        StgClosure* c = tagged.untag();
+
+        {
+            CheckVisitor check;
+            check.check_closure(tagged);
+            for (const Error& e : check.get_errors()) {
+                errors.push_back(e);
+            }
+        }
+
+        for (TaggedClosurePtr p : collect_pointers(c)) {
+            env.enqueue(p);
+        }
+
+        if (c->header.info == (StgInfoTable *) 0xaaaaaaaaaaaaaaaa) {
+            errors.push_back(Error(c, "is invalid closure"));
+            continue;
+        }
+
+        const StgInfoTable *info = get_itbl(c);
+        if (!HEAP_ALLOCED((StgPtr) c)) {
+            switch (info->type) {
+            case THUNK_STATIC:
+                if (info->srt != 0) {
+            
+                }
+            }
+        } else {
+            bdescr *bd = Bdescr((StgPtr) c);
+            if (bd->gen_no < 1) {
+                /* nothing to check as we are focused on post nonmoving-GC checking */
+            } else if (bd->flags & BF_NONMOVING && bd->flags & BF_LARGE) {
+                if (bd->flags & BF_NONMOVING_SWEEPING && !(bd->flags & BF_MARKED)) {
+                    errors.push_back(Error(c, "is not marked yet being swept"));
+                }
+            } else if (bd->flags & BF_NONMOVING) {
+                struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) c);
+                nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) c);
+                uint8_t mark = nonmovingGetMark(seg, block_idx);
+                StgClosure *snapshot_loc =
+                  (StgClosure *) nonmovingSegmentGetBlock(seg, nonmovingSegmentInfo(seg)->next_free_snap);
+                if (bd->flags & BF_NONMOVING_SWEEPING) {
+                    /* in a swept segment */
+                    if (mark != nonmovingMarkEpoch) {
+                        errors.push_back(Error(c, "is unmarked nonmoving object being swept"));
+                    }
+                } else if (c < snapshot_loc) {
+                    /* not in a swept segment but in the snapshot */
+                    if (mark != nonmovingMarkEpoch) {
+                        errors.push_back(Error(c, "is unmarked nonmoving object in the snapshot"));
+                    }
+                } else {
+                    /* not in the snapshot; nothing to assert */
+                }
+            } else if (bd->flags & BF_LARGE) {
+                if (! (bd->flags & BF_MARKED)) {
+                    errors.push_back(Error(c, "is unmarked large object"));
+                }
+            } else {
+                if (!(bd->flags & BF_EVACUATED)) {
+                    //errors.push_back(Error(c, "is in from-space block"));
+                }
+            }
+        }
+    }
+
+    if (!errors.empty()) {
+        for (auto err : errors) {
+            std::cerr << err << "\n";
+        }
+        dump_heap_to("heap.dot");
+        abort();
+    }
+}
+


=====================================
rts/sm/GC.c
=====================================
@@ -312,8 +312,6 @@ GarbageCollect (uint32_t collect_gen,
   CostCentreStack *save_CCS[getNumCapabilities()];
 #endif
 
-  ACQUIRE_SM_LOCK;
-
 #if defined(RTS_USER_SIGNALS)
   if (RtsFlags.MiscFlags.install_signal_handlers) {
     // block signals
@@ -589,9 +587,7 @@ GarbageCollect (uint32_t collect_gen,
   // the current garbage collection, so we invoke LdvCensusForDead().
   if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
       || RtsFlags.ProfFlags.bioSelector != NULL) {
-      RELEASE_SM_LOCK; // LdvCensusForDead may need to take the lock
       LdvCensusForDead(N);
-      ACQUIRE_SM_LOCK;
   }
 #endif
 
@@ -760,7 +756,7 @@ GarbageCollect (uint32_t collect_gen,
         }
         else // not compacted
         {
-            freeChain(gen->old_blocks);
+            freeChain_lock(gen->old_blocks);
         }
 
         gen->old_blocks = NULL;
@@ -771,7 +767,7 @@ GarbageCollect (uint32_t collect_gen,
          * collection from large_objects.  Any objects left on the
          * large_objects list are therefore dead, so we free them here.
          */
-        freeChain(gen->large_objects);
+        freeChain_lock(gen->large_objects);
         gen->large_objects  = gen->scavenged_large_objects;
         gen->n_large_blocks = gen->n_scavenged_large_blocks;
         gen->n_large_words  = countOccupied(gen->large_objects);
@@ -890,7 +886,7 @@ GarbageCollect (uint32_t collect_gen,
   if (mark_stack_top_bd != NULL) {
       debugTrace(DEBUG_gc, "mark stack: %d blocks",
                  countBlocks(mark_stack_top_bd));
-      freeChain(mark_stack_top_bd);
+      freeChain_lock(mark_stack_top_bd);
   }
 
   // Free any bitmaps.
@@ -942,9 +938,7 @@ GarbageCollect (uint32_t collect_gen,
 
   // Start any pending finalizers.  Must be after
   // updateStableTables() and stableUnlock() (see #4221).
-  RELEASE_SM_LOCK;
   scheduleFinalizers(cap, dead_weak_ptr_list);
-  ACQUIRE_SM_LOCK;
 
   // check sanity after GC
   // before resurrectThreads(), because that might overwrite some
@@ -959,9 +953,7 @@ GarbageCollect (uint32_t collect_gen,
   // behind.
   if (do_heap_census) {
       debugTrace(DEBUG_sched, "performing heap census");
-      RELEASE_SM_LOCK;
       heapCensus(mut_time);
-      ACQUIRE_SM_LOCK;
   }
 
 #if defined(TICKY_TICKY)
@@ -975,14 +967,14 @@ GarbageCollect (uint32_t collect_gen,
 #endif
 
   // send exceptions to any threads which were about to die
-  RELEASE_SM_LOCK;
   resurrectThreads(resurrected_threads);
-  ACQUIRE_SM_LOCK;
 
   // Finally free the deferred mblocks by sorting the deferred free list and
   // merging it into the actual sorted free list. This needs to happen here so
   // that the `returnMemoryToOS` call down below can successfully free memory.
+  ACQUIRE_SM_LOCK;
   commitMBlockFreeing();
+  RELEASE_SM_LOCK;
 
   if (major_gc) {
       W_ need_prealloc, need_live, need, got;
@@ -1095,8 +1087,6 @@ GarbageCollect (uint32_t collect_gen,
   }
 #endif
 
-  RELEASE_SM_LOCK;
-
   SET_GCT(saved_gct);
 }
 
@@ -1146,7 +1136,7 @@ new_gc_thread (uint32_t n, gc_thread *t)
         // but can't, because it uses gct which isn't set up at this point.
         // Hence, allocate a block for todo_bd manually:
         {
-            bdescr *bd = allocBlockOnNode(capNoToNumaNode(n));
+            bdescr *bd = allocBlockOnNode_lock(capNoToNumaNode(n));
                 // no lock, locks aren't initialised yet
             initBdescr(bd, ws->gen, ws->gen->to);
             bd->flags = BF_EVACUATED;
@@ -1605,7 +1595,7 @@ static void
 stash_mut_list (Capability *cap, uint32_t gen_no)
 {
     cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
-    RELEASE_STORE(&cap->mut_lists[gen_no], allocBlockOnNode_sync(cap->node));
+    RELEASE_STORE(&cap->mut_lists[gen_no], allocBlockOnNode_lock(cap->node));
 }
 
 /* ----------------------------------------------------------------------------
@@ -1632,9 +1622,9 @@ prepare_collected_gen (generation *gen)
         // a check for NULL in recordMutable().
         for (i = 0; i < getNumCapabilities(); i++) {
             bdescr *old = RELAXED_LOAD(&getCapability(i)->mut_lists[g]);
-            freeChain(old);
+            freeChain_lock(old);
 
-            bdescr *new = allocBlockOnNode(capNoToNumaNode(i));
+            bdescr *new = allocBlockOnNode_lock(capNoToNumaNode(i));
             RELAXED_STORE(&getCapability(i)->mut_lists[g], new);
         }
     }
@@ -1717,7 +1707,7 @@ prepare_collected_gen (generation *gen)
         bitmap_size = gen->n_old_blocks * BLOCK_SIZE / BITS_IN(W_);
 
         if (bitmap_size > 0) {
-            bitmap_bdescr = allocGroup((StgWord)BLOCK_ROUND_UP(bitmap_size)
+            bitmap_bdescr = allocGroup_lock((StgWord)BLOCK_ROUND_UP(bitmap_size)
                                        / BLOCK_SIZE);
             gen->bitmap = bitmap_bdescr;
             bitmap = bitmap_bdescr->start;


=====================================
rts/sm/GC.h
=====================================
@@ -13,8 +13,6 @@
 
 #pragma once
 
-#include "HeapAlloc.h"
-
 #include "BeginPrivate.h"
 
 void GarbageCollect (uint32_t collect_gen,


=====================================
rts/sm/GCUtils.c
=====================================
@@ -26,38 +26,15 @@
 #include "WSDeque.h"
 #endif
 
-#if defined(THREADED_RTS)
-SpinLock gc_alloc_block_sync;
-#endif
-
 static void push_todo_block(bdescr *bd, gen_workspace *ws);
 
-bdescr* allocGroup_sync(uint32_t n)
-{
-    bdescr *bd;
-    uint32_t node = capNoToNumaNode(gct->thread_index);
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    bd = allocGroupOnNode(node,n);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-    return bd;
-}
-
-bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n)
-{
-    bdescr *bd;
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    bd = allocGroupOnNode(node,n);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-    return bd;
-}
-
 static uint32_t
-allocBlocks_sync(uint32_t n, bdescr **hd)
+allocBlocks_lock(uint32_t n, bdescr **hd)
 {
     bdescr *bd;
     uint32_t i;
     uint32_t node = capNoToNumaNode(gct->thread_index);
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+    ACQUIRE_SM_LOCK;
     bd = allocLargeChunkOnNode(node,1,n);
     // NB. allocLargeChunk, rather than allocGroup(n), to allocate in a
     // fragmentation-friendly way.
@@ -70,27 +47,11 @@ allocBlocks_sync(uint32_t n, bdescr **hd)
     bd[n-1].link = NULL;
     // We have to hold the lock until we've finished fiddling with the metadata,
     // otherwise the block allocator can get confused.
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+    RELEASE_SM_LOCK;
     *hd = bd;
     return n;
 }
 
-void
-freeChain_sync(bdescr *bd)
-{
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    freeChain(bd);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-}
-
-void
-freeGroup_sync(bdescr *bd)
-{
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    freeGroup(bd);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-}
-
 /* -----------------------------------------------------------------------------
    Workspace utilities
    -------------------------------------------------------------------------- */
@@ -303,7 +264,7 @@ todo_block_full (uint32_t size, gen_workspace *ws)
                 // object.  However, if the object we're copying is
                 // larger than a block, then we might have an empty
                 // block here.
-                freeGroup_sync(bd);
+                freeGroup_lock(bd);
             } else {
                 push_scanned_block(bd, ws);
             }
@@ -343,14 +304,14 @@ alloc_todo_block (gen_workspace *ws, uint32_t size)
     else
     {
         if (size > BLOCK_SIZE_W) {
-            bd = allocGroup_sync((W_)BLOCK_ROUND_UP(size*sizeof(W_))
+            bd = allocGroup_lock((W_)BLOCK_ROUND_UP(size*sizeof(W_))
                                  / BLOCK_SIZE);
         } else {
             if (gct->free_blocks) {
                 bd = gct->free_blocks;
                 gct->free_blocks = bd->link;
             } else {
-                allocBlocks_sync(16, &bd);
+                allocBlocks_lock(16, &bd);
                 gct->free_blocks = bd->link;
             }
         }


=====================================
rts/sm/GCUtils.h
=====================================
@@ -17,22 +17,6 @@
 
 #include "BeginPrivate.h"
 
-bdescr* allocGroup_sync(uint32_t n);
-bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n);
-
-INLINE_HEADER bdescr *allocBlock_sync(void)
-{
-    return allocGroup_sync(1);
-}
-
-INLINE_HEADER bdescr *allocBlockOnNode_sync(uint32_t node)
-{
-    return allocGroupOnNode_sync(node,1);
-}
-
-void    freeChain_sync(bdescr *bd);
-void    freeGroup_sync(bdescr *bd);
-
 void    push_scanned_block   (bdescr *bd, gen_workspace *ws);
 StgPtr  todo_block_full      (uint32_t size, gen_workspace *ws);
 StgPtr  alloc_todo_block     (gen_workspace *ws, uint32_t size);
@@ -62,7 +46,7 @@ recordMutableGen_GC (StgClosure *p, uint32_t gen_no)
     bd = gct->mut_lists[gen_no];
     if (bd->free >= bd->start + BLOCK_SIZE_W) {
         bdescr *new_bd;
-        new_bd = allocBlock_sync();
+        new_bd = allocBlock_lock();
         new_bd->link = bd;
         bd = new_bd;
         gct->mut_lists[gen_no] = bd;


=====================================
rts/sm/HeapAlloc.h
=====================================
@@ -10,6 +10,8 @@
 
 #include "BeginPrivate.h"
 
+#include "Storage.h"
+
 /* -----------------------------------------------------------------------------
    The HEAP_ALLOCED() test.
 
@@ -210,9 +212,9 @@ StgBool HEAP_ALLOCED_GC(const void *p)
     } else {
         // putting the rest out of line turned out to be a slight
         // performance improvement:
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        ACQUIRE_SM_LOCK; // TODO: this may be too expensive
         b = HEAP_ALLOCED_miss(mblock,p);
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+        RELEASE_SM_LOCK;
         return b;
     }
 }


=====================================
rts/sm/MarkStack.h
=====================================
@@ -32,7 +32,7 @@ push_mark_stack(StgPtr p)
         }
         else
         {
-            bd = allocBlock_sync();
+            bd = allocBlock_lock();
             bd->link = mark_stack_bd;
             bd->u.back = NULL;
             mark_stack_bd->u.back = bd; // double-link the new block on


=====================================
rts/sm/NonMoving.c
=====================================
@@ -597,14 +597,10 @@ static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node)
 
     // Nothing in the free list, allocate a new segment...
     if (ret == NULL) {
-        // Take gc spinlock: another thread may be scavenging a moving
-        // generation and call `todo_block_full`
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
         bdescr *bd = allocAlignedGroupOnNode(node, NONMOVING_SEGMENT_BLOCKS);
         // See Note [Live data accounting in nonmoving collector].
         oldest_gen->n_blocks += bd->blocks;
         oldest_gen->n_words  += BLOCK_SIZE_W * bd->blocks;
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
 
         for (StgWord32 i = 0; i < bd->blocks; ++i) {
             initBdescr(&bd[i], oldest_gen, oldest_gen);
@@ -667,7 +663,7 @@ static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *al
     }
 }
 
-/* Allocate a block in the nonmoving heap. Caller must hold SM_MUTEX. sz is in words */
+/* Allocate a block in the nonmoving heap. sz is in words */
 GNUC_ATTR_HOT
 void *nonmovingAllocate(Capability *cap, StgWord sz)
 {
@@ -707,7 +703,9 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
 
         // there are no active segments, allocate new segment
         if (new_current == NULL) {
+            ACQUIRE_SM_LOCK;
             new_current = nonmovingAllocSegment(cap->node);
+            RELEASE_SM_LOCK;
             nonmovingInitSegment(new_current, log_block_size);
         }
 
@@ -790,14 +788,13 @@ void nonmovingExit(void)
 /*
  * Assumes that no garbage collector or mutator threads are running to safely
  * resize the nonmoving_allocators.
- *
- * Must hold sm_mutex.
  */
 void nonmovingAddCapabilities(uint32_t new_n_caps)
 {
     unsigned int old_n_caps = nonmovingHeap.n_caps;
     struct NonmovingAllocator **allocs = nonmovingHeap.allocators;
 
+    ACQUIRE_SM_LOCK;
     for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
         struct NonmovingAllocator *old = allocs[i];
         allocs[i] = alloc_nonmoving_allocator(new_n_caps);
@@ -819,6 +816,7 @@ void nonmovingAddCapabilities(uint32_t new_n_caps)
         }
     }
     nonmovingHeap.n_caps = new_n_caps;
+    RELEASE_SM_LOCK;
 }
 
 void nonmovingClearBitmap(struct NonmovingSegment *seg)
@@ -1238,6 +1236,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
 
 #if defined(THREADED_RTS) && defined(NONCONCURRENT_SWEEP)
 #if defined(DEBUG)
+    check_gc();
     checkNonmovingHeap(&nonmovingHeap);
     checkSanity(true, true);
 #endif


=====================================
rts/sm/NonMovingMark.c
=====================================
@@ -295,10 +295,8 @@ static void nonmovingAddUpdRemSetBlocks_lock(MarkQueue *rset)
 
     nonmovingAddUpdRemSetBlocks_(rset);
     // Reset the state of the remembered set.
-    ACQUIRE_SM_LOCK;
     init_mark_queue_(rset);
     rset->is_upd_rem_set = true;
-    RELEASE_SM_LOCK;
 }
 
 /*
@@ -472,9 +470,7 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
 }
 
 /* A variant of push to be used by the minor GC when it encounters a reference
- * to an object in the non-moving heap. In contrast to the other push
- * operations this uses the gc_alloc_block_sync spinlock instead of the
- * SM_LOCK to allocate new blocks in the event that the mark queue is full.
+ * to an object in the non-moving heap.
  */
 void
 markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
@@ -495,13 +491,13 @@ markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
     if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
         // Yes, this block is full.
         // allocate a fresh block.
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        ACQUIRE_SM_LOCK;
         bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
         bd->link = q->blocks;
         q->blocks = bd;
         q->top = (MarkQueueBlock *) bd->start;
         q->top->head = 0;
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+        RELEASE_SM_LOCK;
     }
 
     MarkQueueEnt ent = {
@@ -932,7 +928,7 @@ static MarkQueueEnt markQueuePop (MarkQueue *q)
 /* Must hold sm_mutex. */
 static void init_mark_queue_ (MarkQueue *queue)
 {
-    bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
+    bdescr *bd = allocGroup_lock(MARK_QUEUE_BLOCKS);
     ASSERT(queue->blocks == NULL);
     queue->blocks = bd;
     queue->top = (MarkQueueBlock *) bd->start;
@@ -943,14 +939,12 @@ static void init_mark_queue_ (MarkQueue *queue)
 #endif
 }
 
-/* Must hold sm_mutex. */
 void initMarkQueue (MarkQueue *queue)
 {
     init_mark_queue_(queue);
     queue->is_upd_rem_set = false;
 }
 
-/* Must hold sm_mutex. */
 void nonmovingInitUpdRemSet (UpdRemSet *rset)
 {
     init_mark_queue_(&rset->queue);
@@ -1610,7 +1604,7 @@ mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin)
         // selectee unreachable. However, we must mark the selectee regardless
         // to satisfy the snapshot invariant.
         PUSH_FIELD(sel, selectee);
-        nonmoving_eval_thunk_selector(queue, sel, origin);
+        //nonmoving_eval_thunk_selector(queue, sel, origin);
         break;
     }
 


=====================================
rts/sm/NonMovingMark.h
=====================================
@@ -9,10 +9,11 @@
 #pragma once
 
 #include "Task.h"
-#include "NonMoving.h"
 
 #include "BeginPrivate.h"
 
+struct NonMovingHeap;
+
 enum EntryType {
     NULL_ENTRY = 0,
     MARK_CLOSURE = 1,
@@ -63,7 +64,7 @@ INLINE_HEADER enum EntryType nonmovingMarkQueueEntryType(MarkQueueEnt *ent)
 {
     uintptr_t tag = (uintptr_t) ent->null_entry.p & TAG_MASK;
     ASSERT(tag <= MARK_ARRAY);
-    return tag;
+    return (enum EntryType) tag;
 }
 
 typedef struct {
@@ -155,7 +156,7 @@ void markQueueAddRoot(MarkQueue* q, StgClosure** root);
 
 void initMarkQueue(MarkQueue *queue);
 void freeMarkQueue(MarkQueue *queue);
-void nonmovingMark(struct MarkQueue_ *restrict queue);
+void nonmovingMark(struct MarkQueue_ *__restrict__ queue);
 
 void nonmovingMarkWeakPtrList(struct MarkQueue_ *queue);
 bool nonmovingTidyWeaks(struct MarkQueue_ *queue);


=====================================
rts/sm/NonMovingShortcut.c
=====================================
@@ -10,6 +10,7 @@
 #include "Rts.h"
 #include "GC.h"
 #include "SMPClosureOps.h"
+#include "NonMoving.h"
 #include "NonMovingMark.h"
 #include "NonMovingShortcut.h"
 #include "Printer.h"


=====================================
rts/sm/Sanity.h
=====================================
@@ -19,6 +19,8 @@
 # define MAX_SLOTS      100000
 # endif
 
+struct NonmovingHeap;
+
 /* debugging routines */
 void checkSanity        ( bool after_gc, bool major_gc );
 void checkNurserySanity ( nursery *nursery );


=====================================
rts/sm/Scav.c
=====================================
@@ -1730,7 +1730,7 @@ scavenge_capability_mut_lists (Capability *cap)
     if (RtsFlags.GcFlags.useNonmoving && major_gc) {
         uint32_t g = oldest_gen->no;
         scavenge_mutable_list(cap->saved_mut_lists[g], oldest_gen);
-        freeChain_sync(cap->saved_mut_lists[g]);
+        freeChain_lock(cap->saved_mut_lists[g]);
         cap->saved_mut_lists[g] = NULL;
         return;
     }
@@ -1743,7 +1743,7 @@ scavenge_capability_mut_lists (Capability *cap)
      */
     for (uint32_t g = RtsFlags.GcFlags.generations-1; g > N; g--) {
         scavenge_mutable_list(cap->saved_mut_lists[g], &generations[g]);
-        freeChain_sync(cap->saved_mut_lists[g]);
+        freeChain_lock(cap->saved_mut_lists[g]);
         cap->saved_mut_lists[g] = NULL;
     }
 }


=====================================
rts/sm/Storage.c
=====================================
@@ -193,14 +193,13 @@ initStorage (void)
   initMutex(&sm_mutex);
 #endif
 
-  ACQUIRE_SM_LOCK;
-
   /* allocate generation info array */
   generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations
                                              * sizeof(struct generation_),
                                              "initStorage: gens");
 
   /* Initialise all generations */
+  ACQUIRE_SM_LOCK;
   for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
       initGeneration(&generations[g], g);
   }
@@ -214,16 +213,11 @@ initStorage (void)
       generations[g].to = &generations[g+1];
   }
   oldest_gen->to = oldest_gen;
+  RELEASE_SM_LOCK;
 
   // Nonmoving heap uses oldest_gen so initialize it after initializing oldest_gen
   nonmovingInit();
 
-#if defined(THREADED_RTS)
-  // nonmovingAddCapabilities allocates segments, which requires taking the gc
-  // sync lock, so initialize it before nonmovingAddCapabilities
-  initSpinLock(&gc_alloc_block_sync);
-#endif
-
   if (RtsFlags.GcFlags.useNonmoving)
       nonmovingAddCapabilities(getNumCapabilities());
 
@@ -261,8 +255,6 @@ initStorage (void)
 
   IF_DEBUG(gc, statDescribeGens());
 
-  RELEASE_SM_LOCK;
-
   traceInitEvent(traceHeapInfo);
 
 }
@@ -314,12 +306,14 @@ void storageAddCapabilities (uint32_t from, uint32_t to)
     assignNurseriesToCapabilities(from,to);
 
     // allocate a block for each mut list
+    ACQUIRE_SM_LOCK;
     for (n = from; n < to; n++) {
         for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
             getCapability(n)->mut_lists[g] =
                 allocBlockOnNode(capNoToNumaNode(n));
         }
     }
+    RELEASE_SM_LOCK;
 
     // Initialize NonmovingAllocators and UpdRemSets
     if (RtsFlags.GcFlags.useNonmoving) {
@@ -565,9 +559,7 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     // Allocate the blackhole indirection closure
     if (RtsFlags.GcFlags.useNonmoving) {
         // See Note [Static objects under the nonmoving collector].
-        ACQUIRE_SM_LOCK;
         bh = (StgInd *)nonmovingAllocate(cap, sizeofW(*bh));
-        RELEASE_SM_LOCK;
         recordMutableCap((StgClosure*)bh,
                          regTableToCapability(reg), oldest_gen->no);
     } else {
@@ -725,6 +717,7 @@ allocNursery (uint32_t node, bdescr *tail, W_ blocks)
     // automatic prefetching works across nursery blocks.  This is a
     // tiny optimisation (~0.5%), but it's free.
 
+    ACQUIRE_SM_LOCK;
     while (blocks > 0) {
         n = stg_min(BLOCKS_PER_MBLOCK, blocks);
         // allocLargeChunk will prefer large chunks, but will pick up
@@ -760,6 +753,7 @@ allocNursery (uint32_t node, bdescr *tail, W_ blocks)
 
         tail = &bd[0];
     }
+    RELEASE_SM_LOCK;
 
     return &bd[0];
 }
@@ -879,7 +873,7 @@ resizeNurseriesEach (W_ blocks)
                 next_bd = bd->link;
                 next_bd->u.back = NULL;
                 nursery_blocks -= bd->blocks; // might be a large block
-                freeGroup(bd);
+                freeGroup_lock(bd);
                 bd = next_bd;
             }
             nursery->blocks = bd;
@@ -1300,9 +1294,7 @@ allocatePinned (Capability *cap, W_ n /*words*/, W_ alignment /*bytes*/, W_ alig
         if (bd == NULL) {
             // The pinned block list is empty: allocate a fresh block (we can't fail
             // here).
-            ACQUIRE_SM_LOCK;
             bd = allocNursery(cap->node, NULL, PINNED_EMPTY_SIZE);
-            RELEASE_SM_LOCK;
         }
 
         // Bump up the nursery pointer to avoid the pathological situation
@@ -1405,8 +1397,6 @@ allocatePinned (Capability *cap, W_ n /*words*/, W_ alignment /*bytes*/, W_ alig
 void
 dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old)
 {
-    ASSERT(RELAXED_LOAD(&mvar->header.info) == &stg_MUT_VAR_CLEAN_info);
-
     Capability *cap = regTableToCapability(reg);
     // No barrier required here as no other heap object fields are read. See
     // Note [Heap memory barriers] in SMP.h.



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/4b4207f32fd4c0a6995e2be1a59bf2b3eff53d51...14f1045d178417cd7d508c3b72cd70ce5b681d1a

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/4b4207f32fd4c0a6995e2be1a59bf2b3eff53d51...14f1045d178417cd7d508c3b72cd70ce5b681d1a
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20221206/0667f479/attachment-0001.html>