[Git][ghc/ghc][wip/gc/nonmoving-nonconcurrent] 2 commits: rts: Non-concurrent mark and sweep

Wed Jun 19 00:52:56 UTC 2019


Ben Gamari pushed to branch wip/gc/nonmoving-nonconcurrent at Glasgow Haskell Compiler / GHC


Commits:
b03ec7ea by Ömer Sinan Ağacan at 2019-06-19T00:52:36Z
rts: Non-concurrent mark and sweep

This implements the core heap structure and a serial mark/sweep
collector which can be used to manage the oldest-generation heap.
This is the first step towards a concurrent mark-and-sweep collector
aimed at low-latency applications.

The full design of the collector implemented here is described in detail
in a technical note

    B. Gamari. "A Concurrent Garbage Collector For the Glasgow Haskell
    Compiler" (2018)

The basic heap structure used in this design is heavily inspired by

    K. Ueno & A. Ohori. "A fully concurrent garbage collector for
    functional programs on multicore processors." /ACM SIGPLAN Notices/
    Vol. 51. No. 9 (presented by ICFP 2016)

This design is intended to allow both marking and sweeping
concurrent to execution of a multi-core mutator. Unlike the Ueno design,
which requires no global synchronization pauses, the collector
introduced here requires a stop-the-world pause at the beginning and end
of the mark phase.

To avoid heap fragmentation, the allocator consists of a number of
fixed-size /sub-allocators/. Each of these sub-allocators allocators into
its own set of /segments/, themselves allocated from the block
allocator. Each segment is broken into a set of fixed-size allocation
blocks (which back allocations) in addition to a bitmap (used to track
the liveness of blocks) and some additional metadata (used also used
to track liveness).

This heap structure enables collection via mark-and-sweep, which can be
performed concurrently via a snapshot-at-the-beginning scheme (although
concurrent collection is not implemented in this patch).

The mark queue is a fairly straightforward chunked-array structure.
The representation is a bit more verbose than a typical mark queue to
accomodate a combination of two features:

 * a mark FIFO, which improves the locality of marking, reducing one of
   the major overheads seen in mark/sweep allocators (see [1] for
   details)

 * the selector optimization and indirection shortcutting, which
   requires that we track where we found each reference to an object
   in case we need to update the reference at a later point (e.g. when
   we find that it is an indirection). See Note [Origin references in
   the nonmoving collector] (in `NonMovingMark.h`) for details.

Beyond this the mark/sweep is fairly run-of-the-mill.

[1] R. Garner, S.M. Blackburn, D. Frampton. "Effective Prefetch for
    Mark-Sweep Garbage Collection." ISMM 2007.

Co-Authored-By: Ben Gamari <ben at well-typed.com>

- - - - -
9cd98caa by Ben Gamari at 2019-06-19T00:52:36Z
testsuite: Add nonmoving WAY

This simply runs the compile_and_run tests with `-xn`, enabling the
nonmoving oldest generation.

- - - - -


24 changed files:

- includes/rts/storage/Block.h
- rts/Capability.c
- rts/Capability.h
- rts/RtsStartup.c
- rts/Weak.c
- rts/sm/Evac.c
- rts/sm/GC.c
- rts/sm/GC.h
- rts/sm/GCAux.c
- rts/sm/GCThread.h
- + rts/sm/NonMoving.c
- + rts/sm/NonMoving.h
- + rts/sm/NonMovingMark.c
- + rts/sm/NonMovingMark.h
- + rts/sm/NonMovingScav.c
- + rts/sm/NonMovingScav.h
- + rts/sm/NonMovingSweep.c
- + rts/sm/NonMovingSweep.h
- rts/sm/Sanity.c
- rts/sm/Sanity.h
- rts/sm/Scav.c
- rts/sm/Storage.c
- rts/sm/Storage.h
- testsuite/config/ghc


Changes:

=====================================
includes/rts/storage/Block.h
=====================================
@@ -97,6 +97,8 @@ typedef struct bdescr_ {
                                // block allocator.  In particular, the
                                // value (StgPtr)(-1) is used to
                                // indicate that a block is unallocated.
+                               //
+                               // Unused by the non-moving allocator.
 
     struct bdescr_ *link;      // used for chaining blocks together
 
@@ -141,7 +143,8 @@ typedef struct bdescr_ {
 #define BF_LARGE     2
 /* Block is pinned */
 #define BF_PINNED    4
-/* Block is to be marked, not copied */
+/* Block is to be marked, not copied. Also used for marked large objects in
+ * non-moving heap. */
 #define BF_MARKED    8
 /* Block is executable */
 #define BF_EXEC      32
@@ -153,6 +156,12 @@ typedef struct bdescr_ {
 #define BF_SWEPT     256
 /* Block is part of a Compact */
 #define BF_COMPACT   512
+/* A non-moving allocator segment (see NonMoving.c) */
+#define BF_NONMOVING 1024
+/* A large object which has been moved to off of oldest_gen->large_objects and
+ * onto nonmoving_large_objects. The mark phase ignores objects which aren't
+ * so-flagged */
+#define BF_NONMOVING_SWEEPING 2048
 /* Maximum flag value (do not define anything higher than this!) */
 #define BF_FLAG_MAX  (1 << 15)
 


=====================================
rts/Capability.c
=====================================
@@ -27,6 +27,7 @@
 #include "STM.h"
 #include "RtsUtils.h"
 #include "sm/OSMem.h"
+#include "sm/BlockAlloc.h" // for countBlocks()
 
 #if !defined(mingw32_HOST_OS)
 #include "rts/IOManager.h" // for setIOManagerControlFd()


=====================================
rts/Capability.h
=====================================
@@ -23,6 +23,7 @@
 #include "sm/GC.h" // for evac_fn
 #include "Task.h"
 #include "Sparks.h"
+#include "sm/NonMovingMark.h" // for MarkQueue
 
 #include "BeginPrivate.h"
 


=====================================
rts/RtsStartup.c
=====================================
@@ -432,6 +432,9 @@ hs_exit_(bool wait_foreign)
     /* shutdown the hpc support (if needed) */
     exitHpc();
 
+    /* wait for any on-going concurrent GC to finish */
+    nonmovingExit();
+
     // clean up things from the storage manager's point of view.
     // also outputs the stats (+RTS -s) info.
     exitStorage();


=====================================
rts/Weak.c
=====================================
@@ -91,9 +91,19 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     StgWord size;
     uint32_t n, i;
 
-    ASSERT(n_finalizers == 0);
-
-    finalizer_list = list;
+    // This assertion does not hold with non-moving collection because
+    // non-moving collector does not wait for the list to be consumed (by
+    // doIdleGcWork()) before appending the list with more finalizers.
+    ASSERT(RtsFlags.GcFlags.useNonmoving || n_finalizers == 0);
+
+    // Append finalizer_list with the new list. TODO: Perhaps cache tail of the
+    // list for faster append. NOTE: We can't append `list` here! Otherwise we
+    // end up traversing already visited weaks in the loops below.
+    StgWeak **tl = &finalizer_list;
+    while (*tl) {
+        tl = &(*tl)->link;
+    }
+    *tl = list;
 
     // Traverse the list and
     //  * count the number of Haskell finalizers
@@ -128,7 +138,7 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
         SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
     }
 
-    n_finalizers = i;
+    n_finalizers += i;
 
     // No Haskell finalizers to run?
     if (n == 0) return;


=====================================
rts/sm/Evac.c
=====================================
@@ -27,6 +27,7 @@
 #include "LdvProfile.h"
 #include "CNF.h"
 #include "Scav.h"
+#include "NonMoving.h"
 
 #if defined(THREADED_RTS) && !defined(PARALLEL_GC)
 #define evacuate(p) evacuate1(p)
@@ -62,9 +63,18 @@ STATIC_INLINE void evacuate_large(StgPtr p);
    Allocate some space in which to copy an object.
    -------------------------------------------------------------------------- */
 
+/* size is in words */
 STATIC_INLINE StgPtr
 alloc_for_copy (uint32_t size, uint32_t gen_no)
 {
+    ASSERT(gen_no < RtsFlags.GcFlags.generations);
+
+    if (RtsFlags.GcFlags.useNonmoving && major_gc) {
+        // unconditionally promote to non-moving heap in major gc
+        gct->copied += size;
+        return nonmovingAllocate(gct->cap, size);
+    }
+
     StgPtr to;
     gen_workspace *ws;
 
@@ -81,6 +91,11 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
         }
     }
 
+    if (RtsFlags.GcFlags.useNonmoving && gen_no == oldest_gen->no) {
+        gct->copied += size;
+        return nonmovingAllocate(gct->cap, size);
+    }
+
     ws = &gct->gens[gen_no];  // zero memory references here
 
     /* chain a new block onto the to-space for the destination gen if
@@ -100,6 +115,7 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
    The evacuate() code
    -------------------------------------------------------------------------- */
 
+/* size is in words */
 STATIC_INLINE GNUC_ATTR_HOT void
 copy_tag(StgClosure **p, const StgInfoTable *info,
          StgClosure *src, uint32_t size, uint32_t gen_no, StgWord tag)
@@ -296,7 +312,9 @@ evacuate_large(StgPtr p)
    */
   new_gen_no = bd->dest_no;
 
-  if (new_gen_no < gct->evac_gen_no) {
+  if (RtsFlags.GcFlags.useNonmoving && major_gc) {
+      new_gen_no = oldest_gen->no;
+  } else if (new_gen_no < gct->evac_gen_no) {
       if (gct->eager_promotion) {
           new_gen_no = gct->evac_gen_no;
       } else {
@@ -308,6 +326,9 @@ evacuate_large(StgPtr p)
   new_gen = &generations[new_gen_no];
 
   bd->flags |= BF_EVACUATED;
+  if (RtsFlags.GcFlags.useNonmoving && new_gen == oldest_gen) {
+      bd->flags |= BF_NONMOVING;
+  }
   initBdescr(bd, new_gen, new_gen->to);
 
   // If this is a block of pinned or compact objects, we don't have to scan
@@ -575,7 +596,16 @@ loop:
 
   bd = Bdescr((P_)q);
 
-  if ((bd->flags & (BF_LARGE | BF_MARKED | BF_EVACUATED | BF_COMPACT)) != 0) {
+  if ((bd->flags & (BF_LARGE | BF_MARKED | BF_EVACUATED | BF_COMPACT | BF_NONMOVING)) != 0) {
+      // Pointer to non-moving heap. Non-moving heap is collected using
+      // mark-sweep so this object should be marked and then retained in sweep.
+      if (bd->flags & BF_NONMOVING) {
+          // NOTE: large objects in nonmoving heap are also marked with
+          // BF_NONMOVING. Those are moved to scavenged_large_objects list in
+          // mark phase.
+          return;
+      }
+
       // pointer into to-space: just return it.  It might be a pointer
       // into a generation that we aren't collecting (> N), or it
       // might just be a pointer into to-space.  The latter doesn't
@@ -906,6 +936,10 @@ evacuate_BLACKHOLE(StgClosure **p)
     // blackholes can't be in a compact
     ASSERT((bd->flags & BF_COMPACT) == 0);
 
+    if (bd->flags & BF_NONMOVING) {
+        return;
+    }
+
     // blackholes *can* be in a large object: when raiseAsync() creates an
     // AP_STACK the payload might be large enough to create a large object.
     // See #14497.
@@ -1056,7 +1090,7 @@ selector_chain:
         // save any space in any case, and updating with an indirection is
         // trickier in a non-collected gen: we would have to update the
         // mutable list.
-        if (bd->flags & BF_EVACUATED) {
+        if ((bd->flags & BF_EVACUATED) || (bd->flags & BF_NONMOVING)) {
             unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p);
             *q = (StgClosure *)p;
             // shortcut, behave as for:  if (evac) evacuate(q);


=====================================
rts/sm/GC.c
=====================================
@@ -51,6 +51,7 @@
 #include "CheckUnload.h"
 #include "CNF.h"
 #include "RtsFlags.h"
+#include "NonMoving.h"
 
 #if defined(PROFILING)
 #include "RetainerProfile.h"
@@ -163,7 +164,6 @@ static void mark_root               (void *user, StgClosure **root);
 static void prepare_collected_gen   (generation *gen);
 static void prepare_uncollected_gen (generation *gen);
 static void init_gc_thread          (gc_thread *t);
-static void resize_generations      (void);
 static void resize_nursery          (void);
 static void start_gc_threads        (void);
 static void scavenge_until_all_done (void);
@@ -576,7 +576,7 @@ GarbageCollect (uint32_t collect_gen,
     gen = &generations[g];
 
     // for generations we collected...
-    if (g <= N) {
+    if (g <= N && !(RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) {
 
         /* free old memory and shift to-space into from-space for all
          * the collected generations (except the allocation area).  These
@@ -714,8 +714,42 @@ GarbageCollect (uint32_t collect_gen,
     }
   } // for all generations
 
-  // update the max size of older generations after a major GC
-  resize_generations();
+  // Mark and sweep the oldest generation.
+  // N.B. This can only happen after we've moved
+  // oldest_gen->scavenged_large_objects back to oldest_gen->large_objects.
+  ASSERT(oldest_gen->scavenged_large_objects == NULL);
+  if (RtsFlags.GcFlags.useNonmoving && major_gc) {
+      // All threads in non-moving heap should be found to be alive, becuase
+      // threads in the non-moving generation's list should live in the
+      // non-moving heap, and we consider non-moving objects alive during
+      // preparation.
+      ASSERT(oldest_gen->old_threads == END_TSO_QUEUE);
+      // For weaks, remember that we evacuated all weaks to the non-moving heap
+      // in markWeakPtrList(), and then moved the weak_ptr_list list to
+      // old_weak_ptr_list. We then moved weaks with live keys to the
+      // weak_ptr_list again. Then, in collectDeadWeakPtrs() we moved weaks in
+      // old_weak_ptr_list to dead_weak_ptr_list. So at this point
+      // old_weak_ptr_list should be empty.
+      ASSERT(oldest_gen->old_weak_ptr_list == NULL);
+
+      // we may need to take the lock to allocate mark queue blocks
+      RELEASE_SM_LOCK;
+      // dead_weak_ptr_list contains weak pointers with dead keys. Those need to
+      // be kept alive because we'll use them in finalizeSchedulers(). Similarly
+      // resurrected_threads are also going to be used in resurrectedThreads()
+      // so we need to mark those too.
+      // Note that in sequential case these lists will be appended with more
+      // weaks and threads found to be dead in mark.
+      nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads);
+      ACQUIRE_SM_LOCK;
+  }
+
+  // Update the max size of older generations after a major GC:
+  // We can't resize here in the case of the concurrent collector since we
+  // don't yet know how much live data we have. This will be instead done
+  // once we finish marking.
+  if (major_gc && RtsFlags.GcFlags.generations > 1 && ! RtsFlags.GcFlags.useNonmoving)
+      resizeGenerations();
 
   // Free the mark stack.
   if (mark_stack_top_bd != NULL) {
@@ -739,7 +773,7 @@ GarbageCollect (uint32_t collect_gen,
 
  // mark the garbage collected CAFs as dead
 #if defined(DEBUG)
-  if (major_gc) { gcCAFs(); }
+  if (major_gc && !RtsFlags.GcFlags.useNonmoving) { gcCAFs(); }
 #endif
 
   // Update the stable name hash table
@@ -772,8 +806,9 @@ GarbageCollect (uint32_t collect_gen,
   // check sanity after GC
   // before resurrectThreads(), because that might overwrite some
   // closures, which will cause problems with THREADED where we don't
-  // fill slop.
-  IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc));
+  // fill slop. If we are using the nonmoving collector then we can't claim to
+  // be *after* the major GC; it's now running concurrently.
+  IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc && !RtsFlags.GcFlags.useNonmoving));
 
   // If a heap census is due, we need to do it before
   // resurrectThreads(), for the same reason as checkSanity above:
@@ -946,6 +981,7 @@ new_gc_thread (uint32_t n, gc_thread *t)
         ws->todo_overflow = NULL;
         ws->n_todo_overflow = 0;
         ws->todo_large_objects = NULL;
+        ws->todo_seg = END_NONMOVING_TODO_LIST;
 
         ws->part_list = NULL;
         ws->n_part_blocks = 0;
@@ -1322,6 +1358,18 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
 }
 #endif
 
+/* ----------------------------------------------------------------------------
+   Save the mutable lists in saved_mut_lists where it will be scavenged
+   during GC
+   ------------------------------------------------------------------------- */
+
+static void
+stash_mut_list (Capability *cap, uint32_t gen_no)
+{
+    cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
+    cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
+}
+
 /* ----------------------------------------------------------------------------
    Initialise a generation that is to be collected
    ------------------------------------------------------------------------- */
@@ -1333,11 +1381,17 @@ prepare_collected_gen (generation *gen)
     gen_workspace *ws;
     bdescr *bd, *next;
 
-    // Throw away the current mutable list.  Invariant: the mutable
-    // list always has at least one block; this means we can avoid a
-    // check for NULL in recordMutable().
     g = gen->no;
-    if (g != 0) {
+
+    if (RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no) {
+        // Nonmoving heap's mutable list is always a root.
+        for (i = 0; i < n_capabilities; i++) {
+            stash_mut_list(capabilities[i], g);
+        }
+    } else if (g != 0) {
+        // Otherwise throw away the current mutable list. Invariant: the
+        // mutable list always has at least one block; this means we can avoid
+        // a check for NULL in recordMutable().
         for (i = 0; i < n_capabilities; i++) {
             freeChain(capabilities[i]->mut_lists[g]);
             capabilities[i]->mut_lists[g] =
@@ -1353,13 +1407,17 @@ prepare_collected_gen (generation *gen)
     gen->old_threads = gen->threads;
     gen->threads = END_TSO_QUEUE;
 
-    // deprecate the existing blocks
-    gen->old_blocks   = gen->blocks;
-    gen->n_old_blocks = gen->n_blocks;
-    gen->blocks       = NULL;
-    gen->n_blocks     = 0;
-    gen->n_words      = 0;
-    gen->live_estimate = 0;
+    // deprecate the existing blocks (except in the case of the nonmoving
+    // collector since these will be preserved in nonmovingCollect for the
+    // concurrent GC).
+    if (!(RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no)) {
+        gen->old_blocks   = gen->blocks;
+        gen->n_old_blocks = gen->n_blocks;
+        gen->blocks       = NULL;
+        gen->n_blocks     = 0;
+        gen->n_words      = 0;
+        gen->live_estimate = 0;
+    }
 
     // initialise the large object queues.
     ASSERT(gen->scavenged_large_objects == NULL);
@@ -1453,18 +1511,6 @@ prepare_collected_gen (generation *gen)
     }
 }
 
-
-/* ----------------------------------------------------------------------------
-   Save the mutable lists in saved_mut_lists
-   ------------------------------------------------------------------------- */
-
-static void
-stash_mut_list (Capability *cap, uint32_t gen_no)
-{
-    cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
-    cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
-}
-
 /* ----------------------------------------------------------------------------
    Initialise a generation that is *not* to be collected
    ------------------------------------------------------------------------- */
@@ -1533,31 +1579,57 @@ collect_gct_blocks (void)
 }
 
 /* -----------------------------------------------------------------------------
-   During mutation, any blocks that are filled by allocatePinned() are
-   stashed on the local pinned_object_blocks list, to avoid needing to
-   take a global lock.  Here we collect those blocks from the
-   cap->pinned_object_blocks lists and put them on the
-   main g0->large_object list.
+   During mutation, any blocks that are filled by allocatePinned() are stashed
+   on the local pinned_object_blocks list, to avoid needing to take a global
+   lock.  Here we collect those blocks from the cap->pinned_object_blocks lists
+   and put them on the g0->large_object or oldest_gen->large_objects.
+
+   How to decide which list to put them on?
+
+   - When non-moving heap is enabled and this is a major GC, we put them on
+     oldest_gen. This is because after preparation we really want no
+     old-to-young references, and we want to be able to reset mut_lists. For
+     this we need to promote every potentially live object to the oldest gen.
+
+   - Otherwise we put them on g0.
    -------------------------------------------------------------------------- */
 
 static void
 collect_pinned_object_blocks (void)
 {
-    uint32_t n;
-    bdescr *bd, *prev;
+    generation *gen;
+    const bool use_nonmoving = RtsFlags.GcFlags.useNonmoving;
+    if (use_nonmoving && major_gc) {
+        gen = oldest_gen;
+    } else {
+        gen = g0;
+    }
 
-    for (n = 0; n < n_capabilities; n++) {
-        prev = NULL;
-        for (bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
-            prev = bd;
+    for (uint32_t n = 0; n < n_capabilities; n++) {
+        bdescr *last = NULL;
+        if (use_nonmoving && gen == oldest_gen) {
+            // Mark objects as belonging to the nonmoving heap
+            for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
+                bd->flags |= BF_NONMOVING;
+                bd->gen = oldest_gen;
+                bd->gen_no = oldest_gen->no;
+                oldest_gen->n_large_words += bd->free - bd->start;
+                oldest_gen->n_large_blocks += bd->blocks;
+                last = bd;
+            }
+        } else {
+            for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
+                last = bd;
+            }
         }
-        if (prev != NULL) {
-            prev->link = g0->large_objects;
-            if (g0->large_objects != NULL) {
-                g0->large_objects->u.back = prev;
+
+        if (last != NULL) {
+            last->link = gen->large_objects;
+            if (gen->large_objects != NULL) {
+                gen->large_objects->u.back = last;
             }
-            g0->large_objects = capabilities[n]->pinned_object_blocks;
-            capabilities[n]->pinned_object_blocks = 0;
+            gen->large_objects = capabilities[n]->pinned_object_blocks;
+            capabilities[n]->pinned_object_blocks = NULL;
         }
     }
 }
@@ -1616,98 +1688,100 @@ mark_root(void *user USED_IF_THREADS, StgClosure **root)
    percentage of the maximum heap size available to allocate into.
    ------------------------------------------------------------------------- */
 
-static void
-resize_generations (void)
+void
+resizeGenerations (void)
 {
     uint32_t g;
+    W_ live, size, min_alloc, words;
+    const W_ max  = RtsFlags.GcFlags.maxHeapSize;
+    const W_ gens = RtsFlags.GcFlags.generations;
 
-    if (major_gc && RtsFlags.GcFlags.generations > 1) {
-        W_ live, size, min_alloc, words;
-        const W_ max  = RtsFlags.GcFlags.maxHeapSize;
-        const W_ gens = RtsFlags.GcFlags.generations;
-
-        // live in the oldest generations
-        if (oldest_gen->live_estimate != 0) {
-            words = oldest_gen->live_estimate;
-        } else {
-            words = oldest_gen->n_words;
-        }
-        live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
-            oldest_gen->n_large_blocks +
-            oldest_gen->n_compact_blocks;
+    // live in the oldest generations
+    if (oldest_gen->live_estimate != 0) {
+        words = oldest_gen->live_estimate;
+    } else {
+        words = oldest_gen->n_words;
+    }
+    live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
+        oldest_gen->n_large_blocks +
+        oldest_gen->n_compact_blocks;
 
-        // default max size for all generations except zero
-        size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
-                       RtsFlags.GcFlags.minOldGenSize);
+    // default max size for all generations except zero
+    size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
+                   RtsFlags.GcFlags.minOldGenSize);
 
-        if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
-            if (max > 0) {
-                RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
-            } else {
-                RtsFlags.GcFlags.heapSizeSuggestion = size;
-            }
+    if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
+        if (max > 0) {
+            RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
+        } else {
+            RtsFlags.GcFlags.heapSizeSuggestion = size;
         }
+    }
 
-        // minimum size for generation zero
-        min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
-                            RtsFlags.GcFlags.minAllocAreaSize
-                            * (W_)n_capabilities);
-
-        // Auto-enable compaction when the residency reaches a
-        // certain percentage of the maximum heap size (default: 30%).
-        if (RtsFlags.GcFlags.compact ||
-            (max > 0 &&
-             oldest_gen->n_blocks >
-             (RtsFlags.GcFlags.compactThreshold * max) / 100)) {
-            oldest_gen->mark = 1;
-            oldest_gen->compact = 1;
+    // minimum size for generation zero
+    min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
+                        RtsFlags.GcFlags.minAllocAreaSize
+                        * (W_)n_capabilities);
+
+    // Auto-enable compaction when the residency reaches a
+    // certain percentage of the maximum heap size (default: 30%).
+    // Except when non-moving GC is enabled.
+    if (!RtsFlags.GcFlags.useNonmoving &&
+        (RtsFlags.GcFlags.compact ||
+         (max > 0 &&
+          oldest_gen->n_blocks >
+          (RtsFlags.GcFlags.compactThreshold * max) / 100))) {
+        oldest_gen->mark = 1;
+        oldest_gen->compact = 1;
 //        debugBelch("compaction: on\n", live);
-        } else {
-            oldest_gen->mark = 0;
-            oldest_gen->compact = 0;
+    } else {
+        oldest_gen->mark = 0;
+        oldest_gen->compact = 0;
 //        debugBelch("compaction: off\n", live);
-        }
+    }
 
-        if (RtsFlags.GcFlags.sweep) {
-            oldest_gen->mark = 1;
-        }
+    if (RtsFlags.GcFlags.sweep) {
+        oldest_gen->mark = 1;
+    }
 
-        // if we're going to go over the maximum heap size, reduce the
-        // size of the generations accordingly.  The calculation is
-        // different if compaction is turned on, because we don't need
-        // to double the space required to collect the old generation.
-        if (max != 0) {
+    // if we're going to go over the maximum heap size, reduce the
+    // size of the generations accordingly.  The calculation is
+    // different if compaction is turned on, because we don't need
+    // to double the space required to collect the old generation.
+    if (max != 0) {
+
+        // this test is necessary to ensure that the calculations
+        // below don't have any negative results - we're working
+        // with unsigned values here.
+        if (max < min_alloc) {
+            heapOverflow();
+        }
 
-            // this test is necessary to ensure that the calculations
-            // below don't have any negative results - we're working
-            // with unsigned values here.
-            if (max < min_alloc) {
-                heapOverflow();
+        if (oldest_gen->compact) {
+            if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
+                size = (max - min_alloc) / ((gens - 1) * 2 - 1);
             }
-
-            if (oldest_gen->compact) {
-                if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
-                    size = (max - min_alloc) / ((gens - 1) * 2 - 1);
-                }
-            } else {
-                if ( (size * (gens - 1) * 2) + min_alloc > max ) {
-                    size = (max - min_alloc) / ((gens - 1) * 2);
-                }
+        } else {
+            if ( (size * (gens - 1) * 2) + min_alloc > max ) {
+                size = (max - min_alloc) / ((gens - 1) * 2);
             }
+        }
 
-            if (size < live) {
-                heapOverflow();
-            }
+        if (size < live) {
+            heapOverflow();
         }
+    }
 
 #if 0
-        debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
-                   min_alloc, size, max);
+    debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
+               min_alloc, size, max);
+    debugBelch("resize_gen: n_blocks: %lu, n_large_block: %lu, n_compact_blocks: %lu\n",
+               oldest_gen->n_blocks, oldest_gen->n_large_blocks, oldest_gen->n_compact_blocks);
+    debugBelch("resize_gen: max_blocks: %lu -> %lu\n", oldest_gen->max_blocks, oldest_gen->n_blocks);
 #endif
 
-        for (g = 0; g < gens; g++) {
-            generations[g].max_blocks = size;
-        }
+    for (g = 0; g < gens; g++) {
+        generations[g].max_blocks = size;
     }
 }
 


=====================================
rts/sm/GC.h
=====================================
@@ -55,6 +55,8 @@ void gcWorkerThread (Capability *cap);
 void initGcThreads (uint32_t from, uint32_t to);
 void freeGcThreads (void);
 
+void resizeGenerations (void);
+
 #if defined(THREADED_RTS)
 void waitForGcThreads (Capability *cap, bool idle_cap[]);
 void releaseGCThreads (Capability *cap, bool idle_cap[]);


=====================================
rts/sm/GCAux.c
=====================================
@@ -60,6 +60,14 @@ isAlive(StgClosure *p)
     // ignore closures in generations that we're not collecting.
     bd = Bdescr((P_)q);
 
+    // isAlive is used when scavenging moving generations, before the mark
+    // phase. Because we don't know alive-ness of objects before the mark phase
+    // we have to conservatively treat objects in the non-moving generation as
+    // alive here.
+    if (bd->flags & BF_NONMOVING) {
+        return p;
+    }
+
     // if it's a pointer into to-space, then we're done
     if (bd->flags & BF_EVACUATED) {
         return p;


=====================================
rts/sm/GCThread.h
=====================================
@@ -83,6 +83,7 @@ typedef struct gen_workspace_ {
     bdescr *     todo_bd;
     StgPtr       todo_free;            // free ptr for todo_bd
     StgPtr       todo_lim;             // lim for todo_bd
+    struct NonmovingSegment *todo_seg; // only available for oldest gen workspace
 
     WSDeque *    todo_q;
     bdescr *     todo_overflow;
@@ -100,9 +101,6 @@ typedef struct gen_workspace_ {
     bdescr *     part_list;
     StgWord      n_part_blocks;      // count of above
     StgWord      n_part_words;
-
-    StgWord pad[1];
-
 } gen_workspace ATTRIBUTE_ALIGNED(64);
 // align so that computing gct->gens[n] is a shift, not a multiply
 // fails if the size is <64, which is why we need the pad above


=====================================
rts/sm/NonMoving.c
=====================================
@@ -0,0 +1,860 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "Capability.h"
+#include "Printer.h"
+#include "Storage.h"
+// We call evacuate, which expects the thread-local gc_thread to be valid;
+// This is sometimes declared as a register variable therefore it is necessary
+// to include the declaration so that the compiler doesn't clobber the register.
+#include "GCThread.h"
+#include "GCTDecl.h"
+#include "Schedule.h"
+
+#include "NonMoving.h"
+#include "NonMovingMark.h"
+#include "NonMovingSweep.h"
+#include "StablePtr.h" // markStablePtrTable
+#include "Schedule.h" // markScheduler
+#include "Weak.h" // dead_weak_ptr_list
+
+struct NonmovingHeap nonmovingHeap;
+
+uint8_t nonmovingMarkEpoch = 1;
+
+static void nonmovingBumpEpoch(void) {
+    nonmovingMarkEpoch = nonmovingMarkEpoch == 1 ? 2 : 1;
+}
+
+/*
+ * Note [Non-moving garbage collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * TODO
+ *
+ * Note [Concurrent non-moving collection]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Concurrency-control of non-moving garbage collection is a bit tricky. There
+ * are a few things to keep in mind:
+ *
+ *  - Only one non-moving collection may be active at a time. This is enforced by the
+ *    concurrent_coll_running flag, which is set when a collection is on-going. If
+ *    we attempt to initiate a new collection while this is set we wait on the
+ *    concurrent_coll_finished condition variable, which signals when the
+ *    active collection finishes.
+ *
+ *  - In between the mark and sweep phases the non-moving collector must synchronize
+ *    with mutator threads to collect and mark their final update remembered
+ *    sets. This is accomplished using
+ *    stopAllCapabilitiesWith(SYNC_FLUSH_UPD_REM_SET). Capabilities are held
+ *    the final mark has concluded.
+ *
+ *
+ * Note [Live data accounting in nonmoving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The nonmoving collector uses an approximate heuristic for reporting live
+ * data quantity. Specifically, during mark we record how much live data we
+ * find in nonmoving_live_words. At the end of mark we declare this amount to
+ * be how much live data we have on in the nonmoving heap (by setting
+ * oldest_gen->live_estimate).
+ *
+ * In addition, we update oldest_gen->live_estimate every time we fill a
+ * segment. This, as well, is quite approximate: we assume that all blocks
+ * above next_free_next are newly-allocated. In principle we could refer to the
+ * bitmap to count how many blocks we actually allocated but this too would be
+ * approximate due to concurrent collection and ultimately seems more costly
+ * than the problem demands.
+ *
+ */
+
+W_ nonmoving_live_words = 0;
+
+static void nonmovingClearBitmap(struct NonmovingSegment *seg);
+static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads);
+
+/* Signals to mutators that they should stop to synchronize with the nonmoving
+ * collector so it can proceed to sweep phase. */
+bool nonmoving_syncing = false;
+
+static void nonmovingInitSegment(struct NonmovingSegment *seg, uint8_t block_size)
+{
+    seg->link = NULL;
+    seg->todo_link = NULL;
+    seg->next_free = 0;
+    seg->next_free_snap = 0;
+    seg->block_size = block_size;
+    nonmovingClearBitmap(seg);
+    Bdescr((P_)seg)->u.scan = nonmovingSegmentGetBlock(seg, 0);
+}
+
+// Add a segment to the free list.
+void nonmovingPushFreeSegment(struct NonmovingSegment *seg)
+{
+    // See Note [Live data accounting in nonmoving collector].
+    if (nonmovingHeap.n_free > NONMOVING_MAX_FREE) {
+        bdescr *bd = Bdescr((StgPtr) seg);
+        ACQUIRE_SM_LOCK;
+        ASSERT(oldest_gen->n_blocks >= bd->blocks);
+        ASSERT(oldest_gen->n_words >= BLOCK_SIZE_W * bd->blocks);
+        oldest_gen->n_blocks -= bd->blocks;
+        oldest_gen->n_words  -= BLOCK_SIZE_W * bd->blocks;
+        freeGroup(bd);
+        RELEASE_SM_LOCK;
+        return;
+    }
+
+    while (true) {
+        struct NonmovingSegment *old = nonmovingHeap.free;
+        seg->link = old;
+        if (cas((StgVolatilePtr) &nonmovingHeap.free, (StgWord) old, (StgWord) seg) == (StgWord) old)
+            break;
+    }
+    __sync_add_and_fetch(&nonmovingHeap.n_free, 1);
+}
+
+static struct NonmovingSegment *nonmovingPopFreeSegment(void)
+{
+    while (true) {
+        struct NonmovingSegment *seg = nonmovingHeap.free;
+        if (seg == NULL) {
+            return NULL;
+        }
+        if (cas((StgVolatilePtr) &nonmovingHeap.free,
+                (StgWord) seg,
+                (StgWord) seg->link) == (StgWord) seg) {
+            __sync_sub_and_fetch(&nonmovingHeap.n_free, 1);
+            return seg;
+        }
+    }
+}
+
+/*
+ * Request a fresh segment from the free segment list or allocate one of the
+ * given node.
+ *
+ */
+static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node)
+{
+    // First try taking something off of the free list
+    struct NonmovingSegment *ret;
+    ret = nonmovingPopFreeSegment();
+
+    // Nothing in the free list, allocate a new segment...
+    if (ret == NULL) {
+        // Take gc spinlock: another thread may be scavenging a moving
+        // generation and call `todo_block_full`
+        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        bdescr *bd = allocAlignedGroupOnNode(node, NONMOVING_SEGMENT_BLOCKS);
+        // See Note [Live data accounting in nonmoving collector].
+        oldest_gen->n_blocks += bd->blocks;
+        oldest_gen->n_words  += BLOCK_SIZE_W * bd->blocks;
+        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+
+        for (StgWord32 i = 0; i < bd->blocks; ++i) {
+            initBdescr(&bd[i], oldest_gen, oldest_gen);
+            bd[i].flags = BF_NONMOVING;
+        }
+        ret = (struct NonmovingSegment *)bd->start;
+    }
+
+    // Check alignment
+    ASSERT(((uintptr_t)ret % NONMOVING_SEGMENT_SIZE) == 0);
+    return ret;
+}
+
+static inline unsigned long log2_floor(unsigned long x)
+{
+    return sizeof(unsigned long)*8 - 1 - __builtin_clzl(x);
+}
+
+static inline unsigned long log2_ceil(unsigned long x)
+{
+    unsigned long log = log2_floor(x);
+    return (x - (1 << log)) ? log + 1 : log;
+}
+
+// Advance a segment's next_free pointer. Returns true if segment if full.
+static bool advance_next_free(struct NonmovingSegment *seg)
+{
+    uint8_t *bitmap = seg->bitmap;
+    unsigned int blk_count = nonmovingSegmentBlockCount(seg);
+    for (unsigned int i = seg->next_free+1; i < blk_count; i++) {
+        if (!bitmap[i]) {
+            seg->next_free = i;
+            return false;
+        }
+    }
+    seg->next_free = blk_count;
+    return true;
+}
+
+static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *alloca)
+{
+    while (true) {
+        struct NonmovingSegment *seg = alloca->active;
+        if (seg == NULL) {
+            return NULL;
+        }
+        if (cas((StgVolatilePtr) &alloca->active,
+                (StgWord) seg,
+                (StgWord) seg->link) == (StgWord) seg) {
+            return seg;
+        }
+    }
+}
+
+/* sz is in words */
+GNUC_ATTR_HOT
+void *nonmovingAllocate(Capability *cap, StgWord sz)
+{
+    unsigned int allocator_idx = log2_ceil(sz * sizeof(StgWord)) - NONMOVING_ALLOCA0;
+
+    // The max we ever allocate is 3276 bytes (anything larger is a large
+    // object and not moved) which is covered by allocator 9.
+    ASSERT(allocator_idx < NONMOVING_ALLOCA_CNT);
+
+    struct NonmovingAllocator *alloca = nonmovingHeap.allocators[allocator_idx];
+
+    // Allocate into current segment
+    struct NonmovingSegment *current = alloca->current[cap->no];
+    ASSERT(current); // current is never NULL
+    void *ret = nonmovingSegmentGetBlock(current, current->next_free);
+    ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment
+
+    // Add segment to the todo list unless it's already there
+    // current->todo_link == NULL means not in todo list
+    if (!current->todo_link) {
+        gen_workspace *ws = &gct->gens[oldest_gen->no];
+        current->todo_link = ws->todo_seg;
+        ws->todo_seg = current;
+    }
+
+    // Advance the current segment's next_free or allocate a new segment if full
+    bool full = advance_next_free(current);
+    if (full) {
+        // Current segment is full: update live data estimate link it to
+        // filled, take an active segment if one exists, otherwise allocate a
+        // new segment.
+
+        // Update live data estimate.
+        // See Note [Live data accounting in nonmoving collector].
+        unsigned int new_blocks =  nonmovingSegmentBlockCount(current) - current->next_free_snap;
+        atomic_inc(&oldest_gen->live_estimate, new_blocks * nonmovingSegmentBlockSize(current) / sizeof(W_));
+
+        // push the current segment to the filled list
+        nonmovingPushFilledSegment(current);
+
+        // first look for a new segment in the active list
+        struct NonmovingSegment *new_current = pop_active_segment(alloca);
+
+        // there are no active segments, allocate new segment
+        if (new_current == NULL) {
+            new_current = nonmovingAllocSegment(cap->node);
+            nonmovingInitSegment(new_current, NONMOVING_ALLOCA0 + allocator_idx);
+        }
+
+        // make it current
+        new_current->link = NULL;
+        alloca->current[cap->no] = new_current;
+    }
+
+    return ret;
+}
+
+/* Allocate a nonmovingAllocator */
+static struct NonmovingAllocator *alloc_nonmoving_allocator(uint32_t n_caps)
+{
+    size_t allocator_sz =
+        sizeof(struct NonmovingAllocator) +
+        sizeof(void*) * n_caps; // current segment pointer for each capability
+    struct NonmovingAllocator *alloc =
+        stgMallocBytes(allocator_sz, "nonmovingInit");
+    memset(alloc, 0, allocator_sz);
+    return alloc;
+}
+
+void nonmovingInit(void)
+{
+    for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
+        nonmovingHeap.allocators[i] = alloc_nonmoving_allocator(n_capabilities);
+    }
+}
+
+void nonmovingExit(void)
+{
+}
+
+/*
+ * Wait for any concurrent collections to finish. Called during shutdown to
+ * ensure we don't steal capabilities that the nonmoving collector still has yet
+ * to synchronize with.
+ */
+void nonmovingWaitUntilFinished(void)
+{
+}
+
+/*
+ * Assumes that no garbage collector or mutator threads are running to safely
+ * resize the nonmoving_allocators.
+ *
+ * Must hold sm_mutex.
+ */
+void nonmovingAddCapabilities(uint32_t new_n_caps)
+{
+    unsigned int old_n_caps = nonmovingHeap.n_caps;
+    struct NonmovingAllocator **allocs = nonmovingHeap.allocators;
+
+    for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
+        struct NonmovingAllocator *old = allocs[i];
+        allocs[i] = alloc_nonmoving_allocator(new_n_caps);
+
+        // Copy the old state
+        allocs[i]->filled = old->filled;
+        allocs[i]->active = old->active;
+        for (unsigned int j = 0; j < old_n_caps; j++) {
+            allocs[i]->current[j] = old->current[j];
+        }
+        stgFree(old);
+
+        // Initialize current segments for the new capabilities
+        for (unsigned int j = old_n_caps; j < new_n_caps; j++) {
+            allocs[i]->current[j] = nonmovingAllocSegment(capabilities[j]->node);
+            nonmovingInitSegment(allocs[i]->current[j], NONMOVING_ALLOCA0 + i);
+            allocs[i]->current[j]->link = NULL;
+        }
+    }
+    nonmovingHeap.n_caps = new_n_caps;
+}
+
+static void nonmovingClearBitmap(struct NonmovingSegment *seg)
+{
+    unsigned int n = nonmovingSegmentBlockCount(seg);
+    memset(seg->bitmap, 0, n);
+}
+
+static void nonmovingClearSegmentBitmaps(struct NonmovingSegment *seg)
+{
+    while (seg) {
+        nonmovingClearBitmap(seg);
+        seg = seg->link;
+    }
+}
+
+static void nonmovingClearAllBitmaps(void)
+{
+    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
+        nonmovingClearSegmentBitmaps(alloca->filled);
+    }
+
+    // Clear large object bits
+    for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) {
+        bd->flags &= ~BF_MARKED;
+    }
+}
+
+/* Prepare the heap bitmaps and snapshot metadata for a mark */
+static void nonmovingPrepareMark(void)
+{
+    nonmovingClearAllBitmaps();
+    nonmovingBumpEpoch();
+    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
+
+        // Update current segments' snapshot pointers
+        for (uint32_t cap_n = 0; cap_n < n_capabilities; ++cap_n) {
+            struct NonmovingSegment *seg = alloca->current[cap_n];
+            seg->next_free_snap = seg->next_free;
+        }
+
+        // Update filled segments' snapshot pointers
+        struct NonmovingSegment *seg = alloca->filled;
+        while (seg) {
+            seg->next_free_snap = seg->next_free;
+            seg = seg->link;
+        }
+
+        // N.B. It's not necessary to update snapshot pointers of active segments;
+        // they were set after they were swept and haven't seen any allocation
+        // since.
+    }
+
+    ASSERT(oldest_gen->scavenged_large_objects == NULL);
+    bdescr *next;
+    for (bdescr *bd = oldest_gen->large_objects; bd; bd = next) {
+        next = bd->link;
+        bd->flags |= BF_NONMOVING_SWEEPING;
+        dbl_link_onto(bd, &nonmoving_large_objects);
+    }
+    n_nonmoving_large_blocks += oldest_gen->n_large_blocks;
+    oldest_gen->large_objects = NULL;
+    oldest_gen->n_large_words = 0;
+    oldest_gen->n_large_blocks = 0;
+    nonmoving_live_words = 0;
+
+#if defined(DEBUG)
+    debug_caf_list_snapshot = debug_caf_list;
+    debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
+#endif
+}
+
+// Mark weak pointers in the non-moving heap. They'll either end up in
+// dead_weak_ptr_list or stay in weak_ptr_list. Either way they need to be kept
+// during sweep. See `MarkWeak.c:markWeakPtrList` for the moving heap variant
+// of this.
+static void nonmovingMarkWeakPtrList(MarkQueue *mark_queue, StgWeak *dead_weak_ptr_list)
+{
+    for (StgWeak *w = oldest_gen->weak_ptr_list; w; w = w->link) {
+        markQueuePushClosure_(mark_queue, (StgClosure*)w);
+        // Do not mark finalizers and values here, those fields will be marked
+        // in `nonmovingMarkDeadWeaks` (for dead weaks) or
+        // `nonmovingTidyWeaks` (for live weaks)
+    }
+
+    // We need to mark dead_weak_ptr_list too. This is subtle:
+    //
+    // - By the beginning of this GC we evacuated all weaks to the non-moving
+    //   heap (in `markWeakPtrList`)
+    //
+    // - During the scavenging of the moving heap we discovered that some of
+    //   those weaks are dead and moved them to `dead_weak_ptr_list`. Note that
+    //   because of the fact above _all weaks_ are in the non-moving heap at
+    //   this point.
+    //
+    // - So, to be able to traverse `dead_weak_ptr_list` and run finalizers we
+    //   need to mark it.
+    for (StgWeak *w = dead_weak_ptr_list; w; w = w->link) {
+        markQueuePushClosure_(mark_queue, (StgClosure*)w);
+        nonmovingMarkDeadWeak(mark_queue, w);
+    }
+}
+
+void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads)
+{
+    resizeGenerations();
+
+    nonmovingPrepareMark();
+    nonmovingPrepareSweep();
+
+    // N.B. These should have been cleared at the end of the last sweep.
+    ASSERT(nonmoving_marked_large_objects == NULL);
+    ASSERT(n_nonmoving_marked_large_blocks == 0);
+
+    MarkQueue *mark_queue = stgMallocBytes(sizeof(MarkQueue), "mark queue");
+    initMarkQueue(mark_queue);
+    current_mark_queue = mark_queue;
+
+    // Mark roots
+    markCAFs((evac_fn)markQueueAddRoot, mark_queue);
+    for (unsigned int n = 0; n < n_capabilities; ++n) {
+        markCapability((evac_fn)markQueueAddRoot, mark_queue,
+                capabilities[n], true/*don't mark sparks*/);
+    }
+    markScheduler((evac_fn)markQueueAddRoot, mark_queue);
+    nonmovingMarkWeakPtrList(mark_queue, *dead_weaks);
+    markStablePtrTable((evac_fn)markQueueAddRoot, mark_queue);
+
+    // Mark threads resurrected during moving heap scavenging
+    for (StgTSO *tso = *resurrected_threads; tso != END_TSO_QUEUE; tso = tso->global_link) {
+        markQueuePushClosure_(mark_queue, (StgClosure*)tso);
+    }
+
+    // Roots marked, mark threads and weak pointers
+
+    // At this point all threads are moved to threads list (from old_threads)
+    // and all weaks are moved to weak_ptr_list (from old_weak_ptr_list) by
+    // the previous scavenge step, so we need to move them to "old" lists
+    // again.
+
+    // Fine to override old_threads because any live or resurrected threads are
+    // moved to threads or resurrected_threads lists.
+    ASSERT(oldest_gen->old_threads == END_TSO_QUEUE);
+    ASSERT(nonmoving_old_threads == END_TSO_QUEUE);
+    nonmoving_old_threads = oldest_gen->threads;
+    oldest_gen->threads = END_TSO_QUEUE;
+
+    // Make sure we don't lose any weak ptrs here. Weaks in old_weak_ptr_list
+    // will either be moved to `dead_weaks` (if dead) or `weak_ptr_list` (if
+    // alive).
+    ASSERT(oldest_gen->old_weak_ptr_list == NULL);
+    ASSERT(nonmoving_old_weak_ptr_list == NULL);
+    nonmoving_old_weak_ptr_list = oldest_gen->weak_ptr_list;
+    oldest_gen->weak_ptr_list = NULL;
+
+    // We are now safe to start concurrent marking
+
+    // Note that in concurrent mark we can't use dead_weaks and
+    // resurrected_threads from the preparation to add new weaks and threads as
+    // that would cause races between minor collection and mark. So we only pass
+    // those lists to mark function in sequential case. In concurrent case we
+    // allocate fresh lists.
+
+    // Use the weak and thread lists from the preparation for any new weaks and
+    // threads found to be dead in mark.
+    nonmovingMark_(mark_queue, dead_weaks, resurrected_threads);
+}
+
+/* Mark mark queue, threads, and weak pointers until no more weaks have been
+ * resuscitated
+ */
+static void nonmovingMarkThreadsWeaks(MarkQueue *mark_queue)
+{
+    while (true) {
+        // Propagate marks
+        nonmovingMark(mark_queue);
+
+        // Tidy threads and weaks
+        nonmovingTidyThreads();
+
+        if (! nonmovingTidyWeaks(mark_queue))
+            return;
+    }
+}
+
+static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads)
+{
+    debugTrace(DEBUG_nonmoving_gc, "Starting mark...");
+
+    // Do concurrent marking; most of the heap will get marked here.
+    nonmovingMarkThreadsWeaks(mark_queue);
+
+    nonmovingResurrectThreads(mark_queue, resurrected_threads);
+
+    // No more resurrecting threads after this point
+
+    // Do last marking of weak pointers
+    while (true) {
+        // Propagate marks
+        nonmovingMark(mark_queue);
+
+        if (!nonmovingTidyWeaks(mark_queue))
+            break;
+    }
+
+    nonmovingMarkDeadWeaks(mark_queue, dead_weaks);
+
+    // Propagate marks
+    nonmovingMark(mark_queue);
+
+    // Now remove all dead objects from the mut_list to ensure that a younger
+    // generation collection doesn't attempt to look at them after we've swept.
+    nonmovingSweepMutLists();
+
+    debugTrace(DEBUG_nonmoving_gc,
+               "Done marking, resurrecting threads before releasing capabilities");
+
+#if defined(DEBUG)
+    // Zap CAFs that we will sweep
+    nonmovingGcCafs(mark_queue);
+#endif
+
+    ASSERT(mark_queue->top->head == 0);
+    ASSERT(mark_queue->blocks->link == NULL);
+
+    // Update oldest_gen thread and weak lists
+    // Note that we need to append these lists as a concurrent minor GC may have
+    // added stuff to them while we're doing mark-sweep concurrently
+    {
+        StgTSO **threads = &oldest_gen->threads;
+        while (*threads != END_TSO_QUEUE) {
+            threads = &(*threads)->global_link;
+        }
+        *threads = nonmoving_threads;
+        nonmoving_threads = END_TSO_QUEUE;
+        nonmoving_old_threads = END_TSO_QUEUE;
+    }
+
+    {
+        StgWeak **weaks = &oldest_gen->weak_ptr_list;
+        while (*weaks) {
+            weaks = &(*weaks)->link;
+        }
+        *weaks = nonmoving_weak_ptr_list;
+        nonmoving_weak_ptr_list = NULL;
+        nonmoving_old_weak_ptr_list = NULL;
+    }
+
+    current_mark_queue = NULL;
+    freeMarkQueue(mark_queue);
+    stgFree(mark_queue);
+
+    oldest_gen->live_estimate = nonmoving_live_words;
+    oldest_gen->n_old_blocks = 0;
+    resizeGenerations();
+
+    /****************************************************
+     * Sweep
+     ****************************************************/
+
+    // Because we can't mark large object blocks (no room for mark bit) we
+    // collect them in a map in mark_queue and we pass it here to sweep large
+    // objects
+    nonmovingSweepLargeObjects();
+    nonmovingSweepStableNameTable();
+
+    nonmovingSweep();
+    ASSERT(nonmovingHeap.sweep_list == NULL);
+    debugTrace(DEBUG_nonmoving_gc, "Finished sweeping.");
+
+    // TODO: Remainder of things done by GarbageCollect (update stats)
+}
+
+#if defined(DEBUG)
+
+// Use this with caution: this doesn't work correctly during scavenge phase
+// when we're doing parallel scavenging. Use it in mark phase or later (where
+// we don't allocate more anymore).
+void assert_in_nonmoving_heap(StgPtr p)
+{
+    if (!HEAP_ALLOCED_GC(p))
+        return;
+
+    bdescr *bd = Bdescr(p);
+    if (bd->flags & BF_LARGE) {
+        // It should be in a capability (if it's not filled yet) or in non-moving heap
+        for (uint32_t cap = 0; cap < n_capabilities; ++cap) {
+            if (bd == capabilities[cap]->pinned_object_block) {
+                return;
+            }
+        }
+        ASSERT(bd->flags & BF_NONMOVING);
+        return;
+    }
+
+    // Search snapshot segments
+    for (struct NonmovingSegment *seg = nonmovingHeap.sweep_list; seg; seg = seg->link) {
+        if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+            return;
+        }
+    }
+
+    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
+        // Search current segments
+        for (uint32_t cap_idx = 0; cap_idx < n_capabilities; ++cap_idx) {
+            struct NonmovingSegment *seg = alloca->current[cap_idx];
+            if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                return;
+            }
+        }
+
+        // Search active segments
+        int seg_idx = 0;
+        struct NonmovingSegment *seg = alloca->active;
+        while (seg) {
+            if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                return;
+            }
+            seg_idx++;
+            seg = seg->link;
+        }
+
+        // Search filled segments
+        seg_idx = 0;
+        seg = alloca->filled;
+        while (seg) {
+            if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                return;
+            }
+            seg_idx++;
+            seg = seg->link;
+        }
+    }
+
+    // We don't search free segments as they're unused
+
+    barf("%p is not in nonmoving heap\n", (void*)p);
+}
+
+void nonmovingPrintSegment(struct NonmovingSegment *seg)
+{
+    int num_blocks = nonmovingSegmentBlockCount(seg);
+
+    debugBelch("Segment with %d blocks of size 2^%d (%d bytes, %lu words, scan: %p)\n",
+               num_blocks,
+               seg->block_size,
+               1 << seg->block_size,
+               ROUNDUP_BYTES_TO_WDS(1 << seg->block_size),
+               (void*)Bdescr((P_)seg)->u.scan);
+
+    for (nonmoving_block_idx p_idx = 0; p_idx < seg->next_free; ++p_idx) {
+        StgClosure *p = (StgClosure*)nonmovingSegmentGetBlock(seg, p_idx);
+        if (nonmovingGetMark(seg, p_idx) != 0) {
+            debugBelch("%d (%p)* :\t", p_idx, (void*)p);
+        } else {
+            debugBelch("%d (%p)  :\t", p_idx, (void*)p);
+        }
+        printClosure(p);
+    }
+
+    debugBelch("End of segment\n\n");
+}
+
+void nonmovingPrintAllocator(struct NonmovingAllocator *alloc)
+{
+    debugBelch("Allocator at %p\n", (void*)alloc);
+    debugBelch("Filled segments:\n");
+    for (struct NonmovingSegment *seg = alloc->filled; seg != NULL; seg = seg->link) {
+        debugBelch("%p ", (void*)seg);
+    }
+    debugBelch("\nActive segments:\n");
+    for (struct NonmovingSegment *seg = alloc->active; seg != NULL; seg = seg->link) {
+        debugBelch("%p ", (void*)seg);
+    }
+    debugBelch("\nCurrent segments:\n");
+    for (uint32_t i = 0; i < n_capabilities; ++i) {
+        debugBelch("%p ", alloc->current[i]);
+    }
+    debugBelch("\n");
+}
+
+void locate_object(P_ obj)
+{
+    // Search allocators
+    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
+        for (uint32_t cap = 0; cap < n_capabilities; ++cap) {
+            struct NonmovingSegment *seg = alloca->current[cap];
+            if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                debugBelch("%p is in current segment of capability %d of allocator %d at %p\n", obj, cap, alloca_idx, (void*)seg);
+                return;
+            }
+        }
+        int seg_idx = 0;
+        struct NonmovingSegment *seg = alloca->active;
+        while (seg) {
+            if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                debugBelch("%p is in active segment %d of allocator %d at %p\n", obj, seg_idx, alloca_idx, (void*)seg);
+                return;
+            }
+            seg_idx++;
+            seg = seg->link;
+        }
+
+        seg_idx = 0;
+        seg = alloca->filled;
+        while (seg) {
+            if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                debugBelch("%p is in filled segment %d of allocator %d at %p\n", obj, seg_idx, alloca_idx, (void*)seg);
+                return;
+            }
+            seg_idx++;
+            seg = seg->link;
+        }
+    }
+
+    struct NonmovingSegment *seg = nonmovingHeap.free;
+    int seg_idx = 0;
+    while (seg) {
+        if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+            debugBelch("%p is in free segment %d at %p\n", obj, seg_idx, (void*)seg);
+            return;
+        }
+        seg_idx++;
+        seg = seg->link;
+    }
+
+    // Search nurseries
+    for (uint32_t nursery_idx = 0; nursery_idx < n_nurseries; ++nursery_idx) {
+        for (bdescr* nursery_block = nurseries[nursery_idx].blocks; nursery_block; nursery_block = nursery_block->link) {
+            if (obj >= nursery_block->start && obj <= nursery_block->start + nursery_block->blocks*BLOCK_SIZE_W) {
+                debugBelch("%p is in nursery %d\n", obj, nursery_idx);
+                return;
+            }
+        }
+    }
+
+    // Search generations
+    for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++g) {
+        generation *gen = &generations[g];
+        for (bdescr *blk = gen->blocks; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " blocks\n", obj, g);
+                return;
+            }
+        }
+        for (bdescr *blk = gen->old_blocks; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " old blocks\n", obj, g);
+                return;
+            }
+        }
+    }
+
+    // Search large objects
+    for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++g) {
+        generation *gen = &generations[g];
+        for (bdescr *large_block = gen->large_objects; large_block; large_block = large_block->link) {
+            if ((P_)large_block->start == obj) {
+                debugBelch("%p is in large blocks of generation %d\n", obj, g);
+                return;
+            }
+        }
+    }
+
+    for (bdescr *large_block = nonmoving_large_objects; large_block; large_block = large_block->link) {
+        if ((P_)large_block->start == obj) {
+            debugBelch("%p is in nonmoving_large_objects\n", obj);
+            return;
+        }
+    }
+
+    for (bdescr *large_block = nonmoving_marked_large_objects; large_block; large_block = large_block->link) {
+        if ((P_)large_block->start == obj) {
+            debugBelch("%p is in nonmoving_marked_large_objects\n", obj);
+            return;
+        }
+    }
+}
+
+void nonmovingPrintSweepList()
+{
+    debugBelch("==== SWEEP LIST =====\n");
+    int i = 0;
+    for (struct NonmovingSegment *seg = nonmovingHeap.sweep_list; seg; seg = seg->link) {
+        debugBelch("%d: %p\n", i++, (void*)seg);
+    }
+    debugBelch("= END OF SWEEP LIST =\n");
+}
+
+void check_in_mut_list(StgClosure *p)
+{
+    for (uint32_t cap_n = 0; cap_n < n_capabilities; ++cap_n) {
+        for (bdescr *bd = capabilities[cap_n]->mut_lists[oldest_gen->no]; bd; bd = bd->link) {
+            for (StgPtr q = bd->start; q < bd->free; ++q) {
+                if (*((StgPtr**)q) == (StgPtr*)p) {
+                    debugBelch("Object is in mut list of cap %d: %p\n", cap_n, capabilities[cap_n]->mut_lists[oldest_gen->no]);
+                    return;
+                }
+            }
+        }
+    }
+
+    debugBelch("Object is not in a mut list\n");
+}
+
+void print_block_list(bdescr* bd)
+{
+    while (bd) {
+        debugBelch("%p, ", (void*)bd);
+        bd = bd->link;
+    }
+    debugBelch("\n");
+}
+
+void print_thread_list(StgTSO* tso)
+{
+    while (tso != END_TSO_QUEUE) {
+        printClosure((StgClosure*)tso);
+        tso = tso->global_link;
+    }
+}
+
+#endif


=====================================
rts/sm/NonMoving.h
=====================================
@@ -0,0 +1,285 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator
+ *
+ * ---------------------------------------------------------------------------*/
+
+#pragma once
+
+#if !defined(CMINUSMINUS)
+
+#include <string.h>
+#include "HeapAlloc.h"
+#include "NonMovingMark.h"
+
+#include "BeginPrivate.h"
+
+// Segments
+#define NONMOVING_SEGMENT_BITS 15   // 2^15 = 32kByte
+// Mask to find base of segment
+#define NONMOVING_SEGMENT_MASK ((1 << NONMOVING_SEGMENT_BITS) - 1)
+// In bytes
+#define NONMOVING_SEGMENT_SIZE (1 << NONMOVING_SEGMENT_BITS)
+// In words
+#define NONMOVING_SEGMENT_SIZE_W ((1 << NONMOVING_SEGMENT_BITS) / SIZEOF_VOID_P)
+// In blocks
+#define NONMOVING_SEGMENT_BLOCKS (NONMOVING_SEGMENT_SIZE / BLOCK_SIZE)
+
+_Static_assert(NONMOVING_SEGMENT_SIZE % BLOCK_SIZE == 0,
+               "non-moving segment size must be multiple of block size");
+
+// The index of a block within a segment
+typedef uint16_t nonmoving_block_idx;
+
+// A non-moving heap segment
+struct NonmovingSegment {
+    struct NonmovingSegment *link;     // for linking together segments into lists
+    struct NonmovingSegment *todo_link; // NULL when not in todo list
+    nonmoving_block_idx next_free;      // index of the next unallocated block
+    nonmoving_block_idx next_free_snap; // snapshot of next_free
+    uint8_t block_size;                 // log2 of block size in bytes
+    uint8_t bitmap[];                   // liveness bitmap
+    // After the liveness bitmap comes the data blocks. Note that we need to
+    // ensure that the size of this struct (including the bitmap) is a multiple
+    // of the word size since GHC assumes that all object pointers are
+    // so-aligned.
+};
+
+// This is how we mark end of todo lists. Not NULL because todo_link == NULL
+// means segment is not in list.
+#define END_NONMOVING_TODO_LIST ((struct NonmovingSegment*)1)
+
+// A non-moving allocator for a particular block size
+struct NonmovingAllocator {
+    struct NonmovingSegment *filled;
+    struct NonmovingSegment *active;
+    // indexed by capability number
+    struct NonmovingSegment *current[];
+};
+
+// first allocator is of size 2^NONMOVING_ALLOCA0 (in bytes)
+#define NONMOVING_ALLOCA0 3
+
+// allocators cover block sizes of 2^NONMOVING_ALLOCA0 to
+// 2^(NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT) (in bytes)
+#define NONMOVING_ALLOCA_CNT 12
+
+// maximum number of free segments to hold on to
+#define NONMOVING_MAX_FREE 16
+
+struct NonmovingHeap {
+    struct NonmovingAllocator *allocators[NONMOVING_ALLOCA_CNT];
+    // free segment list. This is a cache where we keep up to
+    // NONMOVING_MAX_FREE segments to avoid thrashing the block allocator.
+    // Note that segments in this list are still counted towards
+    // oldest_gen->n_blocks.
+    struct NonmovingSegment *free;
+    // how many segments in free segment list? accessed atomically.
+    unsigned int n_free;
+
+    // records the current length of the nonmovingAllocator.current arrays
+    unsigned int n_caps;
+
+    // The set of segments being swept in this GC. Segments are moved here from
+    // the filled list during preparation and moved back to either the filled,
+    // active, or free lists during sweep.  Should be NULL before mark and
+    // after sweep.
+    struct NonmovingSegment *sweep_list;
+};
+
+extern struct NonmovingHeap nonmovingHeap;
+
+extern uint64_t nonmoving_live_words;
+
+void nonmovingInit(void);
+void nonmovingExit(void);
+void nonmovingWaitUntilFinished(void);
+
+
+// dead_weaks and resurrected_threads lists are used for two things:
+//
+// - The weaks and threads in those lists are found to be dead during
+//   preparation, but the weaks will be used for finalization and threads will
+//   be scheduled again (aka. resurrection) so we need to keep them alive in the
+//   non-moving heap as well. So we treat them as roots and mark them.
+//
+// - In non-threaded runtime we add weaks and threads found to be dead in the
+//   non-moving heap to those lists so that they'll be finalized and scheduled
+//   as other weaks and threads. In threaded runtime we can't do this as that'd
+//   cause races between a minor collection and non-moving collection. Instead
+//   in non-moving heap we finalize the weaks and resurrect the threads
+//   directly, but in a pause.
+//
+void nonmovingCollect(StgWeak **dead_weaks,
+                       StgTSO **resurrected_threads);
+
+void *nonmovingAllocate(Capability *cap, StgWord sz);
+void nonmovingAddCapabilities(uint32_t new_n_caps);
+void nonmovingPushFreeSegment(struct NonmovingSegment *seg);
+
+// Add a segment to the appropriate active list.
+INLINE_HEADER void nonmovingPushActiveSegment(struct NonmovingSegment *seg)
+{
+    struct NonmovingAllocator *alloc =
+        nonmovingHeap.allocators[seg->block_size - NONMOVING_ALLOCA0];
+    while (true) {
+        struct NonmovingSegment *current_active = (struct NonmovingSegment*)VOLATILE_LOAD(&alloc->active);
+        seg->link = current_active;
+        if (cas((StgVolatilePtr) &alloc->active, (StgWord) current_active, (StgWord) seg) == (StgWord) current_active) {
+            break;
+        }
+    }
+}
+
+// Add a segment to the appropriate filled list.
+INLINE_HEADER void nonmovingPushFilledSegment(struct NonmovingSegment *seg)
+{
+    struct NonmovingAllocator *alloc =
+        nonmovingHeap.allocators[seg->block_size - NONMOVING_ALLOCA0];
+    while (true) {
+        struct NonmovingSegment *current_filled = (struct NonmovingSegment*)VOLATILE_LOAD(&alloc->filled);
+        seg->link = current_filled;
+        if (cas((StgVolatilePtr) &alloc->filled, (StgWord) current_filled, (StgWord) seg) == (StgWord) current_filled) {
+            break;
+        }
+    }
+}
+// Assert that the pointer can be traced by the non-moving collector (e.g. in
+// mark phase). This means one of the following:
+//
+// - A static object
+// - A large object
+// - An object in the non-moving heap (e.g. in one of the segments)
+//
+void assert_in_nonmoving_heap(StgPtr p);
+
+// The block size of a given segment in bytes.
+INLINE_HEADER unsigned int nonmovingSegmentBlockSize(struct NonmovingSegment *seg)
+{
+    return 1 << seg->block_size;
+}
+
+// How many blocks does the given segment contain? Also the size of the bitmap.
+INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg)
+{
+  unsigned int sz = nonmovingSegmentBlockSize(seg);
+  unsigned int segment_data_size = NONMOVING_SEGMENT_SIZE - sizeof(struct NonmovingSegment);
+  segment_data_size -= segment_data_size % SIZEOF_VOID_P;
+  return segment_data_size / (sz + 1);
+}
+
+// Get a pointer to the given block index
+INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i)
+{
+  // Block size in bytes
+  unsigned int blk_size = nonmovingSegmentBlockSize(seg);
+  // Bitmap size in bytes
+  W_ bitmap_size = nonmovingSegmentBlockCount(seg) * sizeof(uint8_t);
+  // Where the actual data starts (address of the first block).
+  // Use ROUNDUP_BYTES_TO_WDS to align to word size. Note that
+  // ROUNDUP_BYTES_TO_WDS returns in _words_, not in _bytes_, so convert it back
+  // back to bytes by multiplying with word size.
+  W_ data = ROUNDUP_BYTES_TO_WDS(((W_)seg) + sizeof(struct NonmovingSegment) + bitmap_size) * sizeof(W_);
+  return (void*)(data + i*blk_size);
+}
+
+// Get the segment which a closure resides in. Assumes that pointer points into
+// non-moving heap.
+INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p)
+{
+    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
+    const uintptr_t mask = ~NONMOVING_SEGMENT_MASK;
+    return (struct NonmovingSegment *) (((uintptr_t) p) & mask);
+}
+
+INLINE_HEADER nonmoving_block_idx nonmovingGetBlockIdx(StgPtr p)
+{
+    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
+    struct NonmovingSegment *seg = nonmovingGetSegment(p);
+    ptrdiff_t blk0 = (ptrdiff_t)nonmovingSegmentGetBlock(seg, 0);
+    ptrdiff_t offset = (ptrdiff_t)p - blk0;
+    return (nonmoving_block_idx) (offset >> seg->block_size);
+}
+
+// TODO: Eliminate this
+extern uint8_t nonmovingMarkEpoch;
+
+INLINE_HEADER void nonmovingSetMark(struct NonmovingSegment *seg, nonmoving_block_idx i)
+{
+    seg->bitmap[i] = nonmovingMarkEpoch;
+}
+
+INLINE_HEADER uint8_t nonmovingGetMark(struct NonmovingSegment *seg, nonmoving_block_idx i)
+{
+    return seg->bitmap[i];
+}
+
+INLINE_HEADER void nonmovingSetClosureMark(StgPtr p)
+{
+    nonmovingSetMark(nonmovingGetSegment(p), nonmovingGetBlockIdx(p));
+}
+
+// TODO: Audit the uses of these
+/* Was the given closure marked this major GC cycle? */
+INLINE_HEADER bool nonmovingClosureMarkedThisCycle(StgPtr p)
+{
+    struct NonmovingSegment *seg = nonmovingGetSegment(p);
+    nonmoving_block_idx blk_idx = nonmovingGetBlockIdx(p);
+    return nonmovingGetMark(seg, blk_idx) == nonmovingMarkEpoch;
+}
+
+INLINE_HEADER bool nonmovingClosureMarked(StgPtr p)
+{
+    struct NonmovingSegment *seg = nonmovingGetSegment(p);
+    nonmoving_block_idx blk_idx = nonmovingGetBlockIdx(p);
+    return nonmovingGetMark(seg, blk_idx) != 0;
+}
+
+// Can be called during a major collection to determine whether a particular
+// segment is in the set of segments that will be swept this collection cycle.
+INLINE_HEADER bool nonmovingSegmentBeingSwept(struct NonmovingSegment *seg)
+{
+    unsigned int n = nonmovingSegmentBlockCount(seg);
+    return seg->next_free_snap >= n;
+}
+
+// Can be called during a major collection to determine whether a particular
+// closure lives in a segment that will be swept this collection cycle.
+// Note that this returns true for both large and normal objects.
+INLINE_HEADER bool nonmovingClosureBeingSwept(StgClosure *p)
+{
+    bdescr *bd = Bdescr((StgPtr) p);
+    if (HEAP_ALLOCED_GC(p)) {
+        if (bd->flags & BF_NONMOVING_SWEEPING) {
+            return true;
+        } else if (bd->flags & BF_NONMOVING) {
+            struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+            return nonmovingSegmentBeingSwept(seg);
+        } else {
+            // outside of the nonmoving heap
+            return false;
+        }
+    } else {
+        // a static object
+        return true;
+    }
+}
+
+#if defined(DEBUG)
+
+void nonmovingPrintSegment(struct NonmovingSegment *seg);
+void nonmovingPrintAllocator(struct NonmovingAllocator *alloc);
+void locate_object(P_ obj);
+void nonmovingPrintSweepList(void);
+// Check if the object is in one of non-moving heap mut_lists
+void check_in_mut_list(StgClosure *p);
+void print_block_list(bdescr *bd);
+void print_thread_list(StgTSO* tso);
+
+#endif
+
+#include "EndPrivate.h"
+
+#endif // CMINUSMINUS


=====================================
rts/sm/NonMovingMark.c
=====================================
@@ -0,0 +1,1217 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Mark phase
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+// We call evacuate, which expects the thread-local gc_thread to be valid;
+// This is sometimes declared as a register variable therefore it is necessary
+// to include the declaration so that the compiler doesn't clobber the register.
+#include "NonMovingMark.h"
+#include "NonMoving.h"
+#include "BlockAlloc.h"  /* for countBlocks */
+#include "HeapAlloc.h"
+#include "Task.h"
+#include "Trace.h"
+#include "HeapUtils.h"
+#include "Printer.h"
+#include "Schedule.h"
+#include "Weak.h"
+#include "STM.h"
+#include "MarkWeak.h"
+#include "sm/Storage.h"
+
+static void mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin);
+static void mark_tso (MarkQueue *queue, StgTSO *tso);
+static void mark_stack (MarkQueue *queue, StgStack *stack);
+static void mark_PAP_payload (MarkQueue *queue,
+                              StgClosure *fun,
+                              StgClosure **payload,
+                              StgWord size);
+
+// How many Array# entries to add to the mark queue at once?
+#define MARK_ARRAY_CHUNK_LENGTH 128
+
+/* Note [Large objects in the non-moving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The nonmoving collector keeps a separate list of its large objects, apart from
+ * oldest_gen->large_objects. There are two reasons for this:
+ *
+ *  1. oldest_gen is mutated by minor collections, which happen concurrently with
+ *     marking
+ *  2. the non-moving collector needs a consistent picture
+ *
+ * At the beginning of a major collection, nonmovingCollect takes the objects in
+ * oldest_gen->large_objects (which includes all large objects evacuated by the
+ * moving collector) and adds them to nonmoving_large_objects. This is the set
+ * of large objects that will being collected in the current major GC cycle.
+ *
+ * As the concurrent mark phase proceeds, the large objects in
+ * nonmoving_large_objects that are found to be live are moved to
+ * nonmoving_marked_large_objects. During sweep we discard all objects that remain
+ * in nonmoving_large_objects and move everything in nonmoving_marked_larged_objects
+ * back to nonmoving_large_objects.
+ *
+ * During minor collections large objects will accumulate on
+ * oldest_gen->large_objects, where they will be picked up by the nonmoving
+ * collector and moved to nonmoving_large_objects during the next major GC.
+ * When this happens the block gets its BF_NONMOVING_SWEEPING flag set to
+ * indicate that it is part of the snapshot and consequently should be marked by
+ * the nonmoving mark phase..
+ */
+
+bdescr *nonmoving_large_objects = NULL;
+bdescr *nonmoving_marked_large_objects = NULL;
+memcount n_nonmoving_large_blocks = 0;
+memcount n_nonmoving_marked_large_blocks = 0;
+
+/*
+ * Where we keep our threads during collection since we must have a snapshot of
+ * the threads that lived in the nonmoving heap at the time that the snapshot
+ * was taken to safely resurrect.
+ */
+StgTSO *nonmoving_old_threads = END_TSO_QUEUE;
+/* Same for weak pointers */
+StgWeak *nonmoving_old_weak_ptr_list = NULL;
+/* Because we can "tidy" thread and weak lists concurrently with a minor GC we
+ * need to move marked threads and weaks to these lists until we pause for sync.
+ * Then we move them to oldest_gen lists. */
+StgTSO *nonmoving_threads = END_TSO_QUEUE;
+StgWeak *nonmoving_weak_ptr_list = NULL;
+
+#if defined(DEBUG)
+// TODO (osa): Document
+StgIndStatic *debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST;
+#endif
+
+/* Used to provide the current mark queue to the young generation
+ * collector for scavenging.
+ */
+MarkQueue *current_mark_queue = NULL;
+
+/*********************************************************
+ * Pushing to either the mark queue or remembered set
+ *********************************************************/
+
+STATIC_INLINE void
+push (MarkQueue *q, const MarkQueueEnt *ent)
+{
+    // Are we at the end of the block?
+    if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
+        // Yes, this block is full.
+        // allocate a fresh block.
+        ACQUIRE_SM_LOCK;
+        bdescr *bd = allocGroup(1);
+        bd->link = q->blocks;
+        q->blocks = bd;
+        q->top = (MarkQueueBlock *) bd->start;
+        q->top->head = 0;
+        RELEASE_SM_LOCK;
+    }
+
+    q->top->entries[q->top->head] = *ent;
+    q->top->head++;
+}
+
+static inline
+void push_closure (MarkQueue *q,
+                   StgClosure *p,
+                   StgClosure **origin)
+{
+    // TODO: Push this into callers where they already have the Bdescr
+    if (HEAP_ALLOCED_GC(p) && (Bdescr((StgPtr) p)->gen != oldest_gen))
+        return;
+
+#if defined(DEBUG)
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    // Commenting out: too slow
+    // if (RtsFlags.DebugFlags.sanity) {
+    //     assert_in_nonmoving_heap((P_)p);
+    //     if (origin)
+    //         assert_in_nonmoving_heap((P_)origin);
+    // }
+#endif
+
+    MarkQueueEnt ent = {
+        .type = MARK_CLOSURE,
+        .mark_closure = {
+            .p = UNTAG_CLOSURE(p),
+            .origin = origin,
+        }
+    };
+    push(q, &ent);
+}
+
+static
+void push_array (MarkQueue *q,
+                 const StgMutArrPtrs *array,
+                 StgWord start_index)
+{
+    // TODO: Push this into callers where they already have the Bdescr
+    if (HEAP_ALLOCED_GC(array) && (Bdescr((StgPtr) array)->gen != oldest_gen))
+        return;
+
+    MarkQueueEnt ent = {
+        .type = MARK_ARRAY,
+        .mark_array = {
+            .array = array,
+            .start_index = start_index
+        }
+    };
+    push(q, &ent);
+}
+
+static
+void push_thunk_srt (MarkQueue *q, const StgInfoTable *info)
+{
+    const StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(info);
+    if (thunk_info->i.srt) {
+        push_closure(q, (StgClosure*)GET_SRT(thunk_info), NULL);
+    }
+}
+
+static
+void push_fun_srt (MarkQueue *q, const StgInfoTable *info)
+{
+    const StgFunInfoTable *fun_info = itbl_to_fun_itbl(info);
+    if (fun_info->i.srt) {
+        push_closure(q, (StgClosure*)GET_FUN_SRT(fun_info), NULL);
+    }
+}
+
+/*********************************************************
+ * Pushing to the mark queue
+ *********************************************************/
+
+void markQueuePush (MarkQueue *q, const MarkQueueEnt *ent)
+{
+    push(q, ent);
+}
+
+void markQueuePushClosure (MarkQueue *q,
+                              StgClosure *p,
+                              StgClosure **origin)
+{
+    push_closure(q, p, origin);
+}
+
+/* TODO: Do we really never want to specify the origin here? */
+void markQueueAddRoot (MarkQueue* q, StgClosure** root)
+{
+    markQueuePushClosure(q, *root, NULL);
+}
+
+/* Push a closure to the mark queue without origin information */
+void markQueuePushClosure_ (MarkQueue *q, StgClosure *p)
+{
+    markQueuePushClosure(q, p, NULL);
+}
+
+void markQueuePushFunSrt (MarkQueue *q, const StgInfoTable *info)
+{
+    push_fun_srt(q, info);
+}
+
+void markQueuePushThunkSrt (MarkQueue *q, const StgInfoTable *info)
+{
+    push_thunk_srt(q, info);
+}
+
+void markQueuePushArray (MarkQueue *q,
+                            const StgMutArrPtrs *array,
+                            StgWord start_index)
+{
+    push_array(q, array, start_index);
+}
+
+/*********************************************************
+ * Popping from the mark queue
+ *********************************************************/
+
+// Returns invalid MarkQueueEnt if queue is empty.
+static MarkQueueEnt markQueuePop (MarkQueue *q)
+{
+    MarkQueueBlock *top;
+
+again:
+    top = q->top;
+
+    // Are we at the beginning of the block?
+    if (top->head == 0) {
+        // Is this the first block of the queue?
+        if (q->blocks->link == NULL) {
+            // Yes, therefore queue is empty...
+            MarkQueueEnt none = { .type = NULL_ENTRY };
+            return none;
+        } else {
+            // No, unwind to the previous block and try popping again...
+            bdescr *old_block = q->blocks;
+            q->blocks = old_block->link;
+            q->top = (MarkQueueBlock*)q->blocks->start;
+            ACQUIRE_SM_LOCK;
+            freeGroup(old_block); // TODO: hold on to a block to avoid repeated allocation/deallocation?
+            RELEASE_SM_LOCK;
+            goto again;
+        }
+    }
+
+    top->head--;
+    MarkQueueEnt ent = top->entries[top->head];
+    return ent;
+}
+
+/*********************************************************
+ * Creating and destroying MarkQueues
+ *********************************************************/
+
+/* Must hold sm_mutex. */
+static void init_mark_queue_ (MarkQueue *queue)
+{
+    bdescr *bd = allocGroup(1);
+    queue->blocks = bd;
+    queue->top = (MarkQueueBlock *) bd->start;
+    queue->top->head = 0;
+}
+
+/* Must hold sm_mutex. */
+void initMarkQueue (MarkQueue *queue)
+{
+    init_mark_queue_(queue);
+    queue->marked_objects = allocHashTable();
+}
+
+void freeMarkQueue (MarkQueue *queue)
+{
+    bdescr* b = queue->blocks;
+    ACQUIRE_SM_LOCK;
+    while (b)
+    {
+        bdescr* b_ = b->link;
+        freeGroup(b);
+        b = b_;
+    }
+    RELEASE_SM_LOCK;
+    freeHashTable(queue->marked_objects, NULL);
+}
+
+
+/*********************************************************
+ * Marking
+ *********************************************************/
+
+/*
+ * N.B. Mutation of TRecHeaders is completely unprotected by any write
+ * barrier. Consequently it's quite important that we deeply mark
+ * any outstanding transactions.
+ */
+static void mark_trec_header (MarkQueue *queue, StgTRecHeader *trec)
+{
+    while (trec != NO_TREC) {
+        StgTRecChunk *chunk = trec->current_chunk;
+        markQueuePushClosure_(queue, (StgClosure *) trec);
+        markQueuePushClosure_(queue, (StgClosure *) chunk);
+        while (chunk != END_STM_CHUNK_LIST) {
+            for (StgWord i=0; i < chunk->next_entry_idx; i++) {
+                TRecEntry *ent = &chunk->entries[i];
+                markQueuePushClosure_(queue, (StgClosure *) ent->tvar);
+                markQueuePushClosure_(queue, ent->expected_value);
+                markQueuePushClosure_(queue, ent->new_value);
+            }
+            chunk = chunk->prev_chunk;
+        }
+        trec = trec->enclosing_trec;
+    }
+}
+
+static void mark_tso (MarkQueue *queue, StgTSO *tso)
+{
+    // TODO: Clear dirty if contains only old gen objects
+
+    if (tso->bound != NULL) {
+        markQueuePushClosure_(queue, (StgClosure *) tso->bound->tso);
+    }
+
+    markQueuePushClosure_(queue, (StgClosure *) tso->blocked_exceptions);
+    markQueuePushClosure_(queue, (StgClosure *) tso->bq);
+    mark_trec_header(queue, tso->trec);
+    markQueuePushClosure_(queue, (StgClosure *) tso->stackobj);
+    markQueuePushClosure_(queue, (StgClosure *) tso->_link);
+    if (   tso->why_blocked == BlockedOnMVar
+        || tso->why_blocked == BlockedOnMVarRead
+        || tso->why_blocked == BlockedOnBlackHole
+        || tso->why_blocked == BlockedOnMsgThrowTo
+        || tso->why_blocked == NotBlocked
+        ) {
+        markQueuePushClosure_(queue, tso->block_info.closure);
+    }
+}
+
+static void
+do_push_closure (StgClosure **p, void *user)
+{
+    MarkQueue *queue = (MarkQueue *) user;
+    // TODO: Origin? need reference to containing closure
+    markQueuePushClosure_(queue, *p);
+}
+
+static void
+mark_large_bitmap (MarkQueue *queue,
+                   StgClosure **p,
+                   StgLargeBitmap *large_bitmap,
+                   StgWord size)
+{
+    walk_large_bitmap(do_push_closure, p, large_bitmap, size, queue);
+}
+
+static void
+mark_small_bitmap (MarkQueue *queue, StgClosure **p, StgWord size, StgWord bitmap)
+{
+    while (size > 0) {
+        if ((bitmap & 1) == 0) {
+            // TODO: Origin?
+            markQueuePushClosure(queue, *p, NULL);
+        }
+        p++;
+        bitmap = bitmap >> 1;
+        size--;
+    }
+}
+
+static GNUC_ATTR_HOT
+void mark_PAP_payload (MarkQueue *queue,
+                       StgClosure *fun,
+                       StgClosure **payload,
+                       StgWord size)
+{
+    const StgFunInfoTable *fun_info = get_fun_itbl(UNTAG_CONST_CLOSURE(fun));
+    ASSERT(fun_info->i.type != PAP);
+    StgPtr p = (StgPtr) payload;
+
+    StgWord bitmap;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+        bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+        goto small_bitmap;
+    case ARG_GEN_BIG:
+        mark_large_bitmap(queue, payload, GET_FUN_LARGE_BITMAP(fun_info), size);
+        break;
+    case ARG_BCO:
+        mark_large_bitmap(queue, payload, BCO_BITMAP(fun), size);
+        break;
+    default:
+        bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+        mark_small_bitmap(queue, (StgClosure **) p, size, bitmap);
+        break;
+    }
+}
+
+/* Helper for mark_stack; returns next stack frame. */
+static StgPtr
+mark_arg_block (MarkQueue *queue, const StgFunInfoTable *fun_info, StgClosure **args)
+{
+    StgWord bitmap, size;
+
+    StgPtr p = (StgPtr)args;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+        bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+        size = BITMAP_SIZE(fun_info->f.b.bitmap);
+        goto small_bitmap;
+    case ARG_GEN_BIG:
+        size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+        mark_large_bitmap(queue, (StgClosure**)p, GET_FUN_LARGE_BITMAP(fun_info), size);
+        p += size;
+        break;
+    default:
+        bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+        size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+        mark_small_bitmap(queue, (StgClosure**)p, size, bitmap);
+        p += size;
+        break;
+    }
+    return p;
+}
+
+static GNUC_ATTR_HOT void
+mark_stack_ (MarkQueue *queue, StgPtr sp, StgPtr spBottom)
+{
+    ASSERT(sp <= spBottom);
+
+    while (sp < spBottom) {
+        const StgRetInfoTable *info = get_ret_itbl((StgClosure *)sp);
+        switch (info->i.type) {
+        case UPDATE_FRAME:
+        {
+            // See Note [upd-black-hole] in rts/Scav.c
+            StgUpdateFrame *frame = (StgUpdateFrame *) sp;
+            markQueuePushClosure_(queue, frame->updatee);
+            sp += sizeofW(StgUpdateFrame);
+            continue;
+        }
+
+            // small bitmap (< 32 entries, or 64 on a 64-bit machine)
+        case CATCH_STM_FRAME:
+        case CATCH_RETRY_FRAME:
+        case ATOMICALLY_FRAME:
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
+        case CATCH_FRAME:
+        case RET_SMALL:
+        {
+            StgWord bitmap = BITMAP_BITS(info->i.layout.bitmap);
+            StgWord size   = BITMAP_SIZE(info->i.layout.bitmap);
+            // NOTE: the payload starts immediately after the info-ptr, we
+            // don't have an StgHeader in the same sense as a heap closure.
+            sp++;
+            mark_small_bitmap(queue, (StgClosure **) sp, size, bitmap);
+            sp += size;
+        }
+        follow_srt:
+            if (info->i.srt) {
+                markQueuePushClosure_(queue, (StgClosure*)GET_SRT(info));
+            }
+            continue;
+
+        case RET_BCO: {
+            sp++;
+            markQueuePushClosure_(queue, *(StgClosure**)sp);
+            StgBCO *bco = (StgBCO *)*sp;
+            sp++;
+            StgWord size = BCO_BITMAP_SIZE(bco);
+            mark_large_bitmap(queue, (StgClosure **) sp, BCO_BITMAP(bco), size);
+            sp += size;
+            continue;
+        }
+
+          // large bitmap (> 32 entries, or > 64 on a 64-bit machine)
+        case RET_BIG:
+        {
+            StgWord size;
+
+            size = GET_LARGE_BITMAP(&info->i)->size;
+            sp++;
+            mark_large_bitmap(queue, (StgClosure **) sp, GET_LARGE_BITMAP(&info->i), size);
+            sp += size;
+            // and don't forget to follow the SRT
+            goto follow_srt;
+        }
+
+        case RET_FUN:
+        {
+            StgRetFun *ret_fun = (StgRetFun *)sp;
+            const StgFunInfoTable *fun_info;
+
+            markQueuePushClosure_(queue, ret_fun->fun);
+            fun_info = get_fun_itbl(UNTAG_CLOSURE(ret_fun->fun));
+            sp = mark_arg_block(queue, fun_info, ret_fun->payload);
+            goto follow_srt;
+        }
+
+        default:
+            barf("mark_stack: weird activation record found on stack: %d", (int)(info->i.type));
+        }
+    }
+}
+
+static GNUC_ATTR_HOT void
+mark_stack (MarkQueue *queue, StgStack *stack)
+{
+    // TODO: Clear dirty if contains only old gen objects
+
+    mark_stack_(queue, stack->sp, stack->stack + stack->stack_size);
+}
+
+static GNUC_ATTR_HOT void
+mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
+{
+    (void)origin; // TODO: should be used for selector/thunk optimisations
+
+ try_again:
+    p = UNTAG_CLOSURE(p);
+
+#   define PUSH_FIELD(obj, field)                                \
+        markQueuePushClosure(queue,                           \
+                                (StgClosure *) (obj)->field,     \
+                                (StgClosure **) &(obj)->field)
+
+    if (!HEAP_ALLOCED_GC(p)) {
+        const StgInfoTable *info = get_itbl(p);
+        StgHalfWord type = info->type;
+
+        if (type == CONSTR_0_1 || type == CONSTR_0_2 || type == CONSTR_NOCAF) {
+            // no need to put these on the static linked list, they don't need
+            // to be marked.
+            return;
+        }
+
+        if (lookupHashTable(queue->marked_objects, (W_)p)) {
+            // already marked
+            return;
+        }
+
+        insertHashTable(queue->marked_objects, (W_)p, (P_)1);
+
+        switch (type) {
+
+        case THUNK_STATIC:
+            if (info->srt != 0) {
+                markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above
+            }
+            return;
+
+        case FUN_STATIC:
+            if (info->srt != 0 || info->layout.payload.ptrs != 0) {
+                markQueuePushFunSrt(queue, info); // TODO this function repeats the check above
+
+                // a FUN_STATIC can also be an SRT, so it may have pointer
+                // fields.  See Note [SRTs] in CmmBuildInfoTables, specifically
+                // the [FUN] optimisation.
+                // TODO (osa) I don't understand this comment
+                for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                    PUSH_FIELD(p, payload[i]);
+                }
+            }
+            return;
+
+        case IND_STATIC:
+            PUSH_FIELD((StgInd *) p, indirectee);
+            return;
+
+        case CONSTR:
+        case CONSTR_1_0:
+        case CONSTR_2_0:
+        case CONSTR_1_1:
+            for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                PUSH_FIELD(p, payload[i]);
+            }
+            return;
+
+        case WHITEHOLE:
+            while (get_itbl(p)->type == WHITEHOLE);
+                // busy_wait_nop(); // FIXME
+            goto try_again;
+
+        default:
+            barf("mark_closure(static): strange closure type %d", (int)(info->type));
+        }
+    }
+
+    bdescr *bd = Bdescr((StgPtr) p);
+
+    if (bd->gen != oldest_gen) {
+        // Here we have an object living outside of the non-moving heap. Since
+        // we moved everything to the non-moving heap before starting the major
+        // collection, we know that we don't need to trace it: it was allocated
+        // after we took our snapshot.
+
+        // This should never happen in the non-concurrent case
+        barf("Closure outside of non-moving heap: %p", p);
+    }
+
+    ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info);
+
+    ASSERT(!IS_FORWARDING_PTR(p->header.info));
+
+    if (bd->flags & BF_NONMOVING) {
+
+        if (bd->flags & BF_LARGE) {
+            if (! (bd->flags & BF_NONMOVING_SWEEPING)) {
+                // Not in the snapshot
+                return;
+            }
+            if (bd->flags & BF_MARKED) {
+                return;
+            }
+
+            // Mark contents
+            p = (StgClosure*)bd->start;
+        } else {
+            struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+            nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+
+            /* We don't mark blocks that,
+             *  - were not live at the time that the snapshot was taken, or
+             *  - we have already marked this cycle
+             */
+            uint8_t mark = nonmovingGetMark(seg, block_idx);
+            /* Don't mark things we've already marked (since we may loop) */
+            if (mark == nonmovingMarkEpoch)
+                return;
+
+            StgClosure *snapshot_loc =
+              (StgClosure *) nonmovingSegmentGetBlock(seg, seg->next_free_snap);
+            if (p >= snapshot_loc && mark == 0) {
+                /*
+                 * In this case we are looking at a block that wasn't allocated
+                 * at the time that the snapshot was taken. We mustn't trace
+                 * things above the allocation pointer that aren't marked since
+                 * they may not be valid objects.
+                 */
+                return;
+            }
+        }
+    }
+
+    // A pinned object that is still attached to a capability (because it's not
+    // filled yet). No need to trace it pinned objects can't contain poiners.
+    else if (bd->flags & BF_PINNED) {
+#if defined(DEBUG)
+        bool found_it = false;
+        for (uint32_t i = 0; i < n_capabilities; ++i) {
+            if (capabilities[i]->pinned_object_block == bd) {
+                found_it = true;
+                break;
+            }
+        }
+        ASSERT(found_it);
+#endif
+        return;
+    }
+
+    else {
+        barf("Strange closure in nonmoving mark: %p", p);
+    }
+
+    /////////////////////////////////////////////////////
+    // Trace pointers
+    /////////////////////////////////////////////////////
+
+    const StgInfoTable *info = get_itbl(p);
+    switch (info->type) {
+
+    case MVAR_CLEAN:
+    case MVAR_DIRTY: {
+        StgMVar *mvar = (StgMVar *) p;
+        PUSH_FIELD(mvar, head);
+        PUSH_FIELD(mvar, tail);
+        PUSH_FIELD(mvar, value);
+        break;
+    }
+
+    case TVAR: {
+        StgTVar *tvar = ((StgTVar *)p);
+        PUSH_FIELD(tvar, current_value);
+        PUSH_FIELD(tvar, first_watch_queue_entry);
+        break;
+    }
+
+    case FUN_2_0:
+        markQueuePushFunSrt(queue, info);
+        PUSH_FIELD(p, payload[1]);
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case THUNK_2_0: {
+        StgThunk *thunk = (StgThunk *) p;
+        markQueuePushThunkSrt(queue, info);
+        PUSH_FIELD(thunk, payload[1]);
+        PUSH_FIELD(thunk, payload[0]);
+        break;
+    }
+
+    case CONSTR_2_0:
+        PUSH_FIELD(p, payload[1]);
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case THUNK_1_0:
+        markQueuePushThunkSrt(queue, info);
+        PUSH_FIELD((StgThunk *) p, payload[0]);
+        break;
+
+    case FUN_1_0:
+        markQueuePushFunSrt(queue, info);
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case CONSTR_1_0:
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case THUNK_0_1:
+        markQueuePushThunkSrt(queue, info);
+        break;
+
+    case FUN_0_1:
+        markQueuePushFunSrt(queue, info);
+        break;
+
+    case CONSTR_0_1:
+    case CONSTR_0_2:
+        break;
+
+    case THUNK_0_2:
+        markQueuePushThunkSrt(queue, info);
+        break;
+
+    case FUN_0_2:
+        markQueuePushFunSrt(queue, info);
+        break;
+
+    case THUNK_1_1:
+        markQueuePushThunkSrt(queue, info);
+        PUSH_FIELD((StgThunk *) p, payload[0]);
+        break;
+
+    case FUN_1_1:
+        markQueuePushFunSrt(queue, info);
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case CONSTR_1_1:
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case FUN:
+        markQueuePushFunSrt(queue, info);
+        goto gen_obj;
+
+    case THUNK: {
+        markQueuePushThunkSrt(queue, info);
+        for (StgWord i = 0; i < info->layout.payload.ptrs; i++) {
+            StgClosure **field = &((StgThunk *) p)->payload[i];
+            markQueuePushClosure(queue, *field, field);
+        }
+        break;
+    }
+
+    gen_obj:
+    case CONSTR:
+    case CONSTR_NOCAF:
+    case WEAK:
+    case PRIM:
+    {
+        for (StgWord i = 0; i < info->layout.payload.ptrs; i++) {
+            StgClosure **field = &((StgClosure *) p)->payload[i];
+            markQueuePushClosure(queue, *field, field);
+        }
+        break;
+    }
+
+    case BCO: {
+        StgBCO *bco = (StgBCO *)p;
+        PUSH_FIELD(bco, instrs);
+        PUSH_FIELD(bco, literals);
+        PUSH_FIELD(bco, ptrs);
+        break;
+    }
+
+
+    case IND:
+    case BLACKHOLE:
+        PUSH_FIELD((StgInd *) p, indirectee);
+        break;
+
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+        PUSH_FIELD((StgMutVar *)p, var);
+        break;
+
+    case BLOCKING_QUEUE: {
+        StgBlockingQueue *bq = (StgBlockingQueue *)p;
+        PUSH_FIELD(bq, bh);
+        PUSH_FIELD(bq, owner);
+        PUSH_FIELD(bq, queue);
+        PUSH_FIELD(bq, link);
+        break;
+    }
+
+    case THUNK_SELECTOR:
+        PUSH_FIELD((StgSelector *) p, selectee);
+        // TODO: selector optimization
+        break;
+
+    case AP_STACK: {
+        StgAP_STACK *ap = (StgAP_STACK *)p;
+        PUSH_FIELD(ap, fun);
+        mark_stack_(queue, (StgPtr) ap->payload, (StgPtr) ap->payload + ap->size);
+        break;
+    }
+
+    case PAP: {
+        StgPAP *pap = (StgPAP *) p;
+        PUSH_FIELD(pap, fun);
+        mark_PAP_payload(queue, pap->fun, pap->payload, pap->n_args);
+        break;
+    }
+
+    case AP: {
+        StgAP *ap = (StgAP *) p;
+        PUSH_FIELD(ap, fun);
+        mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args);
+        break;
+    }
+
+    case ARR_WORDS:
+        // nothing to follow
+        break;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
+        // TODO: Check this against Scav.c
+        markQueuePushArray(queue, (StgMutArrPtrs *) p, 0);
+        break;
+
+    case SMALL_MUT_ARR_PTRS_CLEAN:
+    case SMALL_MUT_ARR_PTRS_DIRTY:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: {
+        StgSmallMutArrPtrs *arr = (StgSmallMutArrPtrs *) p;
+        for (StgWord i = 0; i < arr->ptrs; i++) {
+            StgClosure **field = &arr->payload[i];
+            markQueuePushClosure(queue, *field, field);
+        }
+        break;
+    }
+
+    case TSO:
+        mark_tso(queue, (StgTSO *) p);
+        break;
+
+    case STACK: {
+        // See Note [StgStack dirtiness flags and concurrent marking]
+        StgStack *stack = (StgStack *) p;
+        mark_stack(queue, stack);
+        break;
+    }
+
+    case MUT_PRIM: {
+        for (StgHalfWord p_idx = 0; p_idx < info->layout.payload.ptrs; ++p_idx) {
+            StgClosure **field = &p->payload[p_idx];
+            markQueuePushClosure(queue, *field, field);
+        }
+        break;
+    }
+
+    case TREC_CHUNK: {
+        // TODO: Should we abort here? This should have already been marked
+        // when we dirtied the TSO
+        StgTRecChunk *tc = ((StgTRecChunk *) p);
+        PUSH_FIELD(tc, prev_chunk);
+        TRecEntry *end = &tc->entries[tc->next_entry_idx];
+        for (TRecEntry *e = &tc->entries[0]; e < end; e++) {
+            markQueuePushClosure_(queue, (StgClosure *) e->tvar);
+            markQueuePushClosure_(queue, (StgClosure *) e->expected_value);
+            markQueuePushClosure_(queue, (StgClosure *) e->new_value);
+        }
+        break;
+    }
+
+    case WHITEHOLE:
+        while (get_itbl(p)->type == WHITEHOLE);
+            // busy_wait_nop(); // FIXME
+        goto try_again;
+
+    default:
+        barf("mark_closure: unimplemented/strange closure type %d @ %p",
+             info->type, p);
+    }
+
+#   undef PUSH_FIELD
+
+    /* Set the mark bit: it's important that we do this only after we actually push
+     * the object's pointers since in the case of marking stacks there may be a
+     * mutator waiting for us to finish so it can start execution.
+     */
+    if (bd->flags & BF_LARGE) {
+        if (! (bd->flags & BF_MARKED)) {
+            // Remove the object from nonmoving_large_objects and link it to
+            // nonmoving_marked_large_objects
+            dbl_link_remove(bd, &nonmoving_large_objects);
+            dbl_link_onto(bd, &nonmoving_marked_large_objects);
+            n_nonmoving_large_blocks -= bd->blocks;
+            n_nonmoving_marked_large_blocks += bd->blocks;
+            bd->flags |= BF_MARKED;
+        }
+    } else {
+        // TODO: Kill repetition
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+        nonmovingSetMark(seg, block_idx);
+        nonmoving_live_words += nonmovingSegmentBlockSize(seg) / sizeof(W_);
+    }
+}
+
+/* This is the main mark loop.
+ * Invariants:
+ *
+ *  a. nonmovingPrepareMark has been called.
+ *  b. the nursery has been fully evacuated into the non-moving generation.
+ *  c. the mark queue has been seeded with a set of roots.
+ *
+ */
+GNUC_ATTR_HOT void nonmovingMark (MarkQueue *queue)
+{
+    debugTrace(DEBUG_nonmoving_gc, "Starting mark pass");
+    unsigned int count = 0;
+    while (true) {
+        count++;
+        MarkQueueEnt ent = markQueuePop(queue);
+
+        switch (ent.type) {
+        case MARK_CLOSURE:
+            mark_closure(queue, ent.mark_closure.p, ent.mark_closure.origin);
+            break;
+        case MARK_ARRAY: {
+            const StgMutArrPtrs *arr = ent.mark_array.array;
+            StgWord start = ent.mark_array.start_index;
+            StgWord end = start + MARK_ARRAY_CHUNK_LENGTH;
+            if (end < arr->ptrs) {
+                markQueuePushArray(queue, ent.mark_array.array, end);
+            } else {
+                end = arr->ptrs;
+            }
+            for (StgWord i = start; i < end; i++) {
+                markQueuePushClosure_(queue, arr->payload[i]);
+            }
+            break;
+        }
+        case NULL_ENTRY:
+            // Nothing more to do
+            debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count);
+            return;
+        }
+    }
+}
+
+// A variant of `isAlive` that works for non-moving heap. Used for:
+//
+// - Collecting weak pointers; checking key of a weak pointer.
+// - Resurrecting threads; checking if a thread is dead.
+// - Sweeping object lists: large_objects, mut_list, stable_name_table.
+//
+// This may only be used after a full mark but before nonmovingSweep as it
+// relies on the correctness of the next_free_snap and mark bitmaps.
+bool nonmovingIsAlive (StgClosure *p)
+{
+    // Ignore static closures. See comments in `isAlive`.
+    if (!HEAP_ALLOCED_GC(p)) {
+        return true;
+    }
+
+    bdescr *bd = Bdescr((P_)p);
+
+    // All non-static objects in the non-moving heap should be marked as
+    // BF_NONMOVING
+    ASSERT(bd->flags & BF_NONMOVING);
+
+    if (bd->flags & BF_LARGE) {
+        return (bd->flags & BF_NONMOVING_SWEEPING) == 0
+                   // the large object wasn't in the snapshot and therefore wasn't marked
+            || (bd->flags & BF_MARKED) != 0;
+                   // The object was marked
+    } else {
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx i = nonmovingGetBlockIdx((StgPtr) p);
+        uint8_t mark =  nonmovingGetMark(seg, i);
+        if (i >= seg->next_free_snap) {
+            // If the object is allocated after next_free_snap then one of the
+            // following must be true:
+            //
+            // * if its mark is 0 then the block was not allocated last time
+            //   the segment was swept; however, it may have been allocated since
+            //   then and therefore we must conclude that the block is alive.
+            //
+            // * if its mark is equal to nonmovingMarkEpoch then we found that
+            //   the object was alive in the snapshot of the current GC (recall
+            //   that this function may only be used after a mark).
+            //   Consequently we must conclude that the object is still alive.
+            //
+            // * if its mark is not equal to nonmovingMarkEpoch then we found
+            //   that the object was not reachable in the last snapshot.
+            //   Assuming that the mark is complete we can conclude that the
+            //   object is dead since the snapshot invariant guarantees that
+            //   all objects alive in the snapshot would be marked.
+            //
+            return mark == nonmovingMarkEpoch || mark == 0;
+        } else {
+            // If the object is below next_free_snap then the snapshot
+            // invariant guarantees that it is marked if reachable.
+            return mark == nonmovingMarkEpoch;
+        }
+    }
+}
+
+// Check whether a snapshotted object is alive. That is for an object that we
+// know to be in the snapshot, is its mark bit set. It is imperative that the
+// object is in the snapshot (e.g. was in the nonmoving heap at the time that
+// the snapshot was taken) since we assume that its mark bit reflects its
+// reachability.
+//
+// This is used when
+//
+// - Collecting weak pointers; checking key of a weak pointer.
+// - Resurrecting threads; checking if a thread is dead.
+// - Sweeping object lists: large_objects, mut_list, stable_name_table.
+//
+static bool nonmovingIsNowAlive (StgClosure *p)
+{
+    // Ignore static closures. See comments in `isAlive`.
+    if (!HEAP_ALLOCED_GC(p)) {
+        return true;
+    }
+
+    bdescr *bd = Bdescr((P_)p);
+
+    // All non-static objects in the non-moving heap should be marked as
+    // BF_NONMOVING
+    ASSERT(bd->flags & BF_NONMOVING);
+
+    if (bd->flags & BF_LARGE) {
+        return (bd->flags & BF_NONMOVING_SWEEPING) == 0
+                   // the large object wasn't in the snapshot and therefore wasn't marked
+            || (bd->flags & BF_MARKED) != 0;
+                   // The object was marked
+    } else {
+        return nonmovingClosureMarkedThisCycle((P_)p);
+    }
+}
+
+// Non-moving heap variant of `tidyWeakList`
+bool nonmovingTidyWeaks (struct MarkQueue_ *queue)
+{
+    bool did_work = false;
+
+    StgWeak **last_w = &nonmoving_old_weak_ptr_list;
+    StgWeak *next_w;
+    for (StgWeak *w = nonmoving_old_weak_ptr_list; w != NULL; w = next_w) {
+        if (w->header.info == &stg_DEAD_WEAK_info) {
+            // finalizeWeak# was called on the weak
+            next_w = w->link;
+            *last_w = next_w;
+            continue;
+        }
+
+        // Otherwise it's a live weak
+        ASSERT(w->header.info == &stg_WEAK_info);
+
+        if (nonmovingIsNowAlive(w->key)) {
+            nonmovingMarkLiveWeak(queue, w);
+            did_work = true;
+
+            // remove this weak ptr from old_weak_ptr list
+            *last_w = w->link;
+            next_w = w->link;
+
+            // and put it on the weak ptr list
+            w->link = nonmoving_weak_ptr_list;
+            nonmoving_weak_ptr_list = w;
+        } else {
+            last_w = &(w->link);
+            next_w = w->link;
+        }
+    }
+
+    return did_work;
+}
+
+void nonmovingMarkDeadWeak (struct MarkQueue_ *queue, StgWeak *w)
+{
+    if (w->cfinalizers != &stg_NO_FINALIZER_closure) {
+        markQueuePushClosure_(queue, w->value);
+    }
+    markQueuePushClosure_(queue, w->finalizer);
+}
+
+void nonmovingMarkLiveWeak (struct MarkQueue_ *queue, StgWeak *w)
+{
+    ASSERT(nonmovingClosureMarkedThisCycle((P_)w));
+    markQueuePushClosure_(queue, w->value);
+    markQueuePushClosure_(queue, w->finalizer);
+    markQueuePushClosure_(queue, w->cfinalizers);
+}
+
+// When we're done with marking, any weak pointers with non-marked keys will be
+// considered "dead". We mark values and finalizers of such weaks, and then
+// schedule them for finalization in `scheduleFinalizers` (which we run during
+// synchronization).
+void nonmovingMarkDeadWeaks (struct MarkQueue_ *queue, StgWeak **dead_weaks)
+{
+    StgWeak *next_w;
+    for (StgWeak *w = nonmoving_old_weak_ptr_list; w; w = next_w) {
+        ASSERT(!nonmovingClosureMarkedThisCycle((P_)(w->key)));
+        nonmovingMarkDeadWeak(queue, w);
+        next_w = w ->link;
+        w->link = *dead_weaks;
+        *dead_weaks = w;
+    }
+}
+
+// Non-moving heap variant of of `tidyThreadList`
+void nonmovingTidyThreads ()
+{
+    StgTSO *next;
+    StgTSO **prev = &nonmoving_old_threads;
+    for (StgTSO *t = nonmoving_old_threads; t != END_TSO_QUEUE; t = next) {
+
+        next = t->global_link;
+
+        // N.B. This thread is in old_threads, consequently we *know* it is in
+        // the snapshot and it is therefore safe to rely on the bitmap to
+        // determine its reachability.
+        if (nonmovingIsNowAlive((StgClosure*)t)) {
+            // alive
+            *prev = next;
+
+            // move this thread onto threads list
+            t->global_link = nonmoving_threads;
+            nonmoving_threads = t;
+        } else {
+            // not alive (yet): leave this thread on the old_threads list
+            prev = &(t->global_link);
+        }
+    }
+}
+
+void nonmovingResurrectThreads (struct MarkQueue_ *queue, StgTSO **resurrected_threads)
+{
+    StgTSO *next;
+    for (StgTSO *t = nonmoving_old_threads; t != END_TSO_QUEUE; t = next) {
+        next = t->global_link;
+
+        switch (t->what_next) {
+        case ThreadKilled:
+        case ThreadComplete:
+            continue;
+        default:
+            markQueuePushClosure_(queue, (StgClosure*)t);
+            t->global_link = *resurrected_threads;
+            *resurrected_threads = t;
+        }
+    }
+}
+
+#if !defined(DEBUG)
+
+void printMarkQueueEntry (MarkQueueEnt *ent)
+{
+    if (ent->type == MARK_CLOSURE) {
+        debugBelch("Closure: ");
+        printClosure(ent->mark_closure.p);
+    } else if (ent->type == MARK_ARRAY) {
+        debugBelch("Array\n");
+    } else {
+        debugBelch("End of mark\n");
+    }
+}
+
+void printMarkQueue (MarkQueue *q)
+{
+    debugBelch("======== MARK QUEUE ========\n");
+    for (bdescr *block = q->blocks; block; block = block->link) {
+        MarkQueueBlock *queue = (MarkQueueBlock*)block->start;
+        for (uint32_t i = 0; i < queue->head; ++i) {
+            printMarkQueueEntry(&queue->entries[i]);
+        }
+    }
+    debugBelch("===== END OF MARK QUEUE ====\n");
+}
+
+#endif


=====================================
rts/sm/NonMovingMark.h
=====================================
@@ -0,0 +1,140 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Mark phase
+ *
+ * ---------------------------------------------------------------------------*/
+
+#pragma once
+
+#include "Hash.h"
+#include "Task.h"
+#include "NonMoving.h"
+
+#include "BeginPrivate.h"
+
+#include "Hash.h"
+
+enum EntryType {
+    NULL_ENTRY = 0,
+    MARK_CLOSURE,
+    MARK_ARRAY
+};
+
+/* Note [Origin references in the nonmoving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * To implement indirection short-cutting and the selector optimisation the
+ * collector needs to know where it found references, so it can update the
+ * reference if it later turns out that points to an indirection. For this
+ * reason, each mark queue entry contains two things:
+ *
+ * - a pointer to the object to be marked (p), and
+ *
+ * - a pointer to the field where we found the reference (origin)
+ *
+ * Note that the origin pointer is an interior pointer: it points not to a
+ * valid closure (with info table pointer) but rather to a field inside a closure.
+ * Since such references can't be safely scavenged we establish the invariant
+ * that the origin pointer may only point to a field of an object living in the
+ * nonmoving heap, where no scavenging is needed.
+ *
+ */
+
+typedef struct {
+    enum EntryType type;
+    // All pointers should be untagged
+    union {
+        struct {
+            StgClosure *p;        // the object to be marked
+            StgClosure **origin;  // field where this reference was found.
+                                  // See Note [Origin references in the nonmoving collector]
+        } mark_closure;
+        struct {
+            const StgMutArrPtrs *array;
+            StgWord start_index;
+        } mark_array;
+    };
+} MarkQueueEnt;
+
+typedef struct {
+    // index of first *unused* queue entry
+    uint32_t head;
+
+    MarkQueueEnt entries[];
+} MarkQueueBlock;
+
+/* The mark queue is not capable of concurrent read or write.
+ *
+ * invariants:
+ *
+ *  a. top == blocks->start;
+ *  b. there is always a valid MarkQueueChunk, although it may be empty
+ *     (e.g. top->head == 0).
+ */
+typedef struct MarkQueue_ {
+    // A singly link-list of blocks, each containing a MarkQueueChunk.
+    bdescr *blocks;
+
+    // Cached value of blocks->start.
+    MarkQueueBlock *top;
+
+    // Marked objects outside of nonmoving heap, namely large and static
+    // objects.
+    HashTable *marked_objects;
+} MarkQueue;
+
+// The length of MarkQueueBlock.entries
+#define MARK_QUEUE_BLOCK_ENTRIES ((BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt))
+
+extern bdescr *nonmoving_large_objects, *nonmoving_marked_large_objects;
+extern memcount n_nonmoving_large_blocks, n_nonmoving_marked_large_blocks;
+
+extern StgTSO *nonmoving_old_threads;
+extern StgWeak *nonmoving_old_weak_ptr_list;
+extern StgTSO *nonmoving_threads;
+extern StgWeak *nonmoving_weak_ptr_list;
+
+#if defined(DEBUG)
+extern StgIndStatic *debug_caf_list_snapshot;
+#endif
+
+extern MarkQueue *current_mark_queue;
+
+void markQueueAddRoot(MarkQueue* q, StgClosure** root);
+
+void initMarkQueue(MarkQueue *queue);
+void freeMarkQueue(MarkQueue *queue);
+void nonmovingMark(struct MarkQueue_ *restrict queue);
+
+bool nonmovingTidyWeaks(struct MarkQueue_ *queue);
+void nonmovingTidyThreads(void);
+void nonmovingMarkDeadWeaks(struct MarkQueue_ *queue, StgWeak **dead_weak_ptr_list);
+void nonmovingResurrectThreads(struct MarkQueue_ *queue, StgTSO **resurrected_threads);
+bool nonmovingIsAlive(StgClosure *p);
+void nonmovingMarkDeadWeak(struct MarkQueue_ *queue, StgWeak *w);
+void nonmovingMarkLiveWeak(struct MarkQueue_ *queue, StgWeak *w);
+
+void markQueuePush(MarkQueue *q, const MarkQueueEnt *ent);
+void markQueuePushClosure(MarkQueue *q,
+                             StgClosure *p,
+                             StgClosure **origin);
+void markQueuePushClosure_(MarkQueue *q, StgClosure *p);
+void markQueuePushThunkSrt(MarkQueue *q, const StgInfoTable *info);
+void markQueuePushFunSrt(MarkQueue *q, const StgInfoTable *info);
+void markQueuePushArray(MarkQueue *q, const StgMutArrPtrs *array, StgWord start_index);
+
+INLINE_HEADER bool markQueueIsEmpty(MarkQueue *q)
+{
+    return (q->blocks == NULL) || (q->top->head == 0 && q->blocks->link == NULL);
+}
+
+#if defined(DEBUG)
+
+void printMarkQueueEntry(MarkQueueEnt *ent);
+void printMarkQueue(MarkQueue *q);
+
+#endif
+
+#include "EndPrivate.h"


=====================================
rts/sm/NonMovingScav.c
=====================================
@@ -0,0 +1,366 @@
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "NonMoving.h"
+#include "NonMovingScav.h"
+#include "Capability.h"
+#include "Scav.h"
+#include "Evac.h"
+#include "GCThread.h" // for GCUtils.h
+#include "GCUtils.h"
+#include "Printer.h"
+#include "MarkWeak.h" // scavengeLiveWeak
+
+void
+nonmovingScavengeOne (StgClosure *q)
+{
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
+    StgPtr p = (StgPtr)q;
+    const StgInfoTable *info = get_itbl(q);
+
+    switch (info->type) {
+
+    case MVAR_CLEAN:
+    case MVAR_DIRTY:
+    {
+        StgMVar *mvar = ((StgMVar *)p);
+        evacuate((StgClosure **)&mvar->head);
+        evacuate((StgClosure **)&mvar->tail);
+        evacuate((StgClosure **)&mvar->value);
+        if (gct->failed_to_evac) {
+            mvar->header.info = &stg_MVAR_DIRTY_info;
+        } else {
+            mvar->header.info = &stg_MVAR_CLEAN_info;
+        }
+        break;
+    }
+
+    case TVAR:
+    {
+        StgTVar *tvar = ((StgTVar *)p);
+        evacuate((StgClosure **)&tvar->current_value);
+        evacuate((StgClosure **)&tvar->first_watch_queue_entry);
+        if (gct->failed_to_evac) {
+            tvar->header.info = &stg_TVAR_DIRTY_info;
+        } else {
+            tvar->header.info = &stg_TVAR_CLEAN_info;
+        }
+        break;
+    }
+
+    case FUN_2_0:
+        scavenge_fun_srt(info);
+        evacuate(&((StgClosure *)p)->payload[1]);
+        evacuate(&((StgClosure *)p)->payload[0]);
+        break;
+
+    case THUNK_2_0:
+        scavenge_thunk_srt(info);
+        evacuate(&((StgThunk *)p)->payload[1]);
+        evacuate(&((StgThunk *)p)->payload[0]);
+        break;
+
+    case CONSTR_2_0:
+        evacuate(&((StgClosure *)p)->payload[1]);
+        evacuate(&((StgClosure *)p)->payload[0]);
+        break;
+
+    case THUNK_1_0:
+        scavenge_thunk_srt(info);
+        evacuate(&((StgThunk *)p)->payload[0]);
+        break;
+
+    case FUN_1_0:
+        scavenge_fun_srt(info);
+        FALLTHROUGH;
+    case CONSTR_1_0:
+        evacuate(&((StgClosure *)p)->payload[0]);
+        break;
+
+    case THUNK_0_1:
+        scavenge_thunk_srt(info);
+        break;
+
+    case FUN_0_1:
+        scavenge_fun_srt(info);
+        FALLTHROUGH;
+    case CONSTR_0_1:
+        break;
+
+    case THUNK_0_2:
+        scavenge_thunk_srt(info);
+        break;
+
+    case FUN_0_2:
+        scavenge_fun_srt(info);
+        FALLTHROUGH;
+    case CONSTR_0_2:
+        break;
+
+    case THUNK_1_1:
+        scavenge_thunk_srt(info);
+        evacuate(&((StgThunk *)p)->payload[0]);
+        break;
+
+    case FUN_1_1:
+        scavenge_fun_srt(info);
+        FALLTHROUGH;
+    case CONSTR_1_1:
+        evacuate(&q->payload[0]);
+        break;
+
+    case FUN:
+        scavenge_fun_srt(info);
+        goto gen_obj;
+
+    case THUNK:
+    {
+        scavenge_thunk_srt(info);
+        StgPtr end = (P_)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+        for (p = (P_)((StgThunk *)p)->payload; p < end; p++) {
+            evacuate((StgClosure **)p);
+        }
+        break;
+    }
+
+    gen_obj:
+    case CONSTR:
+    case CONSTR_NOCAF:
+    case WEAK:
+    case PRIM:
+    {
+        StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+        for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+            evacuate((StgClosure **)p);
+        }
+        break;
+    }
+
+    case BCO: {
+        StgBCO *bco = (StgBCO *)p;
+        evacuate((StgClosure **)&bco->instrs);
+        evacuate((StgClosure **)&bco->literals);
+        evacuate((StgClosure **)&bco->ptrs);
+        break;
+    }
+
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+        evacuate(&((StgMutVar *)p)->var);
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+        }
+        break;
+
+    case BLOCKING_QUEUE:
+    {
+        StgBlockingQueue *bq = (StgBlockingQueue *)p;
+
+        evacuate(&bq->bh);
+        evacuate((StgClosure**)&bq->owner);
+        evacuate((StgClosure**)&bq->queue);
+        evacuate((StgClosure**)&bq->link);
+
+        if (gct->failed_to_evac) {
+            bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
+        } else {
+            bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info;
+        }
+        break;
+    }
+
+    case THUNK_SELECTOR:
+    {
+        StgSelector *s = (StgSelector *)p;
+        evacuate(&s->selectee);
+        break;
+    }
+
+    // A chunk of stack saved in a heap object
+    case AP_STACK:
+    {
+        StgAP_STACK *ap = (StgAP_STACK *)p;
+
+        evacuate(&ap->fun);
+        scavenge_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+        break;
+    }
+
+    case PAP:
+        p = scavenge_PAP((StgPAP *)p);
+        break;
+
+    case AP:
+        scavenge_AP((StgAP *)p);
+        break;
+
+    case ARR_WORDS:
+        // nothing to follow
+        break;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    {
+        // We don't eagerly promote objects pointed to by a mutable
+        // array, but if we find the array only points to objects in
+        // the same or an older generation, we mark it "clean" and
+        // avoid traversing it during minor GCs.
+        scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+        }
+        gct->failed_to_evac = true; // always put it on the mutable list.
+        break;
+    }
+
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
+        // follow everything
+    {
+        scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
+
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+        }
+        break;
+    }
+
+    case SMALL_MUT_ARR_PTRS_CLEAN:
+    case SMALL_MUT_ARR_PTRS_DIRTY:
+        // follow everything
+    {
+        // We don't eagerly promote objects pointed to by a mutable
+        // array, but if we find the array only points to objects in
+        // the same or an older generation, we mark it "clean" and
+        // avoid traversing it during minor GCs.
+        StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p);
+        for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) {
+            evacuate((StgClosure **)p);
+        }
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_CLEAN_info;
+        }
+        gct->failed_to_evac = true; // always put it on the mutable list.
+        break;
+    }
+
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
+        // follow everything
+    {
+        StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p);
+        for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) {
+            evacuate((StgClosure **)p);
+        }
+
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+        }
+        break;
+    }
+
+    case TSO:
+    {
+        scavengeTSO((StgTSO *)p);
+        break;
+    }
+
+    case STACK:
+    {
+        StgStack *stack = (StgStack*)p;
+
+        scavenge_stack(stack->sp, stack->stack + stack->stack_size);
+        stack->dirty = gct->failed_to_evac;
+        // TODO (osa): There may be something special about stacks that we're
+        // missing. All other mut objects are marked by using a different info
+        // table except stacks.
+
+        break;
+    }
+
+    case MUT_PRIM:
+    {
+        StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+        for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+            evacuate((StgClosure **)p);
+        }
+        gct->failed_to_evac = true; // mutable
+        break;
+    }
+
+    case TREC_CHUNK:
+      {
+        StgWord i;
+        StgTRecChunk *tc = ((StgTRecChunk *) p);
+        TRecEntry *e = &(tc -> entries[0]);
+        evacuate((StgClosure **)&tc->prev_chunk);
+        for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+          evacuate((StgClosure **)&e->tvar);
+          evacuate((StgClosure **)&e->expected_value);
+          evacuate((StgClosure **)&e->new_value);
+        }
+        gct->failed_to_evac = true; // mutable
+        break;
+      }
+
+    case IND:
+    case BLACKHOLE:
+    case IND_STATIC:
+        evacuate(&((StgInd *)p)->indirectee);
+        break;
+
+    default:
+        barf("nonmoving scavenge: unimplemented/strange closure type %d @ %p",
+             info->type, p);
+    }
+
+    if (gct->failed_to_evac) {
+        // Mutable object or points to a younger object, add to the mut_list
+        gct->failed_to_evac = false;
+        if (oldest_gen->no > 0) {
+            recordMutableGen_GC(q, oldest_gen->no);
+        }
+    }
+}
+
+/* Scavenge objects evacuated into a nonmoving segment by a minor GC */
+void
+scavengeNonmovingSegment (struct NonmovingSegment *seg)
+{
+    const StgWord blk_size = nonmovingSegmentBlockSize(seg);
+    gct->evac_gen_no = oldest_gen->no;
+    gct->failed_to_evac = false;
+
+    // scavenge objects between scan and free_ptr whose bitmap bits are 0
+    bdescr *seg_block = Bdescr((P_)seg);
+
+    ASSERT(seg_block->u.scan >= (P_)nonmovingSegmentGetBlock(seg, 0));
+    ASSERT(seg_block->u.scan <= (P_)nonmovingSegmentGetBlock(seg, seg->next_free));
+
+    StgPtr scan_end = (P_)nonmovingSegmentGetBlock(seg, seg->next_free);
+    if (seg_block->u.scan == scan_end)
+        return;
+
+    nonmoving_block_idx p_idx = nonmovingGetBlockIdx(seg_block->u.scan);
+    while (seg_block->u.scan < scan_end) {
+        StgClosure *p = (StgClosure*)seg_block->u.scan;
+
+        // bit set = was allocated in a previous GC, no need to scavenge
+        // bit not set = new allocation, so scavenge
+        if (nonmovingGetMark(seg, p_idx) == 0) {
+            nonmovingScavengeOne(p);
+        }
+
+        p_idx++;
+        seg_block->u.scan = (P_)(((uint8_t*)seg_block->u.scan) + blk_size);
+    }
+}


=====================================
rts/sm/NonMovingScav.h
=====================================
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "NonMoving.h"
+
+#include "BeginPrivate.h"
+
+void nonmovingScavengeOne(StgClosure *p);
+void scavengeNonmovingSegment(struct NonmovingSegment *seg);
+
+#include "EndPrivate.h"


=====================================
rts/sm/NonMovingSweep.c
=====================================
@@ -0,0 +1,273 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Sweep phase
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "NonMovingSweep.h"
+#include "NonMoving.h"
+#include "NonMovingMark.h" // for nonmovingIsAlive
+#include "Capability.h"
+#include "GCThread.h" // for GCUtils.h
+#include "GCUtils.h"
+#include "Storage.h"
+#include "Trace.h"
+#include "StableName.h"
+
+static struct NonmovingSegment *pop_all_filled_segments(struct NonmovingAllocator *alloc)
+{
+    while (true) {
+        struct NonmovingSegment *head = alloc->filled;
+        if (cas((StgVolatilePtr) &alloc->filled, (StgWord) head, (StgWord) NULL) == (StgWord) head)
+            return head;
+    }
+}
+
+void nonmovingPrepareSweep()
+{
+    ASSERT(nonmovingHeap.sweep_list == NULL);
+
+    // Move blocks in the allocators' filled lists into sweep_list
+    for (unsigned int alloc_idx = 0; alloc_idx < NONMOVING_ALLOCA_CNT; alloc_idx++)
+    {
+        struct NonmovingAllocator *alloc = nonmovingHeap.allocators[alloc_idx];
+        struct NonmovingSegment *filled = pop_all_filled_segments(alloc);
+
+        // Link filled to sweep_list
+        if (filled) {
+            struct NonmovingSegment *filled_head = filled;
+            // Find end of filled list
+            while (filled->link) {
+                filled = filled->link;
+            }
+            filled->link = nonmovingHeap.sweep_list;
+            nonmovingHeap.sweep_list = filled_head;
+        }
+    }
+}
+
+// On which list should a particular segment be placed?
+enum SweepResult {
+    SEGMENT_FREE,     // segment is empty: place on free list
+    SEGMENT_PARTIAL,  // segment is partially filled: place on active list
+    SEGMENT_FILLED    // segment is full: place on filled list
+};
+
+// Determine which list a marked segment should be placed on and initialize
+// next_free indices as appropriate.
+GNUC_ATTR_HOT static enum SweepResult
+nonmovingSweepSegment(struct NonmovingSegment *seg)
+{
+    bool found_free = false;
+    bool found_live = false;
+
+    for (nonmoving_block_idx i = 0;
+         i < nonmovingSegmentBlockCount(seg);
+         ++i)
+    {
+        if (seg->bitmap[i] == nonmovingMarkEpoch) {
+            found_live = true;
+        } else if (!found_free) {
+            found_free = true;
+            seg->next_free = i;
+            seg->next_free_snap = i;
+            Bdescr((P_)seg)->u.scan = (P_)nonmovingSegmentGetBlock(seg, i);
+            seg->bitmap[i] = 0;
+        } else {
+            seg->bitmap[i] = 0;
+        }
+
+        if (found_free && found_live) {
+            // zero the remaining dead object's mark bits
+            for (; i < nonmovingSegmentBlockCount(seg); ++i) {
+                if (seg->bitmap[i] != nonmovingMarkEpoch) {
+                    seg->bitmap[i] = 0;
+                }
+            }
+            return SEGMENT_PARTIAL;
+        }
+    }
+
+    if (found_live) {
+        return SEGMENT_FILLED;
+    } else {
+        ASSERT(seg->next_free == 0);
+        ASSERT(seg->next_free_snap == 0);
+        return SEGMENT_FREE;
+    }
+}
+
+#if defined(DEBUG)
+
+void nonmovingGcCafs(struct MarkQueue_ *queue)
+{
+    uint32_t i = 0;
+    StgIndStatic *next;
+
+    for (StgIndStatic *caf = debug_caf_list_snapshot;
+         caf != (StgIndStatic*) END_OF_CAF_LIST;
+         caf = next)
+    {
+        next = (StgIndStatic*)caf->saved_info;
+
+        const StgInfoTable *info = get_itbl((StgClosure*)caf);
+        ASSERT(info->type == IND_STATIC);
+
+        if (lookupHashTable(queue->marked_objects, (StgWord) caf) == NULL) {
+            debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", caf);
+            SET_INFO((StgClosure*)caf, &stg_GCD_CAF_info); // stub it
+        } else {
+            // CAF is alive, move it back to the debug_caf_list
+            ++i;
+            debugTrace(DEBUG_gccafs, "CAF alive at 0x%p", caf);
+            ACQUIRE_SM_LOCK; // debug_caf_list is global, locked by sm_mutex
+            caf->saved_info = (const StgInfoTable*)debug_caf_list;
+            debug_caf_list = caf;
+            RELEASE_SM_LOCK;
+        }
+    }
+
+    debugTrace(DEBUG_gccafs, "%d CAFs live", i);
+    debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST;
+}
+
+static void
+clear_segment(struct NonmovingSegment* seg)
+{
+    size_t end = ((size_t)seg) + NONMOVING_SEGMENT_SIZE;
+    memset(&seg->bitmap, 0, end - (size_t)&seg->bitmap);
+}
+
+static void
+clear_segment_free_blocks(struct NonmovingSegment* seg)
+{
+    unsigned int block_size = nonmovingSegmentBlockSize(seg);
+    for (unsigned int p_idx = 0; p_idx < nonmovingSegmentBlockCount(seg); ++p_idx) {
+        // after mark, so bit not set == dead
+        if (nonmovingGetMark(seg, p_idx) == 0) {
+            memset(nonmovingSegmentGetBlock(seg, p_idx), 0, block_size);
+        }
+    }
+}
+
+#endif
+
+GNUC_ATTR_HOT void nonmovingSweep(void)
+{
+    while (nonmovingHeap.sweep_list) {
+        struct NonmovingSegment *seg = nonmovingHeap.sweep_list;
+
+        // Pushing the segment to one of the free/active/filled segments
+        // updates the link field, so update sweep_list here
+        nonmovingHeap.sweep_list = seg->link;
+
+        enum SweepResult ret = nonmovingSweepSegment(seg);
+
+        switch (ret) {
+        case SEGMENT_FREE:
+            IF_DEBUG(sanity, clear_segment(seg));
+            nonmovingPushFreeSegment(seg);
+            break;
+        case SEGMENT_PARTIAL:
+            IF_DEBUG(sanity, clear_segment_free_blocks(seg));
+            nonmovingPushActiveSegment(seg);
+            break;
+        case SEGMENT_FILLED:
+            nonmovingPushFilledSegment(seg);
+            break;
+        default:
+            barf("nonmovingSweep: weird sweep return: %d\n", ret);
+        }
+    }
+}
+
+/* N.B. This happens during the pause so we own all capabilities. */
+void nonmovingSweepMutLists()
+{
+    for (uint32_t n = 0; n < n_capabilities; n++) {
+        Capability *cap = capabilities[n];
+        bdescr *old_mut_list = cap->mut_lists[oldest_gen->no];
+        cap->mut_lists[oldest_gen->no] = allocBlockOnNode_sync(cap->node);
+        for (bdescr *bd = old_mut_list; bd; bd = bd->link) {
+            for (StgPtr p = bd->start; p < bd->free; p++) {
+                StgClosure **q = (StgClosure**)p;
+                if (nonmovingIsAlive(*q)) {
+                    recordMutableCap(*q, cap, oldest_gen->no);
+                }
+            }
+        }
+        freeChain(old_mut_list);
+    }
+}
+
+void nonmovingSweepLargeObjects()
+{
+    freeChain_lock(nonmoving_large_objects);
+    nonmoving_large_objects = nonmoving_marked_large_objects;
+    n_nonmoving_large_blocks = n_nonmoving_marked_large_blocks;
+    nonmoving_marked_large_objects = NULL;
+    n_nonmoving_marked_large_blocks = 0;
+}
+
+// Helper for nonmovingSweepStableNameTable. Essentially nonmovingIsAlive,
+// but works when the object died in moving heap, see
+// nonmovingSweepStableNameTable
+static bool is_alive(StgClosure *p)
+{
+    if (!HEAP_ALLOCED_GC(p)) {
+        return true;
+    }
+
+    if (nonmovingClosureBeingSwept(p)) {
+        return nonmovingIsAlive(p);
+    } else {
+        // We don't want to sweep any stable names which weren't in the
+        // set of segments that we swept.
+        // See Note [Sweeping stable names in the concurrent collector]
+        return true;
+    }
+}
+
+void nonmovingSweepStableNameTable()
+{
+    // See comments in gcStableTables
+
+    /* Note [Sweeping stable names in the concurrent collector]
+     * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     *
+     * When collecting concurrently we need to take care to avoid freeing
+     * stable names the we didn't sweep this collection cycle. For instance,
+     * consider the following situation:
+     *
+     *  1. We take a snapshot and start collection
+     *  2. A mutator allocates a new object, then makes a stable name for it
+     *  3. The mutator performs a minor GC and promotes the new object to the nonmoving heap
+     *  4. The GC thread gets to the sweep phase and, when traversing the stable
+     *     name table, finds the new object unmarked. It then assumes that the
+     *     object is dead and removes the stable name from the stable name table.
+     *
+     */
+
+    // FIXME: We can't use nonmovingIsAlive here without first using isAlive:
+    // a stable name can die during moving heap collection and we can't use
+    // nonmovingIsAlive on those objects. Inefficient.
+
+    stableNameLock();
+    FOR_EACH_STABLE_NAME(
+        p, {
+            if (p->sn_obj != NULL) {
+                if (!is_alive((StgClosure*)p->sn_obj)) {
+                    p->sn_obj = NULL; // Just to make an assertion happy
+                    freeSnEntry(p);
+                } else if (p->addr != NULL) {
+                    if (!is_alive((StgClosure*)p->addr)) {
+                        p->addr = NULL;
+                    }
+                }
+            }
+        });
+    stableNameUnlock();
+}


=====================================
rts/sm/NonMovingSweep.h
=====================================
@@ -0,0 +1,32 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Sweep phase
+ *
+ * ---------------------------------------------------------------------------*/
+
+#pragma once
+
+#include "NonMoving.h"
+#include "Hash.h"
+
+GNUC_ATTR_HOT void nonmovingSweep(void);
+
+// Remove unmarked entries in oldest generation mut_lists
+void nonmovingSweepMutLists(void);
+
+// Remove unmarked entries in oldest generation scavenged_large_objects list
+void nonmovingSweepLargeObjects(void);
+
+// Remove dead entries in the stable name table
+void nonmovingSweepStableNameTable(void);
+
+// Collect the set of segments to be collected during a major GC into
+// nonmovingHeap.sweep_list.
+void nonmovingPrepareSweep(void);
+
+#if defined(DEBUG)
+// The non-moving equivalent of the moving collector's gcCAFs.
+void nonmovingGcCafs(struct MarkQueue_ *queue);
+#endif


=====================================
rts/sm/Sanity.c
=====================================
@@ -29,6 +29,8 @@
 #include "Arena.h"
 #include "RetainerProfile.h"
 #include "CNF.h"
+#include "sm/NonMoving.h"
+#include "sm/NonMovingMark.h"
 #include "Profiling.h" // prof_arena
 
 /* -----------------------------------------------------------------------------
@@ -40,6 +42,9 @@ static void  checkLargeBitmap    ( StgPtr payload, StgLargeBitmap*, uint32_t );
 static void  checkClosureShallow ( const StgClosure * );
 static void  checkSTACK          (StgStack *stack);
 
+static W_    countNonMovingSegments ( struct NonmovingSegment *segs );
+static W_    countNonMovingHeap     ( struct NonmovingHeap *heap );
+
 /* -----------------------------------------------------------------------------
    Check stack sanity
    -------------------------------------------------------------------------- */
@@ -481,6 +486,41 @@ void checkHeapChain (bdescr *bd)
     }
 }
 
+/* -----------------------------------------------------------------------------
+ * Check nonmoving heap sanity
+ *
+ * After a concurrent sweep the nonmoving heap can be checked for validity.
+ * -------------------------------------------------------------------------- */
+
+static void checkNonmovingSegments (struct NonmovingSegment *seg)
+{
+    while (seg != NULL) {
+        const nonmoving_block_idx count = nonmovingSegmentBlockCount(seg);
+        for (nonmoving_block_idx i=0; i < count; i++) {
+            if (seg->bitmap[i] == nonmovingMarkEpoch) {
+                StgPtr p = nonmovingSegmentGetBlock(seg, i);
+                checkClosure((StgClosure *) p);
+            } else if (i < seg->next_free_snap){
+                seg->bitmap[i] = 0;
+            }
+        }
+        seg = seg->link;
+    }
+}
+
+void checkNonmovingHeap (const struct NonmovingHeap *heap)
+{
+    for (unsigned int i=0; i < NONMOVING_ALLOCA_CNT; i++) {
+        const struct NonmovingAllocator *alloc = heap->allocators[i];
+        checkNonmovingSegments(alloc->filled);
+        checkNonmovingSegments(alloc->active);
+        for (unsigned int cap=0; cap < n_capabilities; cap++) {
+            checkNonmovingSegments(alloc->current[cap]);
+        }
+    }
+}
+
+
 void
 checkHeapChunk(StgPtr start, StgPtr end)
 {
@@ -753,16 +793,25 @@ static void checkGeneration (generation *gen,
     uint32_t n;
     gen_workspace *ws;
 
-    ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
+    //ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
     ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
 
 #if defined(THREADED_RTS)
-    // heap sanity checking doesn't work with SMP, because we can't
-    // zero the slop (see Updates.h).  However, we can sanity-check
-    // the heap after a major gc, because there is no slop.
+    // heap sanity checking doesn't work with SMP for two reasons:
+    //   * we can't zero the slop (see Updates.h).  However, we can sanity-check
+    //     the heap after a major gc, because there is no slop.
+    //
+    //   * the nonmoving collector may be mutating its large object lists, unless we
+    //     were in fact called by the nonmoving collector.
     if (!after_major_gc) return;
 #endif
 
+    if (RtsFlags.GcFlags.useNonmoving && gen == oldest_gen) {
+        ASSERT(countBlocks(nonmoving_large_objects) == n_nonmoving_large_blocks);
+        ASSERT(countBlocks(nonmoving_marked_large_objects) == n_nonmoving_marked_large_blocks);
+        ASSERT(countNonMovingSegments(nonmovingHeap.free) == (W_) nonmovingHeap.n_free * NONMOVING_SEGMENT_BLOCKS);
+    }
+
     checkHeapChain(gen->blocks);
 
     for (n = 0; n < n_capabilities; n++) {
@@ -811,6 +860,15 @@ markCompactBlocks(bdescr *bd)
     }
 }
 
+static void
+markNonMovingSegments(struct NonmovingSegment *seg)
+{
+    while (seg) {
+        markBlocks(Bdescr((P_)seg));
+        seg = seg->link;
+    }
+}
+
 // If memInventory() calculates that we have a memory leak, this
 // function will try to find the block(s) that are leaking by marking
 // all the ones that we know about, and search through memory to find
@@ -821,7 +879,7 @@ markCompactBlocks(bdescr *bd)
 static void
 findMemoryLeak (void)
 {
-    uint32_t g, i;
+    uint32_t g, i, j;
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
         for (i = 0; i < n_capabilities; i++) {
             markBlocks(capabilities[i]->mut_lists[g]);
@@ -843,6 +901,23 @@ findMemoryLeak (void)
         markBlocks(capabilities[i]->pinned_object_block);
     }
 
+    if (RtsFlags.GcFlags.useNonmoving) {
+        markBlocks(nonmoving_large_objects);
+        markBlocks(nonmoving_marked_large_objects);
+        for (i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
+            struct NonmovingAllocator *alloc = nonmovingHeap.allocators[i];
+            markNonMovingSegments(alloc->filled);
+            markNonMovingSegments(alloc->active);
+            for (j = 0; j < n_capabilities; j++) {
+                markNonMovingSegments(alloc->current[j]);
+            }
+        }
+        markNonMovingSegments(nonmovingHeap.sweep_list);
+        markNonMovingSegments(nonmovingHeap.free);
+        if (current_mark_queue)
+            markBlocks(current_mark_queue->blocks);
+    }
+
 #if defined(PROFILING)
   // TODO:
   // if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
@@ -901,14 +976,63 @@ void findSlop(bdescr *bd)
 static W_
 genBlocks (generation *gen)
 {
-    ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
+    W_ ret = 0;
+    if (RtsFlags.GcFlags.useNonmoving && gen == oldest_gen) {
+        // See Note [Live data accounting in nonmoving collector].
+        ASSERT(countNonMovingHeap(&nonmovingHeap) == gen->n_blocks);
+        ret += countAllocdBlocks(nonmoving_large_objects);
+        ret += countAllocdBlocks(nonmoving_marked_large_objects);
+        ret += countNonMovingHeap(&nonmovingHeap);
+        if (current_mark_queue)
+            ret += countBlocks(current_mark_queue->blocks);
+    } else {
+        ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
+        ret += gen->n_blocks;
+    }
+
     ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
     ASSERT(countCompactBlocks(gen->compact_objects) == gen->n_compact_blocks);
     ASSERT(countCompactBlocks(gen->compact_blocks_in_import) == gen->n_compact_blocks_in_import);
-    return gen->n_blocks + gen->n_old_blocks +
+
+    ret += gen->n_old_blocks +
         countAllocdBlocks(gen->large_objects) +
         countAllocdCompactBlocks(gen->compact_objects) +
         countAllocdCompactBlocks(gen->compact_blocks_in_import);
+    return ret;
+}
+
+static W_
+countNonMovingSegments(struct NonmovingSegment *segs)
+{
+    W_ ret = 0;
+    while (segs) {
+        ret += countBlocks(Bdescr((P_)segs));
+        segs = segs->link;
+    }
+    return ret;
+}
+
+static W_
+countNonMovingAllocator(struct NonmovingAllocator *alloc)
+{
+    W_ ret = countNonMovingSegments(alloc->filled)
+           + countNonMovingSegments(alloc->active);
+    for (uint32_t i = 0; i < n_capabilities; ++i) {
+        ret += countNonMovingSegments(alloc->current[i]);
+    }
+    return ret;
+}
+
+static W_
+countNonMovingHeap(struct NonmovingHeap *heap)
+{
+    W_ ret = 0;
+    for (int alloc_idx = 0; alloc_idx < NONMOVING_ALLOCA_CNT; alloc_idx++) {
+        ret += countNonMovingAllocator(heap->allocators[alloc_idx]);
+    }
+    ret += countNonMovingSegments(heap->sweep_list);
+    ret += countNonMovingSegments(heap->free);
+    return ret;
 }
 
 void
@@ -916,8 +1040,8 @@ memInventory (bool show)
 {
   uint32_t g, i;
   W_ gen_blocks[RtsFlags.GcFlags.generations];
-  W_ nursery_blocks, retainer_blocks,
-      arena_blocks, exec_blocks, gc_free_blocks = 0;
+  W_ nursery_blocks = 0, retainer_blocks = 0,
+      arena_blocks = 0, exec_blocks = 0, gc_free_blocks = 0;
   W_ live_blocks = 0, free_blocks = 0;
   bool leak;
 
@@ -934,20 +1058,19 @@ memInventory (bool show)
       gen_blocks[g] += genBlocks(&generations[g]);
   }
 
-  nursery_blocks = 0;
   for (i = 0; i < n_nurseries; i++) {
       ASSERT(countBlocks(nurseries[i].blocks) == nurseries[i].n_blocks);
       nursery_blocks += nurseries[i].n_blocks;
   }
   for (i = 0; i < n_capabilities; i++) {
-      gc_free_blocks += countBlocks(gc_threads[i]->free_blocks);
+      W_ n = countBlocks(gc_threads[i]->free_blocks);
+      gc_free_blocks += n;
       if (capabilities[i]->pinned_object_block != NULL) {
           nursery_blocks += capabilities[i]->pinned_object_block->blocks;
       }
       nursery_blocks += countBlocks(capabilities[i]->pinned_object_blocks);
   }
 
-  retainer_blocks = 0;
 #if defined(PROFILING)
   if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
       retainer_blocks = retainerStackBlocks();
@@ -963,6 +1086,8 @@ memInventory (bool show)
   /* count the blocks on the free list */
   free_blocks = countFreeList();
 
+  }
+
   live_blocks = 0;
   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
       live_blocks += gen_blocks[g];


=====================================
rts/sm/Sanity.h
=====================================
@@ -31,6 +31,7 @@ void checkStaticObjects ( StgClosure* static_objects );
 void checkStackChunk    ( StgPtr sp, StgPtr stack_end );
 StgOffset checkStackFrame ( StgPtr sp );
 StgOffset checkClosure  ( const StgClosure* p );
+void checkNonmovingHeap ( const struct NonmovingHeap *heap );
 
 void checkRunQueue      (Capability *cap);
 


=====================================
rts/sm/Scav.c
=====================================
@@ -62,6 +62,8 @@
 #include "Hash.h"
 
 #include "sm/MarkWeak.h"
+#include "sm/NonMoving.h" // for nonmoving_set_closure_mark_bit
+#include "sm/NonMovingScav.h"
 
 static void scavenge_large_bitmap (StgPtr p,
                                    StgLargeBitmap *large_bitmap,
@@ -1652,7 +1654,10 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
                 ;
             }
 
-            if (scavenge_one(p)) {
+            if (RtsFlags.GcFlags.useNonmoving && major_gc && gen == oldest_gen) {
+                // We can't use scavenge_one here as we need to scavenge SRTs
+                nonmovingScavengeOne((StgClosure *)p);
+            } else if (scavenge_one(p)) {
                 // didn't manage to promote everything, so put the
                 // object back on the list.
                 recordMutableGen_GC((StgClosure *)p,gen_no);
@@ -1664,7 +1669,14 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
 void
 scavenge_capability_mut_lists (Capability *cap)
 {
-    uint32_t g;
+    // In a major GC only nonmoving heap's mut list is root
+    if (RtsFlags.GcFlags.useNonmoving && major_gc) {
+        uint32_t g = oldest_gen->no;
+        scavenge_mutable_list(cap->saved_mut_lists[g], oldest_gen);
+        freeChain_sync(cap->saved_mut_lists[g]);
+        cap->saved_mut_lists[g] = NULL;
+        return;
+    }
 
     /* Mutable lists from each generation > N
      * we want to *scavenge* these roots, not evacuate them: they're not
@@ -1672,7 +1684,7 @@ scavenge_capability_mut_lists (Capability *cap)
      * Also do them in reverse generation order, for the usual reason:
      * namely to reduce the likelihood of spurious old->new pointers.
      */
-    for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+    for (uint32_t g = RtsFlags.GcFlags.generations-1; g > N; g--) {
         scavenge_mutable_list(cap->saved_mut_lists[g], &generations[g]);
         freeChain_sync(cap->saved_mut_lists[g]);
         cap->saved_mut_lists[g] = NULL;
@@ -2042,6 +2054,16 @@ loop:
     for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) {
         ws = &gct->gens[g];
 
+        if (ws->todo_seg != END_NONMOVING_TODO_LIST) {
+            struct NonmovingSegment *seg = ws->todo_seg;
+            ASSERT(seg->todo_link);
+            ws->todo_seg = seg->todo_link;
+            seg->todo_link = NULL;
+            scavengeNonmovingSegment(seg);
+            did_something = true;
+            break;
+        }
+
         gct->scan_bd = NULL;
 
         // If we have a scan block with some work to do,


=====================================
rts/sm/Storage.c
=====================================
@@ -29,6 +29,7 @@
 #include "Trace.h"
 #include "GC.h"
 #include "Evac.h"
+#include "NonMoving.h"
 #if defined(ios_HOST_OS)
 #include "Hash.h"
 #endif
@@ -82,7 +83,7 @@ Mutex sm_mutex;
 static void allocNurseries (uint32_t from, uint32_t to);
 static void assignNurseriesToCapabilities (uint32_t from, uint32_t to);
 
-static void
+void
 initGeneration (generation *gen, int g)
 {
     gen->no = g;
@@ -170,6 +171,19 @@ initStorage (void)
   }
   oldest_gen->to = oldest_gen;
 
+  // Nonmoving heap uses oldest_gen so initialize it after initializing oldest_gen
+  if (RtsFlags.GcFlags.useNonmoving)
+      nonmovingInit();
+
+#if defined(THREADED_RTS)
+  // nonmovingAddCapabilities allocates segments, which requires taking the gc
+  // sync lock, so initialize it before nonmovingAddCapabilities
+  initSpinLock(&gc_alloc_block_sync);
+#endif
+
+  if (RtsFlags.GcFlags.useNonmoving)
+      nonmovingAddCapabilities(n_capabilities);
+
   /* The oldest generation has one step. */
   if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) {
       if (RtsFlags.GcFlags.generations == 1) {
@@ -195,9 +209,6 @@ initStorage (void)
 
   exec_block = NULL;
 
-#if defined(THREADED_RTS)
-  initSpinLock(&gc_alloc_block_sync);
-#endif
   N = 0;
 
   for (n = 0; n < n_numa_nodes; n++) {
@@ -1226,8 +1237,8 @@ W_ countOccupied (bdescr *bd)
 
 W_ genLiveWords (generation *gen)
 {
-    return gen->n_words + gen->n_large_words +
-        gen->n_compact_blocks * BLOCK_SIZE_W;
+    return (gen->live_estimate ? gen->live_estimate : gen->n_words) +
+        gen->n_large_words + gen->n_compact_blocks * BLOCK_SIZE_W;
 }
 
 W_ genLiveBlocks (generation *gen)
@@ -1283,9 +1294,9 @@ calcNeeded (bool force_major, memcount *blocks_needed)
     for (uint32_t g = 0; g < RtsFlags.GcFlags.generations; g++) {
         generation *gen = &generations[g];
 
-        W_ blocks = gen->n_blocks // or: gen->n_words / BLOCK_SIZE_W (?)
-                  + gen->n_large_blocks
-                  + gen->n_compact_blocks;
+        W_ blocks = gen->live_estimate ? (gen->live_estimate / BLOCK_SIZE_W) : gen->n_blocks;
+        blocks += gen->n_large_blocks
+                + gen->n_compact_blocks;
 
         // we need at least this much space
         needed += blocks;
@@ -1303,7 +1314,7 @@ calcNeeded (bool force_major, memcount *blocks_needed)
                 //  mark stack:
                 needed += gen->n_blocks / 100;
             }
-            if (gen->compact) {
+            if (gen->compact || (RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) {
                 continue; // no additional space needed for compaction
             } else {
                 needed += gen->n_blocks;


=====================================
rts/sm/Storage.h
=====================================
@@ -17,6 +17,7 @@
    -------------------------------------------------------------------------- */
 
 void initStorage(void);
+void initGeneration(generation *gen, int g);
 void exitStorage(void);
 void freeStorage(bool free_heap);
 


=====================================
testsuite/config/ghc
=====================================
@@ -24,7 +24,8 @@ config.other_ways         = ['prof', 'normal_h',
                              'profllvm', 'profoptllvm', 'profthreadedllvm',
                              'debug',
                              'ghci-ext', 'ghci-ext-prof',
-                             'ext-interp']
+                             'ext-interp',
+                             'nonmoving']
 
 if ghc_with_native_codegen:
     config.compile_ways.append('optasm')
@@ -94,7 +95,8 @@ config.way_flags = {
     'profthreadedllvm' : ['-O', '-prof', '-static', '-fprof-auto', '-threaded', '-fllvm'],
     'ghci-ext'         : ['--interactive', '-v0', '-ignore-dot-ghci', '-fno-ghci-history', '-fexternal-interpreter', '+RTS', '-I0.1', '-RTS'],
     'ghci-ext-prof'    : ['--interactive', '-v0', '-ignore-dot-ghci', '-fno-ghci-history', '-fexternal-interpreter', '-prof', '+RTS', '-I0.1', '-RTS'],
-    'ext-interp' : ['-fexternal-interpreter'],
+    'ext-interp'   : ['-fexternal-interpreter'],
+    'nonmoving'    : ['-debug'],
    }
 
 config.way_rts_flags = {
@@ -133,6 +135,7 @@ config.way_rts_flags = {
     'ghci-ext'         : [],
     'ghci-ext-prof'    : [],
     'ext-interp'       : [],
+    'nonmoving'        : ['-DS', '-xn'],
    }
 
 # Useful classes of ways that can be used with only_ways(), omit_ways() and



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/25ce370a0dc1f0591b4f6296a0d16ad1d6a2a287...9cd98caaacd33094142a03a87b64c43884e837d5

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/25ce370a0dc1f0591b4f6296a0d16ad1d6a2a287...9cd98caaacd33094142a03a87b64c43884e837d5
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20190618/3b251b9c/attachment-0001.html>