[Git][ghc/ghc][wip/gc/optimize] 10 commits: rts: Add prefetch macros

Ben Gamari gitlab at gitlab.haskell.org
Tue May 21 13:51:01 UTC 2019



Ben Gamari pushed to branch wip/gc/optimize at Glasgow Haskell Compiler / GHC


Commits:
d6b14a1f by Ben Gamari at 2019-05-19T18:13:02Z
rts: Add prefetch macros

- - - - -
c9e5e5e0 by Ben Gamari at 2019-05-19T18:19:37Z
NonMoving: Prefetch when clearing bitmaps

Ensure that the bitmap of the segmentt that we will clear next is in
cache by the time we reach it.

- - - - -
b78483f0 by Ben Gamari at 2019-05-19T18:22:44Z
NonMoving: Inline nonmovingClearAllBitmaps

- - - - -
93178281 by Ben Gamari at 2019-05-19T18:24:25Z
NonMoving: Fuse sweep preparation into mark prep

- - - - -
f6704ef0 by Ben Gamari at 2019-05-19T18:27:16Z
NonMoving: Pre-fetch during mark

This improved overall runtime on nofib's constraints test by nearly 10%.

- - - - -
67c6a5c8 by Ben Gamari at 2019-05-19T18:49:57Z
NonMoving: Prefetch segment header

- - - - -
cddfb6ab by Ben Gamari at 2019-05-19T18:50:01Z
NonMoving: Optimise allocator cache behavior

Previously we would look at the segment header to determine the block
size despite the fact that we already had the block size at hand.

- - - - -
9bc0f119 by Ben Gamari at 2019-05-19T18:50:02Z
NonMovingMark: Eliminate redundant check_in_nonmoving_heaps

- - - - -
57a995c4 by Ben Gamari at 2019-05-19T18:50:02Z
NonMoving: Don't do major GC if one is already running

Previously we would perform a preparatory moving collection, resulting
in many things being added to the mark queue. When we finished with this
we would realize in nonmovingCollect that there was already a collection
running, in which case we would simply not run the nonmoving collector.

However, it was very easy to end up in a "treadmilling" situation: all
subsequent GC following the first failed major GC would be scheduled as
major GCs. Consequently we would continuously feed the concurrent
collector with more mark queue entries and it would never finish.

This patch aborts the major collection far earlier, meaning that we
avoid adding nonmoving objects to the mark queue and allowing the
concurrent collector to finish.

- - - - -
0dc9f62d by Ben Gamari at 2019-05-19T18:50:02Z
Nonmoving: Ensure write barrier vanishes in non-threaded RTS

- - - - -


19 changed files:

- includes/Cmm.h
- includes/Rts.h
- includes/rts/NonMoving.h
- nofib
- rts/Messages.c
- rts/PrimOps.cmm
- rts/STM.c
- rts/Schedule.c
- rts/ThreadPaused.c
- rts/Threads.c
- rts/Updates.h
- rts/sm/GC.c
- rts/sm/NonMoving.c
- rts/sm/NonMoving.h
- rts/sm/NonMovingMark.c
- rts/sm/NonMovingMark.h
- rts/sm/NonMovingSweep.c
- rts/sm/NonMovingSweep.h
- rts/sm/Storage.c


Changes:

=====================================
includes/Cmm.h
=====================================
@@ -935,19 +935,23 @@
     return (dst);
 
 
+//
+// Nonmoving write barrier helpers
+//
+// See Note [Update remembered set] in NonMovingMark.c.
+
 #if defined(THREADED_RTS)
-#define IF_WRITE_BARRIER_ENABLED                               \
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
     if (W_[nonmoving_write_barrier_enabled] != 0) (likely: False)
 #else
 // A similar measure is also taken in rts/NonMoving.h, but that isn't visible from C--
-#define IF_WRITE_BARRIER_ENABLED                               \
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
     if (0)
 #define nonmoving_write_barrier_enabled 0
 #endif
 
 // A useful helper for pushing a pointer to the update remembered set.
-// See Note [Update remembered set] in NonMovingMark.c.
 #define updateRemembSetPushPtr(p)                                    \
-    IF_WRITE_BARRIER_ENABLED {                                       \
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {                             \
       ccall updateRemembSetPushClosure_(BaseReg "ptr", p "ptr");     \
     }


=====================================
includes/Rts.h
=====================================
@@ -68,6 +68,10 @@ extern "C" {
 #define RTS_UNREACHABLE abort()
 #endif
 
+/* Prefetch primitives */
+#define prefetchForRead(ptr) __builtin_prefetch(ptr, 0)
+#define prefetchForWrite(ptr) __builtin_prefetch(ptr, 1)
+
 /* Fix for mingw stat problem (done here so it's early enough) */
 #if defined(mingw32_HOST_OS)
 #define __MSVCRT__ 1


=====================================
includes/rts/NonMoving.h
=====================================
@@ -21,4 +21,7 @@ void updateRemembSetPushClosure(Capability *cap, StgClosure *p);
 
 void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p);
 
+// Note that RTS code should not condition on this directly by rather
+// use the IF_NONMOVING_WRITE_BARRIER_ENABLED macro to ensure that
+// the barrier is eliminated in the non-threaded RTS.
 extern StgWord DLL_IMPORT_DATA_VAR(nonmoving_write_barrier_enabled);


=====================================
nofib
=====================================
@@ -1 +1 @@
-Subproject commit f87d446b4e361cc82f219cf78917db9681af69b3
+Subproject commit ac596ee3e71bed874b6830361a31ca23ff4aa1a6


=====================================
rts/Messages.c
=====================================
@@ -256,7 +256,7 @@ loop:
 
         // point to the BLOCKING_QUEUE from the BLACKHOLE
         write_barrier(); // make the BQ visible
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, (StgClosure*)p);
         }
         ((StgInd*)bh)->indirectee = (StgClosure *)bq;
@@ -287,7 +287,7 @@ loop:
         }
 #endif
 
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             // We are about to overwrite bq->queue; make sure its current value
             // makes it into the update remembered set
             updateRemembSetPushClosure(cap, (StgClosure*)bq->queue);


=====================================
rts/PrimOps.cmm
=====================================
@@ -474,7 +474,7 @@ stg_copyArray_barrier ( W_ hdr_size, gcptr dst, W_ dst_off, W_ n)
     end = p + WDS(n);
 
 again:
-    IF_WRITE_BARRIER_ENABLED {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         ccall updateRemembSetPushClosure_(BaseReg "ptr", W_[p] "ptr");
     }
     p = p + WDS(1);
@@ -490,7 +490,7 @@ stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n)
     W_ dst_p, src_p, bytes;
 
     if (n > 0) {
-        IF_WRITE_BARRIER_ENABLED {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
                                       dst, dst_off, n);
         }
@@ -511,7 +511,7 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
     W_ dst_p, src_p, bytes;
 
     if (n > 0) {
-        IF_WRITE_BARRIER_ENABLED {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
                                       dst, dst_off, n);
         }


=====================================
rts/STM.c
=====================================
@@ -297,8 +297,10 @@ static StgClosure *lock_tvar(Capability *cap,
   } while (cas((void *)&(s -> current_value),
                (StgWord)result, (StgWord)trec) != (StgWord)result);
 
-  if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
-      updateRemembSetPushClosure(cap, result);
+
+  IF_NONMOVING_WRITE_BARRIER_ENABLED {
+      if (result)
+          updateRemembSetPushClosure(cap, result);
   }
   return result;
 }
@@ -323,8 +325,9 @@ static StgBool cond_lock_tvar(Capability *cap,
   TRACE("%p : cond_lock_tvar(%p, %p)", trec, s, expected);
   w = cas((void *)&(s -> current_value), (StgWord)expected, (StgWord)trec);
   result = (StgClosure *)w;
-  if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
-      updateRemembSetPushClosure(cap, expected);
+  IF_NONMOVING_WRITE_BARRIER_ENABLED {
+      if (result)
+          updateRemembSetPushClosure(cap, expected);
   }
   TRACE("%p : %s", trec, result ? "success" : "failure");
   return (result == expected);


=====================================
rts/Schedule.c
=====================================
@@ -2500,7 +2500,7 @@ resumeThread (void *task_)
     incall->suspended_tso = NULL;
     incall->suspended_cap = NULL;
     // we will modify tso->_link
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         updateRemembSetPushClosure(cap, (StgClosure *)tso->_link);
     }
     tso->_link = END_TSO_QUEUE;


=====================================
rts/ThreadPaused.c
=====================================
@@ -330,15 +330,16 @@ threadPaused(Capability *cap, StgTSO *tso)
             }
 #endif
 
-            if (RTS_UNLIKELY(nonmoving_write_barrier_enabled
-                             && ip_THUNK(INFO_PTR_TO_STRUCT(bh_info)))) {
-                // We are about to replace a thunk with a blackhole.
-                // Add the free variables of the closure we are about to
-                // overwrite to the update remembered set.
-                // N.B. We caught the WHITEHOLE case above.
-                updateRemembSetPushThunkEager(cap,
-                                             THUNK_INFO_PTR_TO_STRUCT(bh_info),
-                                             (StgThunk *) bh);
+            IF_NONMOVING_WRITE_BARRIER_ENABLED {
+                if (ip_THUNK(INFO_PTR_TO_STRUCT(bh_info))) {
+                    // We are about to replace a thunk with a blackhole.
+                    // Add the free variables of the closure we are about to
+                    // overwrite to the update remembered set.
+                    // N.B. We caught the WHITEHOLE case above.
+                    updateRemembSetPushThunkEager(cap,
+                                                 THUNK_INFO_PTR_TO_STRUCT(bh_info),
+                                                 (StgThunk *) bh);
+                }
             }
 
             // The payload of the BLACKHOLE points to the TSO


=====================================
rts/Threads.c
=====================================
@@ -711,7 +711,7 @@ threadStackUnderflow (Capability *cap, StgTSO *tso)
             barf("threadStackUnderflow: not enough space for return values");
         }
 
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             // ensure that values that we copy into the new stack are marked
             // for the nonmoving collector. Note that these values won't
             // necessarily form a full closure so we need to handle them


=====================================
rts/Updates.h
=====================================
@@ -44,7 +44,7 @@
     W_ bd;                                                      \
                                                                 \
     OVERWRITING_CLOSURE(p1);                                    \
-    IF_WRITE_BARRIER_ENABLED {                                  \
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {                        \
       ccall updateRemembSetPushThunk_(BaseReg, p1 "ptr");       \
     }                                                           \
     StgInd_indirectee(p1) = p2;                                 \
@@ -73,7 +73,7 @@ INLINE_HEADER void updateWithIndirection (Capability *cap,
     /* not necessarily true: ASSERT( !closure_IND(p1) ); */
     /* occurs in RaiseAsync.c:raiseAsync() */
     OVERWRITING_CLOSURE(p1);
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         updateRemembSetPushThunk(cap, (StgThunk*)p1);
     }
     ((StgInd *)p1)->indirectee = p2;


=====================================
rts/sm/GC.c
=====================================
@@ -267,6 +267,14 @@ GarbageCollect (uint32_t collect_gen,
   N = collect_gen;
   major_gc = (N == RtsFlags.GcFlags.generations-1);
 
+#if defined(THREADED_RTS)
+  if (major_gc && RtsFlags.GcFlags.useNonmoving && concurrent_coll_running) {
+      N--;
+      collect_gen--;
+      major_gc = false;
+  }
+#endif
+
   /* N.B. The nonmoving collector works a bit differently. See
    * Note [Static objects under the nonmoving collector].
    */


=====================================
rts/sm/NonMoving.c
=====================================
@@ -167,6 +167,20 @@ static struct NonmovingSegment *nonmovingPopFreeSegment(void)
     }
 }
 
+unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size)
+{
+  // We compute the overwhelmingly common size cases directly to avoid a very
+  // expensive integer division.
+  switch (log_block_size) {
+    case 3:  return nonmovingBlockCount(3);
+    case 4:  return nonmovingBlockCount(4);
+    case 5:  return nonmovingBlockCount(5);
+    case 6:  return nonmovingBlockCount(6);
+    case 7:  return nonmovingBlockCount(7);
+    default: return nonmovingBlockCount(log_block_size);
+  }
+}
+
 /*
  * Request a fresh segment from the free segment list or allocate one of the
  * given node.
@@ -215,10 +229,10 @@ static inline unsigned long log2_ceil(unsigned long x)
 }
 
 // Advance a segment's next_free pointer. Returns true if segment if full.
-static bool advance_next_free(struct NonmovingSegment *seg)
+static bool advance_next_free(struct NonmovingSegment *seg, const unsigned int blk_count)
 {
     const uint8_t *bitmap = seg->bitmap;
-    const unsigned int blk_count = nonmovingSegmentBlockCount(seg);
+    ASSERT(blk_count == nonmovingSegmentBlockCount(seg));
 #if defined(NAIVE_ADVANCE_FREE)
     // reference implementation
     for (unsigned int i = seg->next_free+1; i < blk_count; i++) {
@@ -260,22 +274,23 @@ static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *al
 GNUC_ATTR_HOT
 void *nonmovingAllocate(Capability *cap, StgWord sz)
 {
-    unsigned int allocator_idx = log2_ceil(sz * sizeof(StgWord)) - NONMOVING_ALLOCA0;
+    unsigned int log_block_size = log2_ceil(sz * sizeof(StgWord));
+    unsigned int block_count = nonmovingBlockCountFromSize(log_block_size);
 
     // The max we ever allocate is 3276 bytes (anything larger is a large
     // object and not moved) which is covered by allocator 9.
-    ASSERT(allocator_idx < NONMOVING_ALLOCA_CNT);
+    ASSERT(log_block_size < NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT);
 
-    struct NonmovingAllocator *alloca = nonmovingHeap.allocators[allocator_idx];
+    struct NonmovingAllocator *alloca = nonmovingHeap.allocators[log_block_size - NONMOVING_ALLOCA0];
 
     // Allocate into current segment
     struct NonmovingSegment *current = alloca->current[cap->no];
     ASSERT(current); // current is never NULL
-    void *ret = nonmovingSegmentGetBlock(current, current->next_free);
+    void *ret = nonmovingSegmentGetBlock_(current, log_block_size, current->next_free);
     ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment
 
     // Advance the current segment's next_free or allocate a new segment if full
-    bool full = advance_next_free(current);
+    bool full = advance_next_free(current, block_count);
     if (full) {
         // Current segment is full: update live data estimate link it to
         // filled, take an active segment if one exists, otherwise allocate a
@@ -283,8 +298,9 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
 
         // Update live data estimate.
         // See Note [Live data accounting in nonmoving collector].
-        unsigned int new_blocks =  nonmovingSegmentBlockCount(current) - current->next_free_snap;
-        atomic_inc(&oldest_gen->live_estimate, new_blocks * nonmovingSegmentBlockSize(current) / sizeof(W_));
+        unsigned int new_blocks = block_count - current->next_free_snap;
+        unsigned int block_size = 1 << log_block_size;
+        atomic_inc(&oldest_gen->live_estimate, new_blocks * block_size / sizeof(W_));
 
         // push the current segment to the filled list
         nonmovingPushFilledSegment(current);
@@ -295,7 +311,7 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
         // there are no active segments, allocate new segment
         if (new_current == NULL) {
             new_current = nonmovingAllocSegment(cap->node);
-            nonmovingInitSegment(new_current, NONMOVING_ALLOCA0 + allocator_idx);
+            nonmovingInitSegment(new_current, log_block_size);
         }
 
         // make it current
@@ -379,33 +395,12 @@ void nonmovingAddCapabilities(uint32_t new_n_caps)
     nonmovingHeap.n_caps = new_n_caps;
 }
 
-static void nonmovingClearBitmap(struct NonmovingSegment *seg)
+static inline void nonmovingClearBitmap(struct NonmovingSegment *seg)
 {
     unsigned int n = nonmovingSegmentBlockCount(seg);
     memset(seg->bitmap, 0, n);
 }
 
-static void nonmovingClearSegmentBitmaps(struct NonmovingSegment *seg)
-{
-    while (seg) {
-        nonmovingClearBitmap(seg);
-        seg = seg->link;
-    }
-}
-
-static void nonmovingClearAllBitmaps(void)
-{
-    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
-        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
-        nonmovingClearSegmentBitmaps(alloca->filled);
-    }
-
-    // Clear large object bits
-    for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) {
-        bd->flags &= ~BF_MARKED;
-    }
-}
-
 /* Prepare the heap bitmaps and snapshot metadata for a mark */
 static void nonmovingPrepareMark(void)
 {
@@ -414,7 +409,9 @@ static void nonmovingPrepareMark(void)
     static_flag =
         static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
 
-    nonmovingClearAllBitmaps();
+    // Should have been cleared by the last sweep
+    ASSERT(nonmovingHeap.sweep_list == NULL);
+
     nonmovingBumpEpoch();
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
@@ -425,11 +422,28 @@ static void nonmovingPrepareMark(void)
             seg->next_free_snap = seg->next_free;
         }
 
-        // Update filled segments' snapshot pointers
-        struct NonmovingSegment *seg = alloca->filled;
-        while (seg) {
-            seg->next_free_snap = seg->next_free;
-            seg = seg->link;
+        // Update filled segments' snapshot pointers and move to sweep_list
+        uint32_t n_filled = 0;
+        struct NonmovingSegment *const filled = alloca->filled;
+        alloca->filled = NULL;
+        if (filled) {
+            struct NonmovingSegment *seg = filled;
+            while (true) {
+                n_filled++;
+                prefetchForRead(seg->link);
+                // Clear bitmap
+                prefetchForWrite(seg->link->bitmap);
+                nonmovingClearBitmap(seg);
+                // Set snapshot
+                seg->next_free_snap = seg->next_free;
+                if (seg->link)
+                    seg = seg->link;
+                else
+                    break;
+            }
+            // add filled segments to sweep_list
+            seg->link = nonmovingHeap.sweep_list;
+            nonmovingHeap.sweep_list = filled;
         }
 
         // N.B. It's not necessary to update snapshot pointers of active segments;
@@ -450,6 +464,12 @@ static void nonmovingPrepareMark(void)
     oldest_gen->n_large_blocks = 0;
     nonmoving_live_words = 0;
 
+    // Clear large object bits
+    for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) {
+        bd->flags &= ~BF_MARKED;
+    }
+
+
 #if defined(DEBUG)
     debug_caf_list_snapshot = debug_caf_list;
     debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
@@ -500,7 +520,6 @@ void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads)
     resizeGenerations();
 
     nonmovingPrepareMark();
-    nonmovingPrepareSweep();
 
     // N.B. These should have been cleared at the end of the last sweep.
     ASSERT(nonmoving_marked_large_objects == NULL);


=====================================
rts/sm/NonMoving.h
=====================================
@@ -92,6 +92,9 @@ struct NonmovingHeap {
 extern struct NonmovingHeap nonmovingHeap;
 
 extern uint64_t nonmoving_live_words;
+#if defined(THREADED_RTS)
+extern bool concurrent_coll_running;
+#endif
 
 void nonmovingInit(void);
 void nonmovingExit(void);
@@ -170,28 +173,24 @@ INLINE_HEADER unsigned int nonmovingBlockCount(uint8_t log_block_size)
   return segment_data_size / (blk_size + 1);
 }
 
+unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size);
+
 // How many blocks does the given segment contain? Also the size of the bitmap.
 INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg)
 {
-  // We compute the overwhelmingly common size cases directly to avoid a very
-  // expensive integer division.
-  switch (seg->block_size) {
-    case 3:  return nonmovingBlockCount(3);
-    case 4:  return nonmovingBlockCount(4);
-    case 5:  return nonmovingBlockCount(5);
-    case 6:  return nonmovingBlockCount(6);
-    case 7:  return nonmovingBlockCount(7);
-    default: return nonmovingBlockCount(seg->block_size);
-  }
+  return nonmovingBlockCountFromSize(seg->block_size);
 }
 
-// Get a pointer to the given block index
-INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i)
+// Get a pointer to the given block index assuming that the block size is as
+// given (avoiding a potential cache miss when this information is already
+// available). The log_block_size argument must be equal to seg->block_size.
+INLINE_HEADER void *nonmovingSegmentGetBlock_(struct NonmovingSegment *seg, uint8_t log_block_size, nonmoving_block_idx i)
 {
+  ASSERT(log_block_size == seg->block_size);
   // Block size in bytes
-  unsigned int blk_size = nonmovingSegmentBlockSize(seg);
+  unsigned int blk_size = 1 << log_block_size;
   // Bitmap size in bytes
-  W_ bitmap_size = nonmovingSegmentBlockCount(seg) * sizeof(uint8_t);
+  W_ bitmap_size = nonmovingBlockCountFromSize(log_block_size) * sizeof(uint8_t);
   // Where the actual data starts (address of the first block).
   // Use ROUNDUP_BYTES_TO_WDS to align to word size. Note that
   // ROUNDUP_BYTES_TO_WDS returns in _words_, not in _bytes_, so convert it back
@@ -200,15 +199,26 @@ INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmo
   return (void*)(data + i*blk_size);
 }
 
+// Get a pointer to the given block index.
+INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i)
+{
+  return nonmovingSegmentGetBlock_(seg, seg->block_size, i);
+}
+
 // Get the segment which a closure resides in. Assumes that pointer points into
 // non-moving heap.
-INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p)
+INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment_unchecked(StgPtr p)
 {
-    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
     const uintptr_t mask = ~NONMOVING_SEGMENT_MASK;
     return (struct NonmovingSegment *) (((uintptr_t) p) & mask);
 }
 
+INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p)
+{
+    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
+    return nonmovingGetSegment_unchecked(p);
+}
+
 INLINE_HEADER nonmoving_block_idx nonmovingGetBlockIdx(StgPtr p)
 {
     ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));


=====================================
rts/sm/NonMovingMark.c
=====================================
@@ -410,11 +410,8 @@ void push_closure (MarkQueue *q,
                    StgClosure *p,
                    StgClosure **origin)
 {
-    // TODO: Push this into callers where they already have the Bdescr
-    if (HEAP_ALLOCED_GC(p) && (Bdescr((StgPtr) p)->gen != oldest_gen))
-        return;
-
 #if defined(DEBUG)
+    ASSERT(!HEAP_ALLOCED_GC(p) || (Bdescr((StgPtr) p)->gen == oldest_gen));
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
     // Commenting out: too slow
     // if (RtsFlags.DebugFlags.sanity) {
@@ -527,15 +524,11 @@ void updateRemembSetPushThunkEager(Capability *cap,
         MarkQueue *queue = &cap->upd_rem_set.queue;
         push_thunk_srt(queue, &info->i);
 
-        // Don't record the origin of objects living outside of the nonmoving
-        // heap; we can't perform the selector optimisation on them anyways.
-        bool record_origin = check_in_nonmoving_heap((StgClosure*)thunk);
-
         for (StgWord i = 0; i < info->i.layout.payload.ptrs; i++) {
             if (check_in_nonmoving_heap(thunk->payload[i])) {
-                push_closure(queue,
-                             thunk->payload[i],
-                             record_origin ? &thunk->payload[i] : NULL);
+                // Don't bother to push origin; it makes the barrier needlessly
+                // expensive with little benefit.
+                push_closure(queue, thunk->payload[i], NULL);
             }
         }
         break;
@@ -544,7 +537,9 @@ void updateRemembSetPushThunkEager(Capability *cap,
     {
         MarkQueue *queue = &cap->upd_rem_set.queue;
         StgAP *ap = (StgAP *) thunk;
-        push_closure(queue, ap->fun, &ap->fun);
+        if (check_in_nonmoving_heap(ap->fun)) {
+            push_closure(queue, ap->fun, NULL);
+        }
         mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args);
         break;
     }
@@ -565,9 +560,10 @@ void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p)
 
 inline void updateRemembSetPushClosure(Capability *cap, StgClosure *p)
 {
-    if (!check_in_nonmoving_heap(p)) return;
-    MarkQueue *queue = &cap->upd_rem_set.queue;
-    push_closure(queue, p, NULL);
+    if (check_in_nonmoving_heap(p)) {
+        MarkQueue *queue = &cap->upd_rem_set.queue;
+        push_closure(queue, p, NULL);
+    }
 }
 
 void updateRemembSetPushClosure_(StgRegTable *reg, StgClosure *p)
@@ -664,7 +660,10 @@ void markQueuePushClosure (MarkQueue *q,
                            StgClosure *p,
                            StgClosure **origin)
 {
-    push_closure(q, p, origin);
+    // TODO: Push this into callers where they already have the Bdescr
+    if (check_in_nonmoving_heap(p)) {
+        push_closure(q, p, origin);
+    }
 }
 
 /* TODO: Do we really never want to specify the origin here? */
@@ -701,7 +700,7 @@ void markQueuePushArray (MarkQueue *q,
  *********************************************************/
 
 // Returns invalid MarkQueueEnt if queue is empty.
-static MarkQueueEnt markQueuePop (MarkQueue *q)
+static MarkQueueEnt markQueuePop_ (MarkQueue *q)
 {
     MarkQueueBlock *top;
 
@@ -732,6 +731,47 @@ again:
     return ent;
 }
 
+static MarkQueueEnt markQueuePop (MarkQueue *q)
+{
+#if MARK_PREFETCH_QUEUE_DEPTH == 0
+    return markQueuePop_(q);
+#else
+    unsigned int i = q->prefetch_head;
+    while (nonmovingMarkQueueEntryType(&q->prefetch_queue[i]) == NULL_ENTRY) {
+        MarkQueueEnt new = markQueuePop_(q);
+        if (nonmovingMarkQueueEntryType(&new) == NULL_ENTRY) {
+            // Mark queue is empty; look for any valid entries in the prefetch
+            // queue
+            for (unsigned int j = (i+1) % MARK_PREFETCH_QUEUE_DEPTH;
+                 j != i;
+                 j = (j+1) % MARK_PREFETCH_QUEUE_DEPTH)
+            {
+                if (nonmovingMarkQueueEntryType(&q->prefetch_queue[j]) != NULL_ENTRY) {
+                    i = j;
+                    goto done;
+                }
+            }
+            return new;
+        }
+
+        // The entry may not be a MARK_CLOSURE but it doesn't matter, our
+        // MarkQueueEnt encoding always places the pointer to the object to be
+        // marked first.
+        prefetchForRead(&new.mark_closure.p->header.info);
+        prefetchForRead(&nonmovingGetSegment_unchecked((StgPtr) new.mark_closure.p)->block_size);
+        q->prefetch_queue[i] = new;
+        i = (i + 1) % MARK_PREFETCH_QUEUE_DEPTH;
+    }
+
+  done:
+    ;
+    MarkQueueEnt ret = q->prefetch_queue[i];
+    q->prefetch_queue[i].null_entry.p = NULL;
+    q->prefetch_head = i;
+    return ret;
+#endif
+}
+
 /*********************************************************
  * Creating and destroying MarkQueues and UpdRemSets
  *********************************************************/
@@ -743,6 +783,10 @@ static void init_mark_queue_ (MarkQueue *queue)
     queue->blocks = bd;
     queue->top = (MarkQueueBlock *) bd->start;
     queue->top->head = 0;
+#if MARK_PREFETCH_QUEUE_DEPTH > 0
+    memset(&queue->prefetch_queue, 0, sizeof(queue->prefetch_queue));
+    queue->prefetch_head = 0;
+#endif
 }
 
 /* Must hold sm_mutex. */


=====================================
rts/sm/NonMovingMark.h
=====================================
@@ -84,6 +84,9 @@ typedef struct {
     MarkQueueEnt entries[];
 } MarkQueueBlock;
 
+// How far ahead in mark queue to prefetch?
+#define MARK_PREFETCH_QUEUE_DEPTH 5
+
 /* The mark queue is not capable of concurrent read or write.
  *
  * invariants:
@@ -101,6 +104,13 @@ typedef struct MarkQueue_ {
 
     // Is this a mark queue or a capability-local update remembered set?
     bool is_upd_rem_set;
+
+#if MARK_PREFETCH_QUEUE_DEPTH > 0
+    // A ring-buffer of entries which we will mark next
+    MarkQueueEnt prefetch_queue[MARK_PREFETCH_QUEUE_DEPTH];
+    // The first free slot in prefetch_queue.
+    uint8_t prefetch_head;
+#endif
 } MarkQueue;
 
 /* While it shares its representation with MarkQueue, UpdRemSet differs in
@@ -133,6 +143,15 @@ extern StgIndStatic *debug_caf_list_snapshot;
 extern MarkQueue *current_mark_queue;
 extern bdescr *upd_rem_set_block_list;
 
+// A similar macro is defined in includes/Cmm.h for C-- code.
+#if defined(THREADED_RTS)
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED \
+    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+#else
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED \
+    if (0)
+#endif
+
 void nonmovingMarkInitUpdRemSet(void);
 
 void init_upd_rem_set(UpdRemSet *rset);


=====================================
rts/sm/NonMovingSweep.c
=====================================
@@ -17,38 +17,6 @@
 #include "Trace.h"
 #include "StableName.h"
 
-static struct NonmovingSegment *pop_all_filled_segments(struct NonmovingAllocator *alloc)
-{
-    while (true) {
-        struct NonmovingSegment *head = alloc->filled;
-        if (cas((StgVolatilePtr) &alloc->filled, (StgWord) head, (StgWord) NULL) == (StgWord) head)
-            return head;
-    }
-}
-
-void nonmovingPrepareSweep()
-{
-    ASSERT(nonmovingHeap.sweep_list == NULL);
-
-    // Move blocks in the allocators' filled lists into sweep_list
-    for (unsigned int alloc_idx = 0; alloc_idx < NONMOVING_ALLOCA_CNT; alloc_idx++)
-    {
-        struct NonmovingAllocator *alloc = nonmovingHeap.allocators[alloc_idx];
-        struct NonmovingSegment *filled = pop_all_filled_segments(alloc);
-
-        // Link filled to sweep_list
-        if (filled) {
-            struct NonmovingSegment *filled_head = filled;
-            // Find end of filled list
-            while (filled->link) {
-                filled = filled->link;
-            }
-            filled->link = nonmovingHeap.sweep_list;
-            nonmovingHeap.sweep_list = filled_head;
-        }
-    }
-}
-
 // On which list should a particular segment be placed?
 enum SweepResult {
     SEGMENT_FREE,     // segment is empty: place on free list


=====================================
rts/sm/NonMovingSweep.h
=====================================
@@ -22,10 +22,6 @@ void nonmovingSweepLargeObjects(void);
 // Remove dead entries in the stable name table
 void nonmovingSweepStableNameTable(void);
 
-// Collect the set of segments to be collected during a major GC into
-// nonmovingHeap.sweep_list.
-void nonmovingPrepareSweep(void);
-
 #if defined(DEBUG)
 // The non-moving equivalent of the moving collector's gcCAFs.
 void nonmovingGcCafs(void);


=====================================
rts/sm/Storage.c
=====================================
@@ -478,7 +478,7 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     // reference should be in SRTs
     ASSERT(orig_info_tbl->layout.payload.ptrs == 0);
     // Becuase the payload is empty we just push the SRT
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(orig_info_tbl);
         if (thunk_info->i.srt) {
             updateRemembSetPushClosure(cap, GET_SRT(thunk_info));
@@ -1205,7 +1205,7 @@ dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old)
     if (mvar->header.info == &stg_MUT_VAR_CLEAN_info) {
         mvar->header.info = &stg_MUT_VAR_DIRTY_info;
         recordClosureMutated(cap, (StgClosure *) mvar);
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled != 0)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure_(reg, old);
         }
     }
@@ -1224,7 +1224,7 @@ dirty_TVAR(Capability *cap, StgTVar *p,
     if (p->header.info == &stg_TVAR_CLEAN_info) {
         p->header.info = &stg_TVAR_DIRTY_info;
         recordClosureMutated(cap,(StgClosure*)p);
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled != 0)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, old);
         }
     }
@@ -1241,8 +1241,9 @@ setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
     if (tso->dirty == 0) {
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, (StgClosure *) tso->_link);
+        }
     }
     tso->_link = target;
 }
@@ -1253,8 +1254,9 @@ setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target)
     if (tso->dirty == 0) {
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, (StgClosure *) tso->block_info.prev);
+        }
     }
     tso->block_info.prev = target;
 }
@@ -1267,8 +1269,9 @@ dirty_TSO (Capability *cap, StgTSO *tso)
         recordClosureMutated(cap,(StgClosure*)tso);
     }
 
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         updateRemembSetPushTSO(cap, tso);
+    }
 }
 
 void
@@ -1276,8 +1279,9 @@ dirty_STACK (Capability *cap, StgStack *stack)
 {
     // First push to upd_rem_set before we set stack->dirty since we
     // the nonmoving collector may already be marking the stack.
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         updateRemembSetPushStack(cap, stack);
+    }
 
     if (! (stack->dirty & STACK_DIRTY)) {
         stack->dirty = STACK_DIRTY;
@@ -1301,7 +1305,7 @@ void
 update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val)
 {
     Capability *cap = regTableToCapability(reg);
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
         StgMVar *mvar = (StgMVar *) p;
         updateRemembSetPushClosure(cap, old_val);
         updateRemembSetPushClosure(cap, (StgClosure *) mvar->head);



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/f36efd897f39a5f0f835d63072d6aee942276e21...0dc9f62d456db8b9614662666ab3192ab684617b

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/f36efd897f39a5f0f835d63072d6aee942276e21...0dc9f62d456db8b9614662666ab3192ab684617b
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20190521/46faf446/attachment-0001.html>


More information about the ghc-commits mailing list