[Git][ghc/ghc][master] 3 commits: docs: move -xn flag beside --nonmoving-gc

Tue Sep 12 08:31:32 UTC 2023


Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC


Commits:
98166389 by Teo Camarasu at 2023-09-12T04:30:54-04:00
docs: move -xn flag beside --nonmoving-gc

It makes sense to have these beside each other as they are aliases.

- - - - -
f367835c by Teo Camarasu at 2023-09-12T04:30:55-04:00
nonmoving: introduce a family of dense allocators

Supplement the existing power 2 sized nonmoving allocators with a family
of dense allocators up to a configurable threshold.

This should reduce waste from rounding up block sizes while keeping the
amount of allocator sizes manageable.

This patch:
  - Adds a new configuration option `--nonmoving-dense-allocator-count`
    to control the amount of these new dense allocators.
  - Adds some constants to `NonmovingAllocator` in order to keep
    marking fast with the new allocators.

Resolves #23340

- - - - -
2b07bf2e by Teo Camarasu at 2023-09-12T04:30:55-04:00
Add changelog entry for #23340

- - - - -


17 changed files:

- docs/users_guide/9.10.1-notes.rst
- docs/users_guide/eventlog-formats.rst
- docs/users_guide/runtime_control.rst
- rts/RtsFlags.c
- rts/Trace.c
- rts/Trace.h
- rts/eventlog/EventLog.c
- rts/eventlog/EventLog.h
- rts/gen_event_types.py
- rts/include/rts/Flags.h
- rts/include/rts/storage/Block.h
- rts/sm/NonMoving.c
- rts/sm/NonMoving.h
- rts/sm/NonMovingAllocate.c
- rts/sm/NonMovingCensus.c
- rts/sm/Sanity.c
- rts/sm/Storage.c


Changes:

=====================================
docs/users_guide/9.10.1-notes.rst
=====================================
@@ -60,6 +60,11 @@ GHCi
 Runtime system
 ~~~~~~~~~~~~~~
 
+- Internal fragmentation incurred by the non-moving GC's allocator has been reduced for small objects.
+  In one real-world application, this has reduced resident set size by about 20% and modestly improved run-time.
+  See :ghc-ticket:`23340`.
+  :rts-flag:`--nonmoving-dense-allocator-count=⟨count⟩` has been added to fine-tune this behaviour.
+
 ``base`` library
 ~~~~~~~~~~~~~~~~
 


=====================================
docs/users_guide/eventlog-formats.rst
=====================================
@@ -922,7 +922,7 @@ heap.
 
    :tag: 207
    :length: fixed
-   :field Word8: base-2 logarithm of *blk_sz*.
+   :field Word16: *blk_sz* in bytes.
    :field Word32: number of active segments.
    :field Word32: number of filled segments.
    :field Word32: number of live blocks.


=====================================
docs/users_guide/runtime_control.rst
=====================================
@@ -411,6 +411,29 @@ performance.
     Note that :rts-flag:`--nonmoving-gc` cannot be used with ``-G1``,
     :rts-flag:`profiling <-hc>` nor :rts-flag:`-c`.
 
+.. rts-flag:: -xn
+
+    :default: off
+    :since: 8.10.1
+
+    An alias for :rts-flag:`--nonmoving-gc`
+
+.. rts-flag:: --nonmoving-dense-allocator-count=⟨count⟩
+
+    :default: 16
+    :since: 9.10.1
+    :reverse: none
+
+    Specify the amount of dense allocators used by the non-moving garbage collector.
+
+    Increasing this value is likely to decrease the amount of memory lost to
+    internal fragmentation while marginally increasing the baseline memory requirements
+    and potentially regressing other metrics.
+
+    Large values are likely to lead to diminishing returns as
+    , in practice, the Haskell heap tends to be dominated by small objects.
+
+
 .. rts-flag:: -w
 
     :default: off
@@ -422,13 +445,6 @@ performance.
     (:rts-flag:`-hT`) unless linked against the profiling runtime system with
     :ghc-flag:`-prof`.
 
-.. rts-flag:: -xn
-
-    :default: off
-    :since: 8.10.1
-
-    An alias for :rts-flag:`--nonmoving-gc`
-
 .. rts-flag:: -A ⟨size⟩
 
     :default: 4MB


=====================================
rts/RtsFlags.c
=====================================
@@ -166,6 +166,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.GcFlags.oldGenFactor       = 2;
     RtsFlags.GcFlags.returnDecayFactor  = 4;
     RtsFlags.GcFlags.useNonmoving       = false;
+    RtsFlags.GcFlags.nonmovingDenseAllocatorCount = 16;
     RtsFlags.GcFlags.generations        = 2;
     RtsFlags.GcFlags.squeezeUpdFrames   = true;
     RtsFlags.GcFlags.compact            = false;
@@ -1028,6 +1029,17 @@ error = true;
                       OPTION_SAFE;
                       RtsFlags.GcFlags.useNonmoving = true;
                   }
+                  else if (!strncmp("nonmoving-dense-allocator-count=",
+                               &rts_argv[arg][2], 32)) {
+                      OPTION_SAFE;
+                      int32_t threshold = strtol(rts_argv[arg]+34, (char **) NULL, 10);
+                      if (threshold < 1 || threshold > (uint16_t)-1) {
+                        errorBelch("bad value for --nonmoving-dense-allocator-count");
+                        error = true;
+                      } else {
+                        RtsFlags.GcFlags.nonmovingDenseAllocatorCount = threshold;
+                      }
+                  }
 #if defined(THREADED_RTS)
 #if defined(mingw32_HOST_OS)
                   else if (!strncmp("io-manager-threads",


=====================================
rts/Trace.c
=====================================
@@ -918,11 +918,11 @@ void traceConcUpdRemSetFlush(Capability *cap)
         postConcUpdRemSetFlush(cap);
 }
 
-void traceNonmovingHeapCensus(uint32_t log_blk_size,
+void traceNonmovingHeapCensus(uint16_t blk_size,
                               const struct NonmovingAllocCensus *census)
 {
     if (eventlog_enabled && TRACE_nonmoving_gc)
-        postNonmovingHeapCensus(log_blk_size, census);
+        postNonmovingHeapCensus(blk_size, census);
 }
 
 void traceThreadStatus_ (StgTSO *tso USED_IF_DEBUG)


=====================================
rts/Trace.h
=====================================
@@ -329,7 +329,7 @@ void traceConcSyncEnd(void);
 void traceConcSweepBegin(void);
 void traceConcSweepEnd(void);
 void traceConcUpdRemSetFlush(Capability *cap);
-void traceNonmovingHeapCensus(uint32_t log_blk_size,
+void traceNonmovingHeapCensus(uint16_t blk_size,
                               const struct NonmovingAllocCensus *census);
 
 void traceIPE(const InfoProvEnt *ipe);


=====================================
rts/eventlog/EventLog.c
=====================================
@@ -1132,12 +1132,12 @@ void postConcMarkEnd(StgWord32 marked_obj_count)
     RELEASE_LOCK(&eventBufMutex);
 }
 
-void postNonmovingHeapCensus(int log_blk_size,
+void postNonmovingHeapCensus(uint16_t blk_size,
                              const struct NonmovingAllocCensus *census)
 {
     ACQUIRE_LOCK(&eventBufMutex);
     postEventHeader(&eventBuf, EVENT_NONMOVING_HEAP_CENSUS);
-    postWord8(&eventBuf, log_blk_size);
+    postWord16(&eventBuf, blk_size);
     postWord32(&eventBuf, census->n_active_segs);
     postWord32(&eventBuf, census->n_filled_segs);
     postWord32(&eventBuf, census->n_live_blocks);


=====================================
rts/eventlog/EventLog.h
=====================================
@@ -194,7 +194,7 @@ void postIPE(const InfoProvEnt *ipe);
 
 void postConcUpdRemSetFlush(Capability *cap);
 void postConcMarkEnd(StgWord32 marked_obj_count);
-void postNonmovingHeapCensus(int log_blk_size,
+void postNonmovingHeapCensus(uint16_t blk_size,
                              const struct NonmovingAllocCensus *census);
 
 #if defined(TICKY_TICKY)


=====================================
rts/gen_event_types.py
=====================================
@@ -131,7 +131,7 @@ event_types = [
     EventType(204, 'CONC_SWEEP_BEGIN',             [],                    'Begin concurrent sweep phase'),
     EventType(205, 'CONC_SWEEP_END',               [],                    'End concurrent sweep phase'),
     EventType(206, 'CONC_UPD_REM_SET_FLUSH',       [CapNo],               'Update remembered set flushed'),
-    EventType(207, 'NONMOVING_HEAP_CENSUS',        [Word8, Word32, Word32, Word32], 'Nonmoving heap census'),
+    EventType(207, 'NONMOVING_HEAP_CENSUS',        [Word16, Word32, Word32, Word32], 'Nonmoving heap census'),
 
     # Ticky-ticky profiling
     EventType(210, 'TICKY_COUNTER_DEF',            VariableLength,        'Ticky-ticky entry counter definition'),


=====================================
rts/include/rts/Flags.h
=====================================
@@ -54,6 +54,7 @@ typedef struct _GC_FLAGS {
     double  pcFreeHeap;
 
     bool         useNonmoving; // default = false
+    uint16_t     nonmovingDenseAllocatorCount; // Amount of dense nonmoving allocators. See Note [Allocator sizes]
     uint32_t     generations;
     bool squeezeUpdFrames;
 


=====================================
rts/include/rts/storage/Block.h
=====================================
@@ -87,7 +87,8 @@
 
 
 struct NonmovingSegmentInfo {
-  StgWord8 log_block_size;
+  StgWord16 allocator_idx; // nonmovingHeap.allocators[allocators_idx] is
+                           // this segment's allocator.
   StgWord16 next_free_snap;
 };
 


=====================================
rts/sm/NonMoving.c
=====================================
@@ -33,6 +33,8 @@
 struct NonmovingHeap nonmovingHeap;
 
 uint8_t nonmovingMarkEpoch = 1;
+uint8_t nonmoving_alloca_dense_cnt;
+uint8_t nonmoving_alloca_cnt;
 
 static void nonmovingBumpEpoch(void) {
     nonmovingMarkEpoch = nonmovingMarkEpoch == 1 ? 2 : 1;
@@ -244,6 +246,8 @@ static void nonmovingBumpEpoch(void) {
  *  - Note [Sync phase marking budget] describes how we avoid long mutator
  *    pauses during the sync phase
  *
+ *  - Note [Allocator sizes] goes into detail about our choice of allocator sizes.
+ *
  * [ueno 2016]:
  *   Katsuhiro Ueno and Atsushi Ohori. 2016. A fully concurrent garbage
  *   collector for functional programs on multicore processors. SIGPLAN Not. 51,
@@ -527,6 +531,35 @@ static void nonmovingBumpEpoch(void) {
  * TODO: Perhaps sync_phase_marking_budget should be controllable via a
  * command-line argument?
  *
+ *
+ * Note [Allocator sizes]
+ * ~~~~~~~~~~~~~~~~~~~~~~
+ * Our choice of allocator sizes has to balance several considerations:
+ * - Allocator sizes should be available for the most commonly request block sizes,
+ *   in order to avoid excessive waste from rounding up to the next size (internal fragmentation).
+ * - It should be possible to efficiently determine which allocator services
+ *   a certain block size.
+ * - The amount of allocators should be kept down to avoid overheads
+ *   (eg, each capability must have an allocator of each size)
+ *   and the risk of fragmentation.
+ * - It should be possible to efficiently divide by the allocator size.
+ *   This is necessary to implement marking efficiently. It's trivial
+ *   to efficiently divide by powers of 2. But to do so efficiently with
+ *   arbitrary allocator sizes, we need to do some precomputation and make
+ *   use of the integer division by constants optimisation.
+ *
+ * We currenlty try to balance these considerations by adopting the following scheme.
+ * We have nonmoving_alloca_dense_cnt "dense" allocators starting with size
+ * NONMOVING_ALLOCA0, and incrementing by NONMOVING_ALLOCA_DENSE_INCREMENT.
+ * These service the vast majority of allocations.
+ * In practice, Haskell programs tend to allocate a lot of small objects.
+ *
+ * Other allocations are handled by a family of "sparse" allocators, each providing
+ * blocks up to a power of 2. This places an upper bound on the waste at half the
+ * required block size.
+ *
+ * See #23340
+ *
  */
 
 memcount nonmoving_segment_live_words = 0;
@@ -535,6 +568,8 @@ memcount nonmoving_segment_live_words = 0;
 MarkBudget sync_phase_marking_budget = 200000;
 
 static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads, bool concurrent);
+static void nonmovingInitAllocator(struct NonmovingAllocator* alloc, uint16_t block_size);
+static void nonmovingInitAllocators(void);
 
 static void nonmovingInitConcurrentWorker(void);
 static void nonmovingStartConcurrentMark(MarkQueue *roots);
@@ -566,23 +601,42 @@ void nonmovingPushFreeSegment(struct NonmovingSegment *seg)
     __sync_add_and_fetch(&nonmovingHeap.n_free, 1);
 }
 
-unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size)
+void nonmovingInitAllocator(struct NonmovingAllocator* alloc, uint16_t block_size)
 {
-  // We compute the overwhelmingly common size cases directly to avoid a very
-  // expensive integer division.
-  switch (log_block_size) {
-    case 3:  return nonmovingBlockCount(3);
-    case 4:  return nonmovingBlockCount(4);
-    case 5:  return nonmovingBlockCount(5);
-    case 6:  return nonmovingBlockCount(6);
-    case 7:  return nonmovingBlockCount(7);
-    default: return nonmovingBlockCount(log_block_size);
-  }
+  *alloc = (struct NonmovingAllocator)
+    { .filled = NULL,
+      .saved_filled = NULL,
+      .active = NULL,
+      .block_size = block_size,
+      .block_count = nonmovingBlockCount(block_size),
+      .block_division_constant = ((uint32_t) -1) / block_size + 1
+    };
 }
 
+void nonmovingInitAllocators(void)
+{
+    nonmoving_alloca_dense_cnt = RtsFlags.GcFlags.nonmovingDenseAllocatorCount;
+    uint16_t first_sparse_allocator = nonmoving_first_sparse_allocator_size();
+    uint16_t nonmoving_alloca_sparse_cnt = log2_ceil(NONMOVING_SEGMENT_SIZE) - first_sparse_allocator;
+    nonmoving_alloca_cnt = nonmoving_alloca_dense_cnt + nonmoving_alloca_sparse_cnt;
+
+    nonmovingHeap.allocators = stgMallocBytes(sizeof(struct NonmovingAllocator) * nonmoving_alloca_cnt, "allocators array");
+
+    // Initialise allocator sizes
+    for (unsigned int i = 0; i < nonmoving_alloca_dense_cnt; i++) {
+      nonmovingInitAllocator(&nonmovingHeap.allocators[i], NONMOVING_ALLOCA0 + i * sizeof(StgWord));
+    }
+    for (unsigned int i = nonmoving_alloca_dense_cnt; i < nonmoving_alloca_cnt; i++) {
+      uint16_t block_size = 1 << (i + first_sparse_allocator - nonmoving_alloca_dense_cnt);
+      nonmovingInitAllocator(&nonmovingHeap.allocators[i], block_size);
+    }
+}
+
+
 void nonmovingInit(void)
 {
     if (! RtsFlags.GcFlags.useNonmoving) return;
+    nonmovingInitAllocators();
     nonmovingInitConcurrentWorker();
     nonmovingMarkInit();
 }
@@ -606,7 +660,7 @@ static void nonmovingPrepareMark(void)
 
     nonmovingHeap.n_caps = n_capabilities;
     nonmovingBumpEpoch();
-    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+    for (int alloca_idx = 0; alloca_idx < nonmoving_alloca_cnt; ++alloca_idx) {
         struct NonmovingAllocator *alloca = &nonmovingHeap.allocators[alloca_idx];
 
         // Update current segments' snapshot pointers
@@ -990,7 +1044,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
 
     // Walk the list of filled segments that we collected during preparation,
     // updated their snapshot pointers and move them to the sweep list.
-    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+    for (int alloca_idx = 0; alloca_idx < nonmoving_alloca_cnt; ++alloca_idx) {
         struct NonmovingSegment *filled = nonmovingHeap.allocators[alloca_idx].saved_filled;
         if (filled) {
             struct NonmovingSegment *seg = filled;
@@ -1220,7 +1274,7 @@ void assert_in_nonmoving_heap(StgPtr p)
         }
     }
 
-    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+    for (int alloca_idx = 0; alloca_idx < nonmoving_alloca_cnt; ++alloca_idx) {
         struct NonmovingAllocator *alloca = &nonmovingHeap.allocators[alloca_idx];
 
         // Search current segments
@@ -1259,13 +1313,12 @@ void assert_in_nonmoving_heap(StgPtr p)
 void nonmovingPrintSegment(struct NonmovingSegment *seg)
 {
     int num_blocks = nonmovingSegmentBlockCount(seg);
-    uint8_t log_block_size = nonmovingSegmentLogBlockSize(seg);
+    uint16_t block_size = nonmovingSegmentBlockSize(seg);
 
-    debugBelch("Segment with %d blocks of size 2^%d (%d bytes, %u words, scan: %p)\n",
+    debugBelch("Segment with %d blocks of size: %d bytes, %u words, scan: %p\n",
                num_blocks,
-               log_block_size,
-               1 << log_block_size,
-               (unsigned int) ROUNDUP_BYTES_TO_WDS(1 << log_block_size),
+               block_size,
+               (unsigned int) ROUNDUP_BYTES_TO_WDS(block_size),
                (void*)Bdescr((P_)seg)->u.scan);
 
     for (nonmoving_block_idx p_idx = 0; p_idx < seg->next_free; ++p_idx) {


=====================================
rts/sm/NonMoving.h
=====================================
@@ -70,10 +70,11 @@ struct NonmovingSegment {
     // N.B. There are also bits of information which are stored in the
     // NonmovingBlockInfo stored in the segment's block descriptor. Namely:
     //
-    //  * the block size can be found in nonmovingBlockInfo(seg)->log_block_size.
     //  * the next_free snapshot can be found in
     //    nonmovingBlockInfo(seg)->next_free_snap.
     //
+    // Some other information about the block size is stored on NonmovingAllocator.
+    //
     // This allows us to mark a nonmoving closure without bringing the
     // NonmovingSegment header into cache.
 };
@@ -88,20 +89,41 @@ struct NonmovingAllocator {
     struct NonmovingSegment *saved_filled;
     struct NonmovingSegment *active;
     // N.B. Per-capabilty "current" segment lives in Capability
+
+    // The size of each block for this allocator.
+    StgWord16 block_size;
+    // The amount of blocks for a segment of this allocator.
+    // See nonmovingBlockCount for how this is calculated.
+    StgWord16 block_count;
+    // A constant for implementing the "division by a constant" optimisation.
+    // Invariant:
+    // (x * block_division_constant >> NONMOVING_ALLOCA_DIVIDE_SHIFT)
+    // = x / block_size
+    StgWord32 block_division_constant;
 };
 
-// first allocator is of size 2^NONMOVING_ALLOCA0 (in bytes)
-#define NONMOVING_ALLOCA0 3
+// first allocator is of size NONMOVING_ALLOCA0 (in bytes)
+#define NONMOVING_ALLOCA0 8
+
+// used in conjuction with NonmovingAllocator.block_division_constant
+// to implement the "division by a constant" optimisation
+#define NONMOVING_ALLOCA_DIVIDE_SHIFT 32
+
+// amount of dense allocators.
+// These cover block sizes starting with NONMOVING_ALLOCA0
+// and increase in increments of NONMOVING_ALLOCA_INCREMENT
+extern uint8_t nonmoving_alloca_dense_cnt;
 
-// allocators cover block sizes of 2^NONMOVING_ALLOCA0 to
-// 2^(NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT) (in bytes)
-#define NONMOVING_ALLOCA_CNT 12
+// total amount of allocators (dense and sparse).
+// allocators cover block sizes of NONMOVING_ALLOCA0 to
+// NONMOVING_SEGMENT_SIZE (in bytes)
+extern uint8_t nonmoving_alloca_cnt;
 
 // maximum number of free segments to hold on to
 #define NONMOVING_MAX_FREE 16
 
 struct NonmovingHeap {
-    struct NonmovingAllocator allocators[NONMOVING_ALLOCA_CNT];
+    struct NonmovingAllocator *allocators;
     // free segment list. This is a cache where we keep up to
     // NONMOVING_MAX_FREE segments to avoid thrashing the block allocator.
     // Note that segments in this list are still counted towards
@@ -151,19 +173,44 @@ void nonmovingCollect(StgWeak **dead_weaks,
 
 void nonmovingPushFreeSegment(struct NonmovingSegment *seg);
 
+INLINE_HEADER unsigned long log2_ceil(unsigned long x)
+{
+    return (sizeof(unsigned long)*8) - __builtin_clzl(x-1);
+}
+
 INLINE_HEADER struct NonmovingSegmentInfo *nonmovingSegmentInfo(struct NonmovingSegment *seg) {
     return &Bdescr((StgPtr) seg)->nonmoving_segment;
 }
 
-INLINE_HEADER uint8_t nonmovingSegmentLogBlockSize(struct NonmovingSegment *seg) {
-    return nonmovingSegmentInfo(seg)->log_block_size;
+// Find the allocator a segement belongs to
+INLINE_HEADER struct NonmovingAllocator nonmovingSegmentAllocator(struct NonmovingSegment *seg) {
+    return nonmovingHeap.allocators[nonmovingSegmentInfo(seg)->allocator_idx];
+}
+
+// Determine the index of the allocator for blocks of a certain size
+INLINE_HEADER uint8_t nonmovingAllocatorForSize(uint16_t block_size){
+   if (block_size - NONMOVING_ALLOCA0 < nonmoving_alloca_dense_cnt * (uint16_t) sizeof(StgWord)) {
+       // dense case
+       return (block_size  - NONMOVING_ALLOCA0) / sizeof(StgWord);
+   }
+   else {
+       // sparse case
+       return log2_ceil(block_size)
+              - log2_ceil(NONMOVING_ALLOCA0 + sizeof(StgWord) * nonmoving_alloca_dense_cnt)
+              + nonmoving_alloca_dense_cnt;
+    }
+}
+
+// The block size of a given segment in bytes.
+INLINE_HEADER unsigned int nonmovingSegmentBlockSize(struct NonmovingSegment *seg)
+{
+    return nonmovingSegmentAllocator(seg).block_size;
 }
 
 // Add a segment to the appropriate active list.
 INLINE_HEADER void nonmovingPushActiveSegment(struct NonmovingSegment *seg)
 {
-    struct NonmovingAllocator *alloc =
-        &nonmovingHeap.allocators[nonmovingSegmentLogBlockSize(seg) - NONMOVING_ALLOCA0];
+    struct NonmovingAllocator *alloc = &nonmovingHeap.allocators[nonmovingAllocatorForSize(nonmovingSegmentBlockSize(seg))];
     SET_SEGMENT_STATE(seg, ACTIVE);
     while (true) {
         struct NonmovingSegment *current_active = RELAXED_LOAD(&alloc->active);
@@ -177,8 +224,7 @@ INLINE_HEADER void nonmovingPushActiveSegment(struct NonmovingSegment *seg)
 // Add a segment to the appropriate filled list.
 INLINE_HEADER void nonmovingPushFilledSegment(struct NonmovingSegment *seg)
 {
-    struct NonmovingAllocator *alloc =
-        &nonmovingHeap.allocators[nonmovingSegmentLogBlockSize(seg) - NONMOVING_ALLOCA0];
+    struct NonmovingAllocator *alloc = &nonmovingHeap.allocators[nonmovingAllocatorForSize(nonmovingSegmentBlockSize(seg))];
     SET_SEGMENT_STATE(seg, FILLED);
     while (true) {
         struct NonmovingSegment *current_filled = (struct NonmovingSegment*) RELAXED_LOAD(&alloc->filled);
@@ -197,52 +243,43 @@ INLINE_HEADER void nonmovingPushFilledSegment(struct NonmovingSegment *seg)
 //
 void assert_in_nonmoving_heap(StgPtr p);
 
-// The block size of a given segment in bytes.
-INLINE_HEADER unsigned int nonmovingSegmentBlockSize(struct NonmovingSegment *seg)
-{
-    return 1 << nonmovingSegmentLogBlockSize(seg);
-}
-
 // How many blocks does a segment with the given block size have?
-INLINE_HEADER unsigned int nonmovingBlockCount(uint8_t log_block_size)
+INLINE_HEADER unsigned int nonmovingBlockCount(uint16_t block_size)
 {
   unsigned int segment_data_size = NONMOVING_SEGMENT_SIZE - sizeof(struct NonmovingSegment);
   segment_data_size -= segment_data_size % SIZEOF_VOID_P;
-  unsigned int blk_size = 1 << log_block_size;
   // N.B. +1 accounts for the byte in the mark bitmap.
-  return segment_data_size / (blk_size + 1);
+  unsigned int block_count = segment_data_size / (block_size + 1);
+  ASSERT(block_count < 0xfff); // must fit into StgWord16
+  return block_count;
 }
 
-unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size);
-
 // How many blocks does the given segment contain? Also the size of the bitmap.
 INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg)
 {
-  return nonmovingBlockCountFromSize(nonmovingSegmentLogBlockSize(seg));
+  return nonmovingSegmentAllocator(seg).block_count;
 }
 
 // Get a pointer to the given block index assuming that the block size is as
 // given (avoiding a potential cache miss when this information is already
 // available). The log_block_size argument must be equal to seg->block_size.
-INLINE_HEADER void *nonmovingSegmentGetBlock_(struct NonmovingSegment *seg, uint8_t log_block_size, nonmoving_block_idx i)
+INLINE_HEADER void *nonmovingSegmentGetBlock_(struct NonmovingSegment *seg, uint16_t block_size, uint16_t block_count, nonmoving_block_idx i)
 {
-  ASSERT(log_block_size == nonmovingSegmentLogBlockSize(seg));
-  // Block size in bytes
-  unsigned int blk_size = 1 << log_block_size;
+  ASSERT(block_size == nonmovingSegmentBlockSize(seg));
   // Bitmap size in bytes
-  W_ bitmap_size = nonmovingBlockCountFromSize(log_block_size) * sizeof(uint8_t);
+  W_ bitmap_size = block_count * sizeof(uint8_t);
   // Where the actual data starts (address of the first block).
   // Use ROUNDUP_BYTES_TO_WDS to align to word size. Note that
   // ROUNDUP_BYTES_TO_WDS returns in _words_, not in _bytes_, so convert it back
   // back to bytes by multiplying with word size.
   W_ data = ROUNDUP_BYTES_TO_WDS(((W_)seg) + sizeof(struct NonmovingSegment) + bitmap_size) * sizeof(W_);
-  return (void*)(data + i*blk_size);
+  return (void*)(data + i*block_size);
 }
 
 // Get a pointer to the given block index.
 INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i)
 {
-  return nonmovingSegmentGetBlock_(seg, nonmovingSegmentLogBlockSize(seg), i);
+  return nonmovingSegmentGetBlock_(seg, nonmovingSegmentBlockSize(seg), nonmovingSegmentBlockCount(seg), i);
 }
 
 // Get the segment which a closure resides in. Assumes that pointer points into
@@ -267,12 +304,23 @@ INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p)
     return nonmovingGetSegment_unchecked(p);
 }
 
+// Divide x by the block size of the segment.
+INLINE_HEADER uint16_t nonmovingSegmentDivideBySize(struct NonmovingSegment *seg, uint16_t x)
+{
+    return ((StgWord64) x * nonmovingSegmentAllocator(seg).block_division_constant) >> NONMOVING_ALLOCA_DIVIDE_SHIFT;
+}
+
 INLINE_HEADER nonmoving_block_idx nonmovingGetBlockIdx(StgPtr p)
 {
     struct NonmovingSegment *seg = nonmovingGetSegment(p);
     ptrdiff_t blk0 = (ptrdiff_t)nonmovingSegmentGetBlock(seg, 0);
     ptrdiff_t offset = (ptrdiff_t)p - blk0;
-    return (nonmoving_block_idx) (offset >> nonmovingSegmentLogBlockSize(seg));
+    return (nonmoving_block_idx) nonmovingSegmentDivideBySize(seg, offset);
+}
+
+INLINE_HEADER uint16_t nonmoving_first_sparse_allocator_size (void)
+{
+    return log2_ceil(NONMOVING_ALLOCA0 + (nonmoving_alloca_dense_cnt - 1) * sizeof(StgWord) + 1);
 }
 
 // TODO: Eliminate this
@@ -311,7 +359,7 @@ INLINE_HEADER bool nonmovingClosureMarkedThisCycle(StgPtr p)
 INLINE_HEADER bool nonmovingSegmentBeingSwept(struct NonmovingSegment *seg)
 {
     struct NonmovingSegmentInfo *seginfo = nonmovingSegmentInfo(seg);
-    unsigned int n = nonmovingBlockCountFromSize(seginfo->log_block_size);
+    unsigned int n = nonmovingSegmentBlockCount(seg);
     return seginfo->next_free_snap >= n;
 }
 


=====================================
rts/sm/NonMovingAllocate.c
=====================================
@@ -17,19 +17,14 @@
 
 enum AllocLockMode { NO_LOCK, ALLOC_SPIN_LOCK, SM_LOCK };
 
-static inline unsigned long log2_ceil(unsigned long x);
 static struct NonmovingSegment *nonmovingAllocSegment(enum AllocLockMode mode, uint32_t node);
 static void nonmovingClearBitmap(struct NonmovingSegment *seg);
-static void nonmovingInitSegment(struct NonmovingSegment *seg, uint8_t log_block_size);
+static void nonmovingInitSegment(struct NonmovingSegment *seg, uint16_t block_size);
 static bool advance_next_free(struct NonmovingSegment *seg, const unsigned int blk_count);
 static struct NonmovingSegment *nonmovingPopFreeSegment(void);
 static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *alloca);
 static void *nonmovingAllocate_(enum AllocLockMode mode, Capability *cap, StgWord sz);
 
-static inline unsigned long log2_ceil(unsigned long x)
-{
-    return (sizeof(unsigned long)*8) - __builtin_clzl(x-1);
-}
 
 static inline void acquire_alloc_lock(enum AllocLockMode mode) {
     switch (mode) {
@@ -97,14 +92,14 @@ static void nonmovingClearBitmap(struct NonmovingSegment *seg)
     memset(seg->bitmap, 0, n);
 }
 
-static void nonmovingInitSegment(struct NonmovingSegment *seg, uint8_t log_block_size)
+static void nonmovingInitSegment(struct NonmovingSegment *seg, uint16_t allocator_idx)
 {
     bdescr *bd = Bdescr((P_) seg);
     seg->link = NULL;
     seg->todo_link = NULL;
     seg->next_free = 0;
     SET_SEGMENT_STATE(seg, FREE);
-    bd->nonmoving_segment.log_block_size = log_block_size;
+    bd->nonmoving_segment.allocator_idx = allocator_idx;
     bd->nonmoving_segment.next_free_snap = 0;
     bd->u.scan = nonmovingSegmentGetBlock(seg, 0);
     nonmovingClearBitmap(seg);
@@ -115,10 +110,10 @@ void nonmovingInitCapability(Capability *cap)
 {
     // Initialize current segment array
     struct NonmovingSegment **segs =
-        stgMallocBytes(sizeof(struct NonmovingSegment*) * NONMOVING_ALLOCA_CNT, "current segment array");
-    for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
+        stgMallocBytes(sizeof(struct NonmovingSegment*) * nonmoving_alloca_cnt, "current segment array");
+    for (unsigned int i = 0; i < nonmoving_alloca_cnt; i++) {
         segs[i] = nonmovingAllocSegment(NO_LOCK, cap->node);
-        nonmovingInitSegment(segs[i], NONMOVING_ALLOCA0 + i);
+        nonmovingInitSegment(segs[i], i);
         SET_SEGMENT_STATE(segs[i], CURRENT);
     }
     cap->current_segments = segs;
@@ -190,20 +185,27 @@ static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *al
 
 static void *nonmovingAllocate_(enum AllocLockMode mode, Capability *cap, StgWord sz)
 {
-    unsigned int log_block_size = log2_ceil(sz * sizeof(StgWord));
-    unsigned int block_count = nonmovingBlockCountFromSize(log_block_size);
+    unsigned int block_size;
+    if (sz * sizeof(StgWord) <= NONMOVING_ALLOCA0 + (nonmoving_alloca_dense_cnt-1)*sizeof(StgWord)) {
+      block_size = sizeof(StgWord) * sz;
+    } else {
+      unsigned int log_block_size = log2_ceil(sz * sizeof(StgWord));
+      block_size = 1 << log_block_size;
+    }
 
-    // The max we ever allocate is 3276 bytes (anything larger is a large
+    // The max we ever allocate is NONMOVING_SEGMENT_SIZE bytes (anything larger is a large
     // object and not moved) which is covered by allocator 9.
-    ASSERT(log_block_size < NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT);
+    ASSERT(block_size < NONMOVING_SEGMENT_SIZE);
 
-    unsigned int alloca_idx = log_block_size - NONMOVING_ALLOCA0;
+    unsigned int alloca_idx = nonmovingAllocatorForSize(block_size);
     struct NonmovingAllocator *alloca = &nonmovingHeap.allocators[alloca_idx];
 
     // Allocate into current segment
     struct NonmovingSegment *current = cap->current_segments[alloca_idx];
     ASSERT(current); // current is never NULL
-    void *ret = nonmovingSegmentGetBlock_(current, log_block_size, current->next_free);
+    ASSERT(block_size == nonmovingSegmentBlockSize(current));
+    unsigned int block_count = nonmovingSegmentBlockCount(current);
+    void *ret = nonmovingSegmentGetBlock_(current, block_size, block_count, current->next_free);
     ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment
 
     // Advance the current segment's next_free or allocate a new segment if full
@@ -216,7 +218,6 @@ static void *nonmovingAllocate_(enum AllocLockMode mode, Capability *cap, StgWor
         // Update live data estimate.
         // See Note [Live data accounting in nonmoving collector].
         unsigned int new_blocks = block_count - nonmovingSegmentInfo(current)->next_free_snap;
-        unsigned int block_size = 1 << log_block_size;
         atomic_inc(&oldest_gen->live_estimate, new_blocks * block_size / sizeof(W_));
 
         // push the current segment to the filled list
@@ -228,7 +229,7 @@ static void *nonmovingAllocate_(enum AllocLockMode mode, Capability *cap, StgWor
         // there are no active segments, allocate new segment
         if (new_current == NULL) {
             new_current = nonmovingAllocSegment(mode, cap->node);
-            nonmovingInitSegment(new_current, log_block_size);
+            nonmovingInitSegment(new_current, alloca_idx);
         }
 
         // make it current


=====================================
rts/sm/NonMovingCensus.c
=====================================
@@ -133,7 +133,7 @@ void nonmovingPrintAllocatorCensus(bool collect_live_words)
     if (!RtsFlags.GcFlags.useNonmoving)
         return;
 
-    for (int i=0; i < NONMOVING_ALLOCA_CNT; i++) {
+    for (int i=0; i < nonmoving_alloca_cnt; i++) {
         struct NonmovingAllocCensus census =
             nonmovingAllocatorCensus_(i, collect_live_words);
 
@@ -147,10 +147,10 @@ void nonmovingTraceAllocatorCensus(void)
     if (!RtsFlags.GcFlags.useNonmoving && !TRACE_nonmoving_gc)
         return;
 
-    for (int i=0; i < NONMOVING_ALLOCA_CNT; i++) {
+    for (int i=0; i < nonmoving_alloca_cnt; i++) {
         const struct NonmovingAllocCensus census = nonmovingAllocatorCensus(i);
-        const uint32_t log_blk_size = i + NONMOVING_ALLOCA0;
-        traceNonmovingHeapCensus(log_blk_size, &census);
+        const uint32_t blk_size = nonmovingHeap.allocators[i].block_size;
+        traceNonmovingHeapCensus(blk_size, &census);
     }
 #endif
 }


=====================================
rts/sm/Sanity.c
=====================================
@@ -638,7 +638,7 @@ void checkNonmovingHeap (const struct NonmovingHeap *heap)
     checkLargeObjects(nonmoving_large_objects);
     checkLargeObjects(nonmoving_marked_large_objects);
     checkCompactObjects(nonmoving_compact_objects);
-    for (unsigned int i=0; i < NONMOVING_ALLOCA_CNT; i++) {
+    for (unsigned int i=0; i < nonmoving_alloca_cnt; i++) {
         const struct NonmovingAllocator *alloc = &heap->allocators[i];
         checkNonmovingSegments(alloc->filled);
         checkNonmovingSegments(alloc->saved_filled);
@@ -1110,7 +1110,7 @@ findMemoryLeak (void)
         markBlocks(nonmoving_marked_large_objects);
         markBlocks(nonmoving_compact_objects);
         markBlocks(nonmoving_marked_compact_objects);
-        for (i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
+        for (i = 0; i < nonmoving_alloca_cnt; i++) {
             struct NonmovingAllocator *alloc = &nonmovingHeap.allocators[i];
             markNonMovingSegments(alloc->filled);
             markNonMovingSegments(alloc->saved_filled);
@@ -1226,7 +1226,7 @@ static W_
 countNonMovingHeap(struct NonmovingHeap *heap)
 {
     W_ ret = 0;
-    for (int alloc_idx = 0; alloc_idx < NONMOVING_ALLOCA_CNT; alloc_idx++) {
+    for (int alloc_idx = 0; alloc_idx < nonmoving_alloca_cnt; alloc_idx++) {
         struct NonmovingAllocator *alloc = &heap->allocators[alloc_idx];
         ret += countNonMovingSegments(alloc->filled);
         ret += countNonMovingSegments(alloc->saved_filled);


=====================================
rts/sm/Storage.c
=====================================
@@ -401,7 +401,7 @@ void listAllBlocks (ListBlocksCb cb, void *user)
 
       // list capabilities' current segments
       if(RtsFlags.GcFlags.useNonmoving) {
-        for (s = 0; s < NONMOVING_ALLOCA_CNT; s++) {
+        for (s = 0; s < nonmoving_alloca_cnt; s++) {
           listSegmentBlocks(cb, user, getCapability(i)->current_segments[s]);
         }
       }
@@ -409,7 +409,7 @@ void listAllBlocks (ListBlocksCb cb, void *user)
 
   // list blocks on the nonmoving heap
   if(RtsFlags.GcFlags.useNonmoving) {
-    for(s = 0; s < NONMOVING_ALLOCA_CNT; s++) {
+    for(s = 0; s < nonmoving_alloca_cnt; s++) {
       listSegmentBlocks(cb, user, nonmovingHeap.allocators[s].filled);
       listSegmentBlocks(cb, user, nonmovingHeap.allocators[s].saved_filled);
       listSegmentBlocks(cb, user, nonmovingHeap.allocators[s].active);
@@ -2007,7 +2007,7 @@ void rts_clearMemory(void) {
             nonmovingClearSegment(seg);
         }
 
-        for (int i = 0; i < NONMOVING_ALLOCA_CNT; ++i) {
+        for (int i = 0; i < nonmoving_alloca_cnt; ++i) {
             struct NonmovingAllocator *alloc = &nonmovingHeap.allocators[i];
 
             for (struct NonmovingSegment *seg = alloc->active; seg; seg = seg->link) {



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/a1f0d55c91c7b180304cc5bc28671eef30f78d76...2b07bf2e8bcb24520fe78b469c3550b9f4099526

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/a1f0d55c91c7b180304cc5bc28671eef30f78d76...2b07bf2e8bcb24520fe78b469c3550b9f4099526
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20230912/be66eff8/attachment-0001.html>