[Git][ghc/ghc][wip/hugepages] Implement support for 2MB hugepages

Fri May 3 09:45:26 UTC 2024


Teo Camarasu pushed to branch wip/hugepages at Glasgow Haskell Compiler / GHC


Commits:
0feb436c by Teo Camarasu at 2024-05-03T10:45:13+01:00
Implement support for 2MB hugepages

We enable/disable it through a runtime flag (-xH).

When enabled we ensure we only (de)allocate in aligned multiples of 2MB.

- - - - -


4 changed files:

- rts/include/rts/Constants.h
- rts/posix/OSMem.c
- rts/sm/BlockAlloc.c
- rts/sm/OSMem.h


Changes:

=====================================
rts/include/rts/Constants.h
=====================================
@@ -170,7 +170,7 @@
 #define BLOCK_SHIFT  12
 
 /* The size of a megablock (2^MBLOCK_SHIFT bytes) */
-#define MBLOCK_SHIFT   21
+#define MBLOCK_SHIFT   20
 
 /* -----------------------------------------------------------------------------
    Bitmap/size fields (used in info tables)


=====================================
rts/posix/OSMem.c
=====================================
@@ -60,12 +60,6 @@
 # endif
 #endif
 
-#if defined(HAVE_LINUX_MMAN_H)
-#include <linux/mman.h>
-
-#define HUGEPAGE_SIZE (2*1024*1024)
-#define HUGEPAGE_FLAGS (MAP_HUGETLB | MAP_HUGE_2MB)
-#endif
 
 #if !defined(darwin_HOST_OS)
 # undef RESERVE_FLAGS
@@ -244,19 +238,19 @@ my_mmap (void *addr, W_ size, int operation)
 # endif
     } else if (operation == MEM_COMMIT) {
         flags = MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE;
-#if defined(HUGEPAGE_SIZE)
+#if defined(HUGEPAGE_FLAGS)
         if ( RtsFlags.GcFlags.hugepages &&
             (size & (HUGEPAGE_SIZE - 1)) == 0) {
           huge_tried += 1;
           flags |= HUGEPAGE_FLAGS;
         }
-#endif /* defined(HUGEPAGE_SIZE) */
+#endif /* defined(HUGEPAGE_FLAGS) */
     } else {
         flags = MAP_ANON | MAP_PRIVATE;
     }
 
     ret = mmap(addr, size, prot, flags, -1, 0);
-#if defined(HUGEPAGE_SIZE)
+#if defined(HUGEPAGE_FLAGS)
     // If the mmap failed, and we tried with HUGEPAGE_FLAGS
     // then retry without.
     if (ret == MAP_FAILED && flags & HUGEPAGE_FLAGS){
@@ -698,6 +692,10 @@ void osDecommitMemory(void *at, W_ size)
     if(r < 0)
         sysErrorBelch("unable to make released memory unaccessible");
 #endif
+    if(RtsFlags.GcFlags.hugepages) {
+      ASSERT( ((HUGEPAGE_SIZE - 1) & (uintptr_t)at) == 0);
+      ASSERT( ((HUGEPAGE_SIZE - 1) & size) == 0);
+    }
 
 #if defined(MADV_FREE)
     // See Note [MADV_FREE and MADV_DONTNEED].


=====================================
rts/sm/BlockAlloc.c
=====================================
@@ -25,7 +25,8 @@
 
 #include <string.h>
 
-static void  initMBlock(void *mblock, uint32_t node);
+static void initMBlock(void *mblock, uint32_t node);
+static void free_mega_group (bdescr *mg);
 
 /*
  * By default the DEBUG RTS is built with block allocator assertions
@@ -478,13 +479,30 @@ alloc_mega_group (uint32_t node, StgWord mblocks)
     else
     {
         void *mblock;
+        StgWord hugepage_mblocks;
+        if(RtsFlags.GcFlags.hugepages) {
+          // Round up allocation to hugepage size
+          hugepage_mblocks = MBLOCK_ROUND_UP_HUGEPAGE(mblocks);
+        }
+        else {
+          hugepage_mblocks = mblocks;
+        }
+
         if (RtsFlags.GcFlags.numa) {
-            mblock = getMBlocksOnNode(node, mblocks);
+            mblock = getMBlocksOnNode(node, hugepage_mblocks);
         } else {
-            mblock = getMBlocks(mblocks);
+            mblock = getMBlocks(hugepage_mblocks);
         }
         initMBlock(mblock, node); // only need to init the 1st one
         bd = FIRST_BDESCR(mblock);
+
+        // Free the slop
+        if(hugepage_mblocks > mblocks) {
+          bdescr *mblock_slop_bd = FIRST_BDESCR((uintptr_t)mblock + (uintptr_t)mblocks*MBLOCK_SIZE);
+          initMBlock(MBLOCK_ROUND_DOWN(mblock_slop_bd), node); 
+          mblock_slop_bd->blocks = MBLOCK_GROUP_BLOCKS(hugepage_mblocks-mblocks);
+          free_mega_group(mblock_slop_bd);
+        }
     }
     bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks);
     return bd;
@@ -812,7 +830,7 @@ coalesce_mblocks (bdescr *p)
     return q;
 }
 
-static void
+void
 free_mega_group (bdescr *mg)
 {
     bdescr *bd, *prev;
@@ -1199,10 +1217,15 @@ uint32_t returnMemoryToOS(uint32_t n /* megablocks */)
     return 0;
 #else
     bdescr *bd;
+    bdescr *rejects;
+    bdescr *next;
     uint32_t node;
-    StgWord size;
+    StgWord size, unaligned_size, freeable_size;
     uint32_t init_n;
     init_n = n;
+    if(RtsFlags.GcFlags.hugepages) {
+      n = MBLOCK_ROUND_DOWN_HUGEPAGE(n);
+    }
 
     // TODO: This is inefficient because this loop will essentially result in
     // quadratic runtime behavior: for each call to `freeMBlocks`, the
@@ -1215,22 +1238,66 @@ uint32_t returnMemoryToOS(uint32_t n /* megablocks */)
     // ToDo: not fair, we free all the memory starting with node 0.
     for (node = 0; n > 0 && node < n_numa_nodes; node++) {
         bd = free_mblock_list[node];
+        rejects = NULL;
         while ((n > 0) && (bd != NULL)) {
             size = BLOCKS_TO_MBLOCKS(bd->blocks);
-            if (size > n) {
-                StgWord newSize = size - n;
-                char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
-                freeAddr += newSize * MBLOCK_SIZE;
-                bd->blocks = MBLOCK_GROUP_BLOCKS(newSize);
-                freeMBlocks(freeAddr, n);
-                n = 0;
-            }
+            next = bd->link;
+            char *aligned_start;
+
+            if(RtsFlags.GcFlags.hugepages) {
+              aligned_start = (char*)MBLOCK_ROUND_DOWN(bd) + ((uintptr_t)MBLOCK_ROUND_DOWN(bd) & HUGEPAGE_MASK);
+              unaligned_size = (aligned_start - (char*)MBLOCK_ROUND_DOWN(bd)) / MBLOCK_SIZE;
+              freeable_size = MBLOCK_ROUND_DOWN_HUGEPAGE(size - unaligned_size);
+            } 
             else {
-                char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
-                n -= size;
-                bd = bd->link;
-                freeMBlocks(freeAddr, size);
+              aligned_start = (char*)MBLOCK_ROUND_DOWN(bd);
+              unaligned_size = 0;
+              freeable_size = size;
             }
+
+            // We cannot free more than n
+            freeable_size = stg_min(n, freeable_size);
+
+            // Place the front unaligned section back on the list.
+            // If we can't free any of it then this is the entire thing.
+            if (unaligned_size > 0 || freeable_size == 0) { 
+              bd->link = rejects;
+              rejects = bd;
+              // If we are freeing some mblocks from the middle then initialise
+              // the first MBlock and update the sizes.
+              if (freeable_size > 0) {
+                bd->blocks = MBLOCK_GROUP_BLOCKS(unaligned_size);
+                bdescr *aligned_bd;
+                aligned_bd = FIRST_BDESCR(aligned_start);
+                aligned_bd->blocks = MBLOCK_GROUP_BLOCKS(freeable_size);
+                initMBlock(aligned_bd, node);
+              }
+            } 
+
+            if(freeable_size > 0) {
+                n -= freeable_size;
+                freeMBlocks(aligned_start, freeable_size);
+                // add the slop to the rejects list
+                if (size - unaligned_size - freeable_size > 0)
+                {
+                  void *slop = aligned_start + freeable_size * MBLOCK_SIZE;
+                  bdescr* slop_bd = FIRST_BDESCR(slop);
+                  initMBlock(slop_bd, node);
+                  slop_bd->blocks = MBLOCK_GROUP_BLOCKS(size - unaligned_size - freeable_size);
+                  slop_bd->link = rejects;
+                  rejects = slop_bd;
+                }
+            }
+            bd = next;
+        }
+        // Place the rejected mblocks back on the free list.
+        while(rejects) {
+          // pop the top of the rejects list.
+          next = rejects;
+          rejects = next->link;
+          // place it back on the free list.
+          next->link = bd;
+          bd = next;
         }
         free_mblock_list[node] = bd;
     }


=====================================
rts/sm/OSMem.h
=====================================
@@ -10,6 +10,20 @@
 
 #include "BeginPrivate.h"
 
+#if defined(HAVE_LINUX_MMAN_H)
+#include <linux/mman.h>
+
+#define HUGEPAGE_SHIFT 21
+#define HUGEPAGE_FLAGS (MAP_HUGETLB | MAP_HUGE_2MB)
+#else
+#define HUGEPAGE_SHIFT 20
+#endif
+
+#define HUGEPAGE_SIZE (1 << HUGEPAGE_SHIFT)
+#define HUGEPAGE_MASK ((1 << HUGEPAGE_SHIFT) - 1)
+#define MBLOCK_ROUND_DOWN_HUGEPAGE(x) ((x) & ~(HUGEPAGE_SHIFT - MBLOCK_SHIFT))
+#define MBLOCK_ROUND_UP_HUGEPAGE(x) ((x) + ((x) & (HUGEPAGE_SHIFT - MBLOCK_SHIFT)))
+
 void osMemInit(void);
 void *osGetMBlocks(uint32_t n);
 void osFreeMBlocks(void *addr, uint32_t n);



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/0feb436c982dfa95acf0224accea55a0c617ef83

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/0feb436c982dfa95acf0224accea55a0c617ef83
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240503/2db834e8/attachment-0001.html>