[Git][ghc/ghc][wip/hugepages] rts: Implement support for 2MB hugepages

Teo Camarasu (@teo) gitlab at gitlab.haskell.org
Tue May 21 08:58:40 UTC 2024



Teo Camarasu pushed to branch wip/hugepages at Glasgow Haskell Compiler / GHC


Commits:
27528fc7 by Teo Camarasu at 2024-05-21T09:58:28+01:00
rts: Implement support for 2MB hugepages

We enable/disable it through a runtime flag (-xH).

When enabled we ensure we only (de)allocate in aligned multiples of 2MB.

Relates to #24760

Co-authored-by: Matthew Pickering <matthewtpickering at gmail.com>
Co-authored-by: Ben Gamari <bgamari.foss at gmail.com>

- - - - -


9 changed files:

- docs/users_guide/runtime_control.rst
- rts/RtsFlags.c
- rts/configure.ac
- rts/include/rts/Flags.h
- rts/posix/OSMem.c
- rts/sm/BlockAlloc.c
- rts/sm/OSMem.h
- testsuite/tests/rts/all.T
- + testsuite/tests/rts/testhugepagesmblockalloc.c


Changes:

=====================================
docs/users_guide/runtime_control.rst
=====================================
@@ -397,6 +397,16 @@ Miscellaneous RTS options
     heap larger than 1T. ``-xr`` is a no-op if GHC is configured with
     ``--disable-large-address-space`` or if the platform is 32-bit.
 
+.. rts-flag:: -xH
+
+    This option enables using huge pages to back memory allocations.
+    Use of huge pages can make memory lookups more efficient for applications
+    with high memory usage.
+    Currently we only support 2MB hugepages on Linux.
+
+    If huge pages aren't available to back allocations, then we fall back to
+    regular pages.
+
 .. _rts-options-gc:
 
 RTS options to control the garbage collector


=====================================
rts/RtsFlags.c
=====================================
@@ -184,6 +184,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.GcFlags.allocLimitGrace    = (100*1024) / BLOCK_SIZE;
     RtsFlags.GcFlags.numa               = false;
     RtsFlags.GcFlags.numaMask           = 1;
+    RtsFlags.GcFlags.hugepages          = false;
     RtsFlags.GcFlags.ringBell           = false;
     RtsFlags.GcFlags.longGCSync         = 0; /* detection turned off */
 
@@ -554,6 +555,7 @@ usage_text[] = {
 #endif
 "  -xq        The allocation limit given to a thread after it receives",
 "             an AllocationLimitExceeded exception. (default: 100k)",
+"  -xH        Try to use hugepages to allocate memory.",
 "",
 #if defined(USE_LARGE_ADDRESS_SPACE)
 "  -xr        The size of virtual memory address space reserved by the",
@@ -1830,11 +1832,11 @@ error = true;
                    */
 
                 case 'q':
-                  OPTION_UNSAFE;
-                  RtsFlags.GcFlags.allocLimitGrace
-                      = decodeSize(rts_argv[arg], 3, BLOCK_SIZE, HS_INT_MAX)
-                          / BLOCK_SIZE;
-                  break;
+                    OPTION_UNSAFE;
+                    RtsFlags.GcFlags.allocLimitGrace
+                        = decodeSize(rts_argv[arg], 3, BLOCK_SIZE, HS_INT_MAX)
+                            / BLOCK_SIZE;
+                    break;
 
                 case 'r':
                     OPTION_UNSAFE;
@@ -1842,7 +1844,12 @@ error = true;
                       = decodeSize(rts_argv[arg], 3, MBLOCK_SIZE, HS_WORD64_MAX);
                     break;
 
-                  default:
+                case 'H':
+                    OPTION_SAFE;
+                    RtsFlags.GcFlags.hugepages = true;
+                    break;
+
+                default:
                     OPTION_SAFE;
                     errorBelch("unknown RTS option: %s",rts_argv[arg]);
                     error = true;


=====================================
rts/configure.ac
=====================================
@@ -92,7 +92,7 @@ dnl    off_t, because it will affect the result of that test.
 AC_SYS_LARGEFILE
 
 dnl ** check for specific header (.h) files that we are interested in
-AC_CHECK_HEADERS([ctype.h dirent.h dlfcn.h errno.h fcntl.h grp.h limits.h locale.h nlist.h pthread.h pwd.h signal.h sys/param.h sys/mman.h sys/resource.h sys/select.h sys/time.h sys/timeb.h sys/timerfd.h sys/timers.h sys/times.h sys/utsname.h sys/wait.h termios.h utime.h windows.h winsock.h sched.h])
+AC_CHECK_HEADERS([ctype.h dirent.h dlfcn.h errno.h fcntl.h grp.h limits.h locale.h nlist.h pthread.h pwd.h signal.h sys/param.h sys/mman.h linux/mman.h sys/resource.h sys/select.h sys/time.h sys/timeb.h sys/timerfd.h sys/timers.h sys/times.h sys/utsname.h sys/wait.h termios.h utime.h windows.h winsock.h sched.h])
 
 dnl sys/cpuset.h needs sys/param.h to be included first on FreeBSD 9.1; #7708
 AC_CHECK_HEADERS([sys/cpuset.h], [], [],


=====================================
rts/include/rts/Flags.h
=====================================
@@ -91,6 +91,7 @@ typedef struct _GC_FLAGS {
     StgWord numaMask;
 
     StgWord64 addressSpaceSize;  /* large address space size in bytes */
+    bool hugepages;              /* Enable hugepages support */
 } GC_FLAGS;
 
 /* See Note [Synchronization of flags and base APIs] */


=====================================
rts/posix/OSMem.c
=====================================
@@ -60,6 +60,7 @@
 # endif
 #endif
 
+
 #if !defined(darwin_HOST_OS)
 # undef RESERVE_FLAGS
 # if defined(MAP_GUARD)
@@ -73,6 +74,9 @@
 # endif
 #endif
 
+int huge_tried = 0;
+int huge_failed = 0;
+
 static void *next_request = 0;
 
 void osMemInit(void)
@@ -233,12 +237,28 @@ my_mmap (void *addr, W_ size, int operation)
         errorBelch("my_mmap(,,MEM_RESERVE) not supported on this platform");
 # endif
     } else if (operation == MEM_COMMIT) {
-        flags = MAP_FIXED | MAP_ANON | MAP_PRIVATE;
+        flags = MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE;
+#if defined(HUGEPAGE_FLAGS)
+        if ( RtsFlags.GcFlags.hugepages &&
+            (size & (HUGEPAGE_SIZE - 1)) == 0) {
+          huge_tried += 1;
+          flags |= HUGEPAGE_FLAGS;
+        }
+#endif /* defined(HUGEPAGE_FLAGS) */
     } else {
         flags = MAP_ANON | MAP_PRIVATE;
     }
 
     ret = mmap(addr, size, prot, flags, -1, 0);
+#if defined(HUGEPAGE_FLAGS)
+    // If the mmap failed, and we tried with HUGEPAGE_FLAGS
+    // then retry without.
+    if (ret == MAP_FAILED && flags & HUGEPAGE_FLAGS){
+      huge_failed += 1;
+      flags &= ~HUGEPAGE_FLAGS;
+      ret = mmap(addr, size, prot, flags, -1, 0);
+    }
+#endif
 # if defined(linux_HOST_OS)
     if (ret == MAP_FAILED && errno == EPERM) {
         // Linux may return EPERM if it tried to give us
@@ -457,6 +477,7 @@ StgWord64 getPhysicalMemorySize (void)
 
 #if defined(USE_LARGE_ADDRESS_SPACE)
 
+
 static void *
 osTryReserveHeapMemory (W_ len, void *hint)
 {
@@ -470,6 +491,7 @@ osTryReserveHeapMemory (W_ len, void *hint)
        and then we discard what we don't need */
 
     base = my_mmap(hint, len + MBLOCK_SIZE, MEM_RESERVE);
+
     if (base == NULL)
         return NULL;
 
@@ -670,6 +692,10 @@ void osDecommitMemory(void *at, W_ size)
     if(r < 0)
         sysErrorBelch("unable to make released memory unaccessible");
 #endif
+    if(RtsFlags.GcFlags.hugepages) {
+      ASSERT( ((HUGEPAGE_SIZE - 1) & (uintptr_t)at) == 0);
+      ASSERT( ((HUGEPAGE_SIZE - 1) & size) == 0);
+    }
 
 #if defined(MADV_FREE)
     // See Note [MADV_FREE and MADV_DONTNEED].


=====================================
rts/sm/BlockAlloc.c
=====================================
@@ -25,7 +25,8 @@
 
 #include <string.h>
 
-static void  initMBlock(void *mblock, uint32_t node);
+static void initMBlock(void *mblock, uint32_t node);
+static void free_mega_group (bdescr *mg);
 
 /*
  * By default the DEBUG RTS is built with block allocator assertions
@@ -525,13 +526,30 @@ alloc_mega_group (uint32_t node, StgWord mblocks)
     else
     {
         void *mblock;
+        StgWord hugepage_mblocks;
+        if(RtsFlags.GcFlags.hugepages) {
+          // Round up allocation to hugepage size
+          hugepage_mblocks = MBLOCK_ROUND_UP_HUGEPAGE(mblocks);
+        }
+        else {
+          hugepage_mblocks = mblocks;
+        }
+
         if (RtsFlags.GcFlags.numa) {
-            mblock = getMBlocksOnNode(node, mblocks);
+            mblock = getMBlocksOnNode(node, hugepage_mblocks);
         } else {
-            mblock = getMBlocks(mblocks);
+            mblock = getMBlocks(hugepage_mblocks);
         }
         initMBlock(mblock, node); // only need to init the 1st one
         bd = FIRST_BDESCR(mblock);
+
+        // Free the slop
+        if(hugepage_mblocks > mblocks) {
+          bdescr *mblock_slop_bd = FIRST_BDESCR((uintptr_t)mblock + (uintptr_t)mblocks*MBLOCK_SIZE);
+          initMBlock(MBLOCK_ROUND_DOWN(mblock_slop_bd), node);
+          mblock_slop_bd->blocks = MBLOCK_GROUP_BLOCKS(hugepage_mblocks - mblocks);
+          free_mega_group(mblock_slop_bd);
+        }
     }
     bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks);
     return bd;
@@ -859,7 +877,7 @@ coalesce_mblocks (bdescr *p)
     return q;
 }
 
-static void
+void
 free_mega_group (bdescr *mg)
 {
     bdescr *bd, *prev;
@@ -1246,10 +1264,17 @@ uint32_t returnMemoryToOS(uint32_t n /* megablocks */)
     return 0;
 #else
     bdescr *bd;
+    bdescr *rejects;
+    bdescr *next;
     uint32_t node;
-    StgWord size;
+    StgWord size, unaligned_size, freeable_size;
     uint32_t init_n;
     init_n = n;
+    if(RtsFlags.GcFlags.hugepages) {
+      // Invariant: n is always a multiple of the hugepage size
+      // as we can only free whole hugepages.
+      n = MBLOCK_ROUND_DOWN_HUGEPAGE(n);
+    }
 
     // TODO: This is inefficient because this loop will essentially result in
     // quadratic runtime behavior: for each call to `freeMBlocks`, the
@@ -1262,22 +1287,72 @@ uint32_t returnMemoryToOS(uint32_t n /* megablocks */)
     // ToDo: not fair, we free all the memory starting with node 0.
     for (node = 0; n > 0 && node < n_numa_nodes; node++) {
         bd = free_mblock_list[node];
+        // 'rejects' is a reversed list of mblocks that need to go back on the
+        // free list.
+        rejects = NULL;
         while ((n > 0) && (bd != NULL)) {
             size = BLOCKS_TO_MBLOCKS(bd->blocks);
-            if (size > n) {
-                StgWord newSize = size - n;
-                char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
-                freeAddr += newSize * MBLOCK_SIZE;
-                bd->blocks = MBLOCK_GROUP_BLOCKS(newSize);
-                freeMBlocks(freeAddr, n);
-                n = 0;
+            next = bd->link;
+            char *aligned_start;
+
+            if(RtsFlags.GcFlags.hugepages) {
+              // we can only free hugepage aligned mblock groups
+              aligned_start = (char*)MBLOCK_ROUND_DOWN(bd) + ((uintptr_t)MBLOCK_ROUND_DOWN(bd) & HUGEPAGE_MASK);
+              unaligned_size = (aligned_start - (char*)MBLOCK_ROUND_DOWN(bd)) / MBLOCK_SIZE;
+              freeable_size = MBLOCK_ROUND_DOWN_HUGEPAGE(size - unaligned_size);
             }
             else {
-                char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
-                n -= size;
-                bd = bd->link;
-                freeMBlocks(freeAddr, size);
+              aligned_start = (char*)MBLOCK_ROUND_DOWN(bd);
+              unaligned_size = 0;
+              freeable_size = size;
             }
+
+            // We cannot free more than n
+            // Note: n is a multiple of the hugepage size,
+            // so freeable_size will also continue to be a multiple.
+            freeable_size = stg_min(n, freeable_size);
+
+            // Place the front unaligned section back on the list.
+            // If we can't free any of it then this is the entire thing.
+            if (unaligned_size > 0 || freeable_size == 0) {
+              bd->link = rejects;
+              rejects = bd;
+              // We are freeing some mblocks from the middle
+              if (freeable_size > 0) {
+                bd->blocks = MBLOCK_GROUP_BLOCKS(unaligned_size);
+                bdescr *aligned_bd;
+                aligned_bd = FIRST_BDESCR(aligned_start);
+                aligned_bd->blocks = MBLOCK_GROUP_BLOCKS(freeable_size);
+              }
+            }
+
+            if(freeable_size > 0) {
+                // Free the mblocks
+                n -= freeable_size;
+                freeMBlocks(aligned_start, freeable_size);
+                // add the slop to the rejects list
+                if (size - unaligned_size - freeable_size > 0)
+                {
+                  void *slop = aligned_start + freeable_size * MBLOCK_SIZE;
+                  bdescr* slop_bd = FIRST_BDESCR(slop);
+                  slop_bd->blocks = MBLOCK_GROUP_BLOCKS(size - unaligned_size - freeable_size);
+                  slop_bd->link = rejects;
+                  initMBlock(slop, node);
+                  rejects = slop_bd;
+                }
+            }
+            bd = next;
+        }
+        // Place the rejected mblocks back on the free list.
+        // Note: this preserves the order.
+        while(rejects) {
+          // pop the top of the rejects list.
+          next = rejects;
+          rejects = next->link;
+          // place it back on the free list.
+          next->link = bd;
+          ASSERT(next < bd || bd == NULL);
+          bd = next;
         }
         free_mblock_list[node] = bd;
     }


=====================================
rts/sm/OSMem.h
=====================================
@@ -10,6 +10,20 @@
 
 #include "BeginPrivate.h"
 
+#if defined(HAVE_LINUX_MMAN_H)
+#include <linux/mman.h>
+
+#define HUGEPAGE_SHIFT 21
+#define HUGEPAGE_FLAGS (MAP_HUGETLB | MAP_HUGE_2MB)
+#else
+#define HUGEPAGE_SHIFT 20
+#endif
+
+#define HUGEPAGE_SIZE (1 << HUGEPAGE_SHIFT)
+#define HUGEPAGE_MASK ((1 << HUGEPAGE_SHIFT) - 1)
+#define MBLOCK_ROUND_DOWN_HUGEPAGE(x) ((x) & ~(HUGEPAGE_SHIFT - MBLOCK_SHIFT))
+#define MBLOCK_ROUND_UP_HUGEPAGE(x) ((x) + ((x) & (HUGEPAGE_SHIFT - MBLOCK_SHIFT)))
+
 void osMemInit(void);
 void *osGetMBlocks(uint32_t n);
 void osFreeMBlocks(void *addr, uint32_t n);


=====================================
testsuite/tests/rts/all.T
=====================================
@@ -10,6 +10,12 @@ test('testmblockalloc',
 # which will crash because the mblocks we allocate are not in a state
 # the leak detector is expecting.
 
+# A variant of the above that tries to use hugepages
+test('testhugepagesmblockalloc',
+     [c_src, only_ways(['normal','threaded1']), extra_run_opts('+RTS -I0 -xr0.125T -xH'),
+      unless(opsys('linux'), skip)], # Huge pages are only currently supported on Linux
+     compile_and_run, [''])
+
 
 # See bug #101, test requires +RTS -c (or equivalently +RTS -M<something>)
 # only GHCi triggers the bug, but we run the test all ways for completeness.


=====================================
testsuite/tests/rts/testhugepagesmblockalloc.c
=====================================
@@ -0,0 +1,75 @@
+#include "Rts.h"
+
+#include <stdio.h>
+
+// 16 * 64 == max 1GB
+const int MAXALLOC = 16;
+const int ARRSIZE  = 64;
+
+const int LOOPS    = 1000;
+const int SEED     = 0xf00f00;
+
+extern StgWord mblocks_allocated;
+
+int main (int argc, char *argv[])
+{
+    int i, j, b;
+
+    void *a[ARRSIZE];
+    uint32_t sizes[ARRSIZE];
+
+    srand(SEED);
+
+    {
+        RtsConfig conf = defaultRtsConfig;
+        conf.rts_opts_enabled = RtsOptsAll;
+        hs_init_ghc(&argc, &argv, conf);
+    }
+
+   // repeatedly sweep though the array, allocating new random-sized
+   // objects and deallocating the old ones.
+   for (i=0; i < LOOPS; i++)
+   {
+       for (j=0; j < ARRSIZE; j++)
+       {
+           if (i > 0)
+           {
+               freeMBlocks(a[j], sizes[j]);
+           }
+           b = (rand() % MAXALLOC) + 1;
+           a[j] = getMBlocks(b);
+           sizes[j] = b;
+       }
+   }
+
+   releaseFreeMemory();
+
+   for (j=0; j < ARRSIZE; j++)
+   {
+       freeMBlocks(a[j], sizes[j]);
+   }
+
+   releaseFreeMemory();
+
+    // this time, sweep forwards allocating new blocks, and then
+    // backwards deallocating them.
+    for (i=0; i < LOOPS; i++)
+    {
+        for (j=0; j < ARRSIZE; j++)
+        {
+            b = (rand() % MAXALLOC) + 1;
+            a[j] = getMBlocks(b);
+            sizes[j] = b;
+        }
+        for (j=ARRSIZE-1; j >= 0; j--)
+        {
+            freeMBlocks(a[j], sizes[j]);
+        }
+    }
+
+    releaseFreeMemory();
+
+    hs_exit(); // will do a memory leak test
+
+    exit(0);
+}



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/27528fc742038c38cbd46b5cfc630badaa5b1d33

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/27528fc742038c38cbd46b5cfc630badaa5b1d33
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240521/53155a6c/attachment-0001.html>


More information about the ghc-commits mailing list