[commit: ghc] master: Add NUMA support for Windows (c93813d)

Sat Oct 1 22:20:14 UTC 2016

Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/c93813d96b1da53a2ebd9c9ac5af6cc3e3443c43/ghc

>---------------------------------------------------------------

commit c93813d96b1da53a2ebd9c9ac5af6cc3e3443c43
Author: Tamar Christina <tamar at zhox.com>
Date:   Sun Sep 25 20:00:31 2016 +0100

    Add NUMA support for Windows
    
    Summary:
    NOTE: I have been able to do simple testing on emulated NUMA nodes.
               Real hardware would be needed for a proper test.
    
    D2199 Added NUMA support for Linux, I have just filled in the missing pieces following
    the description of the Linux APIs.
    
    Test Plan:
    Use `bcdedit.exe /set groupsize 2` to modify the kernel again (Similar to D2533).
    
    This generates some NUMA nodes:
    
    ```
    Logical Processor to NUMA Node Map:
    NUMA Node 0:
    **
    --
    NUMA Node 1:
    --
    **
    
    Approximate Cross-NUMA Node Access Cost (relative to fastest):
         00  01
    00: 1.1 1.1
    01: 1.0 1.0
    ```
    
    run ` ../test-numa.exe +RTS --numa -RTS`
    
    and check PerfMon for NUMA allocations.
    
    Reviewers: simonmar, erikd, bgamari, austin
    
    Reviewed By: simonmar
    
    Subscribers: thomie, #ghc_windows_task_force
    
    Differential Revision: https://phabricator.haskell.org/D2534
    
    GHC Trac Issues: #12602


>---------------------------------------------------------------

c93813d96b1da53a2ebd9c9ac5af6cc3e3443c43
 docs/users_guide/8.2.1-notes.rst |  2 +
 rts/win32/OSMem.c                | 81 ++++++++++++++++++++++++++++++++++++----
 rts/win32/OSThreads.c            | 45 +++++++++++++++++++++-
 3 files changed, 118 insertions(+), 10 deletions(-)

diff --git a/docs/users_guide/8.2.1-notes.rst b/docs/users_guide/8.2.1-notes.rst
index 033f8da..2147dbc 100644
--- a/docs/users_guide/8.2.1-notes.rst
+++ b/docs/users_guide/8.2.1-notes.rst
@@ -71,6 +71,8 @@ Runtime system
   event log, allowing heap profiles to be correlated with other tracing events
   (see :ghc-ticket:`11094`).
 
+- Added NUMA support to Windows.
+
 - Added processor group support for Windows. This allows the runtime to allocate
   threads to all cores in systems which have multiple processor groups.
   (e.g. > 64 cores, see :ghc-ticket:`11054`)
diff --git a/rts/win32/OSMem.c b/rts/win32/OSMem.c
index 3d9a304..b43636c 100644
--- a/rts/win32/OSMem.c
+++ b/rts/win32/OSMem.c
@@ -11,9 +11,7 @@
 #include "sm/HeapAlloc.h"
 #include "RtsUtils.h"
 
-#if HAVE_WINDOWS_H
 #include <windows.h>
-#endif
 
 typedef struct alloc_rec_ {
     char* base;    // non-aligned base address, directly from VirtualAlloc
@@ -39,11 +37,28 @@ static alloc_rec* allocs = NULL;
 /* free_blocks are kept in ascending order, and adjacent blocks are merged */
 static block_rec* free_blocks = NULL;
 
+/* Mingw-w64 does not currently have this in their header. So we have to import it.*/
+typedef LPVOID(WINAPI *VirtualAllocExNumaProc)(HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+
+/* Cache NUMA API call. */
+VirtualAllocExNumaProc VirtualAllocExNuma;
+
 void
 osMemInit(void)
 {
     allocs = NULL;
     free_blocks = NULL;
+
+    /* Resolve and cache VirtualAllocExNuma. */
+    if (osNumaAvailable() && RtsFlags.GcFlags.numa)
+    {
+        VirtualAllocExNuma = (VirtualAllocExNumaProc)GetProcAddress(GetModuleHandleW(L"kernel32"), "VirtualAllocExNuma");
+        if (!VirtualAllocExNuma)
+        {
+            sysErrorBelch(
+                "osBindMBlocksToNode: VirtualAllocExNuma does not exist. How did you get this far?");
+        }
+    }
 }
 
 static
@@ -486,22 +501,72 @@ void osReleaseHeapMemory (void)
 
 rtsBool osNumaAvailable(void)
 {
-    return rtsFalse;
+    return osNumaNodes() > 1;
 }
 
 uint32_t osNumaNodes(void)
 {
-    return 1;
+    /* Cache the amount of NUMA values. */
+    static ULONG numNumaNodes = 0;
+
+    /* Cache the amount of NUMA nodes. */
+    if (!numNumaNodes && !GetNumaHighestNodeNumber(&numNumaNodes))
+    {
+        numNumaNodes = 1;
+    }
+
+    return numNumaNodes;
 }
 
 StgWord osNumaMask(void)
 {
-    return 1;
+    StgWord numaMask;
+    if (!GetNumaNodeProcessorMask(0, &numaMask))
+    {
+        return 1;
+    }
+    return numaMask;
 }
 
 void osBindMBlocksToNode(
-    void *addr STG_UNUSED,
-    StgWord size STG_UNUSED,
-    uint32_t node STG_UNUSED)
+    void *addr,
+    StgWord size,
+    uint32_t node)
 {
+    if (osNumaAvailable())
+    {
+        void* temp;
+        if (RtsFlags.GcFlags.numa) {
+            /* Note [base memory]
+               I would like to use addr here to specify the base
+               memory of allocation. The problem is that the address
+               we are requesting is too high. I can't figure out if it's
+               because of my NUMA-emulation or a bug in the code.
+
+               On windows also -xb is broken, it does nothing so that can't
+               be used to tweak it (see #12577). So for now, just let the OS decide.
+            */
+            temp = VirtualAllocExNuma(
+                          GetCurrentProcess(),
+                          NULL, // addr? See base memory
+                          size,
+                          MEM_RESERVE | MEM_COMMIT,
+                          PAGE_READWRITE,
+                          node
+                        );
+
+            if (!temp) {
+                if (GetLastError() == ERROR_NOT_ENOUGH_MEMORY) {
+                    errorBelch("out of memory");
+                }
+                else {
+                    sysErrorBelch(
+                        "osBindMBlocksToNode: VirtualAllocExNuma MEM_RESERVE %llu bytes "
+                        "at address %p bytes failed",
+                                        size, addr);
+                }
+                stg_exit(EXIT_FAILURE);
+            }
+        }
+    }
 }
diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c
index c9b594a..b36c3e5 100644
--- a/rts/win32/OSThreads.c
+++ b/rts/win32/OSThreads.c
@@ -9,6 +9,7 @@
 
 #include "Rts.h"
 #include <windows.h>
+#include "sm/OSMem.h"
 #if defined(THREADED_RTS)
 #include "RtsUtils.h"
 
@@ -572,8 +573,48 @@ interruptOSThread (OSThreadId id)
     CloseHandle(hdl);
 }
 
-void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ }
-void releaseThreadNode (void) { /* nothing */ }
+void setThreadNode (uint32_t node)
+{
+    if (osNumaAvailable())
+    {
+        StgWord mask = 0;
+        mask |= 1 << node;
+        if (!SetThreadAffinityMask(GetCurrentThread(), mask))
+        {
+            sysErrorBelch(
+                "setThreadNode: Error setting affinity of thread to NUMA node `%u': %lu.",
+                node, GetLastError());
+            stg_exit(EXIT_FAILURE);
+        }
+    }
+}
+
+void releaseThreadNode (void)
+{
+    if (osNumaAvailable())
+    {
+        StgWord processMask;
+        StgWord systemMask;
+        if (!GetProcessAffinityMask(GetCurrentProcess(),
+                                   &processMask,
+                                   &systemMask))
+        {
+            sysErrorBelch(
+                "releaseThreadNode: Error resetting affinity of thread: %lu",
+                GetLastError());
+            stg_exit(EXIT_FAILURE);
+        }
+
+        if (!SetThreadAffinityMask(GetCurrentThread(), processMask))
+        {
+            sysErrorBelch(
+                "releaseThreadNode: Error reseting NUMA affinity mask of thread: %lu.",
+                GetLastError());
+            stg_exit(EXIT_FAILURE);
+        }
+
+    }
+}
 
 #else /* !defined(THREADED_RTS) */