[commit: ghc] master: Add NUMA support for Windows (c93813d)
git at git.haskell.org
git at git.haskell.org
Sat Oct 1 22:20:14 UTC 2016
Repository : ssh://git@git.haskell.org/ghc
On branch : master
Link : http://ghc.haskell.org/trac/ghc/changeset/c93813d96b1da53a2ebd9c9ac5af6cc3e3443c43/ghc
>---------------------------------------------------------------
commit c93813d96b1da53a2ebd9c9ac5af6cc3e3443c43
Author: Tamar Christina <tamar at zhox.com>
Date: Sun Sep 25 20:00:31 2016 +0100
Add NUMA support for Windows
Summary:
NOTE: I have been able to do simple testing on emulated NUMA nodes.
Real hardware would be needed for a proper test.
D2199 Added NUMA support for Linux, I have just filled in the missing pieces following
the description of the Linux APIs.
Test Plan:
Use `bcdedit.exe /set groupsize 2` to modify the kernel again (Similar to D2533).
This generates some NUMA nodes:
```
Logical Processor to NUMA Node Map:
NUMA Node 0:
**
--
NUMA Node 1:
--
**
Approximate Cross-NUMA Node Access Cost (relative to fastest):
00 01
00: 1.1 1.1
01: 1.0 1.0
```
run ` ../test-numa.exe +RTS --numa -RTS`
and check PerfMon for NUMA allocations.
Reviewers: simonmar, erikd, bgamari, austin
Reviewed By: simonmar
Subscribers: thomie, #ghc_windows_task_force
Differential Revision: https://phabricator.haskell.org/D2534
GHC Trac Issues: #12602
>---------------------------------------------------------------
c93813d96b1da53a2ebd9c9ac5af6cc3e3443c43
docs/users_guide/8.2.1-notes.rst | 2 +
rts/win32/OSMem.c | 81 ++++++++++++++++++++++++++++++++++++----
rts/win32/OSThreads.c | 45 +++++++++++++++++++++-
3 files changed, 118 insertions(+), 10 deletions(-)
diff --git a/docs/users_guide/8.2.1-notes.rst b/docs/users_guide/8.2.1-notes.rst
index 033f8da..2147dbc 100644
--- a/docs/users_guide/8.2.1-notes.rst
+++ b/docs/users_guide/8.2.1-notes.rst
@@ -71,6 +71,8 @@ Runtime system
event log, allowing heap profiles to be correlated with other tracing events
(see :ghc-ticket:`11094`).
+- Added NUMA support to Windows.
+
- Added processor group support for Windows. This allows the runtime to allocate
threads to all cores in systems which have multiple processor groups.
(e.g. > 64 cores, see :ghc-ticket:`11054`)
diff --git a/rts/win32/OSMem.c b/rts/win32/OSMem.c
index 3d9a304..b43636c 100644
--- a/rts/win32/OSMem.c
+++ b/rts/win32/OSMem.c
@@ -11,9 +11,7 @@
#include "sm/HeapAlloc.h"
#include "RtsUtils.h"
-#if HAVE_WINDOWS_H
#include <windows.h>
-#endif
typedef struct alloc_rec_ {
char* base; // non-aligned base address, directly from VirtualAlloc
@@ -39,11 +37,28 @@ static alloc_rec* allocs = NULL;
/* free_blocks are kept in ascending order, and adjacent blocks are merged */
static block_rec* free_blocks = NULL;
+/* Mingw-w64 does not currently have this in their header. So we have to import it.*/
+typedef LPVOID(WINAPI *VirtualAllocExNumaProc)(HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+
+/* Cache NUMA API call. */
+VirtualAllocExNumaProc VirtualAllocExNuma;
+
void
osMemInit(void)
{
allocs = NULL;
free_blocks = NULL;
+
+ /* Resolve and cache VirtualAllocExNuma. */
+ if (osNumaAvailable() && RtsFlags.GcFlags.numa)
+ {
+ VirtualAllocExNuma = (VirtualAllocExNumaProc)GetProcAddress(GetModuleHandleW(L"kernel32"), "VirtualAllocExNuma");
+ if (!VirtualAllocExNuma)
+ {
+ sysErrorBelch(
+ "osBindMBlocksToNode: VirtualAllocExNuma does not exist. How did you get this far?");
+ }
+ }
}
static
@@ -486,22 +501,72 @@ void osReleaseHeapMemory (void)
rtsBool osNumaAvailable(void)
{
- return rtsFalse;
+ return osNumaNodes() > 1;
}
uint32_t osNumaNodes(void)
{
- return 1;
+ /* Cache the amount of NUMA values. */
+ static ULONG numNumaNodes = 0;
+
+ /* Cache the amount of NUMA nodes. */
+ if (!numNumaNodes && !GetNumaHighestNodeNumber(&numNumaNodes))
+ {
+ numNumaNodes = 1;
+ }
+
+ return numNumaNodes;
}
StgWord osNumaMask(void)
{
- return 1;
+ StgWord numaMask;
+ if (!GetNumaNodeProcessorMask(0, &numaMask))
+ {
+ return 1;
+ }
+ return numaMask;
}
void osBindMBlocksToNode(
- void *addr STG_UNUSED,
- StgWord size STG_UNUSED,
- uint32_t node STG_UNUSED)
+ void *addr,
+ StgWord size,
+ uint32_t node)
{
+ if (osNumaAvailable())
+ {
+ void* temp;
+ if (RtsFlags.GcFlags.numa) {
+ /* Note [base memory]
+ I would like to use addr here to specify the base
+ memory of allocation. The problem is that the address
+ we are requesting is too high. I can't figure out if it's
+ because of my NUMA-emulation or a bug in the code.
+
+ On windows also -xb is broken, it does nothing so that can't
+ be used to tweak it (see #12577). So for now, just let the OS decide.
+ */
+ temp = VirtualAllocExNuma(
+ GetCurrentProcess(),
+ NULL, // addr? See base memory
+ size,
+ MEM_RESERVE | MEM_COMMIT,
+ PAGE_READWRITE,
+ node
+ );
+
+ if (!temp) {
+ if (GetLastError() == ERROR_NOT_ENOUGH_MEMORY) {
+ errorBelch("out of memory");
+ }
+ else {
+ sysErrorBelch(
+ "osBindMBlocksToNode: VirtualAllocExNuma MEM_RESERVE %llu bytes "
+ "at address %p bytes failed",
+ size, addr);
+ }
+ stg_exit(EXIT_FAILURE);
+ }
+ }
+ }
}
diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c
index c9b594a..b36c3e5 100644
--- a/rts/win32/OSThreads.c
+++ b/rts/win32/OSThreads.c
@@ -9,6 +9,7 @@
#include "Rts.h"
#include <windows.h>
+#include "sm/OSMem.h"
#if defined(THREADED_RTS)
#include "RtsUtils.h"
@@ -572,8 +573,48 @@ interruptOSThread (OSThreadId id)
CloseHandle(hdl);
}
-void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ }
-void releaseThreadNode (void) { /* nothing */ }
+void setThreadNode (uint32_t node)
+{
+ if (osNumaAvailable())
+ {
+ StgWord mask = 0;
+ mask |= 1 << node;
+ if (!SetThreadAffinityMask(GetCurrentThread(), mask))
+ {
+ sysErrorBelch(
+ "setThreadNode: Error setting affinity of thread to NUMA node `%u': %lu.",
+ node, GetLastError());
+ stg_exit(EXIT_FAILURE);
+ }
+ }
+}
+
+void releaseThreadNode (void)
+{
+ if (osNumaAvailable())
+ {
+ StgWord processMask;
+ StgWord systemMask;
+ if (!GetProcessAffinityMask(GetCurrentProcess(),
+ &processMask,
+ &systemMask))
+ {
+ sysErrorBelch(
+ "releaseThreadNode: Error resetting affinity of thread: %lu",
+ GetLastError());
+ stg_exit(EXIT_FAILURE);
+ }
+
+ if (!SetThreadAffinityMask(GetCurrentThread(), processMask))
+ {
+ sysErrorBelch(
+ "releaseThreadNode: Error reseting NUMA affinity mask of thread: %lu.",
+ GetLastError());
+ stg_exit(EXIT_FAILURE);
+ }
+
+ }
+}
#else /* !defined(THREADED_RTS) */
More information about the ghc-commits
mailing list