[commit: ghc] master: Refactor GCTDecl.h, and mitigate #7602 a bit (28b031c)

git at git.haskell.org git at git.haskell.org
Tue Jan 28 14:27:31 UTC 2014


Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/28b031c506122e28e0230a562a4f6fd3d0256d0c/ghc

>---------------------------------------------------------------

commit 28b031c506122e28e0230a562a4f6fd3d0256d0c
Author: Austin Seipp <austin at well-typed.com>
Date:   Tue Jan 28 06:15:19 2014 -0600

    Refactor GCTDecl.h, and mitigate #7602 a bit
    
    This basically cleans a lot of GCTDecl up - I found it quite hard to
    read and a bit confusing. The changes are mostly cosmetic: better
    delineation between the alternative cases and light touchups, and tries
    to make every branch as consistent as possible.
    
    However, this patch does have one significant effect: it will ensure
    that any LLVM-based compilers will use __thread if they support it.
    Before, they would simply always use pthread_getspecific and
    pthread_setspecific, which are almost surely even *more* inefficient.
    
    The details are a bit too long and boring to go into here; see #7602.
    After talking with Simon, we decided to play it safe - __thread can at
    least be optimized by future clang releases even further on OS X if they
    choose, and it's safer until we can investigate the pthread
    implementation further on Mavericks.
    
    For Linux, the story isn't so bleak if you use Clang (for whatever
    reason) - Linux directly writes to `%fs` for __thread slots (while OS X
    will perform a load followed by an indirect call.) So it should still be
    fairly competitive, speed-wise.
    
    Signed-off-by: Austin Seipp <austin at well-typed.com>


>---------------------------------------------------------------

28b031c506122e28e0230a562a4f6fd3d0256d0c
 rts/sm/GCTDecl.h |  150 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 92 insertions(+), 58 deletions(-)

diff --git a/rts/sm/GCTDecl.h b/rts/sm/GCTDecl.h
index 2c08e10..affb852 100644
--- a/rts/sm/GCTDecl.h
+++ b/rts/sm/GCTDecl.h
@@ -1,6 +1,6 @@
 /* -----------------------------------------------------------------------------
  *
- * (c) The GHC Team 1998-2009
+ * (c) The GHC Team 1998-2014
  *
  * Documentation on the architecture of the Garbage Collector can be
  * found in the online commentary:
@@ -14,94 +14,128 @@
 
 #include "BeginPrivate.h"
 
-/* -----------------------------------------------------------------------------
-   The gct variable is thread-local and points to the current thread's
-   gc_thread structure.  It is heavily accessed, so we try to put gct
-   into a global register variable if possible; if we don't have a
-   register then use gcc's __thread extension to create a thread-local
-   variable.
-   -------------------------------------------------------------------------- */
+/* The gct variable is thread-local and points to the current thread's
+   gc_thread structure. It is heavily accessed, and thus high
+   performance access is crucial to parallel (-threaded) workloads.
+
+   First, we try to use a 'global register variable' which is a GCC
+   extension. This reserves the register globally.
+
+   If that's not possible, then we need to use __thread, which is a
+   compiler/OS specific TLS storage mechanism (assumed to be Fast
+   Enough.)
+
+   BUT, some older versions of OS X compilers (llvm-gcc, older Clangs)
+   do not support __thread at all. Modern clang however, does - but on
+   OS X it's not as fast as the Linux (which can write directly into a
+   segment register - see #7602.)
+
+   If we don't support __thread then we do the absolute worst thing:
+   we just use pthread_getspecific and pthread_setspecific (which are
+   horribly slow.)
+*/
+
+#define GCT_REG_DECL(type,name,reg) register type name REG(reg);
+
+
+/* -------------------------------------------------------------------------- */
+
+/* First: if we're not using the threaded RTS, it's easy: just fake it. */
+#if !defined(THREADED_RTS)
+extern StgWord8 the_gc_thread[];
+#define gct ((gc_thread*)&the_gc_thread)
+#define SET_GCT(to) /*nothing*/
+#define DECLARE_GCT /*nothing*/
 
-#if defined(THREADED_RTS)
+#else /* defined(THREADED_RTS) */
 
-#define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg);
+/* -------------------------------------------------------------------------- */
 
-#ifdef llvm_CC_FLAVOR
+/* Now, llvm-gcc and some older Clang compilers do not support
+   __thread. So we have to fallback to the extremely slow case,
+   unfortunately. Note: clang_CC_FLAVOR implies llvm_CC_FLAVOR */
+#if defined(llvm_CC_FLAVOR) && (CC_SUPPORTS_TLS == 0)
+#define gct ((gc_thread *)(pthread_getspecific(gctKey)))
 #define SET_GCT(to) (pthread_setspecific(gctKey, to))
-#else
-#define SET_GCT(to) gct = (to)
-#endif
+#define DECLARE_GCT ThreadLocalKey gctKey;
 
+/* -------------------------------------------------------------------------- */
 
+/* However, if we *are* using an LLVM based compiler with __thread
+   support, then use that (since LLVM doesn't support global register
+   variables.) */
+#elif defined(llvm_CC_FLAVOR) && (CC_SUPPORTS_TLS == 1)
+extern __thread gc_thread* gct;
+#define SET_GCT(to) gct = (to)
+#define DECLARE_GCT __thread gc_thread* gct;
 
-#if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS))
-// Using __thread is better than stealing a register on x86/Linux, because
-// we have too few registers available.  In my tests it was worth
-// about 5% in GC performance, but of course that might change as gcc
-// improves. -- SDM 2009/04/03
-//
-// For MacOSX, we can use an llvm-based C compiler which will store the gct
-// in a thread local variable using pthreads.
+/* -------------------------------------------------------------------------- */
 
+/* Next up: Using __thread is better than stealing a register on
+   x86/Linux, because we have too few registers available. In my
+   tests it was worth about 5% in GC performance, but of course that
+   might change as gcc improves. -- SDM 2009/04/03 */
+#elif (defined(i386_HOST_ARCH) && defined(linux_HOST_OS))
 extern __thread gc_thread* gct;
+#define SET_GCT(to) gct = (to)
 #define DECLARE_GCT __thread gc_thread* gct;
 
-#elif defined(llvm_CC_FLAVOR)
-// LLVM does not support the __thread extension and will generate
-// incorrect code for global register variables. If we are compiling
-// with a C compiler that uses an LLVM back end (clang or llvm-gcc) then we
-// use pthread_getspecific() to handle the thread local storage for gct.
-#define gct ((gc_thread *)(pthread_getspecific(gctKey)))
-#define DECLARE_GCT ThreadLocalKey gctKey;
+/* -------------------------------------------------------------------------- */
 
-#elif defined(sparc_HOST_ARCH)
-// On SPARC we can't pin gct to a register. Names like %l1 are just offsets
-//	into the register window, which change on each function call.
-//	
-//	There are eight global (non-window) registers, but they're used for other purposes.
-//	%g0     -- always zero
-//	%g1     -- volatile over function calls, used by the linker
-//	%g2-%g3 -- used as scratch regs by the C compiler (caller saves)
-//	%g4	-- volatile over function calls, used by the linker
-//	%g5-%g7	-- reserved by the OS
+/* Next up: On SPARC we can't pin gct to a register. Names like %l1
+   are just offsets into the register window, which change on each
+   function call.
 
+   There are eight global (non-window) registers, but they're used for other
+   purposes:
+
+    %g0     -- always zero
+    %g1     -- volatile over function calls, used by the linker
+    %g2-%g3 -- used as scratch regs by the C compiler (caller saves)
+    %g4     -- volatile over function calls, used by the linker
+    %g5-%g7 -- reserved by the OS
+*/
+#elif defined(sparc_HOST_ARCH)
 extern __thread gc_thread* gct;
+#define SET_GCT(to) gct = (to)
 #define DECLARE_GCT __thread gc_thread* gct;
 
+/* -------------------------------------------------------------------------- */
 
+/* Next up: generally, if REG_Base is defined and we're *not* using
+   i386, then actually declare the needed register. The catch for i386
+   here is that REG_Base is %ebx, but that is also used for -fPIC, so
+   it can't be stolen */
 #elif defined(REG_Base) && !defined(i386_HOST_ARCH)
-// on i386, REG_Base is %ebx which is also used for PIC, so we don't
-// want to steal it
-
-GLOBAL_REG_DECL(gc_thread*, gct, REG_Base)
+GCT_REG_DECL(gc_thread*, gct, REG_Base);
+#define SET_GCT(to) gct = (to)
 #define DECLARE_GCT /* nothing */
 
+/* -------------------------------------------------------------------------- */
 
+/* Next up: if REG_R1 is available after checking REG_Base, we're
+   gonna steal it in every case we can. */
 #elif defined(REG_R1)
-
-GLOBAL_REG_DECL(gc_thread*, gct, REG_R1)
+GCT_REG_DECL(gc_thread*, gct, REG_R1);
+#define SET_GCT(to) gct = (to)
 #define DECLARE_GCT /* nothing */
 
+/* -------------------------------------------------------------------------- */
 
-#elif defined(__GNUC__)
-
+/* Finally, as an absolute fallback, if none of the above tests check
+   out but we *do* have __thread support, then use that. */
+#elif CC_SUPPORTS_TLS == 1
 extern __thread gc_thread* gct;
+#define SET_GCT(to) gct = (to)
 #define DECLARE_GCT __thread gc_thread* gct;
 
-#else
-
-#error Cannot find a way to declare the thread-local gct
+/* -------------------------------------------------------------------------- */
 
+/* Impossible! */
+#else
+#error Cannot find a way to declare the thread-local gc variable!
 #endif
 
-#else  // not the threaded RTS
-
-extern StgWord8 the_gc_thread[];
-
-#define gct ((gc_thread*)&the_gc_thread)
-#define SET_GCT(to) /*nothing*/
-#define DECLARE_GCT /*nothing*/
-
 #endif // THREADED_RTS
 
 #include "EndPrivate.h"



More information about the ghc-commits mailing list