[commit: ghc] master: rts: enable parallel GC scan of large (32M+) allocation area (a5d26f2)

git at git.haskell.org git at git.haskell.org
Tue Aug 30 12:28:19 UTC 2016


Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/a5d26f26d33bc04f31eaff50b7d633444192b4cb/ghc

>---------------------------------------------------------------

commit a5d26f26d33bc04f31eaff50b7d633444192b4cb
Author: Sergei Trofimovich <slyfox at gentoo.org>
Date:   Tue Aug 30 12:10:54 2016 +0100

    rts: enable parallel GC scan of large (32M+) allocation area
    
    Parallel GC does not scan large allocation area (-A)
    effectively as it does not do work stealing from nursery
    by default.
    
    That leads to large imbalance when only one of threads
    overflows allocation area: most of GC threads finish
    quickly (as there is not much to collect) and sit idle
    waiting while single GC thread finishes scan of single
    allocation area for that thread.
    
    The patch enables work stealing for (equivalent of -qb0)
    allocation area of -A32M or higher.
    
    Tested on a highlighting-kate package from Trac #9221
    
    On 8-core machine the difference is around 5% faster
    of wall-clock time. On 24-core VM the speedup is 20%.
    
    Signed-off-by: Sergei Trofimovich <siarheit at google.com>
    
    Test Plan: measured wall time and GC parallelism on highlighting-kate build
    
    Reviewers: austin, bgamari, erikd, simonmar
    
    Reviewed By: bgamari, simonmar
    
    Subscribers: thomie
    
    Differential Revision: https://phabricator.haskell.org/D2483
    
    GHC Trac Issues: #9221


>---------------------------------------------------------------

a5d26f26d33bc04f31eaff50b7d633444192b4cb
 docs/users_guide/runtime_control.rst |  2 +-
 rts/RtsFlags.c                       | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst
index 1ae51dd..3968065 100644
--- a/docs/users_guide/runtime_control.rst
+++ b/docs/users_guide/runtime_control.rst
@@ -449,7 +449,7 @@ performance.
 
 .. rts-flag:: -qb <gen>
 
-    :default: 1
+    :default: 1 for ``-A`` < 32M, 0 otherwise
     :since: 6.12.1
 
     Use load-balancing in the parallel GC in generation ⟨gen⟩ and higher.
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index e23f760..7e06d84 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -227,7 +227,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.ParFlags.parGcEnabled      = 1;
     RtsFlags.ParFlags.parGcGen          = 0;
     RtsFlags.ParFlags.parGcLoadBalancingEnabled = rtsTrue;
-    RtsFlags.ParFlags.parGcLoadBalancingGen = 1;
+    RtsFlags.ParFlags.parGcLoadBalancingGen = ~0u; /* auto, based on -A */
     RtsFlags.ParFlags.parGcNoSyncWithIdle   = 0;
     RtsFlags.ParFlags.parGcThreads      = 0; /* defaults to -N */
     RtsFlags.ParFlags.setAffinity       = 0;
@@ -393,7 +393,8 @@ usage_text[] = {
 "  -qg[<n>]  Use parallel GC only for generations >= <n>",
 "            (default: 0, -qg alone turns off parallel GC)",
 "  -qb[<n>]  Use load-balancing in the parallel GC only for generations >= <n>",
-"            (default: 1, -qb alone turns off load-balancing)",
+"            (default: 1 for -A < 32M, 0 otherwise;"
+"             -qb alone turns off load-balancing)",
 "  -qn<n>    Use <n> threads for parallel GC (defaults to value of -N)",
 "  -qa       Use the OS to set thread affinity (experimental)",
 "  -qm       Don't automatically migrate threads between CPUs",
@@ -1450,6 +1451,22 @@ static void normaliseRtsOpts (void)
         errorUsage();
     }
 
+    if (RtsFlags.ParFlags.parGcLoadBalancingGen == ~0u) {
+        StgWord alloc_area_bytes
+            = RtsFlags.GcFlags.minAllocAreaSize * BLOCK_SIZE;
+
+        // If allocation area is larger that CPU cache
+        // we can finish scanning quicker doing work-stealing
+        // scan. Trac #9221
+        // 32M looks big enough not to fit into L2 cache
+        // of popular modern CPUs.
+        if (alloc_area_bytes >= 32 * 1024 * 1024) {
+            RtsFlags.ParFlags.parGcLoadBalancingGen = 0;
+        } else {
+            RtsFlags.ParFlags.parGcLoadBalancingGen = 1;
+        }
+    }
+
 #ifdef THREADED_RTS
     if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nCapabilities) {
         errorBelch("GC threads (-qn) must be between 1 and the value of -N");



More information about the ghc-commits mailing list