[Git][ghc/ghc][wip/memory-barriers] 7 commits: Clean up

Fri Jun 7 15:19:05 UTC 2019


Ben Gamari pushed to branch wip/memory-barriers at Glasgow Haskell Compiler / GHC


Commits:
b4fc5d93 by Ben Gamari at 2019-06-07T15:18:50Z
Clean up

- - - - -
a23aa325 by Matthew Pickering at 2019-06-07T15:18:50Z
Fix two lint failures in rts/linker/MachO.c

- - - - -
dd11684f by Matthew Pickering at 2019-06-07T15:18:50Z
gitlab-ci: Linters, don't allow to fail

Ben disabled them in cd85f8a71bb56cff332560e1d571b3406789fb71 but didn't
say how or why they were broken.

- - - - -
afebd872 by Matthew Pickering at 2019-06-07T15:18:50Z
gitlab-ci: Don't run two submodule checking jobs on Marge jobs

- - - - -
0bbf3f8b by Ben Gamari at 2019-06-07T15:18:50Z
gitlab-ci: Use --unshallow when fetching for linters

GitLab creates a shallow clone. However, this means that we may not have
the base commit of an MR when linting, causing `git merge-base` to fail.
Fix this by passing `--unshallow` to `git fetch`, ensuring that we have
the entire history.

- - - - -
442e4b0e by Ben Gamari at 2019-06-07T15:18:50Z
gitlab-ci: Fix submodule linter

The job script didn't even try to compute the base commit to lint with
respect to.

- - - - -
78574759 by Ben Gamari at 2019-06-07T15:18:50Z
gitlab-ci: A few clarifying comments

- - - - -


19 changed files:

- .gitlab-ci.yml
- includes/stg/SMP.h
- rts/Apply.cmm
- rts/Interpreter.c
- rts/Messages.c
- rts/PrimOps.cmm
- rts/RaiseAsync.c
- rts/RtsAPI.c
- rts/StgMiscClosures.cmm
- rts/ThreadPaused.c
- rts/Threads.c
- rts/Updates.h
- rts/Weak.c
- rts/linker/MachO.c
- rts/sm/CNF.c
- rts/sm/Compact.c
- rts/sm/MarkWeak.c
- rts/sm/Scav.c
- rts/sm/Storage.c


Changes:

=====================================
.gitlab-ci.yml
=====================================
@@ -49,13 +49,18 @@ stages:
 ############################################################
 
 ghc-linters:
-  allow_failure: true
   stage: lint
   image: "registry.gitlab.haskell.org/ghc/ci-images/linters:$DOCKER_REV"
   script:
-    - git fetch "$CI_MERGE_REQUEST_PROJECT_URL" $CI_MERGE_REQUEST_TARGET_BRANCH_NAME
+    # Note [Unshallow clone for linting]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # GitLab creates a shallow clone which means that we may not have the base
+    # commit of the MR being tested (e.g. if the MR is quite old), causing `git
+    # merge-base` to fail.  Passing `--unshallow` to `git fetch` ensures that
+    # we have the entire history.
+    - git fetch --unshallow "$CI_MERGE_REQUEST_PROJECT_URL" $CI_MERGE_REQUEST_TARGET_BRANCH_NAME
     - base="$(git merge-base FETCH_HEAD $CI_COMMIT_SHA)"
-    - "echo Merge base $base"
+    - "echo Linting changes between $base..$CI_COMMIT_SHA"
     #    - validate-commit-msg .git $(git rev-list $base..$CI_COMMIT_SHA)
     - validate-whitespace .git $(git rev-list $base..$CI_COMMIT_SHA)
     - .gitlab/linters/check-makefiles.py $base $CI_COMMIT_SHA
@@ -75,6 +80,10 @@ ghc-linters:
   stage: lint
   image: "registry.gitlab.haskell.org/ghc/ci-images/linters:$DOCKER_REV"
   script:
+    # See Note [Unshallow clone for linting]
+    - git fetch --unshallow "$CI_MERGE_REQUEST_PROJECT_URL" $CI_MERGE_REQUEST_TARGET_BRANCH_NAME
+    - base="$(git merge-base FETCH_HEAD $CI_COMMIT_SHA)"
+    - "echo Linting changes between $base..$CI_COMMIT_SHA"
     - submodchecker .git $(git rev-list $base..$CI_COMMIT_SHA)
   dependencies: []
   tags:
@@ -97,10 +106,15 @@ lint-submods-marge:
 
 lint-submods-mr:
   extends: .lint-submods
+  # Allow failure since any necessary submodule patches may not be upstreamed
+  # yet.
   allow_failure: true
   only:
     refs:
       - merge_requests
+  except:
+    variables:
+      - $CI_MERGE_REQUEST_LABELS =~ /.*wip/marge_bot_batch_merge_job.*/
 
 .lint-changelogs:
   stage: lint
@@ -117,6 +131,7 @@ lint-submods-mr:
 
 lint-changelogs:
   extends: .lint-changelogs
+  # Allow failure since this isn't a final release.
   allow_failure: true
   only:
     refs:


=====================================
includes/stg/SMP.h
=====================================
@@ -98,15 +98,15 @@ EXTERN_INLINE void load_load_barrier(void);
 
 /*
  * Note [Heap memory barriers]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
  * Machines with weak memory ordering semantics have consequences for how
- * closures are observed and mutated. For example, consider a closure that needs
+ * closures are observed and mutated. For example, consider a thunk that needs
  * to be updated to an indirection. In order for the indirection to be safe for
  * concurrent observers to enter, said observers must read the indirection's
- * info table before they read the indirectee. Furthermore, the entering
- * observer makes assumptions about the closure based on its info table
- * contents, e.g. an INFO_TYPE of IND imples the closure has an indirectee
- * pointer that is safe to follow.
+ * info table before they read the indirectee. Furthermore, the indirectee must
+ * be set before the info table pointer. This ensures that if the observer sees
+ * an IND info table then the indirectee is valid.
  *
  * When a closure is updated with an indirection, both its info table and its
  * indirectee must be written. With weak memory ordering, these two writes can
@@ -145,6 +145,24 @@ EXTERN_INLINE void load_load_barrier(void);
  * - Read the closure's info pointer.
  * - Read barrier.
  * - Read the closure's (non-info table) fields.
+ *
+ * We must also take care when we expose a newly-allocated closure to other cores
+ * by writing a pointer to it to some shared data structure (e.g. an MVar#, a Message,
+ * or MutVar#). Specifically, we need to ensure that all writes constructing the
+ * closure are visible *before* the write exposing the new closure is made visible:
+ *
+ * - Allocate memory for the closure
+ * - Write the closure's info pointer and fields (ordering betweeen this doesn't
+ *   matter since the closure isn't yet visible to anyone else).
+ * - Write barrier
+ * - Make closure visible to other cores
+ *
+ * Note that thread stacks are inherently thread-local and consequently allocating an
+ * object and introducing a reference to it to our stack needs no barrier.
+ *
+ * Finally, we take pains to ensure that we flush all write buffers before
+ * entering GC, since stacks may be read by other cores.
+ *
  */
 
 /* ----------------------------------------------------------------------------


=====================================
rts/Apply.cmm
=====================================
@@ -63,7 +63,7 @@ again:
     P_ untaggedfun;
     W_ arity;
     // We must obey the correct heap object observation pattern in
-    // note [Heap memory barriers] in SMP.h.
+    // Note [Heap memory barriers] in SMP.h.
     untaggedfun = UNTAG(fun);
     info = %INFO_PTR(untaggedfun);
     prim_read_barrier;
@@ -107,6 +107,7 @@ again:
                 CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
                 P_ pap;
                 pap = Hp - SIZEOF_StgPAP + WDS(1);
+                SET_HDR(pap, stg_PAP_info, CCCS);
                 StgPAP_arity(pap) = arity;
                 if (arity <= TAG_MASK) {
                   // TODO: Shouldn't this already be tagged? If not why did we
@@ -115,8 +116,6 @@ again:
                 }
                 StgPAP_fun(pap)   = fun;
                 StgPAP_n_args(pap) = 0;
-                prim_write_barrier;
-                SET_HDR(pap, stg_PAP_info, CCCS);
                 return (pap);
             }
         }
@@ -136,6 +135,7 @@ again:
                 pap = Hp - size + WDS(1);
                 // We'll lose the original PAP, so we should enter its CCS
                 ccall enterFunCCS(BaseReg "ptr", StgHeader_ccs(untaggedfun) "ptr");
+                SET_HDR(pap, stg_PAP_info, CCCS);
                 StgPAP_arity(pap) = StgPAP_arity(untaggedfun);
                 StgPAP_n_args(pap) = StgPAP_n_args(untaggedfun);
                 StgPAP_fun(pap)   = StgPAP_fun(fun);
@@ -143,8 +143,6 @@ again:
                 i = TO_W_(StgPAP_n_args(untaggedfun));
             loop:
                 if (i == 0) {
-                    prim_write_barrier;
-                    SET_HDR(pap, stg_PAP_info, CCCS);
                     return (pap);
                 }
                 i = i - 1;


=====================================
rts/Interpreter.c
=====================================
@@ -249,11 +249,10 @@ StgClosure * newEmptyPAP (Capability *cap,
                           uint32_t arity)
 {
     StgPAP *pap = (StgPAP *)allocate(cap, sizeofW(StgPAP));
+    SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
     pap->arity = arity;
     pap->n_args = 0;
     pap->fun = tagged_obj;
-    write_barrier();
-    SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
     return (StgClosure *)pap;
 }
 
@@ -274,7 +273,7 @@ StgClosure * copyPAP  (Capability *cap, StgPAP *oldpap)
     for (i = 0; i < ((StgPAP *)pap)->n_args; i++) {
         pap->payload[i] = oldpap->payload[i];
     }
-    write_barrier();
+    // No write barrier is needed here as this is a new allocation
     SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
     return (StgClosure *)pap;
 }
@@ -483,9 +482,8 @@ eval_obj:
         {
             StgUpdateFrame *__frame;
             __frame = (StgUpdateFrame *)Sp;
-            __frame->updatee = (StgClosure *)(ap);
-            write_barrier();
             SET_INFO((StgClosure *)__frame, (StgInfoTable *)&stg_upd_frame_info);
+            __frame->updatee = (StgClosure *)(ap);
         }
 
         ENTER_CCS_THUNK(cap,ap);
@@ -811,7 +809,7 @@ do_apply:
                 for (i = 0; i < m; i++) {
                     new_pap->payload[pap->n_args + i] = (StgClosure *)SpW(i);
                 }
-                write_barrier();
+                // No write barrier is needed here as this is a new allocation
                 SET_HDR(new_pap,&stg_PAP_info,cap->r.rCCCS);
                 tagged_obj = (StgClosure *)new_pap;
                 Sp_addW(m);
@@ -854,7 +852,7 @@ do_apply:
                 for (i = 0; i < m; i++) {
                     pap->payload[i] = (StgClosure *)SpW(i);
                 }
-                write_barrier();
+                // No write barrier is needed here as this is a new allocation
                 SET_HDR(pap, &stg_PAP_info,cap->r.rCCCS);
                 tagged_obj = (StgClosure *)pap;
                 Sp_addW(m);
@@ -1099,7 +1097,7 @@ run_BCO:
                      new_aps->payload[i] = (StgClosure *)SpW(i-2);
                   }
 
-                  write_barrier();
+                  // No write barrier is needed here as this is a new allocation
                   SET_HDR(new_aps,&stg_AP_STACK_info,cap->r.rCCCS);
 
                   // Arrange the stack to call the breakpoint IO action, and
@@ -1428,10 +1426,11 @@ run_BCO:
             StgAP* ap;
             int n_payload = BCO_NEXT;
             ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
+            SpW(-1) = (W_)ap;
             ap->n_args = n_payload;
-            write_barrier();
+            // No write barrier is needed here as this is a new allocation
+            // visible only from our stack
             SET_HDR(ap, &stg_AP_info, cap->r.rCCCS)
-            SpW(-1) = (W_)ap;
             Sp_subW(1);
             goto nextInsn;
         }
@@ -1440,10 +1439,11 @@ run_BCO:
             StgAP* ap;
             int n_payload = BCO_NEXT;
             ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
+            SpW(-1) = (W_)ap;
             ap->n_args = n_payload;
-            write_barrier();
+            // No write barrier is needed here as this is a new allocation
+            // visible only from our stack
             SET_HDR(ap, &stg_AP_NOUPD_info, cap->r.rCCCS)
-            SpW(-1) = (W_)ap;
             Sp_subW(1);
             goto nextInsn;
         }
@@ -1453,11 +1453,12 @@ run_BCO:
             int arity = BCO_NEXT;
             int n_payload = BCO_NEXT;
             pap = (StgPAP*)allocate(cap, PAP_sizeW(n_payload));
+            SpW(-1) = (W_)pap;
             pap->n_args = n_payload;
             pap->arity = arity;
-            write_barrier();
+            // No write barrier is needed here as this is a new allocation
+            // visible only from our stack
             SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS)
-            SpW(-1) = (W_)pap;
             Sp_subW(1);
             goto nextInsn;
         }
@@ -1538,7 +1539,8 @@ run_BCO:
             }
             Sp_addW(n_words);
             Sp_subW(1);
-            write_barrier();
+            // No write barrier is needed here as this is a new allocation
+            // visible only from our stack
             SET_HDR(con, (StgInfoTable*)BCO_LIT(o_itbl), cap->r.rCCCS);
             SpW(0) = (W_)con;
             IF_DEBUG(interpreter,


=====================================
rts/Messages.c
=====================================
@@ -28,7 +28,7 @@ void sendMessage(Capability *from_cap, Capability *to_cap, Message *msg)
 #if defined(DEBUG)
     {
         const StgInfoTable *i = msg->header.info;
-        load_load_barrier();
+        load_load_barrier();  // See Note [Heap memory barriers] in SMP.h
         if (i != &stg_MSG_THROWTO_info &&
             i != &stg_MSG_BLACKHOLE_info &&
             i != &stg_MSG_TRY_WAKEUP_info &&
@@ -71,7 +71,7 @@ executeMessage (Capability *cap, Message *m)
 loop:
     write_barrier(); // allow m->header to be modified by another thread
     i = m->header.info;
-    load_load_barrier();
+    load_load_barrier();  // See Note [Heap memory barriers] in SMP.h
     if (i == &stg_MSG_TRY_WAKEUP_info)
     {
         StgTSO *tso = ((MessageWakeup *)m)->tso;
@@ -175,7 +175,7 @@ uint32_t messageBlackHole(Capability *cap, MessageBlackHole *msg)
                   "blackhole %p", (W_)msg->tso->id, msg->bh);
 
     info = bh->header.info;
-    load_load_barrier();
+    load_load_barrier();  // See Note [Heap memory barriers] in SMP.h
 
     // If we got this message in our inbox, it might be that the
     // BLACKHOLE has already been updated, and GC has shorted out the
@@ -199,7 +199,7 @@ loop:
     // and turns this into an infinite loop.
     p = UNTAG_CLOSURE((StgClosure*)VOLATILE_LOAD(&((StgInd*)bh)->indirectee));
     info = p->header.info;
-    load_load_barrier();
+    load_load_barrier();  // See Note [Heap memory barriers] in SMP.h
 
     if (info == &stg_IND_info)
     {
@@ -241,8 +241,11 @@ loop:
         // a collision to update a BLACKHOLE and a BLOCKING_QUEUE
         // becomes orphaned (see updateThunk()).
         bq->link = owner->bq;
-        write_barrier();
         SET_HDR(bq, &stg_BLOCKING_QUEUE_DIRTY_info, CCS_SYSTEM);
+        // We are about to make the newly-constructed message visible to other cores;
+        // a barrier is necessary to ensure that all writes are visible.
+        // See Note [Heap memory barriers] in SMP.h.
+        write_barrier();
         owner->bq = bq;
         dirty_TSO(cap, owner); // we modified owner->bq
 
@@ -291,11 +294,14 @@ loop:
 
         msg->link = bq->queue;
         bq->queue = msg;
+        // No barrier is necessary here: we are only exposing the
+        // closure to the GC. See Note [Heap memory barriers] in SMP.h.
         recordClosureMutated(cap,(StgClosure*)msg);
 
         if (info == &stg_BLOCKING_QUEUE_CLEAN_info) {
-            write_barrier();
             bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
+            // No barrier is necessary here: we are only exposing the
+            // closure to the GC. See Note [Heap memory barriers] in SMP.h.
             recordClosureMutated(cap,(StgClosure*)bq);
         }
 
@@ -325,7 +331,7 @@ StgTSO * blackHoleOwner (StgClosure *bh)
     StgClosure *p;
 
     info = bh->header.info;
-    load_load_barrier();
+    load_load_barrier(); // XXX
 
     if (info != &stg_BLACKHOLE_info &&
         info != &stg_CAF_BLACKHOLE_info &&
@@ -341,7 +347,7 @@ loop:
     // and turns this into an infinite loop.
     p = UNTAG_CLOSURE((StgClosure*)VOLATILE_LOAD(&((StgInd*)bh)->indirectee));
     info = p->header.info;
-    load_load_barrier();
+    load_load_barrier(); // XXX
 
     if (info == &stg_IND_info) goto loop;
 


=====================================
rts/PrimOps.cmm
=====================================
@@ -65,9 +65,8 @@ stg_newByteArrayzh ( W_ n )
         jump stg_raisezh(base_GHCziIOziException_heapOverflow_closure);
     }
     TICK_ALLOC_PRIM(SIZEOF_StgArrBytes,WDS(payload_words),0);
-    StgArrBytes_bytes(p) = n;
-    prim_write_barrier;
     SET_HDR(p, stg_ARR_WORDS_info, CCCS);
+    StgArrBytes_bytes(p) = n;
     return (p);
 }
 
@@ -103,9 +102,9 @@ stg_newPinnedByteArrayzh ( W_ n )
        to BA_ALIGN bytes: */
     p = p + ((-p - SIZEOF_StgArrBytes) & BA_MASK);
 
-    StgArrBytes_bytes(p) = n;
-    prim_write_barrier;
+    /* No write barrier needed since this is a new allocation. */
     SET_HDR(p, stg_ARR_WORDS_info, CCCS);
+    StgArrBytes_bytes(p) = n;
     return (p);
 }
 
@@ -146,9 +145,9 @@ stg_newAlignedPinnedByteArrayzh ( W_ n, W_ alignment )
        <alignment> is a power of 2, which is technically not guaranteed */
     p = p + ((-p - SIZEOF_StgArrBytes) & (alignment - 1));
 
-    StgArrBytes_bytes(p) = n;
-    prim_write_barrier;
+    /* No write barrier needed since this is a new allocation. */
     SET_HDR(p, stg_ARR_WORDS_info, CCCS);
+    StgArrBytes_bytes(p) = n;
     return (p);
 }
 
@@ -257,6 +256,8 @@ stg_newArrayzh ( W_ n /* words */, gcptr init )
     }
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(size), 0);
 
+    /* No write barrier needed since this is a new allocation. */
+    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS);
     StgMutArrPtrs_ptrs(arr) = n;
     StgMutArrPtrs_size(arr) = size;
 
@@ -269,9 +270,6 @@ stg_newArrayzh ( W_ n /* words */, gcptr init )
         goto for;
     }
 
-    prim_write_barrier;
-    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS);
-
     return (arr);
 }
 
@@ -283,13 +281,11 @@ stg_unsafeThawArrayzh ( gcptr arr )
     // mut_list so no need to add it again. MUT_ARR_PTRS_FROZEN_CLEAN means it's
     // not and we should add it to a mut_list.
     if (StgHeader_info(arr) != stg_MUT_ARR_PTRS_FROZEN_DIRTY_info) {
-        prim_write_barrier; // see below:
         SET_INFO(arr,stg_MUT_ARR_PTRS_DIRTY_info);
         // must be done after SET_INFO, because it ASSERTs closure_MUTABLE():
         recordMutable(arr);
         return (arr);
     } else {
-        prim_write_barrier;
         SET_INFO(arr,stg_MUT_ARR_PTRS_DIRTY_info);
         return (arr);
     }
@@ -377,6 +373,7 @@ stg_newArrayArrayzh ( W_ n /* words */ )
     }
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(size), 0);
 
+    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
     StgMutArrPtrs_ptrs(arr) = n;
     StgMutArrPtrs_size(arr) = size;
 
@@ -389,9 +386,6 @@ stg_newArrayArrayzh ( W_ n /* words */ )
         goto for;
     }
 
-    prim_write_barrier;
-    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
-
     return (arr);
 }
 
@@ -414,6 +408,8 @@ stg_newSmallArrayzh ( W_ n /* words */, gcptr init )
     }
     TICK_ALLOC_PRIM(SIZEOF_StgSmallMutArrPtrs, WDS(n), 0);
 
+    /* No write barrier needed since this is a new allocation. */
+    SET_HDR(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info, CCCS);
     StgSmallMutArrPtrs_ptrs(arr) = n;
 
     // Initialise all elements of the array with the value in R2
@@ -428,9 +424,6 @@ stg_newSmallArrayzh ( W_ n /* words */, gcptr init )
         goto for;
     }
 
-    prim_write_barrier;
-    SET_HDR(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info, CCCS);
-
     return (arr);
 }
 
@@ -439,13 +432,11 @@ stg_unsafeThawSmallArrayzh ( gcptr arr )
     // See stg_unsafeThawArrayzh
     if (StgHeader_info(arr) != stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info) {
         SET_INFO(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
-        prim_write_barrier;
         recordMutable(arr);
         // must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
         return (arr);
     } else {
         SET_INFO(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
-        prim_write_barrier;
         return (arr);
     }
 }
@@ -475,14 +466,13 @@ stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n)
 {
     W_ dst_p, src_p, bytes;
 
+    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+
     dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
     src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
     bytes = WDS(n);
     prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
 
-    prim_write_barrier;
-    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
-
     return ();
 }
 
@@ -490,6 +480,8 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
 {
     W_ dst_p, src_p, bytes;
 
+    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+
     dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
     src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
     bytes = WDS(n);
@@ -499,9 +491,6 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
         prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
     }
 
-    prim_write_barrier;
-    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
-
     return ();
 }
 
@@ -537,9 +526,9 @@ stg_newMutVarzh ( gcptr init )
     ALLOC_PRIM_P (SIZEOF_StgMutVar, stg_newMutVarzh, init);
 
     mv = Hp - SIZEOF_StgMutVar + WDS(1);
-    StgMutVar_var(mv) = init;
-    prim_write_barrier;
+    /* No write barrier needed since this is a new allocation. */
     SET_HDR(mv,stg_MUT_VAR_DIRTY_info,CCCS);
+    StgMutVar_var(mv) = init;
 
     return (mv);
 }
@@ -622,18 +611,16 @@ stg_atomicModifyMutVar2zh ( gcptr mv, gcptr f )
     TICK_ALLOC_THUNK_2();
     CCCS_ALLOC(THUNK_2_SIZE);
     z = Hp - THUNK_2_SIZE + WDS(1);
+    SET_HDR(z, stg_ap_2_upd_info, CCCS);
     LDV_RECORD_CREATE(z);
     StgThunk_payload(z,0) = f;
-    prim_write_barrier;
-    SET_HDR(z, stg_ap_2_upd_info, CCCS);
 
     TICK_ALLOC_THUNK_1();
     CCCS_ALLOC(THUNK_1_SIZE);
     y = z - THUNK_1_SIZE;
+    SET_HDR(y, stg_sel_0_upd_info, CCCS);
     LDV_RECORD_CREATE(y);
     StgThunk_payload(y,0) = z;
-    prim_write_barrier;
-    SET_HDR(y, stg_sel_0_upd_info, CCCS);
 
   retry:
     x = StgMutVar_var(mv);
@@ -683,10 +670,9 @@ stg_atomicModifyMutVarzuzh ( gcptr mv, gcptr f )
     TICK_ALLOC_THUNK();
     CCCS_ALLOC(THUNK_SIZE);
     z = Hp - THUNK_SIZE + WDS(1);
+    SET_HDR(z, stg_ap_2_upd_info, CCCS);
     LDV_RECORD_CREATE(z);
     StgThunk_payload(z,0) = f;
-    prim_write_barrier;
-    SET_HDR(z, stg_ap_2_upd_info, CCCS);
 
   retry:
     x = StgMutVar_var(mv);
@@ -719,6 +705,8 @@ stg_mkWeakzh ( gcptr key,
     ALLOC_PRIM (SIZEOF_StgWeak)
 
     w = Hp - SIZEOF_StgWeak + WDS(1);
+    // No memory barrier needed as this is a new allocation.
+    SET_HDR(w, stg_WEAK_info, CCCS);
 
     StgWeak_key(w)         = key;
     StgWeak_value(w)       = value;
@@ -726,10 +714,6 @@ stg_mkWeakzh ( gcptr key,
     StgWeak_cfinalizers(w) = stg_NO_FINALIZER_closure;
 
     StgWeak_link(w) = Capability_weak_ptr_list_hd(MyCapability());
-
-    prim_write_barrier;
-    SET_HDR(w, stg_WEAK_info, CCCS);
-
     Capability_weak_ptr_list_hd(MyCapability()) = w;
     if (Capability_weak_ptr_list_tl(MyCapability()) == NULL) {
         Capability_weak_ptr_list_tl(MyCapability()) = w;
@@ -756,15 +740,13 @@ stg_addCFinalizzerToWeakzh ( W_ fptr,   // finalizer
     ALLOC_PRIM (SIZEOF_StgCFinalizerList)
 
     c = Hp - SIZEOF_StgCFinalizerList + WDS(1);
+    SET_HDR(c, stg_C_FINALIZER_LIST_info, CCCS);
 
     StgCFinalizerList_fptr(c) = fptr;
     StgCFinalizerList_ptr(c) = ptr;
     StgCFinalizerList_eptr(c) = eptr;
     StgCFinalizerList_flag(c) = flag;
 
-    prim_write_barrier;
-    SET_HDR(c, stg_C_FINALIZER_LIST_info, CCCS);
-
     LOCK_CLOSURE(w, info);
 
     if (info == stg_DEAD_WEAK_info) {
@@ -1485,12 +1467,12 @@ stg_newMVarzh ()
     ALLOC_PRIM_ (SIZEOF_StgMVar, stg_newMVarzh);
 
     mvar = Hp - SIZEOF_StgMVar + WDS(1);
+    // No memory barrier needed as this is a new allocation.
+    SET_HDR(mvar,stg_MVAR_DIRTY_info,CCCS);
+        // MVARs start dirty: generation 0 has no mutable list
     StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
-    prim_write_barrier;
-    SET_HDR(mvar,stg_MVAR_DIRTY_info,CCCS);
-        // MVARs start dirty: generation 0 has no mutable list
     return (mvar);
 }
 
@@ -1534,9 +1516,10 @@ stg_takeMVarzh ( P_ mvar /* :: MVar a */ )
 
         StgMVarTSOQueue_link(q) = END_TSO_QUEUE;
         StgMVarTSOQueue_tso(q)  = CurrentTSO;
-
-        prim_write_barrier;
         SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
+        // Write barrier before we make the new MVAR_TSO_QUEUE
+        // visible to other cores.
+        prim_write_barrier;
 
         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
             StgMVar_head(mvar) = q;
@@ -1958,10 +1941,10 @@ stg_makeStableNamezh ( P_ obj )
         // too complicated and doesn't buy us much. See D5342?id=18700.)
         ("ptr" sn_obj) = ccall allocate(MyCapability() "ptr",
                                         BYTES_TO_WDS(SIZEOF_StgStableName));
+        SET_HDR(sn_obj, stg_STABLE_NAME_info, CCCS);
         StgStableName_sn(sn_obj) = index;
-        snEntry_sn_obj(W_[stable_name_table] + index*SIZEOF_snEntry) = sn_obj;
         prim_write_barrier;
-        SET_HDR(sn_obj, stg_STABLE_NAME_info, CCCS);
+        snEntry_sn_obj(W_[stable_name_table] + index*SIZEOF_snEntry) = sn_obj;
     } else {
         sn_obj = snEntry_sn_obj(W_[stable_name_table] + index*SIZEOF_snEntry);
     }
@@ -2002,6 +1985,8 @@ stg_newBCOzh ( P_ instrs,
     ALLOC_PRIM (bytes);
 
     bco = Hp - bytes + WDS(1);
+    // No memory barrier necessary as this is a new allocation.
+    SET_HDR(bco, stg_BCO_info, CCS_MAIN);
 
     StgBCO_instrs(bco)     = instrs;
     StgBCO_literals(bco)   = literals;
@@ -2019,9 +2004,6 @@ for:
         goto for;
     }
 
-    prim_write_barrier;
-    SET_HDR(bco, stg_BCO_info, CCS_MAIN);
-
     return (bco);
 }
 
@@ -2040,13 +2022,12 @@ stg_mkApUpd0zh ( P_ bco )
     CCCS_ALLOC(SIZEOF_StgAP);
 
     ap = Hp - SIZEOF_StgAP + WDS(1);
+    // No memory barrier necessary as this is a new allocation.
+    SET_HDR(ap, stg_AP_info, CCS_MAIN);
 
     StgAP_n_args(ap) = HALF_W_(0);
     StgAP_fun(ap) = bco;
 
-    prim_write_barrier;
-    SET_HDR(ap, stg_AP_info, CCS_MAIN);
-
     return (ap);
 }
 
@@ -2075,6 +2056,7 @@ stg_unpackClosurezh ( P_ closure )
     dat_arr = Hp - dat_arr_sz + WDS(1);
 
 
+    SET_HDR(dat_arr, stg_ARR_WORDS_info, CCCS);
     StgArrBytes_bytes(dat_arr) = WDS(len);
     p = 0;
 for:
@@ -2089,9 +2071,6 @@ for:
     // Follow the pointers
     ("ptr" ptrArray) = foreign "C" heap_view_closurePtrs(MyCapability() "ptr", clos "ptr");
 
-    prim_write_barrier;
-    SET_HDR(dat_arr, stg_ARR_WORDS_info, CCCS);
-
     return (info, dat_arr, ptrArray);
 }
 


=====================================
rts/RaiseAsync.c
=====================================
@@ -870,7 +870,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
                 ap->payload[i] = (StgClosure *)*sp++;
             }
 
-            write_barrier();
+            write_barrier(); // XXX: Necessary?
             SET_HDR(ap,&stg_AP_STACK_info,
                     ((StgClosure *)frame)->header.prof.ccs /* ToDo */);
             TICK_ALLOC_UP_THK(WDS(words+1),0);
@@ -922,7 +922,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
                 ap->payload[i] = (StgClosure *)*sp++;
             }
 
-            write_barrier();
+            write_barrier(); // XXX: Necessary?
             SET_HDR(ap,&stg_AP_STACK_NOUPD_info,stack->header.prof.ccs);
             TICK_ALLOC_SE_THK(WDS(words+1),0);
 
@@ -961,7 +961,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
             //
             raise = (StgThunk *)allocate(cap,sizeofW(StgThunk)+1);
             TICK_ALLOC_SE_THK(WDS(1),0);
-            write_barrier();
+            write_barrier(); // XXX: Necessary?
             SET_HDR(raise,&stg_raise_info,cf->header.prof.ccs);
             raise->payload[0] = exception;
 
@@ -1042,9 +1042,8 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
 
                 atomically = (StgThunk*)allocate(cap,sizeofW(StgThunk)+1);
                 TICK_ALLOC_SE_THK(1,0);
-                atomically->payload[0] = af->code;
-                write_barrier();
                 SET_HDR(atomically,&stg_atomically_info,af->header.prof.ccs);
+                atomically->payload[0] = af->code;
 
                 // discard stack up to and including the ATOMICALLY_FRAME
                 frame += sizeofW(StgAtomicallyFrame);


=====================================
rts/RtsAPI.c
=====================================
@@ -30,9 +30,8 @@ HaskellObj
 rts_mkChar (Capability *cap, HsChar c)
 {
   StgClosure *p = (StgClosure *)allocate(cap, CONSTR_sizeW(0,1));
-  p->payload[0]  = (StgClosure *)(StgWord)(StgChar)c;
-  write_barrier();
   SET_HDR(p, Czh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(StgChar)c;
   return p;
 }
 
@@ -40,9 +39,8 @@ HaskellObj
 rts_mkInt (Capability *cap, HsInt i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  p->payload[0]  = (StgClosure *)(StgInt)i;
-  write_barrier();
   SET_HDR(p, Izh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgInt)i;
   return p;
 }
 
@@ -50,10 +48,9 @@ HaskellObj
 rts_mkInt8 (Capability *cap, HsInt8 i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, I8zh_con_info, CCS_SYSTEM);
   /* Make sure we mask out the bits above the lowest 8 */
   p->payload[0]  = (StgClosure *)(StgInt)i;
-  write_barrier();
-  SET_HDR(p, I8zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -61,10 +58,9 @@ HaskellObj
 rts_mkInt16 (Capability *cap, HsInt16 i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, I16zh_con_info, CCS_SYSTEM);
   /* Make sure we mask out the relevant bits */
   p->payload[0]  = (StgClosure *)(StgInt)i;
-  write_barrier();
-  SET_HDR(p, I16zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -72,9 +68,8 @@ HaskellObj
 rts_mkInt32 (Capability *cap, HsInt32 i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  p->payload[0]  = (StgClosure *)(StgInt)i;
-  write_barrier();
   SET_HDR(p, I32zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgInt)i;
   return p;
 }
 
@@ -82,9 +77,8 @@ HaskellObj
 rts_mkInt64 (Capability *cap, HsInt64 i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,2));
-  ASSIGN_Int64((P_)&(p->payload[0]), i);
-  write_barrier();
   SET_HDR(p, I64zh_con_info, CCS_SYSTEM);
+  ASSIGN_Int64((P_)&(p->payload[0]), i);
   return p;
 }
 
@@ -92,9 +86,8 @@ HaskellObj
 rts_mkWord (Capability *cap, HsWord i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  p->payload[0]  = (StgClosure *)(StgWord)i;
-  write_barrier();
   SET_HDR(p, Wzh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)i;
   return p;
 }
 
@@ -103,9 +96,8 @@ rts_mkWord8 (Capability *cap, HsWord8 w)
 {
   /* see rts_mkInt* comments */
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xff);
-  write_barrier();
   SET_HDR(p, W8zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xff);
   return p;
 }
 
@@ -114,9 +106,8 @@ rts_mkWord16 (Capability *cap, HsWord16 w)
 {
   /* see rts_mkInt* comments */
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffff);
-  write_barrier();
   SET_HDR(p, W16zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffff);
   return p;
 }
 
@@ -125,9 +116,8 @@ rts_mkWord32 (Capability *cap, HsWord32 w)
 {
   /* see rts_mkInt* comments */
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffffffff);
-  write_barrier();
   SET_HDR(p, W32zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffffffff);
   return p;
 }
 
@@ -136,9 +126,8 @@ rts_mkWord64 (Capability *cap, HsWord64 w)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,2));
   /* see mk_Int8 comment */
-  ASSIGN_Word64((P_)&(p->payload[0]), w);
-  write_barrier();
   SET_HDR(p, W64zh_con_info, CCS_SYSTEM);
+  ASSIGN_Word64((P_)&(p->payload[0]), w);
   return p;
 }
 
@@ -147,9 +136,8 @@ HaskellObj
 rts_mkFloat (Capability *cap, HsFloat f)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  ASSIGN_FLT((P_)p->payload, (StgFloat)f);
-  write_barrier();
   SET_HDR(p, Fzh_con_info, CCS_SYSTEM);
+  ASSIGN_FLT((P_)p->payload, (StgFloat)f);
   return p;
 }
 
@@ -157,9 +145,8 @@ HaskellObj
 rts_mkDouble (Capability *cap, HsDouble d)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,sizeofW(StgDouble)));
-  ASSIGN_DBL((P_)p->payload, (StgDouble)d);
-  write_barrier();
   SET_HDR(p, Dzh_con_info, CCS_SYSTEM);
+  ASSIGN_DBL((P_)p->payload, (StgDouble)d);
   return p;
 }
 
@@ -167,9 +154,8 @@ HaskellObj
 rts_mkStablePtr (Capability *cap, HsStablePtr s)
 {
   StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
-  p->payload[0]  = (StgClosure *)s;
-  write_barrier();
   SET_HDR(p, StablePtr_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)s;
   return p;
 }
 
@@ -177,9 +163,8 @@ HaskellObj
 rts_mkPtr (Capability *cap, HsPtr a)
 {
   StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
-  p->payload[0]  = (StgClosure *)a;
-  write_barrier();
   SET_HDR(p, Ptr_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)a;
   return p;
 }
 
@@ -187,9 +172,8 @@ HaskellObj
 rts_mkFunPtr (Capability *cap, HsFunPtr a)
 {
   StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
-  p->payload[0]  = (StgClosure *)a;
-  write_barrier();
   SET_HDR(p, FunPtr_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)a;
   return p;
 }
 
@@ -218,10 +202,9 @@ rts_apply (Capability *cap, HaskellObj f, HaskellObj arg)
     // Here we don't want to use CCS_SYSTEM, because it's a hidden cost centre,
     // and evaluating Haskell code under a hidden cost centre leads to
     // confusing profiling output. (#7753)
+    SET_HDR(ap, (StgInfoTable *)&stg_ap_2_upd_info, CCS_MAIN);
     ap->payload[0] = f;
     ap->payload[1] = arg;
-    write_barrier();
-    SET_HDR(ap, (StgInfoTable *)&stg_ap_2_upd_info, CCS_MAIN);
     return (StgClosure *)ap;
 }
 


=====================================
rts/StgMiscClosures.cmm
=====================================
@@ -317,8 +317,10 @@ retry:
 
         MessageBlackHole_tso(msg) = CurrentTSO;
         MessageBlackHole_bh(msg) = node;
-        prim_write_barrier;
         SET_HDR(msg, stg_MSG_BLACKHOLE_info, CCS_SYSTEM);
+        // Write barrier to ensure that writes constructing Message are
+        // committed before we expose to other threads.
+        prim_write_barrier;
 
         (r) = ccall messageBlackHole(MyCapability() "ptr", msg "ptr");
 


=====================================
rts/ThreadPaused.c
=====================================
@@ -229,6 +229,7 @@ threadPaused(Capability *cap, StgTSO *tso)
 
             // If we've already marked this frame, then stop here.
             frame_info = frame->header.info;
+            // Ensure that read from frame->updatee below sees any pending writes
             load_load_barrier();
             if (frame_info == (StgInfoTable *)&stg_marked_upd_frame_info) {
                 if (prev_was_update_frame) {
@@ -239,12 +240,11 @@ threadPaused(Capability *cap, StgTSO *tso)
                 goto end;
             }
 
-            write_barrier();
             SET_INFO(frame, (StgInfoTable *)&stg_marked_upd_frame_info);
 
             bh = ((StgUpdateFrame *)frame)->updatee;
             bh_info = bh->header.info;
-            load_load_barrier();
+            load_load_barrier(); // XXX: Why is this needed?
 
 #if defined(THREADED_RTS)
         retry:


=====================================
rts/Threads.c
=====================================
@@ -82,14 +82,14 @@ createThread(Capability *cap, W_ size)
     stack_size = round_to_mblocks(size - sizeofW(StgTSO));
     stack = (StgStack *)allocate(cap, stack_size);
     TICK_ALLOC_STACK(stack_size);
+    SET_HDR(stack, &stg_STACK_info, cap->r.rCCCS);
     stack->stack_size   = stack_size - sizeofW(StgStack);
     stack->sp           = stack->stack + stack->stack_size;
     stack->dirty        = 1;
-    write_barrier();
-    SET_HDR(stack, &stg_STACK_info, cap->r.rCCCS);
 
     tso = (StgTSO *)allocate(cap, sizeofW(StgTSO));
     TICK_ALLOC_TSO();
+    SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM);
 
     // Always start with the compiled code evaluator
     tso->what_next = ThreadRunGHC;
@@ -116,9 +116,6 @@ createThread(Capability *cap, W_ size)
     tso->prof.cccs = CCS_MAIN;
 #endif
 
-    write_barrier();
-    SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM);
-
     // put a stop frame on the stack
     stack->sp -= sizeofW(StgStopFrame);
     SET_HDR((StgClosure*)stack->sp,
@@ -129,6 +126,8 @@ createThread(Capability *cap, W_ size)
     ACQUIRE_LOCK(&sched_mutex);
     tso->id = next_thread_id++;  // while we have the mutex
     tso->global_link = g0->threads;
+    /* Mutations above need no memory barrier since this lock will provide
+     * a release barrier */
     g0->threads = tso;
     RELEASE_LOCK(&sched_mutex);
 
@@ -261,8 +260,9 @@ tryWakeupThread (Capability *cap, StgTSO *tso)
         MessageWakeup *msg;
         msg = (MessageWakeup *)allocate(cap,sizeofW(MessageWakeup));
         msg->tso = tso;
-        write_barrier();
         SET_HDR(msg, &stg_MSG_TRY_WAKEUP_info, CCS_SYSTEM);
+        // Ensure that writes constructing Message are committed before sending.
+        write_barrier();
         sendMessage(cap, tso->cap, (Message*)msg);
         debugTraceCap(DEBUG_sched, cap, "message: try wakeup thread %ld on cap %d",
                       (W_)tso->id, tso->cap->no);
@@ -389,8 +389,6 @@ checkBlockingQueues (Capability *cap, StgTSO *tso)
 {
     StgBlockingQueue *bq, *next;
     StgClosure *p;
-    const StgInfoTable *bqinfo;
-    const StgInfoTable *pinfo;
 
     debugTraceCap(DEBUG_sched, cap,
                   "collision occurred; checking blocking queues for thread %ld",
@@ -399,8 +397,8 @@ checkBlockingQueues (Capability *cap, StgTSO *tso)
     for (bq = tso->bq; bq != (StgBlockingQueue*)END_TSO_QUEUE; bq = next) {
         next = bq->link;
 
-        bqinfo = bq->header.info;
-        load_load_barrier();
+        const StgInfoTable *bqinfo = bq->header.info;
+        load_load_barrier();  // XXX: Is this needed?
         if (bqinfo == &stg_IND_info) {
             // ToDo: could short it out right here, to avoid
             // traversing this IND multiple times.
@@ -408,7 +406,7 @@ checkBlockingQueues (Capability *cap, StgTSO *tso)
         }
 
         p = bq->bh;
-        pinfo = p->header.info;
+        const StgInfoTable *pinfo = p->header.info;
         load_load_barrier();
         if (pinfo != &stg_BLACKHOLE_info ||
             ((StgInd *)p)->indirectee != (StgClosure*)bq)
@@ -609,13 +607,12 @@ threadStackOverflow (Capability *cap, StgTSO *tso)
     new_stack = (StgStack*) allocate(cap, chunk_size);
     cap->r.rCurrentTSO = NULL;
 
+    SET_HDR(new_stack, &stg_STACK_info, old_stack->header.prof.ccs);
     TICK_ALLOC_STACK(chunk_size);
 
     new_stack->dirty = 0; // begin clean, we'll mark it dirty below
     new_stack->stack_size = chunk_size - sizeofW(StgStack);
     new_stack->sp = new_stack->stack + new_stack->stack_size;
-    write_barrier();
-    SET_HDR(new_stack, &stg_STACK_info, old_stack->header.prof.ccs);
 
     tso->tot_stack_size += new_stack->stack_size;
 
@@ -664,9 +661,8 @@ threadStackOverflow (Capability *cap, StgTSO *tso)
         } else {
             new_stack->sp -= sizeofW(StgUnderflowFrame);
             frame = (StgUnderflowFrame*)new_stack->sp;
-            frame->next_chunk  = old_stack;
-            write_barrier();
             frame->info = &stg_stack_underflow_frame_info;
+            frame->next_chunk  = old_stack;
         }
 
         // copy the stack chunk between tso->sp and sp to
@@ -681,6 +677,8 @@ threadStackOverflow (Capability *cap, StgTSO *tso)
         new_stack->sp -= chunk_words;
     }
 
+    // No write barriers needed; all of the writes above are to structured
+    // owned by our capability.
     tso->stackobj = new_stack;
 
     // we're about to run it, better mark it dirty


=====================================
rts/Updates.h
=====================================
@@ -39,6 +39,12 @@
                  PROF_HDR_FIELDS(w_,ccs,p2)              \
                  p_ updatee
 
+/*
+ * Getting the memory barriers correct here is quite tricky. Essentially
+ * the write barrier ensures that any writes to the new indirectee are visible
+ * before we introduce the indirection.
+ * See Note [Heap memory barriers] in SMP.h.
+ */
 #define updateWithIndirection(p1, p2, and_then) \
     W_ bd;                                                      \
                                                                 \
@@ -69,6 +75,7 @@ INLINE_HEADER void updateWithIndirection (Capability *cap,
     ASSERT( (P_)p1 != (P_)p2 );
     /* not necessarily true: ASSERT( !closure_IND(p1) ); */
     /* occurs in RaiseAsync.c:raiseAsync() */
+    /* See Note [Heap memory barriers] in SMP.h */
     write_barrier();
     OVERWRITING_CLOSURE(p1);
     ((StgInd *)p1)->indirectee = p2;


=====================================
rts/Weak.c
=====================================
@@ -42,7 +42,6 @@ void
 runAllCFinalizers(StgWeak *list)
 {
     StgWeak *w;
-    const StgInfoTable *winfo;
     Task *task;
 
     task = myTask();
@@ -58,7 +57,7 @@ runAllCFinalizers(StgWeak *list)
         // If there's no major GC between the time that the finalizer for the
         // object from the oldest generation is manually called and shutdown
         // we end up running the same finalizer twice. See #7170.
-        winfo = w->header.info;
+        const StgInfoTable *winfo = w->header.info;
         load_load_barrier();
         if (winfo != &stg_DEAD_WEAK_info) {
             runCFinalizers((StgCFinalizerList *)w->cfinalizers);
@@ -129,7 +128,6 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
         // there's a later call to finalizeWeak# on this weak pointer,
         // we don't run the finalizer again.
         SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
-        write_barrier();
     }
 
     n_finalizers = i;
@@ -142,6 +140,8 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     size = n + mutArrPtrsCardTableSize(n);
     arr = (StgMutArrPtrs *)allocate(cap, sizeofW(StgMutArrPtrs) + size);
     TICK_ALLOC_PRIM(sizeofW(StgMutArrPtrs), n, 0);
+    // No write barrier needed here; this array is only going to referred to by this core.
+    SET_HDR(arr, &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info, CCS_SYSTEM);
     arr->ptrs = n;
     arr->size = size;
 
@@ -157,9 +157,6 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
         arr->payload[i] = (StgClosure *)(W_)(-1);
     }
 
-    write_barrier();
-    SET_HDR(arr, &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info, CCS_SYSTEM);
-
     t = createIOThread(cap,
                        RtsFlags.GcFlags.initialStkSize,
                        rts_apply(cap,


=====================================
rts/linker/MachO.c
=====================================
@@ -1216,7 +1216,7 @@ ocGetNames_MachO(ObjectCode* oc)
     IF_DEBUG(linker, debugBelch("ocGetNames_MachO: will load %d sections\n",
                                 oc->n_sections));
 
-#if defined (ios_HOST_OS)
+#if defined(ios_HOST_OS)
     for(int i=0; i < oc->n_sections; i++)
     {
         MachOSection * section = &oc->info->macho_sections[i];
@@ -1641,7 +1641,7 @@ ocResolve_MachO(ObjectCode* oc)
     {
         IF_DEBUG(linker, debugBelch("ocResolve_MachO: relocating section %d\n", i));
 
-#if defined aarch64_HOST_ARCH
+#if defined(aarch64_HOST_ARCH)
         if (!relocateSectionAarch64(oc, &oc->sections[i]))
             return 0;
 #else


=====================================
rts/sm/CNF.c
=====================================
@@ -373,6 +373,7 @@ compactNew (Capability *cap, StgWord size)
                                          ALLOCATE_NEW);
 
     self = firstBlockGetCompact(block);
+    SET_HDR((StgClosure*)self, &stg_COMPACT_NFDATA_CLEAN_info, CCS_SYSTEM);
     self->autoBlockW = aligned_size / sizeof(StgWord);
     self->nursery = block;
     self->last = block;
@@ -389,9 +390,6 @@ compactNew (Capability *cap, StgWord size)
 
     debugTrace(DEBUG_compact, "compactNew: size %" FMT_Word, size);
 
-    write_barrier();
-    SET_HDR((StgClosure*)self, &stg_COMPACT_NFDATA_CLEAN_info, CCS_SYSTEM);
-
     return self;
 }
 


=====================================
rts/sm/Compact.c
=====================================
@@ -553,8 +553,6 @@ update_fwd_large( bdescr *bd )
 static /* STATIC_INLINE */ StgPtr
 thread_obj (const StgInfoTable *info, StgPtr p)
 {
-    load_load_barrier();
-
     switch (info->type) {
     case THUNK_0_1:
         return p + sizeofW(StgThunk) + 1;


=====================================
rts/sm/MarkWeak.c
=====================================
@@ -235,6 +235,7 @@ static bool tidyWeakList(generation *gen)
     for (w = gen->old_weak_ptr_list; w != NULL; w = next_w) {
 
         info = get_itbl((StgClosure *)w);
+        load_load_barrier();
 
         /* There might be a DEAD_WEAK on the list if finalizeWeak# was
          * called on a live weak pointer object.  Just remove it.


=====================================
rts/sm/Scav.c
=====================================
@@ -187,7 +187,6 @@ scavenge_compact(StgCompactNFData *str)
                str, Bdescr((P_)str)->gen_no, str->totalW * sizeof(W_))
 
     gct->eager_promotion = saved_eager;
-    write_barrier();
     if (gct->failed_to_evac) {
         ((StgClosure *)str)->header.info = &stg_COMPACT_NFDATA_DIRTY_info;
     } else {
@@ -453,7 +452,6 @@ scavenge_block (bdescr *bd)
         evacuate((StgClosure **)&mvar->value);
         gct->eager_promotion = saved_eager_promotion;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             mvar->header.info = &stg_MVAR_DIRTY_info;
         } else {
@@ -471,7 +469,6 @@ scavenge_block (bdescr *bd)
         evacuate((StgClosure **)&tvar->first_watch_queue_entry);
         gct->eager_promotion = saved_eager_promotion;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             tvar->header.info = &stg_TVAR_DIRTY_info;
         } else {
@@ -606,7 +603,6 @@ scavenge_block (bdescr *bd)
         evacuate(&((StgMutVar *)p)->var);
         gct->eager_promotion = saved_eager_promotion;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
         } else {
@@ -626,7 +622,6 @@ scavenge_block (bdescr *bd)
         evacuate((StgClosure**)&bq->link);
         gct->eager_promotion = saved_eager_promotion;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
         } else {
@@ -679,7 +674,6 @@ scavenge_block (bdescr *bd)
 
         p = scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -697,7 +691,6 @@ scavenge_block (bdescr *bd)
     {
         p = scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
@@ -723,7 +716,6 @@ scavenge_block (bdescr *bd)
         }
         gct->eager_promotion = saved_eager_promotion;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -745,7 +737,6 @@ scavenge_block (bdescr *bd)
             evacuate((StgClosure **)p);
         }
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
@@ -886,7 +877,6 @@ scavenge_mark_stack(void)
             evacuate((StgClosure **)&mvar->value);
             gct->eager_promotion = saved_eager_promotion;
 
-            write_barrier();
             if (gct->failed_to_evac) {
                 mvar->header.info = &stg_MVAR_DIRTY_info;
             } else {
@@ -903,7 +893,6 @@ scavenge_mark_stack(void)
             evacuate((StgClosure **)&tvar->first_watch_queue_entry);
             gct->eager_promotion = saved_eager_promotion;
 
-            write_barrier();
             if (gct->failed_to_evac) {
                 tvar->header.info = &stg_TVAR_DIRTY_info;
             } else {
@@ -1010,7 +999,6 @@ scavenge_mark_stack(void)
             evacuate(&((StgMutVar *)p)->var);
             gct->eager_promotion = saved_eager_promotion;
 
-            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
             } else {
@@ -1030,7 +1018,6 @@ scavenge_mark_stack(void)
             evacuate((StgClosure**)&bq->link);
             gct->eager_promotion = saved_eager_promotion;
 
-            write_barrier();
             if (gct->failed_to_evac) {
                 bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
             } else {
@@ -1079,7 +1066,6 @@ scavenge_mark_stack(void)
 
             scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
-            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
             } else {
@@ -1099,7 +1085,6 @@ scavenge_mark_stack(void)
 
             scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
-            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
             } else {
@@ -1127,7 +1112,6 @@ scavenge_mark_stack(void)
             }
             gct->eager_promotion = saved_eager;
 
-            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
             } else {
@@ -1149,7 +1133,6 @@ scavenge_mark_stack(void)
                 evacuate((StgClosure **)p);
             }
 
-            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
             } else {
@@ -1256,7 +1239,6 @@ scavenge_one(StgPtr p)
         evacuate((StgClosure **)&mvar->value);
         gct->eager_promotion = saved_eager_promotion;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             mvar->header.info = &stg_MVAR_DIRTY_info;
         } else {
@@ -1273,7 +1255,6 @@ scavenge_one(StgPtr p)
         evacuate((StgClosure **)&tvar->first_watch_queue_entry);
         gct->eager_promotion = saved_eager_promotion;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             tvar->header.info = &stg_TVAR_DIRTY_info;
         } else {
@@ -1338,7 +1319,6 @@ scavenge_one(StgPtr p)
         evacuate(&((StgMutVar *)p)->var);
         gct->eager_promotion = saved_eager_promotion;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
         } else {
@@ -1358,7 +1338,6 @@ scavenge_one(StgPtr p)
         evacuate((StgClosure**)&bq->link);
         gct->eager_promotion = saved_eager_promotion;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
         } else {
@@ -1407,7 +1386,6 @@ scavenge_one(StgPtr p)
 
         scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -1425,7 +1403,6 @@ scavenge_one(StgPtr p)
         // follow everything
         scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
@@ -1453,7 +1430,6 @@ scavenge_one(StgPtr p)
         }
         gct->eager_promotion = saved_eager;
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -1475,7 +1451,6 @@ scavenge_one(StgPtr p)
             evacuate((StgClosure **)p);
         }
 
-        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
@@ -1599,10 +1574,6 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
     StgPtr p, q;
     uint32_t gen_no;
 
-#if defined(DEBUG)
-    const StgInfoTable *pinfo;
-#endif
-
     gen_no = gen->no;
     gct->evac_gen_no = gen_no;
     for (; bd != NULL; bd = bd->link) {
@@ -1611,6 +1582,7 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
             ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
 
 #if defined(DEBUG)
+            const StgInfoTable *pinfo;
             switch (get_itbl((StgClosure *)p)->type) {
             case MUT_VAR_CLEAN:
                 // can happen due to concurrent writeMutVars
@@ -1664,7 +1636,6 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
 
                 scavenge_mut_arr_ptrs_marked((StgMutArrPtrs *)p);
 
-                write_barrier();
                 if (gct->failed_to_evac) {
                     ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
                 } else {


=====================================
rts/sm/Storage.c
=====================================
@@ -408,8 +408,9 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     // Allocate the blackhole indirection closure
     bh = (StgInd *)allocate(cap, sizeofW(*bh));
     bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
-    write_barrier();
     SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
+    // Ensure that above writes are visible before we introduce reference as CAF indirectee.
+    write_barrier();
 
     caf->indirectee = (StgClosure *)bh;
     write_barrier();



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/9caa7ab8f878a1a5e4c4fcf2fad15d94fb0fe56c...7857475924202b7732d4beb1d88da59b22360a57

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/9caa7ab8f878a1a5e4c4fcf2fad15d94fb0fe56c...7857475924202b7732d4beb1d88da59b22360a57
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20190607/b5d97566/attachment-0001.html>