[Git][ghc/ghc][wip/tsan/fix-thunk-update] Fix thunk update ordering

Ben Gamari (@bgamari) gitlab at gitlab.haskell.org
Wed Dec 13 21:03:07 UTC 2023



Ben Gamari pushed to branch wip/tsan/fix-thunk-update at Glasgow Haskell Compiler / GHC


Commits:
f0682cb3 by Ben Gamari at 2023-12-13T16:02:24-05:00
Fix thunk update ordering

Previously we attempted to ensure soundness of concurrent thunk update
by synchronizing on the access of the thunk's info table pointer field.
This was believed to be sufficient since the indirectee (which may
expose a closure allocated by another core) would not be examined
until the info table pointer update is complete.

However, it turns out that this can result in data races in the presence
of multiple threads racing a update a single thunk. For instance,
consider this interleaving under the old scheme:

            Thread A                             Thread B
            ---------                            ---------
    t=0     Enter t
      1     Push update frame
      2     Begin evaluation

      4     Pause thread
      5     t.indirectee=tso
      6     Release t.info=BLACKHOLE

      7     ... (e.g. GC)

      8     Resume thread
      9     Finish evaluation
      10    Relaxed t.indirectee=x

      11                                         Load t.info
      12                                         Acquire fence
      13                                         Inspect t.indirectee

      14    Release t.info=BLACKHOLE

Here Thread A enters thunk `t` but is soon paused, resulting in `t`
being lazily blackholed at t=6. Then, at t=10 Thread A finishes
evaluation and updates `t.indirectee` with a relaxed store.

Meanwhile, Thread B enters the blackhole. Under the old scheme this
would introduce an acquire-fence but this would only synchronize with
Thread A at t=6. Consequently, the result of the evaluation, `x`, is not
visible to Thread B, introducing a data race.

We fix this by treating the `indirectee` field as we do all other
mutable fields. This means we must always access this field with
acquire-loads and release-stores.

See #23185.

- - - - -


20 changed files:

- compiler/GHC/StgToCmm/Bind.hs
- rts/Apply.cmm
- rts/Compact.cmm
- rts/Heap.c
- rts/Interpreter.c
- rts/Messages.c
- rts/PrimOps.cmm
- rts/StableName.c
- rts/StgMiscClosures.cmm
- rts/ThreadPaused.c
- rts/Threads.c
- rts/Updates.cmm
- rts/Updates.h
- rts/include/Cmm.h
- rts/include/rts/TSANUtils.h
- rts/include/stg/SMP.h
- rts/sm/Evac.c
- rts/sm/NonMovingMark.c
- rts/sm/Storage.c
- utils/genapply/Main.hs


Changes:

=====================================
compiler/GHC/StgToCmm/Bind.hs
=====================================
@@ -721,11 +721,19 @@ emitBlackHoleCode node = do
 
   when eager_blackholing $ do
     whenUpdRemSetEnabled $ emitUpdRemSetPushThunk node
-    emitStore (cmmOffsetW platform node (fixedHdrSizeW profile)) (currentTSOExpr platform)
+    emitAtomicStore platform MemOrderRelease
+        (cmmOffsetW platform node (fixedHdrSizeW profile))
+        (currentTSOExpr platform)
     -- See Note [Heap memory barriers] in SMP.h.
-    let w = wordWidth platform
-    emitPrimCall [] (MO_AtomicWrite w MemOrderRelease)
-        [node, CmmReg (CmmGlobal $ GlobalRegUse EagerBlackholeInfo $ bWord platform)]
+    emitAtomicStore platform MemOrderRelease
+        node
+        (CmmReg (CmmGlobal $ GlobalRegUse EagerBlackholeInfo $ bWord platform))
+
+emitAtomicStore :: Platform -> MemoryOrdering -> CmmExpr -> CmmExpr -> FCode ()
+emitAtomicStore platform mord addr val =
+    emitPrimCall [] (MO_AtomicWrite w mord) [addr, val]
+  where
+    w = typeWidth $ cmmExprType platform val
 
 setupUpdate :: ClosureInfo -> LocalReg -> FCode () -> FCode ()
         -- Nota Bene: this function does not change Node (even if it's a CAF),


=====================================
rts/Apply.cmm
=====================================
@@ -108,7 +108,7 @@ again:
             IND,
             IND_STATIC:
         {
-            fun = StgInd_indirectee(fun);
+            fun = %acquire StgInd_indirectee(fun);
             goto again;
         }
         case BCO:
@@ -693,7 +693,7 @@ INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
   }
   // Can't add StgInd_indirectee(ap) to UpdRemSet here because the old value is
   // not reachable.
-  StgInd_indirectee(ap) = CurrentTSO;
+  %release StgInd_indirectee(ap) = CurrentTSO;
   SET_INFO_RELEASE(ap, __stg_EAGER_BLACKHOLE_info);
 
   /* ensure there is at least AP_STACK_SPLIM words of headroom available


=====================================
rts/Compact.cmm
=====================================
@@ -100,7 +100,7 @@ eval:
 
     // Follow indirections:
     case IND, IND_STATIC: {
-        p = StgInd_indirectee(p);
+        p = %acquire StgInd_indirectee(p);
         goto eval;
     }
 


=====================================
rts/Heap.c
=====================================
@@ -173,7 +173,7 @@ StgWord collect_pointers(StgClosure *closure, StgClosure *ptrs[]) {
         case IND:
         case IND_STATIC:
         case BLACKHOLE:
-            ptrs[nptrs++] = (StgClosure *)(((StgInd *)closure)->indirectee);
+            ptrs[nptrs++] = (StgClosure *) ACQUIRE_LOAD(&((StgInd *)closure)->indirectee);
             break;
 
         case MUT_ARR_PTRS_CLEAN:


=====================================
rts/Interpreter.c
=====================================
@@ -420,7 +420,7 @@ eval_obj:
     case IND:
     case IND_STATIC:
     {
-        tagged_obj = ((StgInd*)obj)->indirectee;
+        tagged_obj = ACQUIRE_LOAD(&((StgInd*)obj)->indirectee);
         goto eval_obj;
     }
 


=====================================
rts/Messages.c
=====================================
@@ -191,9 +191,6 @@ uint32_t messageBlackHole(Capability *cap, MessageBlackHole *msg)
     StgClosure *p;
     const StgInfoTable *info;
     do {
-        // If we are being called from stg_BLACKHOLE then TSAN won't know about the
-        // previous read barrier that makes the following access safe.
-        TSAN_ANNOTATE_BENIGN_RACE(&((StgInd*)bh)->indirectee, "messageBlackHole");
         p = UNTAG_CLOSURE(ACQUIRE_LOAD(&((StgInd*)bh)->indirectee));
         info = RELAXED_LOAD(&p->header.info);
     } while (info == &stg_IND_info);
@@ -291,7 +288,7 @@ uint32_t messageBlackHole(Capability *cap, MessageBlackHole *msg)
             // makes it into the update remembered set
             updateRemembSetPushClosure(cap, (StgClosure*)bq->queue);
         }
-        RELAXED_STORE(&msg->link, bq->queue);
+        msg->link = bq->queue;
         bq->queue = msg;
         // No barrier is necessary here: we are only exposing the
         // closure to the GC. See Note [Heap memory barriers] in SMP.h.


=====================================
rts/PrimOps.cmm
=====================================
@@ -1753,7 +1753,7 @@ loop:
     qinfo = GET_INFO_ACQUIRE(q);
     if (qinfo == stg_IND_info ||
         qinfo == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 
@@ -1821,7 +1821,7 @@ loop:
 
     if (qinfo == stg_IND_info ||
         qinfo == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 
@@ -1923,7 +1923,7 @@ loop:
 
     if (qinfo == stg_IND_info ||
         qinfo == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 
@@ -2012,7 +2012,7 @@ loop:
 
     if (qinfo == stg_IND_info ||
         qinfo == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 
@@ -2293,7 +2293,7 @@ loop:
     //Possibly IND added by removeFromMVarBlockedQueue
     if (StgHeader_info(q) == stg_IND_info ||
         StgHeader_info(q) == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 


=====================================
rts/StableName.c
=====================================
@@ -156,11 +156,11 @@ removeIndirections (StgClosure* p)
         switch (get_itbl(q)->type) {
         case IND:
         case IND_STATIC:
-            p = ((StgInd *)q)->indirectee;
+            p = ACQUIRE_LOAD(&((StgInd *)q)->indirectee);
             continue;
 
         case BLACKHOLE:
-            p = ((StgInd *)q)->indirectee;
+            p = ACQUIRE_LOAD(&((StgInd *)q)->indirectee);
             if (GET_CLOSURE_TAG(p) != 0) {
                 continue;
             } else {


=====================================
rts/StgMiscClosures.cmm
=====================================
@@ -520,8 +520,9 @@ INFO_TABLE(stg_IND,1,0,IND,"IND","IND")
     (P_ node)
 {
     TICK_ENT_DYN_IND(); /* tick */
-    ACQUIRE_FENCE;
-    node = UNTAG(StgInd_indirectee(node));
+    ACQUIRE_FENCE_ON(node + OFFSET_StgHeader_info);
+    node = %acquire StgInd_indirectee(node);
+    node = UNTAG(node);
     TICK_ENT_VIA_NODE();
     jump %GET_ENTRY(node) (node);
 }
@@ -529,8 +530,10 @@ INFO_TABLE(stg_IND,1,0,IND,"IND","IND")
     /* explicit stack */
 {
     TICK_ENT_DYN_IND(); /* tick */
-    ACQUIRE_FENCE;
-    R1 = UNTAG(StgInd_indirectee(R1));
+    ACQUIRE_FENCE_ON(R1 + OFFSET_StgHeader_info);
+    P_ p;
+    p = %acquire StgInd_indirectee(R1);
+    R1 = UNTAG(p);
     TICK_ENT_VIA_NODE();
     jump %GET_ENTRY(R1) [R1];
 }
@@ -540,8 +543,10 @@ INFO_TABLE(stg_IND_STATIC,1,0,IND_STATIC,"IND_STATIC","IND_STATIC")
     /* explicit stack */
 {
     TICK_ENT_STATIC_IND(); /* tick */
-    ACQUIRE_FENCE;
-    R1 = UNTAG(StgInd_indirectee(R1));
+    ACQUIRE_FENCE_ON(R1 + OFFSET_StgHeader_info);
+    P_ p;
+    p = %acquire StgInd_indirectee(R1);
+    R1 = UNTAG(p);
     TICK_ENT_VIA_NODE();
     jump %GET_ENTRY(R1) [R1];
 }
@@ -564,14 +569,11 @@ INFO_TABLE(stg_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE")
     TICK_ENT_DYN_IND(); /* tick */
 
 retry:
-#if defined(TSAN_ENABLED)
-    // See Note [ThreadSanitizer and fences]
-    W_ unused; unused = %acquire GET_INFO(node);
-#endif
-    // Synchronizes with the release-store in updateWithIndirection.
+    // Synchronizes with the release-store in
+    // updateWithIndirection.
     // See Note [Heap memory barriers] in SMP.h.
-    ACQUIRE_FENCE;
-    p = %relaxed StgInd_indirectee(node);
+    ACQUIRE_FENCE_ON(node + OFFSET_StgHeader_info);
+    p = %acquire StgInd_indirectee(node);
     if (GETTAG(p) != 0) {
         return (p);
     }
@@ -656,7 +658,7 @@ INFO_TABLE(stg_WHITEHOLE, 0,0, WHITEHOLE, "WHITEHOLE", "WHITEHOLE")
     i = 0;
 loop:
     // spin until the WHITEHOLE is updated
-    info = StgHeader_info(node);
+    info = %relaxed StgHeader_info(node);
     if (info == stg_WHITEHOLE_info) {
 #if defined(PROF_SPIN)
         W_[whitehole_lockClosure_spin] =
@@ -675,6 +677,7 @@ loop:
         // defined in CMM.
         goto loop;
     }
+    ACQUIRE_FENCE_ON(node + OFFSET_StgHeader_info);
     jump %ENTRY_CODE(info) (node);
 #else
     ccall barf("WHITEHOLE object (%p) entered!", R1) never returns;


=====================================
rts/ThreadPaused.c
=====================================
@@ -352,7 +352,7 @@ threadPaused(Capability *cap, StgTSO *tso)
             OVERWRITING_CLOSURE_SIZE(bh, closure_sizeW_(bh, INFO_PTR_TO_STRUCT(bh_info)));
 
             // The payload of the BLACKHOLE points to the TSO
-            ((StgInd *)bh)->indirectee = (StgClosure *)tso;
+            RELEASE_STORE(&((StgInd *)bh)->indirectee, (StgClosure *)tso);
             SET_INFO_RELEASE(bh,&stg_BLACKHOLE_info);
 
             // .. and we need a write barrier, since we just mutated the closure:


=====================================
rts/Threads.c
=====================================
@@ -437,7 +437,7 @@ checkBlockingQueues (Capability *cap, StgTSO *tso)
         p = UNTAG_CLOSURE(bq->bh);
         const StgInfoTable *pinfo = ACQUIRE_LOAD(&p->header.info);
         if (pinfo != &stg_BLACKHOLE_info ||
-            ((StgInd *)p)->indirectee != (StgClosure*)bq)
+            (RELAXED_LOAD(&((StgInd *)p)->indirectee) != (StgClosure*)bq))
         {
             wakeBlockingQueue(cap,bq);
         }
@@ -468,7 +468,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val)
         return;
     }
 
-    v = UNTAG_CLOSURE(((StgInd*)thunk)->indirectee);
+    v = UNTAG_CLOSURE(ACQUIRE_LOAD(&((StgInd*)thunk)->indirectee));
 
     updateWithIndirection(cap, thunk, val);
 
@@ -808,7 +808,7 @@ loop:
     qinfo = ACQUIRE_LOAD(&q->header.info);
     if (qinfo == &stg_IND_info ||
         qinfo == &stg_MSG_NULL_info) {
-        q = (StgMVarTSOQueue*)((StgInd*)q)->indirectee;
+        q = (StgMVarTSOQueue*) ACQUIRE_LOAD(&((StgInd*)q)->indirectee);
         goto loop;
     }
 


=====================================
rts/Updates.cmm
=====================================
@@ -59,7 +59,7 @@ INFO_TABLE_RET ( stg_marked_upd_frame, UPDATE_FRAME,
     ASSERT(HpAlloc == 0); // Note [HpAlloc]
 
     // we know the closure is a BLACKHOLE
-    v = StgInd_indirectee(updatee);
+    v = %acquire StgInd_indirectee(updatee);
 
     if (GETTAG(v) != 0) (likely: False) {
         // updated by someone else: discard our value and use the


=====================================
rts/Updates.h
=====================================
@@ -261,6 +261,66 @@
  * `tso_1` and other blocked threads may be unblocked more quickly.
  *
  *
+ * Waking up blocking queues
+ * -------------------------
+ * As noted above, when a thread updates a `BLACKHOLE`'d thunk it may find that
+ * some threads have added themselves to the thunk's blocking queue. Naturally,
+ * we must ensure that these threads are woken up. However, this gets a bit
+ * subtle since multiple threads may have raced to enter the thunk.
+ *
+ * That is, we may end up in a situation like one of these (TODO audit):
+ *
+ * ### Race A
+ *
+ *     Thread 0                      Thread 1                     Thread 2
+ *     --------------------------    --------------------------   ----------------------
+ *     enter thnk
+ *                                   enter thnk
+ *     thnk.indirectee := tso_0
+ *                                   thnk.indirectee := tso_1
+ *     thnk.info := BLACKHOLE
+ *                                   thnk.info := BLACKHOLE
+ *                                                                enter, block on thnk
+ *                                                                send MSG_BLACKHOLE to tso_1->cap
+ *     finishes evaluation
+ *     thnk.indirectee := result
+ *                                   handle MSG_BLACKHOLE
+ *                                   add
+ *
+ * ### Race B
+ *
+ *     Thread 0                      Thread 1                     Thread 2
+ *     --------------------------    --------------------------   ----------------------
+ *     enter thnk
+ *                                   enter thnk
+ *     thnk.indirectee := tso_0
+ *                                   thnk.indirectee := tso_1
+ *     thnk.info := BLACKHOLE
+ *                                   thnk.info := BLACKHOLE
+ *                                                                enter, block on thnk
+ *                                                                send MSG_BLACKHOLE to tso_1->cap
+ *                                   handle MSG_BLACKHOLE
+ *                                   add
+ *     finishes evaluation
+ *     thnk.indirectee := result
+ *
+ * ### Race C
+ *
+ *     Thread 0                      Thread 1                     Thread 2
+ *     --------------------------    --------------------------   ----------------------
+ *     enter thnk
+ *                                   enter thnk
+ *     thnk.indirectee := tso_0
+ *     thnk.info := BLACKHOLE
+ *                                                                enter, block on thnk
+ *                                                                send MSG_BLACKHOLE to tso_0->cap
+ *     handle MSG_BLACKHOLE
+ *     thnk.indirectee := new BLOCKING_QUEUE
+ *
+ *                                   thnk.indirectee := tso_1
+ *                                   thnk.info := BLACKHOLE
+ *
+ *
  * Exception handling
  * ------------------
  * When an exception is thrown to a thread which is evaluating a thunk, it is
@@ -400,8 +460,8 @@
     }                                                           \
                                                                 \
     OVERWRITING_CLOSURE(p1);                                    \
-    %relaxed StgInd_indirectee(p1) = p2;                        \
-    SET_INFO_RELEASE(p1, stg_BLACKHOLE_info);                   \
+    %release StgInd_indirectee(p1) = p2;                        \
+    %release SET_INFO(p1, stg_BLACKHOLE_info);                  \
     LDV_RECORD_CREATE(p1);                                      \
     and_then;
 


=====================================
rts/include/Cmm.h
=====================================
@@ -35,6 +35,7 @@
 #define CMINUSMINUS 1
 
 #include "ghcconfig.h"
+#include "rts/TSANUtils.h"
 
 /* -----------------------------------------------------------------------------
    Types
@@ -311,7 +312,7 @@
 #define ENTER(x) ENTER_(return,x)
 #endif
 
-#define ENTER_R1() ENTER_(RET_R1,R1)
+#define ENTER_R1() P_ _r1; _r1 = R1; ENTER_(RET_R1, _r1)
 
 #define RET_R1(x) jump %ENTRY_CODE(Sp(0)) [R1]
 
@@ -326,7 +327,7 @@
     IND,                                                \
     IND_STATIC:                                         \
    {                                                    \
-      x = StgInd_indirectee(x);                         \
+      x = %acquire StgInd_indirectee(x);                \
       goto again;                                       \
    }                                                    \
   case                                                  \
@@ -446,9 +447,17 @@
    HP_CHK_P(bytes);                             \
    TICK_ALLOC_RTS(bytes);
 
+// Load a field out of structure with relaxed ordering.
+#define RELAXED_LOAD_FIELD(fld, ptr) \
+    REP_##fld![(ptr) + OFFSET_##fld]
+
+// Load a field out of an StgClosure with relaxed ordering.
+#define RELAXED_LOAD_CLOSURE_FIELD(fld, ptr) \
+    REP_##fld![(ptr) + SIZEOF_StgHeader + OFFSET_##fld]
+
 #define CHECK_GC()                                                      \
   (bdescr_link(CurrentNursery) == NULL ||                               \
-   generation_n_new_large_words(W_[g0]) >= TO_W_(CLong[large_alloc_lim]))
+   RELAXED_LOAD_FIELD(generation_n_new_large_words, W_[g0]) >= TO_W_(CLong[large_alloc_lim]))
 
 // allocate() allocates from the nursery, so we check to see
 // whether the nursery is nearly empty in any function that uses
@@ -688,9 +697,13 @@
 #define RELEASE_FENCE prim %fence_release();
 #define ACQUIRE_FENCE prim %fence_acquire();
 
-// TODO
-#if 1
+#if TSAN_ENABLED
+// This is may be efficient than a fence but TSAN can reason about it.
+#if WORD_SIZE_IN_BITS == 64
 #define ACQUIRE_FENCE_ON(x) if (1) { W_ tmp; (tmp) = prim %load_acquire64(x); }
+#elif WORD_SIZE_IN_BITS == 32
+#define ACQUIRE_FENCE_ON(x) if (1) { W_ tmp; (tmp) = prim %load_acquire32(x); }
+#endif
 #else
 #define ACQUIRE_FENCE_ON(x) ACQUIRE_FENCE
 #endif


=====================================
rts/include/rts/TSANUtils.h
=====================================
@@ -78,6 +78,8 @@
 #error TSAN cannot be enabled without C11 atomics support.
 #endif
 
+#if !defined(CMINUSMINUS)
+
 #define TSAN_ANNOTATE_HAPPENS_BEFORE(addr)                              \
     AnnotateHappensBefore(__FILE__, __LINE__, (void*)(addr))
 #define TSAN_ANNOTATE_HAPPENS_AFTER(addr)                               \
@@ -106,3 +108,4 @@ uint32_t ghc_tsan_atomic32_compare_exchange(uint32_t *ptr, uint32_t expected, ui
 uint16_t ghc_tsan_atomic16_compare_exchange(uint16_t *ptr, uint16_t expected, uint16_t new_value, int success_memorder, int failure_memorder);
 uint8_t ghc_tsan_atomic8_compare_exchange(uint8_t *ptr, uint8_t expected, uint8_t new_value, int success_memorder, int failure_memorder);
 
+#endif


=====================================
rts/include/stg/SMP.h
=====================================
@@ -110,6 +110,47 @@ EXTERN_INLINE void busy_wait_nop(void);
 #endif // !IN_STG_CODE
 
 /*
+ * Note [C11 memory model]
+ * ~~~~~~~~~~~~~~~~~~~~~~~
+ * When it comes to memory, real multiprocessors provide a wide range of
+ * concurrency semantics due to out-of-order execution and caching.
+ * To provide consistent reasoning across architectures, GHC relies the C11
+ * memory model. Not only does this provide a well-studied, fairly
+ * easy-to-understand conceptual model, but the C11 memory model gives us
+ * access to a number of tools which help us verify the compiler (see Note
+ * [ThreadSanitizer] in rts/include/rts/TSANUtils.h).
+ *
+ * Under the C11 model, each processor can be imagined to have a potentially
+ * out-of-date view onto the system's memory, which can be manipulated with two
+ * classes of memory operations:
+ *
+ *  - non-atomic operations (e.g. loads and stores) operate strictly on the
+ *    processor's local view of memory and consequently may not be visible
+ *    from other processors.
+ *
+ *  - atomic operations (e.g. load, store, fetch-and-{add,subtract,and,or},
+ *    exchange, and compare-and-swap) parametrized by ordering semantics.
+ *
+ * The ordering semantics of an operation (acquire, release, or sequentially
+ * consistent) will determine the amount of synchronization the operation
+ * requires.
+ *
+ * A processor may synchronize its
+ * view of memory with that of another processor by performing an atomic
+ * memory operation.
+ *
+ * While non-atomic operations can be thought of as operating on a local
+ *
+ * See also:
+ *
+ *   - The C11 standard, ISO/IEC 14882 2011.
+ *
+ *   - Boehm, Adve. "Foundations of the C++ Concurrency Memory Model."
+ *     PLDI '08.
+ *
+ *   - Batty, Owens, Sarkar, Sewall, Weber. "Mathematizing C++ Concurrency."
+ *     POPL '11.
+ *
  * Note [Heap memory barriers]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
  * Machines with weak memory ordering semantics have consequences for how
@@ -118,31 +159,40 @@ EXTERN_INLINE void busy_wait_nop(void);
  * stores which formed the new object are visible (e.g. stores are flushed from
  * cache and the relevant cachelines invalidated in other cores).
  *
- * To ensure this we must use memory barriers. Which barriers are required to
- * access a field depends upon the type of the field. In general, fields come
- * in three flavours:
+ * To ensure this we must issue memory barriers when accessing closures and
+ * their fields. Since reasoning about concurrent memory access with barriers tends to be
+ * subtle and platform dependent, it is more common to instead write programs
+ * in terms of an abstract memory model and let the compiler (GHC and the
+ * system's C compiler) worry about what barriers are needed to realize the
+ * requested semantics on the target system. GHC relies on the widely used C11
+ * memory model for this; see Note [C11 memory model] for a brief introduction.
  *
- *  * Mutable GC Pointers (C type StgClosure*, Cmm type StgPtr)
- *  * Immutable GC Pointers (C type MUT_FIELD StgClosure*, Cmm type StgPtr)
- *  * Non-pointers (C type StgWord, Cmm type StdWord)
+ * Also note that the majority of this Note are only concerned with mutation
+ * by the mutator. The GC is free to change nearly any field (which is
+ * necessary for a moving GC). Naturally, doing this safely requires care which
+ * we discuss in the "Barriers during GC" section below.
  *
- * Note that Addr# fields are *not* GC pointers and therefore are classified
- * as non-pointers. Responsibility for barriers lies with the party
- * dereferencing the pointer.
+ * Field access
+ * ------------
+ * Which barriers are required to access a field of a closure depends upon the
+ * identity of the field. In general, fields come in three flavours:
  *
- * Also note that we are only concerned with mutation by the mutator. The GC
- * is free to change nearly any field as this is necessary for a moving GC.
- * Naturally, doing this safely requires care which we discuss in section
- * below.
+ *  * Mutable GC Pointers (C type `StgClosure*`, Cmm type `StgPtr`)
+ *  * Immutable GC Pointers (C type `MUT_FIELD StgClosure*`, Cmm type `StgPtr`)
+ *  * Non-pointers (C type `StgWord`, Cmm type `StgWord`)
+ *
+ * Note that Addr# fields are *not* GC pointers and therefore are classified
+ * as non-pointers. In this case responsibility for barriers lies with the
+ * party dereferencing the Addr#.
  *
  * Immutable pointer fields are those which the mutator cannot change after
  * an object is made visible on the heap. Most objects' fields are of this
  * flavour (e.g. all data constructor fields). As these fields are written
  * precisely once, no write barriers are needed on writes nor reads. This is
  * safe due to an argument hinging on causality: Consider an immutable field F
- * of an object O refers to object O'. Naturally, O' must have been visible to
- * the creator of O when O was constructed. Consequently, if O is visible to a
- * reader, O' must also be visible.
+ * of an object O which refers to object O'. Naturally, O' must have been
+ * visible to the creator of O when O was constructed. Consequently, if O is
+ * visible to a reader, O' must also be visible to the same reader.
  *
  * Mutable pointer fields are those which can be modified by the mutator. These
  * require a bit more care as they may break the causality argument given
@@ -151,6 +201,10 @@ EXTERN_INLINE void busy_wait_nop(void);
  * into F. Without explicit synchronization O' may not be visible to another
  * thread attempting to dereference F.
  *
+ * To ensure the visibility of the referent, writing to a mutable pointer field
+ * must be done via a release-store. Conversely, reading from such a field is
+ * done via an acquire-load.
+ *
  * Mutable fields include:
  *
  *   - StgMutVar: var
@@ -163,64 +217,102 @@ EXTERN_INLINE void busy_wait_nop(void);
  *   - StgMutArrPtrs: payload
  *   - StgSmallMutArrPtrs: payload
  *   - StgThunk although this is a somewhat special case; see below
- *
- * Writing to a mutable pointer field must be done via a release-store.
- * Reading from such a field is done via an acquire-load.
+ *   - StgInd: indirectee
  *
  * Finally, non-pointer fields can be safely mutated without barriers as
- * they do not refer to other memory. Technically, concurrent accesses to
- * non-pointer fields still do need to be atomic in many cases to avoid torn
- * accesses. However, this is something that we generally avoid by locking
- * closures prior to mutating non-pointer fields (see Locking closures below).
- *
- * Note that MUT_VARs offer both synchronized and unsynchronized primops.
- * Consequently, in these cases there is a burden on the user to ensure that
- * synchronization is provided where necessary.
+ * they do not refer to other memory locations. Technically, concurrent
+ * accesses to non-pointer fields still do need to be atomic in many cases to
+ * avoid torn accesses. However, this is something that we generally avoid by
+ * locking closures prior to mutating non-pointer fields (see Locking closures
+ * below).
  *
  * Locking closures
  * ----------------
  * Several primops temporarily turn closures into WHITEHOLEs to ensure that
  * they have exclusive access (see SMPClosureOps.h:reallyLockClosure).
+ * These include,
+ *
+ *   - takeMVar#, tryTakeMVar#
+ *   - putMVar#, tryPutMVar#
+ *   - readMVar#, tryReadMVar#
+ *   - readIOPort#
+ *   - writeIOPort#
+ *   - addCFinalizerToWeak#
+ *   - finalizeWeak#
+ *   - deRefWeak#
+ *
  * Locking is done via an atomic exchange operation on the closure's info table
  * pointer with sequential consistency (although only acquire ordering is
- * needed). This acquire ensures that we synchronize with any previous thread
- * that had locked the closure. Consequently, it is important that we take great
- * care in examining the mutable fields of a lockable closure prior to having
- * locked it.
- *
- * Naturally, unlocking is done via a release-store to restore the closure's
- * original info table pointer.
+ * needed). Similarly, unlocking is also done with an atomic exchange to
+ * restore the closure's original info table pointer (although
+ * this time only the release ordering is needed). This ensures
+ * that we synchronize with any previous thread that had locked the closure.
  *
  * Thunks
  * ------
  * As noted above, thunks are a rather special (yet quite common) case. In
- * particular, they have the unique property of being updatable, transforming
- * from a thunk to an indirection. This transformation requires its own
- * synchronization protocol. In particular, we must ensure that a reader
- * examining a thunk being updated can see the indirectee. Consequently, a
- * thunk update (see rts/Updates.h) does the following:
+ * particular, they have the unique property of being updatable (that is, can
+ * be transformed from a thunk into an indirection after evaluation). This
+ * transformation requires its own synchronization protocol to mediate the
+ * interaction between the updater and the reader. In particular, we
+ * must ensure that a reader examining a thunk being updated by another core
+ * can see the indirectee. Consequently, a thunk update (see rts/Updates.h)
+ * does the following:
+ *
+ *  U1. use a release-store to place the new indirectee into the thunk's
+ *      indirectee field
  *
- *  1. Use a relaxed-store to place the new indirectee into the thunk's
- *     indirectee field
- *  2. use a release-store to set the info table to stg_BLACKHOLE (which
- *     represents an indirection)
+ *  U2. use a release-store to set the info table to stg_BLACKHOLE (which
+ *      represents an indirection)
  *
  * Blackholing a thunk (either eagerly, by GHC.StgToCmm.Bind.emitBlackHoleCode,
  * or lazily, by ThreadPaused.c:threadPaused) is done similarly.
  *
- * Conversely, indirection entry (see the entry code of stg_BLACKHOLE, stg_IND,
- * and stg_IND_STATIC in rts/StgMiscClosure.cmm) does the following:
- *
- *  1. We jump into the entry code for, e.g., stg_BLACKHOLE; this of course
- *     implies that we have already read the thunk's info table pointer, which
- *     is done with a relaxed load.
- *  2. use an acquire-fence to ensure that our view on the thunk is
- *     up-to-date. This synchronizes with step (2) in the update
- *     procedure.
- *  3. relaxed-load the indirectee. Since thunks are updated at most
- *     once we know that the fence in the last step has given us
- *     an up-to-date view of the indirectee closure.
- *  4. enter the indirectee (or block if the indirectee is a TSO)
+ * Conversely, entering an indirection (see the entry code of stg_BLACKHOLE,
+ * stg_IND, and stg_IND_STATIC in rts/StgMiscClosure.cmm) does the
+ * following:
+ *
+ *  E1. jump into the entry code of the indirection (e.g. stg_BLACKHOLE);
+ *      this of course implies that we have already read the thunk's info table
+ *      pointer, which is done with a relaxed load.
+ *
+ *  E2. acquire-fence
+ *
+ *  E3. acquire-load the indirectee. Since thunks are updated at most
+ *      once we know that the fence in the last step has given us
+ *      an up-to-date view of the indirectee closure.
+ *
+ *  E4. enter the indirectee (or block if the indirectee is a TSO)
+ *
+ * The release/acquire pair (U2)/(E2) is somewhat surprising but is necessary as
+ * the C11 memory model does not guarantee that the store (U1) is visible to
+ * (E3) despite (U1) preceding (U2) in program-order (due to the relaxed
+ * ordering of (E3)). This is demonstrated by the following CppMem model:
+ *
+ *     int main() {
+ *       atomic_int x = 0;    // info table pointer
+ *       atomic_int y = 0;    // indirectee
+ *       {{{
+ *         {    // blackhole update
+ *           y.store(1, memory_order_release);                // U1
+ *           x.store(2, memory_order_release);                // U2
+ *         }
+ *       |||
+ *         {    // blackhole entry
+ *           r1=x.load(memory_order_relaxed).readsvalue(2);   // E1
+ *           //fence(memory_order_acquire);                   // E2
+ *           r2=y.load(memory_order_acquire);                 // E3
+ *         }
+ *       }}};
+ *       return 0;
+ *     }
+ *
+ * Under the C11 memory model this program admits an execution where the
+ * indirectee `r2=0`.
+ *
+ * Of course, this could also be addressed by strengthing the ordering of (E1)
+ * to acquire, but this would incur a significant cost on every closure entry
+ * (including non-blackholes).
  *
  * Other closures
  * --------------
@@ -328,6 +420,12 @@ EXTERN_INLINE void busy_wait_nop(void);
  * The work-stealing queue (WSDeque) also requires barriers; these are
  * documented in WSDeque.c.
  *
+ * Verifying memory ordering
+ * -------------------------
+ * To verify that GHC's RTS and the code produced by the compiler are free of
+ * data races we employ ThreadSaniziter. See Note [ThreadSanitizer] in TSANUtils.h
+ * for details on this facility.
+ *
  */
 
 /* ----------------------------------------------------------------------------


=====================================
rts/sm/Evac.c
=====================================
@@ -1542,7 +1542,7 @@ selector_loop:
 bale_out:
     // We didn't manage to evaluate this thunk; restore the old info
     // pointer.  But don't forget: we still need to evacuate the thunk itself.
-    SET_INFO((StgClosure *)p, (const StgInfoTable *)info_ptr);
+    SET_INFO_RELAXED((StgClosure *)p, (const StgInfoTable *)info_ptr);
     // THREADED_RTS: we just unlocked the thunk, so another thread
     // might get in and update it.  copy() will lock it again and
     // check whether it was updated in the meantime.


=====================================
rts/sm/NonMovingMark.c
=====================================
@@ -688,8 +688,9 @@ void updateRemembSetPushThunkEager(Capability *cap,
     case IND:
     {
         StgInd *ind = (StgInd *) thunk;
-        if (check_in_nonmoving_heap(ind->indirectee)) {
-            push_closure(queue, ind->indirectee, NULL);
+        StgClosure *indirectee = ACQUIRE_LOAD(&ind->indirectee);
+        if (check_in_nonmoving_heap(indirectee)) {
+            push_closure(queue, indirectee, NULL);
         }
         break;
     }
@@ -1587,7 +1588,7 @@ mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin)
         // Synchronizes with the release-store in updateWithIndirection.
         // See Note [Heap memory barriers] in SMP.h.
         StgInd *ind = (StgInd *) p;
-        ACQUIRE_FENCE();
+        ACQUIRE_FENCE_ON(&p->header.info);
         StgClosure *indirectee = RELAXED_LOAD(&ind->indirectee);
         markQueuePushClosure(queue, indirectee, &ind->indirectee);
         if (GET_CLOSURE_TAG(indirectee) == 0 || origin == NULL) {


=====================================
rts/sm/Storage.c
=====================================
@@ -596,8 +596,6 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
     SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
 
-    // RELEASE ordering to ensure that above writes are visible before we
-    // introduce reference as CAF indirectee.
     RELEASE_STORE(&caf->indirectee, (StgClosure *) bh);
     SET_INFO_RELEASE((StgClosure*)caf, &stg_IND_STATIC_info);
 


=====================================
utils/genapply/Main.hs
=====================================
@@ -783,7 +783,11 @@ genApply regstatus args =
         text "case IND,",
         text "     IND_STATIC: {",
         nest 4 (vcat [
-          text "R1 = StgInd_indirectee(R1);",
+          -- N.B. annoyingly the %acquire syntax must place its result in a local register
+          -- as it is a Cmm prim call node.
+          text "P_ p;",
+          text "p = %acquire StgInd_indirectee(R1);",
+          text "R1 = p;",
             -- An indirection node might contain a tagged pointer
           text "goto again;"
          ]),



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/f0682cb33419e7ae3f70d28fd8fd0d651cb1376e

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/f0682cb33419e7ae3f70d28fd8fd0d651cb1376e
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20231213/3f64f1f2/attachment-0001.html>


More information about the ghc-commits mailing list