[Git][ghc/ghc][wip/andreask/stm] STM: Be more optimistic when validating in-flight transactions.

Wed Mar 13 09:24:58 UTC 2024


Andreas Klebinger pushed to branch wip/andreask/stm at Glasgow Haskell Compiler / GHC


Commits:
4d1be3de by Andreas Klebinger at 2024-03-13T10:10:37+01:00
STM: Be more optimistic when validating in-flight transactions.

* Don't lock tvars when performing non-committal validation.
* If we encounter a locked tvar don't consider it a failure.

This means in-flight validation will only fail if committing at the
moment of validation is *guaranteed* to fail.

This prevents in-flight validation from failing spuriously if it happens in
parallel on multiple threads or parallel to thread comitting.

- - - - -


8 changed files:

- rts/Exception.cmm
- rts/STM.c
- rts/STM.h
- rts/Schedule.c
- rts/include/stg/SMP.h
- + testsuite/tests/rts/T24142.hs
- + testsuite/tests/rts/T24142.stdout
- testsuite/tests/rts/all.T


Changes:

=====================================
rts/Exception.cmm
=====================================
@@ -495,7 +495,7 @@ retry_pop_stack:
       W_ trec, outer;
       W_ r;
       trec = StgTSO_trec(CurrentTSO);
-      (r) = ccall stmValidateNestOfTransactions(MyCapability() "ptr", trec "ptr");
+      (r) = ccall stmValidateNestOfTransactions(MyCapability() "ptr", trec "ptr", 0);
       outer  = StgTRecHeader_enclosing_trec(trec);
       ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
       ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr");


=====================================
rts/STM.c
=====================================
@@ -359,6 +359,8 @@ static StgTRecHeader *new_stg_trec_header(Capability *cap,
 // Allocation / deallocation functions that retain per-capability lists
 // of closures that can be re-used
 
+//TODO: I think some of these lack write barriers required by the non-moving gc.
+
 static StgTVarWatchQueue *alloc_stg_tvar_watch_queue(Capability *cap,
                                                      StgClosure *closure) {
   StgTVarWatchQueue *result = NULL;
@@ -681,6 +683,44 @@ static void revert_ownership(Capability *cap STG_UNUSED,
 
 /*......................................................................*/
 
+// validate_optimistic()
+StgBool validate_trec_optimistic (Capability *cap, StgTRecHeader *trec);
+
+StgBool validate_trec_optimistic (Capability *cap, StgTRecHeader *trec) {
+  StgBool result;
+  TRACE("cap %d, trec %p : validate_trec_optimistic",
+         cap->no, trec);
+
+  if (shake()) {
+    TRACE("%p : shake, pretending trec is invalid when it may not be", trec);
+    return false;
+  }
+
+  ASSERT((trec -> state == TREC_ACTIVE) ||
+         (trec -> state == TREC_WAITING) ||
+         (trec -> state == TREC_CONDEMNED));
+  result = !((trec -> state) == TREC_CONDEMNED);
+  if (result) {
+    FOR_EACH_ENTRY(trec, e, {
+      StgTVar *s;
+      s = e -> tvar;
+      StgClosure *current = RELAXED_LOAD(&s->current_value);
+      if(current != e->expected_value &&
+          //If the trec is locked we optimistically assume our trec will still be valid after it's unlocked.
+         (GET_INFO(UNTAG_CLOSURE(current)) != &stg_TREC_HEADER_info))
+      {   TRACE("%p : failed optimistic validate %p", trec, s);
+          result = false;
+          BREAK_FOR_EACH;
+      }
+    });
+  }
+
+
+  TRACE("%p : validate_trec_optimistic, result: %d", trec, result);
+  return result;
+}
+
+
 // validate_and_acquire_ownership : this performs the twin functions
 // of checking that the TVars referred to by entries in trec hold the
 // expected values and:
@@ -751,7 +791,7 @@ static StgBool validate_and_acquire_ownership (Capability *cap,
       revert_ownership(cap, trec, acquire_all);
   }
 
-  // TRACE("%p : validate_and_acquire_ownership, result: %d", trec, result);
+  TRACE("%p : validate_and_acquire_ownership, result: %d", trec, result);
   return result;
 }
 
@@ -941,17 +981,124 @@ void stmCondemnTransaction(Capability *cap,
   TRACE("%p : stmCondemnTransaction done", trec);
 }
 
-/*......................................................................*/
-
-// Check if a transaction is known to be invalid by this point.
-// Currently we use this to:
-// * Eagerly abort invalid transactions from the scheduler.
-// * If an exception occured inside a transaction, decide weither or not to
-//   abort by checking if the transaction was valid.
-StgBool stmValidateNestOfTransactions(Capability *cap, StgTRecHeader *trec) {
+/*......................................................................
+
+Note [STM Validation]
+~~~~~~~~~~~~~~~~~~~~~
+We validate STM transactions for two purposes:
+* Ensure the trec (transaction log) is valid *after* execution. Either during
+  commit or after an exception has occured. Potentially locking the tvars in
+  the process. This is done by validate_and_acquire_ownership.
+* Terminate transactions early after their trec became invalid.
+  This is done by validate_trec_optimistic,
+
+Note that the second point is not merely a optimization. Consider transactions
+that are in an infinite loop as a result of seeing an inconsistent view of
+memory, e.g.
+
+      atomically $ do
+          [a,b] <- mapM readTVar [ta,tb]
+          -- a is never equal to b given a consistent view of memory.
+          when (a == b) loop
+
+We want to to always get a precise result for both checks. And indeed for the
+non-threaded runtime this is reasonably (see STM paper "Composable Memory Transactions").
+However for SMP things are more difficult.
+
+The easiest way to avoid false positives is to lock all relevant tvars during
+validation. And indeed that is what we still use for post-run validation.
+While this can lead to validation spuriously failing in edge cases when multiple
+threads perform validation in parallel the relevant transactions will simply be
+restarted and as long as the false-negative rate is reasonably low this is not
+problematic.
+
+However compared to post-run validation in-flight validation can happen multiple
+times per transaction. This means even a fairly low rate of spurious validation
+failures will result in a large performance hit. In the worst case preventing
+progress alltogether (See #24446).
+
+We don't want to reduce validation frequency too much to detect invalid
+transactions early. So we simply stick with the frequency "on return to scheduler"
+that's described in the stm paper.
+
+However we can improve the behaviour of in-flight validations by taking advantage
+of the fact that we can allow false positives for these.
+
+The biggest overhead we can reduce for in-flight validation is locking. We simply
+won't take any locks for in-flight validation. If the tvar is already locked we
+simply assume the value in our trec is still valid.
+
+This has the following effects:
+
+Benefits:
+* No lock contention between post-run and in-flight validations operating on the
+  same tvars. This reduces the false negative rate significantly for both.
+* Concurrent in-flight validations won't cause each other to fail spuriously
+  through lock contention.
+* No cas operations for in-flight validation reduces it's overhead significantly.
+
+Drawbacks:
+* We will sometimes fail to recognize invalid trecs as such by assuming locked
+  tvars contain valid values.
+
+Why can we simply not lock tvars for in-flight validations? Unlike with post-run
+validation if we miss part of an update which would invalidate the trec it will
+be either seen by a later validation (at the latest in the post-run validation
+which still locks). However there is one exception: Looping transactions.
+
+If a transaction loops it will *only* be validated optimistically. I think this
+is not an issue. The only way for in-flight validation to constantly
+result in false-positives is for the conflicting tvar(s) to get constantly locked
+for updates by post-run validations. Which seems impossibly unlikely over a long
+period of time. At at least not any more likely than some of the other similarly
+unlikely live-lock scenarious for the STM implementation.
+
+Alternatives:
+
+All of these primarily revolve around ways to ensure that we can recognize invalid
+looping transactions. However without proof this is a real problem implementing
+those seems not worthwhile.
+
+
+A1:
+Take locks for in-flight validation opportunistically to improve things.
+While this would solve lock contention/false positives caused
+by concurrent in-flight validations. It would still result in in-flight validation
+potentially triggering false-negatives during post-run validation by holding a
+lock a post-run validation is trying to take. Neither is it guaranteed to
+recognize a looping transaction as invalid, so this does not seem like an
+improvement to the lock-free inflight validation.
+
+A2:
+Perform occasional locking in-flight validation for long running transactions.
+This would solve the theoretical looping transaction recognition issue at the
+cost of some performance and complexity. This could done by adding a counter to
+the trec, counting the number of validations it has endured.
+
+A2.1:
+Like A2, but instead of counting the number of validations count the number of
+potentially false-positives by keeping track of how often we couldn't validate
+locked tvars. Could be done fine grained on a trec-entry bases or for the trec
+overall.
+
+A3:
+When encountering a locked tvar, validate the trec based on the value of the
+tvar before it was locked. This could be done by either adding another field
+to the tvar, or by looking for the expected value in the trec that holds the
+lock of the tvar. But neither option sounds great.
+
+
+*/
+
+// Check if a transaction is possibly invalid by this point.
+// Pessimistically - Currently we use this if an exception occured inside a transaction.
+//    To decide weither or not to abort by checking if the transaction was valid.
+// Optimistically - Currently we use this to eagerly abort invalid transactions from the scheduler.
+// See Note [STM Validation]
+StgBool stmValidateNestOfTransactions(Capability *cap, StgTRecHeader *trec, StgBool optimistically) {
   StgTRecHeader *t;
 
-  TRACE("%p : stmValidateNestOfTransactions", trec);
+  TRACE("%p : stmValidateNestOfTransactions, %b", trec, optimistically);
   ASSERT(trec != NO_TREC);
   ASSERT((trec -> state == TREC_ACTIVE) ||
          (trec -> state == TREC_WAITING) ||
@@ -960,8 +1107,13 @@ StgBool stmValidateNestOfTransactions(Capability *cap, StgTRecHeader *trec) {
   t = trec;
   StgBool result = true;
   while (t != NO_TREC) {
-    // TODO: I don't think there is a need to lock any tvars here, all even less so.
-    result &= validate_and_acquire_ownership(cap, t, true, false);
+    if(optimistically) {
+      result &= validate_trec_optimistic(cap, t);
+
+    } else {
+      // TODO: I don't think there is a need to lock all tvars here.
+      result &= validate_and_acquire_ownership(cap, t, true, false);
+    }
     t = t -> enclosing_trec;
   }
 
@@ -972,7 +1124,6 @@ StgBool stmValidateNestOfTransactions(Capability *cap, StgTRecHeader *trec) {
   TRACE("%p : stmValidateNestOfTransactions()=%d", trec, result);
   return result;
 }
-
 /*......................................................................*/
 
 static TRecEntry *get_entry_for(StgTRecHeader *trec, StgTVar *tvar, StgTRecHeader **in) {


=====================================
rts/STM.h
=====================================
@@ -20,7 +20,7 @@
                   non-conflicting transactions to commit in parallel.
                   The implementation treats reads optimistically --
                   extra versioning information is retained in the
-                  saw_update_by field of the TVars so that they do not
+                  num_updates field of the TVars so that they do not
                   need to be locked for reading.
 
   STM.C contains more details about the locking schemes used.
@@ -84,16 +84,23 @@ void stmCondemnTransaction(Capability *cap, StgTRecHeader *trec);
    Validation
    ----------
 
-  Test whether the specified transaction record, and all those within which
-  it is nested, are still valid.
+   Test whether the specified transaction record, and all those within which
+   it is nested, are still valid.
+
+   stmValidateNestOfTransactions - optimistically
+      - Can return false positives when tvars are locked.
+      - Faster
+      - Does not take any locks
+
+   stmValidateNestOfTransactions - pessimistic
+      - Can return false negatives.
+      - Slower
+      - Takes locks, negatively affecting performance of other threads.
+      - Most importantly - no false positives!
 
-  Note: the caller can assume that once stmValidateTransaction has
-  returned false for a given trec then that transaction will never
-  again be valid -- we rely on this in Schedule.c when kicking invalid
-  threads at GC (in case they are stuck looping)
 */
 
-StgBool stmValidateNestOfTransactions(Capability *cap, StgTRecHeader *trec);
+StgBool stmValidateNestOfTransactions(Capability *cap, StgTRecHeader *trec, StgBool optimistically);
 
 /*----------------------------------------------------------------------
 
@@ -110,7 +117,7 @@ StgBool stmValidateNestOfTransactions(Capability *cap, StgTRecHeader *trec);
    Note that, for nested operations, validity here is solely in terms
    of the specified trec: it does not say whether those that it may be
    nested are themselves valid.  Callers can check this with
-   stmValidateNestOfTransactions.
+   stmValidateNestOfTransactionsPessimistic.
 
    The user of the STM should ensure that it is always safe to assume that a
    transaction context is not valid when in fact it is (i.e. to return false in


=====================================
rts/Schedule.c
=====================================
@@ -1099,7 +1099,7 @@ schedulePostRunThread (Capability *cap, StgTSO *t)
     // and a is never equal to b given a consistent view of memory.
     //
     if (t -> trec != NO_TREC && t -> why_blocked == NotBlocked) {
-        if (!stmValidateNestOfTransactions(cap, t -> trec)) {
+        if (!stmValidateNestOfTransactions(cap, t -> trec, true)) {
             debugTrace(DEBUG_sched | DEBUG_stm,
                        "trec %p found wasting its time", t);
 


=====================================
rts/include/stg/SMP.h
=====================================
@@ -201,14 +201,15 @@ EXTERN_INLINE void busy_wait_nop(void);
  *   - StgWeak: finalizer
  *   - StgMVar: head, tail, value
  *   - StgMVarTSOQueue: link
- *   - StgTVar: current_value, first_watch_queue_entry
- *   - StgTVarWatchQueue: {next,prev}_queue_entry
- *   - StgTRecChunk: TODO
  *   - StgMutArrPtrs: payload
  *   - StgSmallMutArrPtrs: payload
  *   - StgThunk although this is a somewhat special case; see below
  *   - StgInd: indirectee
  *   - StgTSO: block_info
+
+ *   - StgTVar: current_value, first_watch_queue_entry
+ *   - StgTVarWatchQueue: {next,prev}_queue_entry
+ *   - StgTRecChunk: TODO
  *
  * Finally, non-pointer fields can be safely mutated without barriers as
  * they do not refer to other memory locations. Technically, concurrent


=====================================
testsuite/tests/rts/T24142.hs
=====================================
@@ -0,0 +1,63 @@
+{- This test constructs a program that used to trigger an excessive amount of STM retries. -}
+{-# LANGUAGE NumericUnderscores #-}
+{-# LANGUAGE BangPatterns #-}
+module Main where
+
+import GHC.Conc
+import Control.Concurrent (newMVar, newEmptyMVar, takeMVar, putMVar)
+import Control.Monad
+import Control.Concurrent.STM.TArray
+import Data.Array.MArray
+import Data.IORef
+
+
+main :: IO ()
+main =
+    forM_ [2..40] $ \i -> do
+        -- Run the test with an increasing number of tvars
+        let tvar_count = i * 10
+        -- print $ "Tvars: " ++ show tvar_count
+        provokeLivelock tvar_count
+
+
+-- Forks two threads running a STM transactions, both accessing the same tvars but in opposite order.
+provokeLivelock :: Int -> IO ()
+provokeLivelock n = do
+    -- Use tvar array as a convenient way to bundle up n Tvars.
+    tvarArray <- atomically $ do
+        newListArray (0,n) [0.. fromIntegral n :: Integer] :: STM (TArray Int Integer)
+    m1 <- newEmptyMVar
+    m2 <- newEmptyMVar
+    updateCount <- newIORef (0 :: Int)
+
+    let useTvars :: [Int] -> Bool -> IO ()
+        useTvars tvar_order use_writes = atomically $ do
+            -- Walk the array once in the given order to add all tvars to the transaction log.
+            unsafeIOToSTM $ atomicModifyIORef' updateCount (\i -> (i+1,()))
+            mapM_ (\i -> readArray tvarArray i >>= \(!_n) -> return ()) tvar_order
+
+
+            -- Then we just enter the scheduler a lot
+            forM_ tvar_order $ \i -> do
+                -- when use_writes $
+                --     readArray tvarArray i >>= \(!n) -> writeArray tvarArray i (n+1 :: Integer)
+                unsafeIOToSTM yield
+
+    _ <- forkIO $ do
+                    useTvars [0..n] False
+                    -- print "Thread1 done."
+                    putMVar m1 True
+    _ <- forkIO $ do
+                    useTvars (reverse [0..n]) False
+                    -- print "Thread1 done."
+                    putMVar m2 True
+    -- Wait for forked threads.
+    _ <- takeMVar m1
+    _ <- takeMVar m2
+    updates <- readIORef updateCount
+    if updates > n
+        then putStrLn $ "TVars: " ++ show n ++ ", ERROR: more than " ++ show n ++ " transaction attempts. (" ++ show updates ++")\n"
+        else putStrLn $ "TVars: " ++ show n ++ ", OK: no more than " ++ show n ++ " transaction attempts."
+
+    return ()
+


=====================================
testsuite/tests/rts/T24142.stdout
=====================================
@@ -0,0 +1,39 @@
+TVars: 20, OK: no more than 20 transaction attempts.
+TVars: 30, OK: no more than 30 transaction attempts.
+TVars: 40, OK: no more than 40 transaction attempts.
+TVars: 50, OK: no more than 50 transaction attempts.
+TVars: 60, OK: no more than 60 transaction attempts.
+TVars: 70, OK: no more than 70 transaction attempts.
+TVars: 80, OK: no more than 80 transaction attempts.
+TVars: 90, OK: no more than 90 transaction attempts.
+TVars: 100, OK: no more than 100 transaction attempts.
+TVars: 110, OK: no more than 110 transaction attempts.
+TVars: 120, OK: no more than 120 transaction attempts.
+TVars: 130, OK: no more than 130 transaction attempts.
+TVars: 140, OK: no more than 140 transaction attempts.
+TVars: 150, OK: no more than 150 transaction attempts.
+TVars: 160, OK: no more than 160 transaction attempts.
+TVars: 170, OK: no more than 170 transaction attempts.
+TVars: 180, OK: no more than 180 transaction attempts.
+TVars: 190, OK: no more than 190 transaction attempts.
+TVars: 200, OK: no more than 200 transaction attempts.
+TVars: 210, OK: no more than 210 transaction attempts.
+TVars: 220, OK: no more than 220 transaction attempts.
+TVars: 230, OK: no more than 230 transaction attempts.
+TVars: 240, OK: no more than 240 transaction attempts.
+TVars: 250, OK: no more than 250 transaction attempts.
+TVars: 260, OK: no more than 260 transaction attempts.
+TVars: 270, OK: no more than 270 transaction attempts.
+TVars: 280, OK: no more than 280 transaction attempts.
+TVars: 290, OK: no more than 290 transaction attempts.
+TVars: 300, OK: no more than 300 transaction attempts.
+TVars: 310, OK: no more than 310 transaction attempts.
+TVars: 320, OK: no more than 320 transaction attempts.
+TVars: 330, OK: no more than 330 transaction attempts.
+TVars: 340, OK: no more than 340 transaction attempts.
+TVars: 350, OK: no more than 350 transaction attempts.
+TVars: 360, OK: no more than 360 transaction attempts.
+TVars: 370, OK: no more than 370 transaction attempts.
+TVars: 380, OK: no more than 380 transaction attempts.
+TVars: 390, OK: no more than 390 transaction attempts.
+TVars: 400, OK: no more than 400 transaction attempts.


=====================================
testsuite/tests/rts/all.T
=====================================
@@ -609,3 +609,6 @@ test('T23400', [], compile_and_run, ['-with-rtsopts -A8k'])
 test('IOManager', [js_skip, when(arch('wasm32'), skip), when(opsys('mingw32'), skip),
                    pre_cmd('$MAKE -s --no-print-directory IOManager.hs')],
                   compile_and_run, [''])
+
+test('T24142', [], compile_and_run, ['-threaded -with-rtsopts "-N2"'])
+



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/4d1be3deb90759b53d78ea6d5603e33e7a28bd03

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/4d1be3deb90759b53d78ea6d5603e33e7a28bd03
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240313/e18aa519/attachment-0001.html>