[Git][ghc/ghc][wip/T22038] compiler: Rework handling of mutator aborting

Ben Gamari (@bgamari) gitlab at gitlab.haskell.org
Fri Aug 19 21:10:22 UTC 2022



Ben Gamari pushed to branch wip/T22038 at Glasgow Haskell Compiler / GHC


Commits:
6d9802a6 by Ben Gamari at 2022-08-19T17:09:44-04:00
compiler: Rework handling of mutator aborting

Previously `-dtag-inference-checks`, `-dcheck-prim-bounds`, and
`-falignment-sanitization` all aborted by calling `barf` from the
mutator. However, this can lead to deadlocks in the threaded RTS. For
instance, in the case of `-dcheck-prim-bounds` the following can happen

1. the mutator takes a capability and begins execution
2. the bounds check fails, calling `barf`
3. `barf` calls `rtsFatalInternalErrorFn`, which in turn calls `endEventLogging`
4. `endEventLogging` calls `flushEventLog`, which it turn initiates a
   sync to request that all capabilities flush their local event logs
5. we deadlock as the the capability held by the crashing mutator can
   never join the sync

To avoid this we now have a more principled means of aborting: we return
to the scheduler setting the thread's return value to ThreadAborting.
The scheduler will see this and call `barf`.

Fixes #22038.

- - - - -


9 changed files:

- compiler/GHC/Cmm/CLabel.hs
- compiler/GHC/StgToCmm/Prim.hs
- compiler/GHC/StgToCmm/TagCheck.hs
- compiler/GHC/StgToCmm/Utils.hs
- rts/PrimOps.cmm
- rts/RtsMessages.c
- rts/Schedule.c
- rts/StgMiscClosures.cmm
- rts/include/rts/Constants.h


Changes:

=====================================
compiler/GHC/Cmm/CLabel.hs
=====================================
@@ -64,6 +64,7 @@ module GHC.Cmm.CLabel (
         mkSMAP_FROZEN_DIRTY_infoLabel,
         mkSMAP_DIRTY_infoLabel,
         mkBadAlignmentLabel,
+        mkTagInferenceCheckFailureLabel,
         mkOutOfBoundsAccessLabel,
         mkArrWords_infoLabel,
         mkSRTInfoLabel,
@@ -637,8 +638,9 @@ mkDirty_MUT_VAR_Label,
     mkTopTickyCtrLabel,
     mkCAFBlackHoleInfoTableLabel,
     mkSMAP_FROZEN_CLEAN_infoLabel, mkSMAP_FROZEN_DIRTY_infoLabel,
-    mkSMAP_DIRTY_infoLabel, mkBadAlignmentLabel,
-    mkOutOfBoundsAccessLabel, mkMUT_VAR_CLEAN_infoLabel :: CLabel
+    mkSMAP_DIRTY_infoLabel, mkMUT_VAR_CLEAN_infoLabel,
+    mkBadAlignmentLabel, mkTagInferenceCheckFailureLabel, mkOutOfBoundsAccessLabel
+    :: CLabel
 mkDirty_MUT_VAR_Label           = mkForeignLabel (fsLit "dirty_MUT_VAR") Nothing ForeignLabelInExternalPackage IsFunction
 mkNonmovingWriteBarrierEnabledLabel
                                 = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "nonmoving_write_barrier_enabled") CmmData
@@ -655,9 +657,10 @@ mkArrWords_infoLabel            = CmmLabel rtsUnitId (NeedExternDecl False) (fsL
 mkSMAP_FROZEN_CLEAN_infoLabel   = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN") CmmInfo
 mkSMAP_FROZEN_DIRTY_infoLabel   = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY") CmmInfo
 mkSMAP_DIRTY_infoLabel          = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "stg_SMALL_MUT_ARR_PTRS_DIRTY") CmmInfo
-mkBadAlignmentLabel             = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "stg_badAlignment")      CmmEntry
-mkOutOfBoundsAccessLabel        = mkForeignLabel (fsLit "rtsOutOfBoundsAccess") Nothing ForeignLabelInExternalPackage IsFunction
-mkMUT_VAR_CLEAN_infoLabel       = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "stg_MUT_VAR_CLEAN")     CmmInfo
+mkMUT_VAR_CLEAN_infoLabel       = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "stg_MUT_VAR_CLEAN")            CmmInfo
+mkBadAlignmentLabel             = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "stg_badAlignment")             CmmPrimCall
+mkOutOfBoundsAccessLabel        = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "stg_outOfBoundsAccess")        CmmPrimCall
+mkTagInferenceCheckFailureLabel = CmmLabel rtsUnitId (NeedExternDecl False) (fsLit "stg_tagInferenceCheckFailure") CmmPrimCall
 
 mkSRTInfoLabel :: Int -> CLabel
 mkSRTInfoLabel n = CmmLabel rtsUnitId (NeedExternDecl False) lbl CmmInfo


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -3211,7 +3211,7 @@ doBoundsCheck idx sz = do
     when do_bounds_check (doCheck platform)
   where
     doCheck platform = do
-        boundsCheckFailed <- getCode $ emitCCall [] (mkLblExpr mkOutOfBoundsAccessLabel) []
+        boundsCheckFailed <- getCode $ emitCall (NativeNodeCall, NativeReturn) (mkLblExpr mkOutOfBoundsAccessLabel) [idx, sz]
         emit =<< mkCmmIfThen' isOutOfBounds boundsCheckFailed (Just False)
       where
         uGE = cmmUGeWord platform


=====================================
compiler/GHC/StgToCmm/TagCheck.hs
=====================================
@@ -19,9 +19,13 @@ import GHC.Prelude
 import GHC.StgToCmm.Env
 import GHC.StgToCmm.Monad
 import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Layout (emitCall)
+import GHC.StgToCmm.Lit (newStringCLit)
 import GHC.Cmm
 import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel (mkTagInferenceCheckFailureLabel)
 import GHC.Cmm.Graph as CmmGraph
+import GHC.Cmm.Utils
 
 import GHC.Core.Type
 import GHC.Types.Id
@@ -95,7 +99,8 @@ emitTagAssertion onWhat fun = do
   ; needsArgTag fun lbarf lret
 
   ; emitLabel lbarf
-  ; emitBarf ("Tag inference failed on:" ++ onWhat)
+  ; onWhat_str <- newStringCLit onWhat
+  ; _ <- emitCall (NativeNodeCall, NativeReturn) (mkLblExpr mkTagInferenceCheckFailureLabel) [CmmLit onWhat_str]
   ; emitLabel lret
   }
 


=====================================
compiler/GHC/StgToCmm/Utils.hs
=====================================
@@ -12,7 +12,6 @@ module GHC.StgToCmm.Utils (
         emitDataLits, emitRODataLits,
         emitDataCon,
         emitRtsCall, emitRtsCallWithResult, emitRtsCallGen,
-        emitBarf,
         assignTemp, newTemp,
 
         newUnboxedTupleRegs,
@@ -158,11 +157,6 @@ tagToClosure platform tycon tag
 --
 -------------------------------------------------------------------------
 
-emitBarf :: String -> FCode ()
-emitBarf msg = do
-  strLbl <- newStringCLit msg
-  emitRtsCall rtsUnitId (fsLit "barf") [(CmmLit strLbl,AddrHint)] False
-
 emitRtsCall :: UnitId -> FastString -> [(CmmExpr,ForeignHint)] -> Bool -> FCode ()
 emitRtsCall pkg fun = emitRtsCallGen [] (mkCmmCodeLabel pkg fun)
 


=====================================
rts/PrimOps.cmm
=====================================
@@ -42,7 +42,7 @@ import CLOSURE CCS_MAIN;
 
 #if defined(DEBUG)
 #define ASSERT_IN_BOUNDS(ind, sz) \
-    if (ind >= sz) { ccall rtsOutOfBoundsAccess(); }
+    if (ind >= sz) { ccall stg_outOfBoundsAccess(ind, sz); }
 #else
 #define ASSERT_IN_BOUNDS(ind, sz)
 #endif
@@ -1150,7 +1150,7 @@ stg_threadStatuszh ( gcptr tso )
  * TVar primitives
  * -------------------------------------------------------------------------- */
 
-stg_abort /* no arg list: explicit stack layout */
+stg_abort_tx /* no arg list: explicit stack layout */
 {
     W_ frame_type;
     W_ frame;
@@ -1159,7 +1159,7 @@ stg_abort /* no arg list: explicit stack layout */
     W_ r;
 
     // STM operations may allocate
-    MAYBE_GC_ (stg_abort); // NB. not MAYBE_GC(), we cannot make a
+    MAYBE_GC_ (stg_abort_tx); // NB. not MAYBE_GC(), we cannot make a
                            // function call in an explicit-stack proc
 
     // Find the enclosing ATOMICALLY_FRAME
@@ -1217,7 +1217,7 @@ INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
     } else {
         // Did not commit: abort and restart.
         StgTSO_trec(CurrentTSO) = outer;
-        jump stg_abort();
+        jump stg_abort_tx();
     }
 }
 


=====================================
rts/RtsMessages.c
=====================================
@@ -320,21 +320,3 @@ rtsDebugMsgFn(const char *s, va_list ap)
   return r;
 }
 
-
-// Used in stg_badAlignment_entry defined in StgStartup.cmm.
-void rtsBadAlignmentBarf(void) GNUC3_ATTRIBUTE(__noreturn__);
-
-void
-rtsBadAlignmentBarf()
-{
-    barf("Encountered incorrectly aligned pointer. This can't be good.");
-}
-
-// Used by code generator
-void rtsOutOfBoundsAccess(void) GNUC3_ATTRIBUTE(__noreturn__);
-
-void
-rtsOutOfBoundsAccess()
-{
-    barf("Encountered out of bounds array access.");
-}


=====================================
rts/Schedule.c
=====================================
@@ -571,6 +571,9 @@ run_thread:
         ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
         break;
 
+    case ThreadAborted:
+        barf("internal error");
+
     default:
       barf("schedule: invalid thread return code %d", (int)ret);
     }
@@ -3090,7 +3093,7 @@ findRetryFrameHelper (Capability *cap, StgTSO *tso)
 /* -----------------------------------------------------------------------------
    findAtomicallyFrameHelper
 
-   This function is called by stg_abort via catch_retry_frame primitive.  It is
+   This function is called by stg_abort_tx via catch_retry_frame primitive. It is
    like findRetryFrameHelper but it will only stop at ATOMICALLY_FRAME.
    -------------------------------------------------------------------------- */
 


=====================================
rts/StgMiscClosures.cmm
=====================================
@@ -1486,3 +1486,49 @@ section "data" {
 }
 
 #endif
+
+/* Note [Aborting from the mutator]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * GHC supports a number of runtime checking modes (largely for debugging
+ * purposes) which may need to abort execution at runtime. This include
+ * -dtag-inference-check, -dcheck-prim-bounds, and -falignment-sanitisation.
+ * To abort execution one might think that we could just call `barf`; however
+ * this is not ideal since it doesn't allow the RTS to gracefully shutdown.
+ *
+ * In #22038 we saw this manifest as a deadlock when -dcheck-prim-bounds
+ * failed. In particular, we saw the following:
+ *
+ * 1. the mutator takes a capability and begins execution
+ * 2. the bounds check fails, calling `barf`
+ * 3. `barf` calls `rtsFatalInternalErrorFn`, which in turn calls `endEventLogging`
+ * 4. `endEventLogging` calls `flushEventLog`, which it turn initiates a
+ *    sync to request that all capabilities flush their local event logs
+ * 5. we deadlock as the the capability held by the crashing mutator can
+ *    never yields to the sync
+ *
+ * Consequently, we instead crash in a more principled manner by yielding back
+ * to the scheduler, indicating that we should abort by setting the thread's
+ * return value to ThreadAborted. This is done by stg_abort().
+ */
+
+stg_tagInferenceCheckFailure(W_ what) {
+    ccall debugBelch("Tag inference failed on: %s\n", what);
+    jump stg_abort();
+}
+
+stg_outOfBoundsAccess(W_ ind, W_ sz) {
+    ccall debugBelch("Encountered out of bounds array access (index=%d, size=%d)", ind, sz);
+    jump stg_abort();
+}
+
+stg_badAlignment() {
+    ccall debugBelch("Encountered incorrectly aligned pointer. This can't be good.");
+    jump stg_abort();
+}
+
+stg_abort() {
+    StgTSO_what_next(CurrentTSO) = ThreadKilled :: I16;
+    StgRegTable_rRet(BaseReg) = ThreadAborted :: W_;
+    R1 = BaseReg;
+    jump stg_returnToSched [R1];
+}


=====================================
rts/include/rts/Constants.h
=====================================
@@ -268,6 +268,7 @@
 #define ThreadYielding 3
 #define ThreadBlocked  4
 #define ThreadFinished 5
+#define ThreadAborted  6                /* See Note [Aborting from the mutator] */
 
 /*
  * Flags for the tso->flags field.



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/6d9802a6e19b0873f58c85ccf61a3f219bac32c1

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/6d9802a6e19b0873f58c85ccf61a3f219bac32c1
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20220819/a97d95d2/attachment-0001.html>


More information about the ghc-commits mailing list