[Git][ghc/ghc][wip/marge_bot_batch_merge_job] 6 commits: hadrian: disable PIC for in-tree GMP on wasm32

Mon Jun 3 03:39:26 UTC 2024


Marge Bot pushed to branch wip/marge_bot_batch_merge_job at Glasgow Haskell Compiler / GHC


Commits:
f9c1ae12 by Cheng Shao at 2024-06-02T14:01:55-04:00
hadrian: disable PIC for in-tree GMP on wasm32

This patch disables PIC for in-tree GMP on wasm32 target. Enabling PIC
unconditionally adds undesired code size and runtime overhead for
wasm32.

- - - - -
1a32f828 by Cheng Shao at 2024-06-02T14:01:55-04:00
hadrian: disable in-tree gmp fft code path for wasm32

This patch disables in-tree GMP FFT code paths for wasm32 target in
order to give up some performance of multiplying very large operands
in exchange for reduced code size.

- - - - -
06277d56 by Cheng Shao at 2024-06-02T14:01:55-04:00
hadrian: build in-tree GMP with malloc-notreentrant on wasm32

This patch makes hadrian build in-tree GMP with the
--enable-alloca=malloc-notreentrant configure option. We will only
need malloc-reentrant when we have threaded RTS and SMP support on
wasm32, which will take some time to happen, before which we should
use malloc-notreentrant to avoid undesired runtime overhead.

- - - - -
9f614270 by ARATA Mizuki at 2024-06-02T14:02:35-04:00
Set package include paths when assembling .S files

Fixes #24839.

Co-authored-by: Sylvain Henry <hsyl20 at gmail.com>

- - - - -
7a814245 by Alex Mason at 2024-06-02T23:39:15-04:00
Improve performance of genericWordQuotRem2Op (#22966)

Implements the algorithm from compiler-rt's udiv128by64to64default. This
rewrite results in a roughly 24x improvement in runtime on AArch64 (and
likely any other arch that uses it).

- - - - -
8303b6fd by Cheng Shao at 2024-06-02T23:39:15-04:00
testsuite: mark T7773 as fragile on wasm

- - - - -


10 changed files:

- compiler/GHC/Driver/Pipeline/Execute.hs
- compiler/GHC/StgToCmm/Prim.hs
- hadrian/src/Settings/Builders/Configure.hs
- libraries/base/tests/all.T
- + testsuite/tests/driver/T24839.hs
- + testsuite/tests/driver/T24839.stdout
- testsuite/tests/driver/all.T
- + testsuite/tests/driver/t24839_sub.S
- testsuite/tests/numeric/should_run/all.T
- + testsuite/tests/numeric/should_run/quotRem2Large.hs


Changes:

=====================================
compiler/GHC/Driver/Pipeline/Execute.hs
=====================================
@@ -290,6 +290,7 @@ runGenericAsPhase :: (Logger -> DynFlags -> [Option] -> IO ()) -> [Option] -> Bo
 runGenericAsPhase run_as extra_opts with_cpp pipe_env hsc_env location input_fn = do
         let dflags     = hsc_dflags   hsc_env
         let logger     = hsc_logger   hsc_env
+        let unit_env   = hsc_unit_env hsc_env
 
         let cmdline_include_paths = includePaths dflags
         let pic_c_flags = picCCOpts dflags
@@ -300,16 +301,21 @@ runGenericAsPhase run_as extra_opts with_cpp pipe_env hsc_env location input_fn
         -- might be a hierarchical module.
         createDirectoryIfMissing True (takeDirectory output_fn)
 
-        let global_includes = [ GHC.SysTools.Option ("-I" ++ p)
-                              | p <- includePathsGlobal cmdline_include_paths ]
-        let local_includes = [ GHC.SysTools.Option ("-iquote" ++ p)
-                             | p <- includePathsQuote cmdline_include_paths ++
-                                includePathsQuoteImplicit cmdline_include_paths]
+        -- add package include paths
+        all_includes <- if not with_cpp
+          then pure []
+          else do
+            pkg_include_dirs <- mayThrowUnitErr (collectIncludeDirs <$> preloadUnitsInfo unit_env)
+            let global_includes = [ GHC.SysTools.Option ("-I" ++ p)
+                                  | p <- includePathsGlobal cmdline_include_paths ++ pkg_include_dirs]
+            let local_includes = [ GHC.SysTools.Option ("-iquote" ++ p)
+                                 | p <- includePathsQuote cmdline_include_paths ++ includePathsQuoteImplicit cmdline_include_paths]
+            pure (local_includes ++ global_includes)
         let runAssembler inputFilename outputFilename
               = withAtomicRename outputFilename $ \temp_outputFilename ->
                     run_as
                        logger dflags
-                       (local_includes ++ global_includes
+                       (all_includes
                        -- See Note [-fPIC for assembler]
                        ++ map GHC.SysTools.Option pic_c_flags
                        -- See Note [Produce big objects on Windows]


=====================================
compiler/GHC/StgToCmm/Prim.hs
=====================================
@@ -1894,53 +1894,179 @@ genericWordQuotRemOp width [res_q, res_r] [arg_x, arg_y]
                (CmmMachOp (MO_U_Rem  width) [arg_x, arg_y])
 genericWordQuotRemOp _ _ _ = panic "genericWordQuotRemOp"
 
+-- Based on the algorithm from LLVM's compiler-rt:
+-- https://github.com/llvm/llvm-project/blob/7339f7ba3053db7595ece1ca5f49bd2e4c3c8305/compiler-rt/lib/builtins/udivmodti4.c#L23
+-- See that file for licensing and copyright.
 genericWordQuotRem2Op :: Platform -> GenericOp
-genericWordQuotRem2Op platform [res_q, res_r] [arg_x_high, arg_x_low, arg_y]
-    = emit =<< f (widthInBits (wordWidth platform)) zero arg_x_high arg_x_low
-    where    ty = cmmExprType platform arg_x_high
-             shl   x i = CmmMachOp (MO_Shl   (wordWidth platform)) [x, i]
-             shr   x i = CmmMachOp (MO_U_Shr (wordWidth platform)) [x, i]
-             or    x y = CmmMachOp (MO_Or    (wordWidth platform)) [x, y]
-             ge    x y = CmmMachOp (MO_U_Ge  (wordWidth platform)) [x, y]
-             ne    x y = CmmMachOp (MO_Ne    (wordWidth platform)) [x, y]
-             minus x y = CmmMachOp (MO_Sub   (wordWidth platform)) [x, y]
-             times x y = CmmMachOp (MO_Mul   (wordWidth platform)) [x, y]
-             zero   = lit 0
-             one    = lit 1
-             negone = lit (fromIntegral (platformWordSizeInBits platform) - 1)
-             lit i = CmmLit (CmmInt i (wordWidth platform))
-
-             f :: Int -> CmmExpr -> CmmExpr -> CmmExpr -> FCode CmmAGraph
-             f 0 acc high _ = return (mkAssign (CmmLocal res_q) acc <*>
-                                      mkAssign (CmmLocal res_r) high)
-             f i acc high low =
-                 do roverflowedBit <- newTemp ty
-                    rhigh'         <- newTemp ty
-                    rhigh''        <- newTemp ty
-                    rlow'          <- newTemp ty
-                    risge          <- newTemp ty
-                    racc'          <- newTemp ty
-                    let high'         = CmmReg (CmmLocal rhigh')
-                        isge          = CmmReg (CmmLocal risge)
-                        overflowedBit = CmmReg (CmmLocal roverflowedBit)
-                    let this = catAGraphs
-                               [mkAssign (CmmLocal roverflowedBit)
-                                          (shr high negone),
-                                mkAssign (CmmLocal rhigh')
-                                          (or (shl high one) (shr low negone)),
-                                mkAssign (CmmLocal rlow')
-                                          (shl low one),
-                                mkAssign (CmmLocal risge)
-                                          (or (overflowedBit `ne` zero)
-                                              (high' `ge` arg_y)),
-                                mkAssign (CmmLocal rhigh'')
-                                          (high' `minus` (arg_y `times` isge)),
-                                mkAssign (CmmLocal racc')
-                                          (or (shl acc one) isge)]
-                    rest <- f (i - 1) (CmmReg (CmmLocal racc'))
-                                      (CmmReg (CmmLocal rhigh''))
-                                      (CmmReg (CmmLocal rlow'))
-                    return (this <*> rest)
+genericWordQuotRem2Op platform [res_q, res_r] [arg_u1, arg_u0, arg_v]
+    = do
+      -- v gets modified below based on clz v
+      v <- newTemp ty
+      emit $ mkAssign (CmmLocal v) arg_v
+      go arg_u1 arg_u0 v
+  where   ty = cmmExprType platform arg_u1
+          shl   x i = CmmMachOp (MO_Shl    (wordWidth platform)) [x, i]
+          shr   x i = CmmMachOp (MO_U_Shr  (wordWidth platform)) [x, i]
+          or    x y = CmmMachOp (MO_Or     (wordWidth platform)) [x, y]
+          ge    x y = CmmMachOp (MO_U_Ge   (wordWidth platform)) [x, y]
+          le    x y = CmmMachOp (MO_U_Le   (wordWidth platform)) [x, y]
+          eq    x y = CmmMachOp (MO_Eq     (wordWidth platform)) [x, y]
+          plus  x y = CmmMachOp (MO_Add    (wordWidth platform)) [x, y]
+          minus x y = CmmMachOp (MO_Sub    (wordWidth platform)) [x, y]
+          times x y = CmmMachOp (MO_Mul    (wordWidth platform)) [x, y]
+          udiv  x y = CmmMachOp (MO_U_Quot (wordWidth platform)) [x, y]
+          and   x y = CmmMachOp (MO_And    (wordWidth platform)) [x, y]
+          lit i     = CmmLit (CmmInt i (wordWidth platform))
+          one       = lit 1
+          zero      = lit 0
+          masklow   = lit ((1 `shiftL` (platformWordSizeInBits platform `div` 2)) - 1)
+          gotoIf pred target = emit =<< mkCmmIfGoto pred target
+          mkTmp ty = do
+            t <- newTemp ty
+            pure (t, CmmReg (CmmLocal t))
+          infixr 8 .=
+          r .= e = emit $ mkAssign (CmmLocal r) e
+
+          go :: CmmActual -> CmmActual -> LocalReg -> FCode ()
+          go u1 u0 v = do
+            -- Computes (ret,r) = (u1<<WORDSIZE*8+u0) `divMod` v
+            -- du_int udiv128by64to64default(du_int u1, du_int u0, du_int v, du_int *r)
+            -- const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT;
+            let n_udword_bits' = widthInBits (wordWidth platform)
+                n_udword_bits = fromIntegral n_udword_bits'
+            -- const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits)
+                b = 1 `shiftL` (n_udword_bits' `div` 2)
+                v' = CmmReg (CmmLocal v)
+            -- du_int un1, un0;                                // Norm. dividend LSD's
+            (un1, un1')   <- mkTmp ty
+            (un0, un0')   <- mkTmp ty
+            -- du_int vn1, vn0;                                // Norm. divisor digits
+            (vn1, vn1')   <- mkTmp ty
+            (vn0, vn0')   <- mkTmp ty
+            -- du_int q1, q0;                                  // Quotient digits
+            (q1, q1')     <- mkTmp ty
+            (q0, q0')     <- mkTmp ty
+            -- du_int un64, un21, un10;                        // Dividend digit pairs
+            (un64, un64') <- mkTmp ty
+            (un21, un21') <- mkTmp ty
+            (un10, un10') <- mkTmp ty
+
+            -- du_int rhat;                                    // A remainder
+            (rhat, rhat') <- mkTmp ty
+            -- si_int s;                                       // Shift amount for normalization
+            (s, s')       <- mkTmp ty
+
+            -- s = __builtin_clzll(v);
+            -- clz(0) in GHC returns N on N bit systems, whereas
+            -- __builtin_clzll returns 0 (or is undefined)
+            emitClzCall s v' (wordWidth platform)
+
+            if_else <- newBlockId
+            if_done <- newBlockId
+            -- if (s > 0) {
+            -- actually if (s > 0 && s /= wordSizeInBits) {
+            gotoIf (s' `eq` zero) if_else
+            gotoIf (s' `eq` lit n_udword_bits) if_else
+            do
+              --   // Normalize the divisor.
+              --   v = v << s;
+              v .= shl v' s'
+              --   un64 = (u1 << s) | (u0 >> (n_udword_bits - s));
+              un64 .= (u1 `shl` s') `or` (u0 `shr` (lit n_udword_bits `minus` s'))
+              --   un10 = u0 << s; // Shift dividend left
+              un10 .= shl u0 s'
+              emit $ mkBranch if_done
+            -- } else {
+            do
+              --   // Avoid undefined behavior of (u0 >> 64).
+              emitLabel if_else
+              --   un64 = u1;
+              un64 .= u1
+              --   un10 = u0;
+              un10 .= u0
+              s .= lit 0 -- Otherwise leads to >>/<< 64
+              -- }
+            emitLabel if_done
+
+            -- // Break divisor up into two 32-bit digits.
+            -- vn1 = v >> (n_udword_bits / 2);
+            vn1 .= v' `shr` lit (n_udword_bits `div` 2)
+            -- vn0 = v & 0xFFFFFFFF;
+            vn0 .= v' `and` masklow
+
+            -- // Break right half of dividend into two digits.
+            -- un1 = un10 >> (n_udword_bits / 2);
+            un1 .= un10' `shr`  lit (n_udword_bits `div` 2)
+            -- un0 = un10 & 0xFFFFFFFF;
+            un0 .= un10' `and` masklow
+
+            -- // Compute the first quotient digit, q1.
+            -- q1 = un64 / vn1;
+            q1 .= un64' `udiv` vn1'
+            -- rhat = un64 - q1 * vn1;
+            rhat .= un64' `minus` times q1' vn1'
+
+            while_1_entry <- newBlockId
+            while_1_body  <- newBlockId
+            while_1_done  <- newBlockId
+            -- // q1 has at most error 2. No more than 2 iterations.
+            -- while (q1 >= b || q1 * vn0 > b * rhat + un1) {
+            emitLabel while_1_entry
+            gotoIf (q1' `ge` lit b) while_1_body
+            gotoIf (le (times q1' vn0')
+                       (times (lit b) rhat' `plus` un1'))
+                  while_1_done
+            do
+              emitLabel while_1_body
+              --   q1 = q1 - 1;
+              q1 .= q1' `minus` one
+              --   rhat = rhat + vn1;
+              rhat .= rhat' `plus` vn1'
+              --   if (rhat >= b)
+              --     break;
+              gotoIf (rhat' `ge` lit b)
+                      while_1_done
+              emit $ mkBranch while_1_entry
+            -- }
+            emitLabel while_1_done
+
+            -- un21 = un64 * b + un1 - q1 * v;
+            un21 .= (times un64' (lit b) `plus` un1') `minus` times q1' v'
+
+            -- // Compute the second quotient digit.
+            -- q0 = un21 / vn1;
+            q0 .= un21' `udiv` vn1'
+            -- rhat = un21 - q0 * vn1;
+            rhat .= un21' `minus` times q0' vn1'
+
+            -- // q0 has at most error 2. No more than 2 iterations.
+            while_2_entry <- newBlockId
+            while_2_body  <- newBlockId
+            while_2_done  <- newBlockId
+            emitLabel while_2_entry
+            -- while (q0 >= b || q0 * vn0 > b * rhat + un0) {
+            gotoIf (q0' `ge` lit b)
+                   while_2_body
+            gotoIf (le (times q0' vn0')
+                       (times (lit b) rhat' `plus` un0'))
+                   while_2_done
+            do
+              emitLabel while_2_body
+              --   q0 = q0 - 1;
+              q0 .= q0' `minus` one
+              --   rhat = rhat + vn1;
+              rhat .= rhat' `plus` vn1'
+              --   if (rhat >= b)
+              --     break;
+              gotoIf (rhat' `ge` lit b) while_2_done
+              emit $ mkBranch while_2_entry
+            -- }
+            emitLabel while_2_done
+
+            --   r = (un21 * b + un0 - q0 * v) >> s;
+            res_r .= ((times un21' (lit b) `plus` un0') `minus` times q0' v') `shr` s'
+            -- return q1 * b + q0;
+            res_q .= times q1' (lit b) `plus` q0'
 genericWordQuotRem2Op _ _ _ = panic "genericWordQuotRem2Op"
 
 genericWordAdd2Op :: GenericOp


=====================================
hadrian/src/Settings/Builders/Configure.hs
=====================================
@@ -15,16 +15,23 @@ configureBuilderArgs = do
                 targetPlatform <- queryTarget targetPlatformTriple
                 buildPlatform <- queryBuild targetPlatformTriple
                 pure $ [ "--enable-shared=no"
-                     , "--with-pic=yes"
                      , "--host=" ++ targetPlatform    -- GMP's host is our target
                      , "--build=" ++ buildPlatform ]
+                     -- Disable FFT logic on wasm32, sacrifice
+                     -- performance of multiplying very large operands
+                     -- to save code size
+                     <> [ "--disable-fft" | targetArch == "wasm32" ]
                      -- Disable GMP's alloca usage on wasm32, it may
                      -- cause stack overflow (#22602) due to the
                      -- rather small 64KB default stack size. See
                      -- https://gmplib.org/manual/Build-Options for
                      -- more detailed explanation of this configure
                      -- option.
-                     <> [ "--enable-alloca=malloc-reentrant" | targetArch == "wasm32" ]
+                     <> [ "--enable-alloca=malloc-notreentrant" | targetArch == "wasm32" ]
+                     -- Enable PIC unless target is wasm32, in which
+                     -- case we don't want libgmp.a to be bloated due
+                     -- to PIC overhead.
+                     <> [ "--with-pic=yes" | targetArch /= "wasm32" ]
 
             , builder (Configure libffiPath) ? do
                 top            <- expr topDirectory


=====================================
libraries/base/tests/all.T
=====================================
@@ -176,6 +176,7 @@ test('T7457', normal, compile_and_run, [''])
 test('T7773',
      [when(opsys('mingw32'), skip),
       js_broken(22261),
+      when(arch('wasm32'), fragile(24928)),
       expect_broken_for(23272, ['ghci-opt']) # unclear
      ],
      compile_and_run,


=====================================
testsuite/tests/driver/T24839.hs
=====================================
@@ -0,0 +1,8 @@
+import Data.Int
+import Foreign.Ptr
+import Foreign.Storable
+
+foreign import ccall "&" foo :: Ptr Int64
+
+main :: IO ()
+main = peek foo >>= print


=====================================
testsuite/tests/driver/T24839.stdout
=====================================
@@ -0,0 +1 @@
+24839


=====================================
testsuite/tests/driver/all.T
=====================================
@@ -327,3 +327,4 @@ test('T23339B', [extra_files(['T23339.hs']), req_c], makefile_test, [])
 test('T23613', normal, compile_and_run, ['-this-unit-id=foo'])
 test('T23944', [unless(have_dynamic(), skip), extra_files(['T23944A.hs'])], multimod_compile, ['T23944 T23944A', '-fprefer-byte-code -fbyte-code -fno-code -dynamic-too -fwrite-interface'])
 test('T24286', [cxx_src, unless(have_profiling(), skip), extra_files(['T24286.cpp'])], compile, ['-prof -no-hs-main'])
+test('T24839', [unless(arch('x86_64') or arch('aarch64'), skip), extra_files(["t24839_sub.S"])], compile_and_run, ['t24839_sub.S'])


=====================================
testsuite/tests/driver/t24839_sub.S
=====================================
@@ -0,0 +1,10 @@
+/* Note that the filename must begin with a lowercase letter, because GHC thinks it as a module name otherwise. */
+#include "ghcconfig.h"
+#if LEADING_UNDERSCORE
+    .globl _foo
+_foo:
+#else
+    .globl foo
+foo:
+#endif
+    .quad 24839


=====================================
testsuite/tests/numeric/should_run/all.T
=====================================
@@ -52,6 +52,7 @@ test('add2', normal, compile_and_run, ['-fobject-code'])
 test('mul2', normal, compile_and_run, ['-fobject-code'])
 test('mul2int', normal, compile_and_run, ['-fobject-code'])
 test('quotRem2', normal, compile_and_run, ['-fobject-code'])
+test('quotRem2Large', normal, compile_and_run, ['-fobject-code'])
 test('T5863', normal, compile_and_run, [''])
 
 test('T7014', js_skip, makefile_test, [])


=====================================
testsuite/tests/numeric/should_run/quotRem2Large.hs
=====================================
The diff for this file was not included because it is too large.


View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/dd0c1732c5f51bd946ba7b79ad965af3e283e5bd...8303b6fda3e0daa15c76b7bcece977a2cd63ed60

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/dd0c1732c5f51bd946ba7b79ad965af3e283e5bd...8303b6fda3e0daa15c76b7bcece977a2cd63ed60
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240602/e4daaea8/attachment-0001.html>