[Git][ghc/ghc][wip/marge_bot_batch_merge_job] 4 commits: x86-ncg: Fix fma codegen when arguments are globals
Marge Bot (@marge-bot)
gitlab at gitlab.haskell.org
Tue Mar 5 03:31:29 UTC 2024
Marge Bot pushed to branch wip/marge_bot_batch_merge_job at Glasgow Haskell Compiler / GHC
Commits:
82ccb801 by Andreas Klebinger at 2024-03-04T19:59:14-05:00
x86-ncg: Fix fma codegen when arguments are globals
Fix a bug in the x86 ncg where results would be wrong when the desired output
register and one of the input registers were the same global.
Also adds a tiny optimization to make use of the memory addressing
support when convenient.
Fixes #24496
- - - - -
4a1b0eb0 by Cheng Shao at 2024-03-04T22:31:07-05:00
rts: add -xr option to control two step allocator reserved space size
This patch adds a -xr RTS option to control the size of virtual memory
address space reserved by the two step allocator on a 64-bit platform,
see added documentation for explanation. Closes #24498.
- - - - -
2d218101 by Ben Gamari at 2024-03-04T22:31:07-05:00
filepath: Bump submodule to 1.5.2.0
- - - - -
fc22d8dc by Ben Gamari at 2024-03-04T22:31:07-05:00
os-string: Bump submodule to 2.0.2
- - - - -
12 changed files:
- compiler/GHC/CmmToAsm/X86/CodeGen.hs
- docs/users_guide/9.10.1-notes.rst
- docs/users_guide/runtime_control.rst
- libraries/filepath
- libraries/os-string
- rts/RtsFlags.c
- rts/include/rts/Flags.h
- rts/sm/MBlock.c
- + testsuite/tests/primops/should_run/T24496.hs
- + testsuite/tests/primops/should_run/T24496.stdout
- testsuite/tests/primops/should_run/all.T
- testsuite/tests/rts/all.T
Changes:
=====================================
compiler/GHC/CmmToAsm/X86/CodeGen.hs
=====================================
@@ -3424,7 +3424,6 @@ genFMA3Code :: Width
-> FMASign
-> CmmExpr -> CmmExpr -> CmmExpr -> NatM Register
genFMA3Code w signs x y z = do
-
-- For the FMA instruction, we want to compute x * y + z
--
-- There are three possible instructions we could emit:
@@ -3445,17 +3444,45 @@ genFMA3Code w signs x y z = do
--
-- Currently we follow neither of these optimisations,
-- opting to always use fmadd213 for simplicity.
+ --
+ -- We would like to compute the result directly into the requested register.
+ -- To do so we must first compute `x` into the destination register. This is
+ -- only possible if the other arguments don't use the destination register.
+ -- We check for this and if there is a conflict we move the result only after
+ -- the computation. See #24496 how this went wrong in the past.
let rep = floatFormat w
(y_reg, y_code) <- getNonClobberedReg y
- (z_reg, z_code) <- getNonClobberedReg z
+ (z_op, z_code) <- getNonClobberedOperand z
x_code <- getAnyReg x
+ x_tmp <- getNewRegNat rep
let
fma213 = FMA3 rep signs FMA213
- code dst
- = y_code `appOL`
+
+ code, code_direct, code_mov :: Reg -> InstrBlock
+ -- Ideal: Compute the result directly into dst
+ code_direct dst = x_code dst `snocOL`
+ fma213 z_op y_reg dst
+ -- Fallback: Compute the result into a tmp reg and then move it.
+ code_mov dst = x_code x_tmp `snocOL`
+ fma213 z_op y_reg x_tmp `snocOL`
+ MOV rep (OpReg x_tmp) (OpReg dst)
+
+ code dst =
+ y_code `appOL`
z_code `appOL`
- x_code dst `snocOL`
- fma213 (OpReg z_reg) y_reg dst
+ ( if arg_regs_conflict then code_mov dst else code_direct dst )
+
+ where
+
+ arg_regs_conflict =
+ y_reg == dst ||
+ case z_op of
+ OpReg z_reg -> z_reg == dst
+ OpAddr amode -> dst `elem` addrModeRegs amode
+ OpImm {} -> False
+
+ -- NB: Computing the result into a desired register using Any can be tricky.
+ -- So for now, we keep it simple. (See #24496).
return (Any rep code)
-----------
=====================================
docs/users_guide/9.10.1-notes.rst
=====================================
@@ -232,6 +232,10 @@ Runtime system
- Add a :rts-flag:`--no-automatic-time-samples` flag which stops time profiling samples being automatically started on
startup. Time profiling can be controlled manually using functions in ``GHC.Profiling``.
+- Add a :rts-flag:`-xr ⟨size⟩` which controls the size of virtual
+ memory address space reserved by the two step allocator on a 64-bit
+ platform. See :ghc-ticket:`24498`.
+
``base`` library
~~~~~~~~~~~~~~~~
=====================================
docs/users_guide/runtime_control.rst
=====================================
@@ -368,6 +368,18 @@ Miscellaneous RTS options
thread can execute its exception handlers. The ``-xq`` controls the
size of this additional quota.
+.. rts-flag:: -xr ⟨size⟩
+
+ :default: 0.25T on aarch64, 1T otherwise
+
+ This option controls the size of virtual memory address space
+ reserved by the two step allocator on a 64-bit platform. It can be
+ useful in scenarios where even reserving a large address range
+ without committing can be expensive (e.g. WSL1), or when you
+ actually have enough physical memory and want to support a Haskell
+ heap larger than 1T. ``-xr`` is a no-op if GHC is configured with
+ ``--disable-large-address-space`` or if the platform is 32-bit.
+
.. _rts-options-gc:
RTS options to control the garbage collector
=====================================
libraries/filepath
=====================================
@@ -1 +1 @@
-Subproject commit b55465e3d174ccd63914e7146079435503204187
+Subproject commit 4dd36add328032f9cbf0eff2a3511ab4369b18eb
=====================================
libraries/os-string
=====================================
@@ -1 +1 @@
-Subproject commit fb2711ba1f43fd609de0e231e161025ee8ed3216
+Subproject commit 6c567f572e62437b8431b0f64b91393c40b677c8
=====================================
rts/RtsFlags.c
=====================================
@@ -186,6 +186,14 @@ void initRtsFlagsDefaults(void)
RtsFlags.GcFlags.ringBell = false;
RtsFlags.GcFlags.longGCSync = 0; /* detection turned off */
+#if defined(aarch64_HOST_ARCH)
+ // 1/4 TBytes, see 38c98e4f for rationale
+ RtsFlags.GcFlags.addressSpaceSize = (StgWord64)1 << 38;
+#else
+ // 1 TBytes
+ RtsFlags.GcFlags.addressSpaceSize = (StgWord64)1 << 40;
+#endif
+
RtsFlags.DebugFlags.scheduler = false;
RtsFlags.DebugFlags.interpreter = false;
RtsFlags.DebugFlags.weak = false;
@@ -552,6 +560,11 @@ usage_text[] = {
" -xq The allocation limit given to a thread after it receives",
" an AllocationLimitExceeded exception. (default: 100k)",
"",
+#if defined(USE_LARGE_ADDRESS_SPACE)
+" -xr The size of virtual memory address space reserved by the",
+" two step allocator (default: 0.25T on aarch64, 1T otherwise)",
+"",
+#endif
" -Mgrace=<n>",
" The amount of allocation after the program receives a",
" HeapOverflow exception before the exception is thrown again, if",
@@ -1820,6 +1833,12 @@ error = true;
/ BLOCK_SIZE;
break;
+ case 'r':
+ OPTION_UNSAFE;
+ RtsFlags.GcFlags.addressSpaceSize
+ = decodeSize(rts_argv[arg], 3, MBLOCK_SIZE, HS_INT_MAX);
+ break;
+
default:
OPTION_SAFE;
errorBelch("unknown RTS option: %s",rts_argv[arg]);
@@ -2118,7 +2137,9 @@ decodeSize(const char *flag, uint32_t offset, StgWord64 min, StgWord64 max)
m = atof(s);
c = s[strlen(s)-1];
- if (c == 'g' || c == 'G')
+ if (c == 't' || c == 'T')
+ m *= (StgWord64)1024*1024*1024*1024;
+ else if (c == 'g' || c == 'G')
m *= 1024*1024*1024;
else if (c == 'm' || c == 'M')
m *= 1024*1024;
@@ -2737,4 +2758,3 @@ doingErasProfiling( void )
|| RtsFlags.ProfFlags.eraSelector != 0);
}
#endif /* PROFILING */
-
=====================================
rts/include/rts/Flags.h
=====================================
@@ -89,6 +89,8 @@ typedef struct _GC_FLAGS {
bool numa; /* Use NUMA */
StgWord numaMask;
+
+ StgWord64 addressSpaceSize; /* large address space size in bytes */
} GC_FLAGS;
/* See Note [Synchronization of flags and base APIs] */
=====================================
rts/sm/MBlock.c
=====================================
@@ -659,20 +659,14 @@ initMBlocks(void)
#if defined(USE_LARGE_ADDRESS_SPACE)
{
- W_ size;
-#if defined(aarch64_HOST_ARCH)
- size = (W_)1 << 38; // 1/4 TByte
-#else
- size = (W_)1 << 40; // 1 TByte
-#endif
void *startAddress = NULL;
if (RtsFlags.GcFlags.heapBase) {
startAddress = (void*) RtsFlags.GcFlags.heapBase;
}
- void *addr = osReserveHeapMemory(startAddress, &size);
+ void *addr = osReserveHeapMemory(startAddress, &RtsFlags.GcFlags.addressSpaceSize);
mblock_address_space.begin = (W_)addr;
- mblock_address_space.end = (W_)addr + size;
+ mblock_address_space.end = (W_)addr + RtsFlags.GcFlags.addressSpaceSize;
mblock_high_watermark = (W_)addr;
}
#elif SIZEOF_VOID_P == 8
=====================================
testsuite/tests/primops/should_run/T24496.hs
=====================================
@@ -0,0 +1,18 @@
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+import GHC.Exts
+
+twoProductFloat# :: Float# -> Float# -> (# Float#, Float# #)
+twoProductFloat# x y = let !r = x `timesFloat#` y
+ in (# r, fmsubFloat# x y r #)
+{-# NOINLINE twoProductFloat# #-}
+
+twoProductDouble# :: Double# -> Double# -> (# Double#, Double# #)
+twoProductDouble# x y = let !r = x *## y
+ in (# r, fmsubDouble# x y r #)
+{-# NOINLINE twoProductDouble# #-}
+
+main :: IO ()
+main = do
+ print $ case twoProductFloat# 2.0# 3.0# of (# r, s #) -> (F# r, F# s)
+ print $ case twoProductDouble# 2.0## 3.0## of (# r, s #) -> (D# r, D# s)
=====================================
testsuite/tests/primops/should_run/T24496.stdout
=====================================
@@ -0,0 +1,2 @@
+(6.0,0.0)
+(6.0,0.0)
=====================================
testsuite/tests/primops/should_run/all.T
=====================================
@@ -77,3 +77,10 @@ test('FMA_ConstantFold'
test('T21624', normal, compile_and_run, [''])
test('T23071', ignore_stdout, compile_and_run, [''])
test('T22710', normal, compile_and_run, [''])
+test('T24496'
+ , [ when(have_cpu_feature('fma'), extra_hc_opts('-mfma'))
+ , js_skip # JS backend doesn't have an FMA implementation
+ , when(arch('wasm32'), skip)
+ , when(have_llvm(), extra_ways(["optllvm"]))
+ ]
+ , compile_and_run, ['-O'])
=====================================
testsuite/tests/rts/all.T
=====================================
@@ -3,7 +3,7 @@ test('testblockalloc',
compile_and_run, [''])
test('testmblockalloc',
- [c_src, only_ways(['normal','threaded1']), extra_run_opts('+RTS -I0'),
+ [c_src, only_ways(['normal','threaded1']), extra_run_opts('+RTS -I0 -xr0.125T'),
when(arch('wasm32'), skip)], # MBlocks can't be freed on wasm32, see Note [Megablock allocator on wasm] in rts
compile_and_run, [''])
# -I0 is important: the idle GC will run the memory leak detector,
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/31be53b2c08f1fe1d53917aeb227a3b8c49bbcf4...fc22d8dcf40f4a4eb8d020d8c2d7b8c6eca23449
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/31be53b2c08f1fe1d53917aeb227a3b8c49bbcf4...fc22d8dcf40f4a4eb8d020d8c2d7b8c6eca23449
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240304/bf3d5d12/attachment-0001.html>
More information about the ghc-commits
mailing list