[Git][ghc/ghc][master] 2 commits: codegen: unroll memcpy calls for small bytearrays
Marge Bot
gitlab at gitlab.haskell.org
Sun Apr 14 05:32:41 UTC 2019
Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC
Commits:
edcef7b3 by Artem Pyanykh at 2019-04-14T05:26:35Z
codegen: unroll memcpy calls for small bytearrays
- - - - -
6094d43f by Artem Pyanykh at 2019-04-14T05:26:35Z
docs: mention memcpy optimization for ByteArrays in 8.10.1-notes
- - - - -
7 changed files:
- compiler/cmm/CmmExpr.hs
- compiler/codeGen/StgCmmPrim.hs
- compiler/nativeGen/X86/CodeGen.hs
- docs/users_guide/8.10.1-notes.rst
- testsuite/tests/codeGen/should_gen_asm/all.T
- + testsuite/tests/codeGen/should_gen_asm/bytearray-memcpy-unroll.asm
- + testsuite/tests/codeGen/should_gen_asm/bytearray-memcpy-unroll.hs
Changes:
=====================================
compiler/cmm/CmmExpr.hs
=====================================
@@ -5,7 +5,7 @@
{-# LANGUAGE UndecidableInstances #-}
module CmmExpr
- ( CmmExpr(..), cmmExprType, cmmExprWidth, maybeInvertCmmExpr
+ ( CmmExpr(..), cmmExprType, cmmExprWidth, cmmExprAlignment, maybeInvertCmmExpr
, CmmReg(..), cmmRegType, cmmRegWidth
, CmmLit(..), cmmLitType
, LocalReg(..), localRegType
@@ -43,6 +43,8 @@ import Unique
import Data.Set (Set)
import qualified Data.Set as Set
+import BasicTypes (Alignment, mkAlignment, alignmentOf)
+
-----------------------------------------------------------------------------
-- CmmExpr
-- An expression. Expressions have no side effects.
@@ -239,6 +241,13 @@ cmmLabelType dflags lbl
cmmExprWidth :: DynFlags -> CmmExpr -> Width
cmmExprWidth dflags e = typeWidth (cmmExprType dflags e)
+-- | Returns an alignment in bytes of a CmmExpr when it's a statically
+-- known integer constant, otherwise returns an alignment of 1 byte.
+-- The caller is responsible for using with a sensible CmmExpr
+-- argument.
+cmmExprAlignment :: CmmExpr -> Alignment
+cmmExprAlignment (CmmLit (CmmInt intOff _)) = alignmentOf (fromInteger intOff)
+cmmExprAlignment _ = mkAlignment 1
--------
--- Negation for conditional branches
=====================================
compiler/codeGen/StgCmmPrim.hs
=====================================
@@ -2035,8 +2035,8 @@ doCopyByteArrayOp = emitCopyByteArray copy
where
-- Copy data (we assume the arrays aren't overlapping since
-- they're of different types)
- copy _src _dst dst_p src_p bytes =
- emitMemcpyCall dst_p src_p bytes 1
+ copy _src _dst dst_p src_p bytes align =
+ emitMemcpyCall dst_p src_p bytes align
-- | Takes a source 'MutableByteArray#', an offset in the source
-- array, a destination 'MutableByteArray#', an offset into the
@@ -2050,22 +2050,26 @@ doCopyMutableByteArrayOp = emitCopyByteArray copy
-- The only time the memory might overlap is when the two arrays
-- we were provided are the same array!
-- TODO: Optimize branch for common case of no aliasing.
- copy src dst dst_p src_p bytes = do
+ copy src dst dst_p src_p bytes align = do
dflags <- getDynFlags
(moveCall, cpyCall) <- forkAltPair
- (getCode $ emitMemmoveCall dst_p src_p bytes 1)
- (getCode $ emitMemcpyCall dst_p src_p bytes 1)
+ (getCode $ emitMemmoveCall dst_p src_p bytes align)
+ (getCode $ emitMemcpyCall dst_p src_p bytes align)
emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall
emitCopyByteArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
- -> FCode ())
+ -> Alignment -> FCode ())
-> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
-> FCode ()
emitCopyByteArray copy src src_off dst dst_off n = do
dflags <- getDynFlags
+ let byteArrayAlignment = wordAlignment dflags
+ srcOffAlignment = cmmExprAlignment src_off
+ dstOffAlignment = cmmExprAlignment dst_off
+ align = minimum [byteArrayAlignment, srcOffAlignment, dstOffAlignment]
dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off
src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off
- copy src dst dst_p src_p n
+ copy src dst dst_p src_p n align
-- | Takes a source 'ByteArray#', an offset in the source array, a
-- destination 'Addr#', and the number of bytes to copy. Copies the given
@@ -2075,7 +2079,7 @@ doCopyByteArrayToAddrOp src src_off dst_p bytes = do
-- Use memcpy (we are allowed to assume the arrays aren't overlapping)
dflags <- getDynFlags
src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off
- emitMemcpyCall dst_p src_p bytes 1
+ emitMemcpyCall dst_p src_p bytes (mkAlignment 1)
-- | Takes a source 'MutableByteArray#', an offset in the source array, a
-- destination 'Addr#', and the number of bytes to copy. Copies the given
@@ -2092,7 +2096,7 @@ doCopyAddrToByteArrayOp src_p dst dst_off bytes = do
-- Use memcpy (we are allowed to assume the arrays aren't overlapping)
dflags <- getDynFlags
dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off
- emitMemcpyCall dst_p src_p bytes 1
+ emitMemcpyCall dst_p src_p bytes (mkAlignment 1)
-- ----------------------------------------------------------------------------
@@ -2107,9 +2111,7 @@ doSetByteArrayOp ba off len c = do
dflags <- getDynFlags
let byteArrayAlignment = wordAlignment dflags -- known since BA is allocated on heap
- offsetAlignment = case off of
- CmmLit (CmmInt intOff _) -> alignmentOf (fromInteger intOff)
- _ -> mkAlignment 1
+ offsetAlignment = cmmExprAlignment off
align = min byteArrayAlignment offsetAlignment
p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off
@@ -2180,7 +2182,7 @@ doCopyArrayOp = emitCopyArray copy
copy _src _dst dst_p src_p bytes =
do dflags <- getDynFlags
emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes)
- (wORD_SIZE dflags)
+ (wordAlignment dflags)
-- | Takes a source 'MutableArray#', an offset in the source array, a
@@ -2198,9 +2200,9 @@ doCopyMutableArrayOp = emitCopyArray copy
dflags <- getDynFlags
(moveCall, cpyCall) <- forkAltPair
(getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes)
- (wORD_SIZE dflags))
+ (wordAlignment dflags))
(getCode $ emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes)
- (wORD_SIZE dflags))
+ (wordAlignment dflags))
emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall
emitCopyArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff
@@ -2247,7 +2249,7 @@ doCopySmallArrayOp = emitCopySmallArray copy
copy _src _dst dst_p src_p bytes =
do dflags <- getDynFlags
emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes)
- (wORD_SIZE dflags)
+ (wordAlignment dflags)
doCopySmallMutableArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff
@@ -2261,9 +2263,9 @@ doCopySmallMutableArrayOp = emitCopySmallArray copy
dflags <- getDynFlags
(moveCall, cpyCall) <- forkAltPair
(getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes)
- (wORD_SIZE dflags))
+ (wordAlignment dflags))
(getCode $ emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes)
- (wORD_SIZE dflags))
+ (wordAlignment dflags))
emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall
emitCopySmallArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff
@@ -2328,7 +2330,7 @@ emitCloneArray info_p res_r src src_off n = do
(mkIntExpr dflags (arrPtrsHdrSizeW dflags)) src_off)
emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n))
- (wORD_SIZE dflags)
+ (wordAlignment dflags)
emit $ mkAssign (CmmLocal res_r) (CmmReg arr)
@@ -2365,7 +2367,7 @@ emitCloneSmallArray info_p res_r src src_off n = do
(mkIntExpr dflags (smallArrPtrsHdrSizeW dflags)) src_off)
emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n))
- (wORD_SIZE dflags)
+ (wordAlignment dflags)
emit $ mkAssign (CmmLocal res_r) (CmmReg arr)
@@ -2493,19 +2495,19 @@ doCasByteArray res mba idx idx_ty old new = do
-- Helpers for emitting function calls
-- | Emit a call to @memcpy at .
-emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> Int -> FCode ()
+emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode ()
emitMemcpyCall dst src n align = do
emitPrimCall
[ {-no results-} ]
- (MO_Memcpy align)
+ (MO_Memcpy (alignmentBytes align))
[ dst, src, n ]
-- | Emit a call to @memmove at .
-emitMemmoveCall :: CmmExpr -> CmmExpr -> CmmExpr -> Int -> FCode ()
+emitMemmoveCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode ()
emitMemmoveCall dst src n align = do
emitPrimCall
[ {- no results -} ]
- (MO_Memmove align)
+ (MO_Memmove (alignmentBytes align))
[ dst, src, n ]
-- | Emit a call to @memset at . The second argument must fit inside an
=====================================
compiler/nativeGen/X86/CodeGen.hs
=====================================
@@ -1767,12 +1767,11 @@ genCCall
-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
--- Unroll memcpy calls if the source and destination pointers are at
--- least DWORD aligned and the number of bytes to copy isn't too
+-- Unroll memcpy calls if the number of bytes to copy isn't too
-- large. Otherwise, call C's memcpy.
-genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
+genCCall dflags _ (PrimTarget (MO_Memcpy align)) _
[dst, src, CmmLit (CmmInt n _)] _
- | fromInteger insns <= maxInlineMemcpyInsns dflags && align .&. 3 == 0 = do
+ | fromInteger insns <= maxInlineMemcpyInsns dflags = do
code_dst <- getAnyReg dst
dst_r <- getNewRegNat format
code_src <- getAnyReg src
@@ -1785,7 +1784,9 @@ genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
-- instructions per move.
insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
- format = if align .&. 4 /= 0 then II32 else (archWordFormat is32Bit)
+ maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported
+ effectiveAlignment = min (alignmentOf align) maxAlignment
+ format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
-- The size of each move, in bytes.
sizeBytes :: Integer
=====================================
docs/users_guide/8.10.1-notes.rst
=====================================
@@ -61,10 +61,11 @@ Compiler
:ghc-flag:`-Wredundant-record-wildcards` which warn users when they have
redundant or unused uses of a record wildcard match.
-- Calls to `memset` are now unrolled more aggressively and the
- produced code is more efficient on `x86_64` with added support for
- 64-bit `MOV`s. In particular, `setByteArray#` calls that were not
- optimized before, now will be. See :ghc-ticket:`16052`.
+- Calls to `memset` and `memcpy` are now unrolled more aggressively
+ and the produced code is more efficient on `x86_64` with added
+ support for 64-bit `MOV`s. In particular, `setByteArray#` and
+ `copyByteArray#` calls that were not optimized before, now will
+ be. See :ghc-ticket:`16052`.
Runtime system
~~~~~~~~~~~~~~
=====================================
testsuite/tests/codeGen/should_gen_asm/all.T
=====================================
@@ -8,3 +8,4 @@ test('memcpy-unroll', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
test('memcpy-unroll-conprop', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
test('memset-unroll', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
test('bytearray-memset-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True, ''])
+test('bytearray-memcpy-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True, ''])
=====================================
testsuite/tests/codeGen/should_gen_asm/bytearray-memcpy-unroll.asm
=====================================
@@ -0,0 +1,8 @@
+movw 0(%rax),%dx
+movw %dx,0(%rcx)
+movw 2(%rax),%dx
+movw %dx,2(%rcx)
+movw 4(%rax),%dx
+movw %dx,4(%rcx)
+movw 6(%rax),%ax
+movw %ax,6(%rcx)
=====================================
testsuite/tests/codeGen/should_gen_asm/bytearray-memcpy-unroll.hs
=====================================
@@ -0,0 +1,19 @@
+{-# language MagicHash #-}
+{-# language UnboxedTuples #-}
+
+module CopyArray
+ ( smallCopy
+ ) where
+
+import GHC.Exts
+import GHC.IO
+
+data ByteArray = ByteArray ByteArray#
+
+-- Does an 8 byte copy with sub-word (2 bytes) alignment
+-- Should be unrolled into 4 aligned stores (MOVWs)
+smallCopy :: ByteArray -> IO ByteArray
+smallCopy (ByteArray ba) = IO $ \s0 -> case newByteArray# 8# s0 of
+ (# s1, mut #) -> case copyByteArray# ba 2# mut 0# 8# s1 of
+ s2 -> case unsafeFreezeByteArray# mut s2 of
+ (# s3, frozen #) -> (# s3, ByteArray frozen #)
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/6febc444c0abea6c033174aa0e813c950b9b2877...6094d43f36bdab5ff3f246afca9a6018545fdd73
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/6febc444c0abea6c033174aa0e813c950b9b2877...6094d43f36bdab5ff3f246afca9a6018545fdd73
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20190414/f6f37ded/attachment-0001.html>
More information about the ghc-commits
mailing list