[Git][ghc/ghc][master] 2 commits: codegen: unroll memcpy calls for small bytearrays

Sun Apr 14 05:32:41 UTC 2019

Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC


Commits:
edcef7b3 by Artem Pyanykh at 2019-04-14T05:26:35Z
codegen: unroll memcpy calls for small bytearrays

- - - - -
6094d43f by Artem Pyanykh at 2019-04-14T05:26:35Z
docs: mention memcpy optimization for ByteArrays in 8.10.1-notes

- - - - -


7 changed files:

- compiler/cmm/CmmExpr.hs
- compiler/codeGen/StgCmmPrim.hs
- compiler/nativeGen/X86/CodeGen.hs
- docs/users_guide/8.10.1-notes.rst
- testsuite/tests/codeGen/should_gen_asm/all.T
- + testsuite/tests/codeGen/should_gen_asm/bytearray-memcpy-unroll.asm
- + testsuite/tests/codeGen/should_gen_asm/bytearray-memcpy-unroll.hs


Changes:

=====================================
compiler/cmm/CmmExpr.hs
=====================================
@@ -5,7 +5,7 @@
 {-# LANGUAGE UndecidableInstances #-}
 
 module CmmExpr
-    ( CmmExpr(..), cmmExprType, cmmExprWidth, maybeInvertCmmExpr
+    ( CmmExpr(..), cmmExprType, cmmExprWidth, cmmExprAlignment, maybeInvertCmmExpr
     , CmmReg(..), cmmRegType, cmmRegWidth
     , CmmLit(..), cmmLitType
     , LocalReg(..), localRegType
@@ -43,6 +43,8 @@ import Unique
 import Data.Set (Set)
 import qualified Data.Set as Set
 
+import BasicTypes (Alignment, mkAlignment, alignmentOf)
+
 -----------------------------------------------------------------------------
 --              CmmExpr
 -- An expression.  Expressions have no side effects.
@@ -239,6 +241,13 @@ cmmLabelType dflags lbl
 cmmExprWidth :: DynFlags -> CmmExpr -> Width
 cmmExprWidth dflags e = typeWidth (cmmExprType dflags e)
 
+-- | Returns an alignment in bytes of a CmmExpr when it's a statically
+-- known integer constant, otherwise returns an alignment of 1 byte.
+-- The caller is responsible for using with a sensible CmmExpr
+-- argument.
+cmmExprAlignment :: CmmExpr -> Alignment
+cmmExprAlignment (CmmLit (CmmInt intOff _)) = alignmentOf (fromInteger intOff)
+cmmExprAlignment _                          = mkAlignment 1
 --------
 --- Negation for conditional branches
 


=====================================
compiler/codeGen/StgCmmPrim.hs
=====================================
@@ -2035,8 +2035,8 @@ doCopyByteArrayOp = emitCopyByteArray copy
   where
     -- Copy data (we assume the arrays aren't overlapping since
     -- they're of different types)
-    copy _src _dst dst_p src_p bytes =
-        emitMemcpyCall dst_p src_p bytes 1
+    copy _src _dst dst_p src_p bytes align =
+        emitMemcpyCall dst_p src_p bytes align
 
 -- | Takes a source 'MutableByteArray#', an offset in the source
 -- array, a destination 'MutableByteArray#', an offset into the
@@ -2050,22 +2050,26 @@ doCopyMutableByteArrayOp = emitCopyByteArray copy
     -- The only time the memory might overlap is when the two arrays
     -- we were provided are the same array!
     -- TODO: Optimize branch for common case of no aliasing.
-    copy src dst dst_p src_p bytes = do
+    copy src dst dst_p src_p bytes align = do
         dflags <- getDynFlags
         (moveCall, cpyCall) <- forkAltPair
-            (getCode $ emitMemmoveCall dst_p src_p bytes 1)
-            (getCode $ emitMemcpyCall  dst_p src_p bytes 1)
+            (getCode $ emitMemmoveCall dst_p src_p bytes align)
+            (getCode $ emitMemcpyCall  dst_p src_p bytes align)
         emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall
 
 emitCopyByteArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
-                      -> FCode ())
+                      -> Alignment -> FCode ())
                   -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
                   -> FCode ()
 emitCopyByteArray copy src src_off dst dst_off n = do
     dflags <- getDynFlags
+    let byteArrayAlignment = wordAlignment dflags
+        srcOffAlignment = cmmExprAlignment src_off
+        dstOffAlignment = cmmExprAlignment dst_off
+        align = minimum [byteArrayAlignment, srcOffAlignment, dstOffAlignment]
     dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off
     src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off
-    copy src dst dst_p src_p n
+    copy src dst dst_p src_p n align
 
 -- | Takes a source 'ByteArray#', an offset in the source array, a
 -- destination 'Addr#', and the number of bytes to copy.  Copies the given
@@ -2075,7 +2079,7 @@ doCopyByteArrayToAddrOp src src_off dst_p bytes = do
     -- Use memcpy (we are allowed to assume the arrays aren't overlapping)
     dflags <- getDynFlags
     src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off
-    emitMemcpyCall dst_p src_p bytes 1
+    emitMemcpyCall dst_p src_p bytes (mkAlignment 1)
 
 -- | Takes a source 'MutableByteArray#', an offset in the source array, a
 -- destination 'Addr#', and the number of bytes to copy.  Copies the given
@@ -2092,7 +2096,7 @@ doCopyAddrToByteArrayOp src_p dst dst_off bytes = do
     -- Use memcpy (we are allowed to assume the arrays aren't overlapping)
     dflags <- getDynFlags
     dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off
-    emitMemcpyCall dst_p src_p bytes 1
+    emitMemcpyCall dst_p src_p bytes (mkAlignment 1)
 
 
 -- ----------------------------------------------------------------------------
@@ -2107,9 +2111,7 @@ doSetByteArrayOp ba off len c = do
     dflags <- getDynFlags
 
     let byteArrayAlignment = wordAlignment dflags -- known since BA is allocated on heap
-        offsetAlignment = case off of
-            CmmLit (CmmInt intOff _) -> alignmentOf (fromInteger intOff)
-            _ -> mkAlignment 1
+        offsetAlignment = cmmExprAlignment off
         align = min byteArrayAlignment offsetAlignment
 
     p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off
@@ -2180,7 +2182,7 @@ doCopyArrayOp = emitCopyArray copy
     copy _src _dst dst_p src_p bytes =
         do dflags <- getDynFlags
            emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes)
-               (wORD_SIZE dflags)
+               (wordAlignment dflags)
 
 
 -- | Takes a source 'MutableArray#', an offset in the source array, a
@@ -2198,9 +2200,9 @@ doCopyMutableArrayOp = emitCopyArray copy
         dflags <- getDynFlags
         (moveCall, cpyCall) <- forkAltPair
             (getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes)
-             (wORD_SIZE dflags))
+             (wordAlignment dflags))
             (getCode $ emitMemcpyCall  dst_p src_p (mkIntExpr dflags bytes)
-             (wORD_SIZE dflags))
+             (wordAlignment dflags))
         emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall
 
 emitCopyArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff
@@ -2247,7 +2249,7 @@ doCopySmallArrayOp = emitCopySmallArray copy
     copy _src _dst dst_p src_p bytes =
         do dflags <- getDynFlags
            emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes)
-               (wORD_SIZE dflags)
+               (wordAlignment dflags)
 
 
 doCopySmallMutableArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff
@@ -2261,9 +2263,9 @@ doCopySmallMutableArrayOp = emitCopySmallArray copy
         dflags <- getDynFlags
         (moveCall, cpyCall) <- forkAltPair
             (getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes)
-             (wORD_SIZE dflags))
+             (wordAlignment dflags))
             (getCode $ emitMemcpyCall  dst_p src_p (mkIntExpr dflags bytes)
-             (wORD_SIZE dflags))
+             (wordAlignment dflags))
         emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall
 
 emitCopySmallArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff
@@ -2328,7 +2330,7 @@ emitCloneArray info_p res_r src src_off n = do
               (mkIntExpr dflags (arrPtrsHdrSizeW dflags)) src_off)
 
     emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n))
-        (wORD_SIZE dflags)
+        (wordAlignment dflags)
 
     emit $ mkAssign (CmmLocal res_r) (CmmReg arr)
 
@@ -2365,7 +2367,7 @@ emitCloneSmallArray info_p res_r src src_off n = do
               (mkIntExpr dflags (smallArrPtrsHdrSizeW dflags)) src_off)
 
     emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n))
-        (wORD_SIZE dflags)
+        (wordAlignment dflags)
 
     emit $ mkAssign (CmmLocal res_r) (CmmReg arr)
 
@@ -2493,19 +2495,19 @@ doCasByteArray res mba idx idx_ty old new = do
 -- Helpers for emitting function calls
 
 -- | Emit a call to @memcpy at .
-emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> Int -> FCode ()
+emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode ()
 emitMemcpyCall dst src n align = do
     emitPrimCall
         [ {-no results-} ]
-        (MO_Memcpy align)
+        (MO_Memcpy (alignmentBytes align))
         [ dst, src, n ]
 
 -- | Emit a call to @memmove at .
-emitMemmoveCall :: CmmExpr -> CmmExpr -> CmmExpr -> Int -> FCode ()
+emitMemmoveCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode ()
 emitMemmoveCall dst src n align = do
     emitPrimCall
         [ {- no results -} ]
-        (MO_Memmove align)
+        (MO_Memmove (alignmentBytes align))
         [ dst, src, n ]
 
 -- | Emit a call to @memset at .  The second argument must fit inside an


=====================================
compiler/nativeGen/X86/CodeGen.hs
=====================================
@@ -1767,12 +1767,11 @@ genCCall
 
 -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

--- Unroll memcpy calls if the source and destination pointers are at
--- least DWORD aligned and the number of bytes to copy isn't too
+-- Unroll memcpy calls if the number of bytes to copy isn't too
 -- large.  Otherwise, call C's memcpy.
-genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
+genCCall dflags _ (PrimTarget (MO_Memcpy align)) _
          [dst, src, CmmLit (CmmInt n _)] _
-    | fromInteger insns <= maxInlineMemcpyInsns dflags && align .&. 3 == 0 = do
+    | fromInteger insns <= maxInlineMemcpyInsns dflags = do
         code_dst <- getAnyReg dst
         dst_r <- getNewRegNat format
         code_src <- getAnyReg src
@@ -1785,7 +1784,9 @@ genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
     -- instructions per move.
     insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
 
-    format = if align .&. 4 /= 0 then II32 else (archWordFormat is32Bit)
+    maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported
+    effectiveAlignment = min (alignmentOf align) maxAlignment
+    format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
 
     -- The size of each move, in bytes.
     sizeBytes :: Integer


=====================================
docs/users_guide/8.10.1-notes.rst
=====================================
@@ -61,10 +61,11 @@ Compiler
   :ghc-flag:`-Wredundant-record-wildcards`  which warn users when they have
   redundant or unused uses of a record wildcard match.
 
-- Calls to `memset` are now unrolled more aggressively and the
-  produced code is more efficient on `x86_64` with added support for
-  64-bit `MOV`s. In particular, `setByteArray#` calls that were not
-  optimized before, now will be. See :ghc-ticket:`16052`.
+- Calls to `memset` and `memcpy` are now unrolled more aggressively
+  and the produced code is more efficient on `x86_64` with added
+  support for 64-bit `MOV`s. In particular, `setByteArray#` and
+  `copyByteArray#` calls that were not optimized before, now will
+  be. See :ghc-ticket:`16052`.
 
 Runtime system
 ~~~~~~~~~~~~~~


=====================================
testsuite/tests/codeGen/should_gen_asm/all.T
=====================================
@@ -8,3 +8,4 @@ test('memcpy-unroll', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
 test('memcpy-unroll-conprop', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
 test('memset-unroll', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
 test('bytearray-memset-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True, ''])
+test('bytearray-memcpy-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True, ''])


=====================================
testsuite/tests/codeGen/should_gen_asm/bytearray-memcpy-unroll.asm
=====================================
@@ -0,0 +1,8 @@
+movw 0(%rax),%dx
+movw %dx,0(%rcx)
+movw 2(%rax),%dx
+movw %dx,2(%rcx)
+movw 4(%rax),%dx
+movw %dx,4(%rcx)
+movw 6(%rax),%ax
+movw %ax,6(%rcx)


=====================================
testsuite/tests/codeGen/should_gen_asm/bytearray-memcpy-unroll.hs
=====================================
@@ -0,0 +1,19 @@
+{-# language MagicHash #-}
+{-# language UnboxedTuples #-}
+
+module CopyArray
+  ( smallCopy
+  ) where
+
+import GHC.Exts
+import GHC.IO
+
+data ByteArray = ByteArray ByteArray#
+
+-- Does an 8 byte copy with sub-word (2 bytes) alignment
+-- Should be unrolled into 4 aligned stores (MOVWs)
+smallCopy :: ByteArray -> IO ByteArray
+smallCopy (ByteArray ba) = IO $ \s0 -> case newByteArray# 8# s0 of
+  (# s1, mut #) -> case copyByteArray# ba 2# mut 0# 8# s1 of
+    s2 -> case unsafeFreezeByteArray# mut s2 of
+          (# s3, frozen #) -> (# s3, ByteArray frozen #)



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/6febc444c0abea6c033174aa0e813c950b9b2877...6094d43f36bdab5ff3f246afca9a6018545fdd73

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/compare/6febc444c0abea6c033174aa0e813c950b9b2877...6094d43f36bdab5ff3f246afca9a6018545fdd73
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20190414/f6f37ded/attachment-0001.html>