[commit: ghc] master: Implement optimized NCG `MO_Ctz W64` op for i386 (#9340) (612f3d1)
git at git.haskell.org
git at git.haskell.org
Sat Oct 18 12:34:42 UTC 2014
Repository : ssh://git@git.haskell.org/ghc
On branch : master
Link : http://ghc.haskell.org/trac/ghc/changeset/612f3d120c65a461a4ad7f212d67bdae005f4975/ghc
>---------------------------------------------------------------
commit 612f3d120c65a461a4ad7f212d67bdae005f4975
Author: Herbert Valerio Riedel <hvr at gnu.org>
Date: Sat Oct 18 14:32:33 2014 +0200
Implement optimized NCG `MO_Ctz W64` op for i386 (#9340)
Summary:
This is an optimization to the CTZ primops introduced for #9340
Previously we called out to `hs_ctz64`, but we can actually generate
better hand-tuned code while avoiding the FFI ccall.
With this patch, the code
{-# LANGUAGE MagicHash #-}
module TestClz0 where
import GHC.Prim
ctz64 :: Word64# -> Word#
ctz64 x = ctz64# x
results in the following assembler generated by NCG on i386:
TestClz.ctz64_info:
movl (%ebp),%eax
movl 4(%ebp),%ecx
movl %ecx,%edx
orl %eax,%edx
movl $64,%edx
je _nAO
bsf %ecx,%ecx
addl $32,%ecx
bsf %eax,%eax
cmovne %eax,%ecx
movl %ecx,%edx
_nAO:
movl %edx,%esi
addl $8,%ebp
jmp *(%ebp)
For comparision, here's what LLVM 3.4 currently generates:
000000fc <TestClzz_ctzz64_info>:
fc: 0f bc 45 04 bsf 0x4(%ebp),%eax
100: b9 20 00 00 00 mov $0x20,%ecx
105: 0f 45 c8 cmovne %eax,%ecx
108: 83 c1 20 add $0x20,%ecx
10b: 8b 45 00 mov 0x0(%ebp),%eax
10e: 8b 55 08 mov 0x8(%ebp),%edx
111: 0f bc f0 bsf %eax,%esi
114: 85 c0 test %eax,%eax
116: 0f 44 f1 cmove %ecx,%esi
119: 83 c5 08 add $0x8,%ebp
11c: ff e2 jmp *%edx
Reviewed By: austin
Auditors: simonmar
Differential Revision: https://phabricator.haskell.org/D163
>---------------------------------------------------------------
612f3d120c65a461a4ad7f212d67bdae005f4975
compiler/nativeGen/X86/CodeGen.hs | 41 ++++++++++++++++++++++++++++++---------
1 file changed, 32 insertions(+), 9 deletions(-)
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 9d7cb78..abd87ed 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1799,14 +1799,38 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src]
size = if width == W8 then II16 else intSize width
lbl = mkCmmCodeLabel primPackageKey (fsLit (clzLabel width))
-genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src]
+genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src]
| is32Bit, width == W64 = do
- -- Fallback to `hs_ctz64` on i386
- targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
- let target = ForeignTarget targetExpr (ForeignConvention CCallConv
- [NoHint] [NoHint]
- CmmMayReturn)
- genCCall dflags is32Bit target dest_regs args
+ ChildCode64 vcode rlo <- iselExpr64 src
+ let rhi = getHiVRegFromLo rlo
+ dst_r = getRegisterReg platform False (CmmLocal dst)
+ lbl1 <- getBlockIdNat
+ lbl2 <- getBlockIdNat
+ tmp_r <- getNewRegNat size
+
+ -- The following instruction sequence corresponds to the pseudo-code
+ --
+ -- if (src) {
+ -- dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
+ -- } else {
+ -- dst = 64;
+ -- }
+ return $ vcode `appOL` toOL
+ ([ MOV II32 (OpReg rhi) (OpReg tmp_r)
+ , OR II32 (OpReg rlo) (OpReg tmp_r)
+ , MOV II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+ , JXX EQQ lbl2
+ , JXX ALWAYS lbl1
+
+ , NEWBLOCK lbl1
+ , BSF II32 (OpReg rhi) dst_r
+ , ADD II32 (OpImm (ImmInt 32)) (OpReg dst_r)
+ , BSF II32 (OpReg rlo) tmp_r
+ , CMOV NE II32 (OpReg tmp_r) dst_r
+ , JXX ALWAYS lbl2
+
+ , NEWBLOCK lbl2
+ ])
| otherwise = do
code_src <- getAnyReg src
@@ -1828,7 +1852,6 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src]
bw = widthInBits width
platform = targetPlatform dflags
size = if width == W8 then II16 else intSize width
- lbl = mkCmmCodeLabel primPackageKey (fsLit (ctzLabel width))
genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
targetExpr <- cmmMakeDynamicReference dflags
@@ -2485,7 +2508,7 @@ outOfLineCmmOp mop res args
MO_PopCnt _ -> fsLit "popcnt"
MO_BSwap _ -> fsLit "bswap"
MO_Clz w -> fsLit $ clzLabel w
- MO_Ctz w -> fsLit $ ctzLabel w
+ MO_Ctz _ -> unsupported
MO_AtomicRMW _ _ -> fsLit "atomicrmw"
MO_AtomicRead _ -> fsLit "atomicread"
More information about the ghc-commits
mailing list