[commit: ghc] master: Implement optimized NCG `MO_Ctz W64` op for i386 (#9340) (612f3d1)

Sat Oct 18 12:34:42 UTC 2014

Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/612f3d120c65a461a4ad7f212d67bdae005f4975/ghc

>---------------------------------------------------------------

commit 612f3d120c65a461a4ad7f212d67bdae005f4975
Author: Herbert Valerio Riedel <hvr at gnu.org>
Date:   Sat Oct 18 14:32:33 2014 +0200

    Implement optimized NCG `MO_Ctz W64` op for i386 (#9340)
    
    Summary:
    This is an optimization to the CTZ primops introduced for #9340
    
    Previously we called out to `hs_ctz64`, but we can actually generate
    better hand-tuned code while avoiding the FFI ccall.
    
    With this patch, the code
    
      {-# LANGUAGE MagicHash #-}
      module TestClz0 where
      import GHC.Prim
      ctz64 :: Word64# -> Word#
      ctz64 x = ctz64# x
    
    results in the following assembler generated by NCG on i386:
    
      TestClz.ctz64_info:
          movl (%ebp),%eax
          movl 4(%ebp),%ecx
          movl %ecx,%edx
          orl %eax,%edx
          movl $64,%edx
          je _nAO
    
          bsf %ecx,%ecx
          addl $32,%ecx
          bsf %eax,%eax
          cmovne %eax,%ecx
          movl %ecx,%edx
      _nAO:
          movl %edx,%esi
          addl $8,%ebp
          jmp *(%ebp)
    
    For comparision, here's what LLVM 3.4 currently generates:
    
      000000fc <TestClzz_ctzz64_info>:
        fc:   0f bc 45 04             bsf    0x4(%ebp),%eax
       100:   b9 20 00 00 00          mov    $0x20,%ecx
       105:   0f 45 c8                cmovne %eax,%ecx
       108:   83 c1 20                add    $0x20,%ecx
       10b:   8b 45 00                mov    0x0(%ebp),%eax
       10e:   8b 55 08                mov    0x8(%ebp),%edx
       111:   0f bc f0                bsf    %eax,%esi
       114:   85 c0                   test   %eax,%eax
       116:   0f 44 f1                cmove  %ecx,%esi
       119:   83 c5 08                add    $0x8,%ebp
       11c:   ff e2                   jmp    *%edx
    
    Reviewed By: austin
    
    Auditors: simonmar
    
    Differential Revision: https://phabricator.haskell.org/D163


>---------------------------------------------------------------

612f3d120c65a461a4ad7f212d67bdae005f4975
 compiler/nativeGen/X86/CodeGen.hs | 41 ++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 9d7cb78..abd87ed 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1799,14 +1799,38 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src]
     size = if width == W8 then II16 else intSize width
     lbl = mkCmmCodeLabel primPackageKey (fsLit (clzLabel width))
 
-genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src]
+genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src]
   | is32Bit, width == W64 = do
-    -- Fallback to `hs_ctz64` on i386
-    targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
-    let target = ForeignTarget targetExpr (ForeignConvention CCallConv
-                                           [NoHint] [NoHint]
-                                           CmmMayReturn)
-    genCCall dflags is32Bit target dest_regs args
+      ChildCode64 vcode rlo <- iselExpr64 src
+      let rhi     = getHiVRegFromLo rlo
+          dst_r   = getRegisterReg platform False (CmmLocal dst)
+      lbl1 <- getBlockIdNat
+      lbl2 <- getBlockIdNat
+      tmp_r <- getNewRegNat size
+
+      -- The following instruction sequence corresponds to the pseudo-code
+      --
+      --  if (src) {
+      --    dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
+      --  } else {
+      --    dst = 64;
+      --  }
+      return $ vcode `appOL` toOL
+               ([ MOV      II32 (OpReg rhi)         (OpReg tmp_r)
+                , OR       II32 (OpReg rlo)         (OpReg tmp_r)
+                , MOV      II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+                , JXX EQQ    lbl2
+                , JXX ALWAYS lbl1
+
+                , NEWBLOCK   lbl1
+                , BSF     II32 (OpReg rhi)         dst_r
+                , ADD     II32 (OpImm (ImmInt 32)) (OpReg dst_r)
+                , BSF     II32 (OpReg rlo)         tmp_r
+                , CMOV NE II32 (OpReg tmp_r)       dst_r
+                , JXX ALWAYS lbl2
+
+                , NEWBLOCK   lbl2
+                ])
 
   | otherwise = do
     code_src <- getAnyReg src
@@ -1828,7 +1852,6 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src]
     bw = widthInBits width
     platform = targetPlatform dflags
     size = if width == W8 then II16 else intSize width
-    lbl = mkCmmCodeLabel primPackageKey (fsLit (ctzLabel width))
 
 genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
     targetExpr <- cmmMakeDynamicReference dflags
@@ -2485,7 +2508,7 @@ outOfLineCmmOp mop res args
               MO_PopCnt _  -> fsLit "popcnt"
               MO_BSwap _   -> fsLit "bswap"
               MO_Clz w     -> fsLit $ clzLabel w
-              MO_Ctz w     -> fsLit $ ctzLabel w
+              MO_Ctz _     -> unsupported
 
               MO_AtomicRMW _ _ -> fsLit "atomicrmw"
               MO_AtomicRead _  -> fsLit "atomicread"