[GHC] #13629: sqrt should use machine instruction on x86_64

Fri Apr 28 23:58:17 UTC 2017

#13629: sqrt should use machine instruction on x86_64
-------------------------------------+-------------------------------------
        Reporter:  bgamari           |                Owner:  (none)
            Type:  bug               |               Status:  closed
        Priority:  normal            |            Milestone:  8.4.1
       Component:  Compiler (NCG)    |              Version:  8.0.1
      Resolution:  fixed             |             Keywords:
Operating System:  Unknown/Multiple  |         Architecture:
                                     |  Unknown/Multiple
 Type of failure:  Runtime           |            Test Case:
  performance bug                    |  numeric/num009
      Blocked By:                    |             Blocking:
 Related Tickets:  #13570            |  Differential Rev(s):  Phab:D3508
       Wiki Page:                    |
-------------------------------------+-------------------------------------

Comment (by bgamari):

 For the record, I did a quick, rather unscientific, comparison of SSE2 and
 x87,
 {{{#!c++
 #include <time.h>
 #include <math.h>
 #include <stdio.h>
 #include <functional>

 const int N = 1e8;

 template<typename Ret, typename... Args>
 void time_it(const char *name, std::function<Ret(Args...)> f, Args...
 args) {
         struct timespec start, end;

         clock_gettime(CLOCK_MONOTONIC, &start);
         f(args...);
         clock_gettime(CLOCK_MONOTONIC, &end);
         double t = (end.tv_nsec - start.tv_nsec) * 1e-9 + (end.tv_sec -
 start.tv_sec);
         printf("%s: %f seconds / %d = %f ns per iter\n", name, t, N, t / N
 / 1e-9);
 }

 static double test(double x) {
         double y = 0;
         for (int i=0; i<N; i++) {
                 y += 1e-4;
                 x += sin(y);
         }
         return x;
 }

 int main() {
         time_it("test", std::function<double(double)>(test), 0.0);
         return 0;
 }
 }}}

 For x87 I compiled this with,
 {{{
 $ g++ -lm hi.cpp -O2 -std=c++11 -march=core2   -ffast-math -mfpmath=387
 }}}
 Which produced (ignoring the epilogue responsible for preparing the return
 value)
 {{{#!objdump
 0000000000000b00 <_ZL4libmd>:
  b00:   f2 0f 11 44 24 f8       movsd  %xmm0,-0x8(%rsp)
  b06:   dd 44 24 f8             fldl   -0x8(%rsp)
  b0a:   b8 00 e1 f5 05          mov    $0x5f5e100,%eax
  b0f:   dd 05 53 01 00 00       fldl   0x153(%rip)        # c68
 <_ZTSPFddE+0xb>
  b15:   d9 ee                   fldz
  b17:   dd 05 53 01 00 00       fldl   0x153(%rip)        # c70
 <_ZTSPFddE+0x13>
  b1d:   dd 05 55 01 00 00       fldl   0x155(%rip)        # c78
 <_ZTSPFddE+0x1b>
  b23:   eb 0f                   jmp    b34 <_ZL4libmd+0x34>
  b25:   0f 1f 00                nopl   (%rax)
  b28:   dc c2                   fadd   %st,%st(2)
  b2a:   d9 ca                   fxch   %st(2)
  b2c:   d9 fe                   fsin
  b2e:   d9 cb                   fxch   %st(3)
  b30:   d9 cc                   fxch   %st(4)
  b32:   d9 ca                   fxch   %st(2)
  b34:   d9 c2                   fld    %st(2)
  b36:   83 e8 01                sub    $0x1,%eax
  b39:   d8 c2                   fadd   %st(2),%st
  b3b:   d9 cd                   fxch   %st(5)
  b3d:   de c4                   faddp  %st,%st(4)
  b3f:   75 e7                   jne    b28 <_ZL4libmd+0x28>
 ...
 }}}

 For SSE I compiled with,
 {{{
 $ g++ -lm hi.cpp -O2 -std=c++11 -march=core2   -ffast-math
 }}}
 Which produced,
 {{{#!objdump
 0000000000000b60 <_ZL4libmd>:
  b60:   53                      push   %rbx
  b61:   66 0f ef c9             pxor   %xmm1,%xmm1
  b65:   bb 00 e1 f5 05          mov    $0x5f5e100,%ebx
  b6a:   48 83 ec 10             sub    $0x10,%rsp
  b6e:   f2 0f 11 04 24          movsd  %xmm0,(%rsp)
  b73:   f2 0f 10 05 5d 01 00    movsd  0x15d(%rip),%xmm0        # cd8
 <_ZTSPFddE+0xb>
  b7a:   00
  b7b:   eb 1e                   jmp    b9b <_ZL4libmd+0x3b>
  b7d:   0f 1f 00                nopl   (%rax)
  b80:   f2 0f 10 05 60 01 00    movsd  0x160(%rip),%xmm0        # ce8
 <_ZTSPFddE+0x1b>
  b87:   00
  b88:   f2 0f 58 c1             addsd  %xmm1,%xmm0
  b8c:   e8 4f fd ff ff          callq  8e0 <sin at plt>
  b91:   f2 0f 10 54 24 08       movsd  0x8(%rsp),%xmm2
  b97:   66 0f 28 ca             movapd %xmm2,%xmm1
  b9b:   f2 0f 10 15 3d 01 00    movsd  0x13d(%rip),%xmm2        # ce0
 <_ZTSPFddE+0x13>
  ba2:   00
  ba3:   83 eb 01                sub    $0x1,%ebx
  ba6:   f2 0f 58 04 24          addsd  (%rsp),%xmm0
  bab:   f2 0f 58 d1             addsd  %xmm1,%xmm2
  baf:   f2 0f 11 04 24          movsd  %xmm0,(%rsp)
  bb4:   f2 0f 11 54 24 08       movsd  %xmm2,0x8(%rsp)
  bba:   75 c4                   jne    b80 <_ZL4libmd+0x20>
 }}}

 Note the call to `sin` which is sent through the PLT. Yuck.

 This gave the following results on my i7-6600U (Skylake @ 2.6GHz) (looking
 at a variety of iteration counts to ensure that the timing isn't dominated
 by setup time, as well as a few measurements at each count to get a sense
 for variance)
 ,
 ||= Method =||= iterations =||= Time per iteration (ns) =||
 || SSE      || 1e8          || 33.91 ||
 || SSE      || 1e8          || 34.08 ||
 || SSE      || 1e8          || 33.08 ||
 || SSE      || 5e7          || 34.47 ||
 || SSE      || 5e7          || 34.76 ||
 || SSE      || 5e7          || 36.36 ||
 || SSE      || 1e7          || 39.69 ||
 || SSE      || 1e7          || 36.45 ||
 || SSE      || 1e7          || 39.40 ||
 || X87      || 1e8          || 37.95 ||
 || X87      || 1e8          || 39.74 ||
 || X87      || 1e8          || 40.94 ||
 || X87      || 5e7          || 37.29 ||
 || X87      || 5e7          || 36.35 ||
 || X87      || 5e7          || 36.60 ||
 || X87      || 1e7          || 37.75 ||
 || X87      || 1e7          || 37.34 ||
 || X87      || 1e7          || 38.30 ||

 It seems to me like (in the case of this particularly terrible benchmark)
 the two implementations are relatively similar, with SSE perhaps having a
 slight edge. Intriguingly, replacing `sin` in the example with `sqrt`
 changes the outcome dramatically: both timings decrease markedly (as one
 might expect; `sin` isn't an easy thing to compute), but X87 is twice as
 fast as SSE (2.2 ns/iteration vs. 5.5 ns/iteration).

--
Ticket URL: <http://ghc.haskell.org/trac/ghc/ticket/13629#comment:11>
GHC <http://www.haskell.org/ghc/>
The Glasgow Haskell Compiler