[GHC] #13094: Poor register allocation and redundant moves when using `foreign import prim`
GHC
ghc-devs at haskell.org
Tue Jan 10 04:59:20 UTC 2017
#13094: Poor register allocation and redundant moves when using `foreign import
prim`
---------------------------------+----------------------------------------
Reporter: jberryman | Owner:
Type: bug | Status: new
Priority: normal | Milestone:
Component: Compiler | Version: 8.0.1
Resolution: | Keywords:
Operating System: Linux | Architecture: Unknown/Multiple
Type of failure: None/Unknown | Test Case:
Blocked By: | Blocking:
Related Tickets: #12232 | Differential Rev(s):
Wiki Page: |
---------------------------------+----------------------------------------
Description changed by jberryman:
@@ -18,1 +18,1 @@
- {{{
+ {{{#!gas
New description:
This may well be the same as #12232 but I didn't want to risk hijacking
that one.
I've been looking at the dumped asm from a library where I use `foreign
import prim` to call a simple assembly routine (that may be irrelevant). I
see many `movq` instructions, and when I traced them by hand many or most
seemed superfluous saving and restoring of registers.
Here is a contiguous and mostly self-contained snippet of the assembly,
with notes. I have four values I'm interested in V0, V1, V2, V3 which I
started tracing at the note "registers good for sipRound_s_x2 at this
point".
Below I specifically follow V3 through the assmebly, marking lines with
{{{"*** V3 ***"}}}, and the moves don't seem sensible to my (untrained)
eye:
{{{#!gas
_c7Ns:
movq %r9,%rbx
movq %rax,%r9
movq %r8,%rax ; *** V3 ***
movq %rbx,%r8
movq %rdi,%rbx
movq %rax,%rdi ; *** V3 (back to rdi!) ***
movq %rsi,%rax
movq %rbx,%rsi
movq %r14,%rbx
movq %rax,%r14
movq %rcx,16(%rbp)
addq $16,%rbp
jmp *8(%rbp)
.align 8
.quad 836
.quad 32
block_info:
_c7Nk:
movq 16(%rbp),%rax
movq $8,16(%rbp)
movq 24(%rbp),%rcx
incq %rcx
movq %rcx,24(%rbp)
movq 32(%rbp),%rdx
incq %rdx
movq %rdx,32(%rbp)
xorq 8(%rbp),%rbx
addq $16,%rbp
movl $8,%r8d
xorl %r9d,%r9d
_n7T1:
movq %rax,64(%rsp)
movq %r8,%rax
; --- registers good for sipRound_s_x2 at this point ------
movq %rdi,%r8 ; moving V3 *OUT* of rdi *** V3 ***
movq %rsi,%rdi ; moving V2 *OUT* of rdi
movq %r14,%rsi ; moving V1 *OUT* of r14
movq %rbx,%r14 ; moving V0 *OUT* of rbx
movq 64(%rsp),%rbx
_c7My: ; (NOTE: there are a few jumps to here from other
sections not included here)
cmpq 24(%rbx),%rdx
je _c7Ns
_c7Nr:
movq 16(%rbx),%r10
movzbl (%r10,%rdx,1),%r10d
cmpq $1,%rax
jne _c7N9
_c7Nl:
movq $block_info,-16(%rbp)
shlq $8,%r9
orq %r10,%r9
;----- at this point (name: register): V0: r14, V1: rsi, V2: rdi, V3: r8
movq %rdi,%rax ; save rdi, trying to be rsi
movq %r8,%rdi ; prepared rdi(V3) *** V3 ***
xorq %r9,%rdi
movq %rsi,%rcx
movq %rax,%rsi ; prepared rsi(V2)
movq %r14,%rax
movq %rcx,%r14 ; prepared r14(V1)
movq %rbx,%rcx
movq %rax,%rbx ; prepared rbx(V0)
movq %r9,-8(%rbp)
movq %rcx,(%rbp)
addq $-16,%rbp
jmp sipRound_s_x2
.align 8
.quad 1733
.quad 32
block_info:
_c7N2:
movq 24(%rbp),%rax
movq $7,24(%rbp)
movq 32(%rbp),%rcx
incq %rcx
movq %rcx,32(%rbp)
movq 40(%rbp),%rdx
incq %rdx
movq %rdx,40(%rbp)
movq 16(%rbp),%r8
xorq 8(%rbp),%rbx
addq $24,%rbp
movl $7,%r9d
_n7T2:
movq %rax,64(%rsp)
movq %r9,%rax
movq %r8,%r9
movq %rdi,%r8
movq %rsi,%rdi
movq %r14,%rsi
movq %rbx,%r14
movq 64(%rsp),%rbx
jmp _c7My
_c7N9:
cmpq $1,%rax
jbe _c7N3
_c7MW:
decq %rax
movq %rax,(%rbp)
incq %rcx
movq %rcx,8(%rbp)
incq %rdx
movq %rdx,16(%rbp)
shlq $8,%r9
orq %r10,%r9
jmp _c7My
_c7N3:
movq $block_info,-24(%rbp)
movq %rdi,%rax
movq %r8,%rdi ; *** V3 (back to rdi!) ***
xorq %r9,%rdi
movq %rsi,%rcx
movq %rax,%rsi
movq %r14,%rax
movq %rcx,%r14
movq %rbx,%rcx
movq %rax,%rbx
movq %r9,-16(%rbp)
movq %r10,-8(%rbp)
movq %rcx,(%rbp)
addq $-24,%rbp
jmp sipRound_s_x2
.size $whashRemainingBytes_info, .-$whashRemainingBytes_info
}}}
This is my first time looking closely at assembly, so maybe this is normal
or no big deal performance-wise (I haven't gotten around to trying to
correlate number of moves with performance of my variations yet), or I'm
missing something obvious. I wasn't able to make sense of an objdump-ed
version of the llvm-compiled program. The code for the version where the
same {{{sipRound}}} stuff is implemented in normal haskell also has a lot
of moves (even more in fact), but they seem interspersed throughout and
are less easy to sort through, e.g. a random snippet:
{{{
xorq %rdi,%r8
movq %r14,%rdi
shrq $32,%rdi
shlq $32,%r14
orq %rdi,%r14
addq %r8,%r14
movq %r14,%rdi
addq %rsi,%rdi
movq %rsi,%r10
shrq $51,%r10
shlq $13,%rsi
orq %r10,%rsi
xorq %rdi,%rsi
movq %r8,%r10
shrq $43,%r10
shlq $21,%r8
orq %r10,%r8
xorq %r14,%r8
movq %rax,%r10
shrq $32,%r10
shlq $32,%rax
orq %r10,%rax
addq %r8,%rax
movq %rax,%r10
}}}
Intuitively this is also bad, after all the haskell we're compiling looks
almost trivially like the good assembly we'd like/expect (although perhaps
{{{rotl}}} is shaking things up here, as we have to implement it with two
shifts and an OR):
{{{#!hs
-- in the Identity monad
do v0 <- return $ v0 + v1
v1 <- return $ rotl v1 13
v1 <- return $ v1 `xor` v0
v0 <- return $ rotl v0 32
v2 <- return $ v2 + v3
v3 <- return $ rotl v3 16
v3 <- return $ v3 `xor` v2
v0 <- return $ v0 + v3
v3 <- return $ rotl v3 21
v3 <- return $ v3 `xor` v0
}}}
You can check out the branch here, which I'll keep at 838b27a:
https://github.com/jberryman/hashabler/tree/saved-branch-exhibiting-
seemingly-bad-register-allocation
You can build and observe the assembly with:
{{{
$ cabal configure -fdev --enable-benchmarks && cabal build core
$ gvim ./dist/build/core/core-tmp/core.dump-asm
}}}
--
--
Ticket URL: <http://ghc.haskell.org/trac/ghc/ticket/13094#comment:1>
GHC <http://www.haskell.org/ghc/>
The Glasgow Haskell Compiler
More information about the ghc-tickets
mailing list