[GHC] #12808: For primitive (Addr#) operations, Loop Invariant Code Flow not lifted outside the loop...
GHC
ghc-devs at haskell.org
Fri Nov 11 14:36:34 UTC 2016
#12808: For primitive (Addr#) operations, Loop Invariant Code Flow not lifted
outside the loop...
-------------------------------------+-------------------------------------
Reporter: GordonBGood | Owner:
Type: bug | Status: new
Priority: normal | Milestone: 8.2.1
Component: Compiler | Version: 8.0.1
Resolution: | Keywords:
Operating System: Unknown/Multiple | Architecture:
Type of failure: Runtime | Unknown/Multiple
performance bug | Test Case:
Blocked By: | Blocking:
Related Tickets: | Differential Rev(s):
Wiki Page: |
-------------------------------------+-------------------------------------
Comment (by GordonBGood):
Replying to [comment:9 simonpj]:
> > You can see that the STG code just reflects the original Haskell
source code and that the faulty register initialization has not yet been
dropped down to within the loop(s), so the problem is not here. The
problem is in the generation of the first CMM
>
> Aha! Could you possibly make the tiniest possible example that
illustrates precisely this point. You can motivate its importance by this
thread, but in thinking about how to fix it, it's MUCH easier to grok a
small example.
I can't cut the test program down to just a few lines as I believe that
the problem is related to pointers and pointer arithmetic (the Addr#
primitive) and thus there is some setup involved in their use in a loop
that shows the problems.
However, I have boiled the test down to a very simple tail-recursive loop
with only one cull operation per loop using an Addr# and an offset that
shows the problems; this loop is inside another loop to feed a variable
prime "p" to the inner loop so it doesn't get optimized away as constants,
and this is inside the setup code to produced the pinned byte array on
which the loop works in the following code:
{{{#!hs
-- SimpleEfficiencyBug
{-# LANGUAGE FlexibleContexts, BangPatterns, MagicHash, UnboxedTuples #-}
{-# OPTIONS_GHC -O3 -rtsopts -keep-s-files -ddump-stg -ddump-cmm -ddump-
opt-cmm -ddump-to-file -dumpdir . #-} -- or -O2 -fllvm -v -dcore-lint
-ddump-asm
import Data.Bits
import Data.Array.ST (runSTUArray)
import Data.Array.Base
import GHC.ST ( ST(..) )
import GHC.Exts
cull :: () -> [Int]
cull() = [i | (i, True) <- assocs arr ] where
arr = runSTUArray $ do
let bfBts = 1 `shiftL` 17 -- 16 Kilobytes worth of bits
bf <- newArray (0, bfBts - 1) False :: ST s (STUArray s Int Bool)
cullb bf
cullb (STUArray l u n marr#) = ST $ \s0# -> -- following is just setup
for the loop...
case getSizeofMutableByteArray# marr# s0# of { (# s1#, n# #) ->
case newPinnedByteArray# n# s1# of { (# s2#, marr'# #) ->
case copyMutableByteArray# marr# 0# marr'# 0# n# s2# of { s3# ->
case unsafeFreezeByteArray# marr'# s3# of { (# s4#, arr# #) -> --
must do this
case byteArrayContents# arr# of { adr# -> -- to obtain the
addr# of pinned marr' here
let cullp !p@(I# p#) sp# = -- for several prime values
if p > 5 then case copyMutableByteArray# marr'# 0# marr# 0# n#
sp# of
so# -> (# so#, STUArray l u n marr# #) else
let !r1@(I# r1#) = ((p .&. 7) + p) `shiftR` 3 in -- register
offset value
let !(I# szlmt#) = n `div` 8 - r1 in
let lmt# = plusAddr# adr# szlmt# in
let doit c# s# = -- all the work is done here; herein lies the
bugs...
case c# `ltAddr#` lmt# of
0# -> s#
_ ->
case readWord8OffAddr# c# r1# s# of { (# s0#, v0# #)
->
case writeWord8OffAddr# c# r1# (v0# `or#` (int2Word#
1#)) s0# of { s1# ->
doit (plusAddr# c# p#) s1# }} in
case doit adr# sp# of sd# -> cullp (p + 2) sd# in cullp 1 s4#
}}}}}
main = print $ length $ cull()
}}}
When compiled with the "-fllvm" compiler flag, the above code produces the
following STG code for the inner loop (located by searching for the first
"doit1_"):
{{{
let {
doit1_s7pL [Occ=LoopBreaker]
:: GHC.Prim.Addr#
-> GHC.Prim.State#
GHC.Prim.RealWorld
-> GHC.Prim.State#
GHC.Prim.RealWorld
[LclId,
Arity=2,
Str=DmdType <S,U><S,U>,
Unf=OtherCon []] =
sat-only \r srt:SRT:[] [c#_s7pM
s#_s7pN]
case
ltAddr# [c#_s7pM
lmt#1_s7pJ]
of
_ [Occ=Dead]
{ __DEFAULT ->
case
readWord8OffAddr# [c#_s7pM
r1#_s7pG
s#_s7pN]
of
_ [Occ=Dead]
{ (#,#) ipv6_s7pQ [Occ=Once]
ipv7_s7pR [Occ=Once] ->
case
or# [ipv7_s7pR
1##]
of
sat_s7pS
{ __DEFAULT ->
case
writeWord8OffAddr# [c#_s7pM
r1#_s7pG
sat_s7pS
ipv6_s7pQ]
of
s1#1_s7pT [OS=OneShot]
{ __DEFAULT ->
case
plusAddr# [c#_s7pM
ww_s7pC]
of
sat_s7pU
{ __DEFAULT ->
doit1_s7pL
sat_s7pU
s1#1_s7pT;
};
};
};
};
0# ->
s#_s7pN;
};
} in
}}}
Which first produces the following CMM code (found by search for
"doit1_"):
{{{
doit1_s7pL_entry() // [R2, R1]
{ info_tbl: [(c7Kw,
label: doit1_s7pL_info
rep:HeapRep 3 nonptrs { Fun {arity: 2 fun_type:
ArgSpec 4} })]
stack_info: arg_space: 8 updfr_space: Just 8
}
{offset
c7Kw:
_s7pM::I64 = R2;
_s7pL::P64 = R1;
goto c7Kp;
c7Kp:
if ((old + 0) - <highSp> < SpLim) goto c7Kx; else goto c7Ky;
c7Kx:
R2 = _s7pM::I64;
R1 = _s7pL::P64;
call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8;
c7Ky:
goto c7Ko;
c7Ko:
_s7pC::I64 = I64[_s7pL::P64 + 6]; // registers initialized
inside loop here
_s7pG::I64 = I64[_s7pL::P64 + 14];
_s7pJ::I64 = I64[_s7pL::P64 + 22]; // to here
_c7Kr::I64 = _s7pM::I64 < _s7pJ::I64;
_s7pO::I64 = _c7Kr::I64;
switch [-9223372036854775808 .. 9223372036854775807] _s7pO::I64
{
case 0 : goto c7Kv;
default: goto c7Ku;
}
c7Kv:
goto c7KG;
c7KG:
call (P64[(old + 8)])() args: 8, res: 0, upd: 8;
c7Ku:
goto c7KB;
c7KB:
_s7pR::I64 = %MO_UU_Conv_W8_W64(I8[_s7pM::I64 + (_s7pG::I64 <<
0)]);
_s7pR::I64 = _s7pR::I64;
_c7KJ::I64 = _s7pR::I64 | 1;
_s7pS::I64 = _c7KJ::I64;
I8[_s7pM::I64 + (_s7pG::I64 << 0)] =
%MO_UU_Conv_W64_W8(_s7pS::I64);
_c7KO::I64 = _s7pM::I64 + _s7pC::I64;
_s7pU::I64 = _c7KO::I64;
_s7pM::I64 = _s7pU::I64;
goto c7Ko;
}
},
}}}
then after many optimization passes produces the following optimized CMM
code:
{{{
==================== Optimised Cmm ====================
2016-11-11 13:14:21.2389114 UTC
doit1_s7pL_entry() // [R1, R2]
{ [(c7Kw,
doit1_s7pL_info:
const 8589934596;
const 12884901888;
const 9;)]
}
{offset
c7Kw:
_s7pM::I64 = R2;
_s7pL::P64 = R1;
goto c7Ko;
c7Ko:
switch [-9223372036854775808 .. 9223372036854775807] (_s7pM::I64
< I64[_s7pL::P64 + 22]) {
case 0 : goto c7Kv;
default: goto c7Ku;
}
c7Kv:
call (P64[Sp])() args: 8, res: 0, upd: 8;
c7Ku:
_s7pC::I64 = I64[_s7pL::P64 + 6]; // registers initialized
inside loop here
_s7pG::I64 = I64[_s7pL::P64 + 14]; // and here
I8[_s7pM::I64 + _s7pG::I64] =
%MO_UU_Conv_W64_W8(%MO_UU_Conv_W8_W64(I8[_s7pM::I64 + _s7pG::I64]) | 1);
_s7pM::I64 = _s7pM::I64 + _s7pC::I64;
goto c7Ko;
}
}
}}}
and finally the following assembly code:
{{{
s7pL_info$def:
# BB#0: # %c7Kw
cmpq %r14, 22(%rbx)
jbe .LBB18_2
.align 16, 0x90
.LBB18_1: # %c7Ku
# =>This Inner Loop Header:
Depth=1
movq 14(%rbx), %rax # registers initialized inside loop here
movq 6(%rbx), %rcx # and here
addq %r14, %rcx
orb $1, (%rax,%r14)
cmpq 22(%rbx), %rcx # and an additional unnecessary memory
load here by LLVM?
movq %rcx, %r14 # extra unnecessary instruction if code
reformulated
jb .LBB18_1
.LBB18_2: # %c7Kv
movq (%rbp), %rax
rex64 jmpq *%rax # TAILCALL
}}}
I find no problems with the STG code, but the problems persist through all
of the other codes including the initial CMM code. I have commented on
where the problems are in the above codes. I would like to see the
optimized CMM code look like the following:
{{{
doit1_s7pL_entry() // [R1, R2]
{ [(c7Kw,
doit1_s7pL_info:
const 8589934596;
const 12884901888;
const 9;)]
}
{offset
c7Kw:
_s7pM::I64 = R2;
_s7pL::P64 = R1;
_s7pC::I64 = I64[_s7pL::P64 + 6]; // registers initialized
outside loop here
_s7pG::I64 = I64[_s7pL::P64 + 14]; // and here
goto c7Ko;
c7Ko:
switch [-9223372036854775808 .. 9223372036854775807] (_s7pM::I64
< I64[_s7pL::P64 + 22]) {
case 0 : goto c7Kv;
default: goto c7Ku;
}
c7Kv:
call (P64[Sp])() args: 8, res: 0, upd: 8;
c7Ku:
I8[_s7pM::I64 + _s7pG::I64] =
%MO_UU_Conv_W64_W8(%MO_UU_Conv_W8_W64(I8[_s7pM::I64 + _s7pG::I64]) | 1);
_s7pM::I64 = _s7pM::I64 + _s7pC::I64;
goto c7Ko;
}
}
}}}
which should produce the following desired assembly code:
{{{
s7pL_info$def:
# BB#0: # %c7Kw
movq 14(%rbx), %rax # registers initialized outside loop here
movq 6(%rbx), %rcx # and here
movq 22(%rbx), %rbx # and here
cmpq %r14, %rbx
jbe .LBB18_2
.align 16, 0x90
.LBB18_1: # %c7Ku
# =>This Inner Loop Header:
Depth=1
orb $1, (%rax,%r14)
addq %rcx, %r14
cmpq %rbx, %rcx # use a register for comparison
jb .LBB18_1
.LBB18_2: # %c7Kv
movq (%rbp), %rax
rex64 jmpq *%rax # TAILCALL
}}}
The above assembly code is about as good as it gets in any language, and
GHC should be able to produce this, at least with the LLVM backend.
--
Ticket URL: <http://ghc.haskell.org/trac/ghc/ticket/12808#comment:10>
GHC <http://www.haskell.org/ghc/>
The Glasgow Haskell Compiler
More information about the ghc-tickets
mailing list