[GHC] #12808: For primitive (Addr#) operations, Loop Invariant Code Flow not lifted outside the loop...

Fri Nov 11 14:36:34 UTC 2016

#12808: For primitive (Addr#) operations, Loop Invariant Code Flow not lifted
outside the loop...
-------------------------------------+-------------------------------------
        Reporter:  GordonBGood       |                Owner:
            Type:  bug               |               Status:  new
        Priority:  normal            |            Milestone:  8.2.1
       Component:  Compiler          |              Version:  8.0.1
      Resolution:                    |             Keywords:
Operating System:  Unknown/Multiple  |         Architecture:
 Type of failure:  Runtime           |  Unknown/Multiple
  performance bug                    |            Test Case:
      Blocked By:                    |             Blocking:
 Related Tickets:                    |  Differential Rev(s):
       Wiki Page:                    |
-------------------------------------+-------------------------------------

Comment (by GordonBGood):

 Replying to [comment:9 simonpj]:
 > > You can see that the STG code just reflects the original Haskell
 source code and that the faulty register initialization has not yet been
 dropped down to within the loop(s), so the problem is not here. The
 problem is in the generation of the first CMM
 >
 > Aha!  Could you possibly make the tiniest possible example that
 illustrates precisely this point. You can motivate its importance by this
 thread, but in thinking about how to fix it, it's MUCH easier to grok a
 small example.

 I can't cut the test program down to just a few lines as I believe that
 the problem is related to pointers and pointer arithmetic (the Addr#
 primitive) and thus there is some setup involved in their use in a loop
 that shows the problems.

 However, I have boiled the test down to a very simple tail-recursive loop
 with only one cull operation per loop using an Addr# and an offset that
 shows the problems; this loop is inside another loop to feed a variable
 prime "p" to the inner loop so it doesn't get optimized away as constants,
 and this is inside the setup code to produced the pinned byte array on
 which the loop works in the following code:
 {{{#!hs
 -- SimpleEfficiencyBug

 {-# LANGUAGE FlexibleContexts, BangPatterns, MagicHash, UnboxedTuples #-}
 {-# OPTIONS_GHC -O3 -rtsopts -keep-s-files -ddump-stg -ddump-cmm -ddump-
 opt-cmm -ddump-to-file -dumpdir . #-} -- or -O2 -fllvm -v -dcore-lint
 -ddump-asm

 import Data.Bits
 import Data.Array.ST (runSTUArray)
 import Data.Array.Base
 import GHC.ST ( ST(..) )
 import GHC.Exts

 cull :: () -> [Int]
 cull() = [i | (i, True) <- assocs arr ] where
   arr = runSTUArray $ do
     let bfBts = 1 `shiftL` 17 -- 16 Kilobytes worth of bits
     bf <- newArray (0, bfBts - 1) False :: ST s (STUArray s Int Bool)
     cullb bf
   cullb (STUArray l u n marr#) = ST $ \s0# -> -- following is just setup
 for the loop...
     case getSizeofMutableByteArray# marr# s0# of { (# s1#, n# #) ->
     case newPinnedByteArray# n# s1#         of { (# s2#, marr'# #) ->
     case copyMutableByteArray# marr# 0# marr'# 0# n# s2# of { s3# ->
     case unsafeFreezeByteArray# marr'# s3#  of { (# s4#, arr# #) -> --
 must do this
     case byteArrayContents# arr#            of { adr# -> -- to obtain the
 addr# of pinned marr' here
     let cullp !p@(I# p#) sp# = -- for several prime values
           if p > 5 then case copyMutableByteArray# marr'# 0# marr# 0# n#
 sp# of
                           so# -> (# so#, STUArray l u n marr# #) else
           let !r1@(I# r1#) = ((p .&. 7) + p) `shiftR` 3 in -- register
 offset value
           let !(I# szlmt#) = n `div` 8 - r1 in
           let lmt# = plusAddr# adr# szlmt# in
           let doit c# s# = -- all the work is done here; herein lies the
 bugs...
                 case c# `ltAddr#` lmt# of
                   0# -> s#
                   _ ->
                     case readWord8OffAddr# c# r1# s# of { (# s0#, v0# #)
 ->
                     case writeWord8OffAddr# c# r1# (v0# `or#` (int2Word#
 1#)) s0# of { s1# ->
                     doit (plusAddr# c# p#) s1# }} in
           case doit adr# sp# of sd# -> cullp (p + 2) sd# in cullp 1 s4#
 }}}}}

 main = print $ length $ cull()
 }}}
 When compiled with the "-fllvm" compiler flag, the above code produces the
 following STG code for the inner loop (located by searching for the first
 "doit1_"):
 {{{
 let {
   doit1_s7pL [Occ=LoopBreaker]
     :: GHC.Prim.Addr#
        -> GHC.Prim.State#
             GHC.Prim.RealWorld
        -> GHC.Prim.State#
             GHC.Prim.RealWorld
   [LclId,
    Arity=2,
    Str=DmdType <S,U><S,U>,
    Unf=OtherCon []] =
       sat-only \r srt:SRT:[] [c#_s7pM
                               s#_s7pN]
           case
               ltAddr# [c#_s7pM
                        lmt#1_s7pJ]
           of
           _ [Occ=Dead]
           { __DEFAULT ->
                 case
                     readWord8OffAddr# [c#_s7pM
                                        r1#_s7pG
                                        s#_s7pN]
                 of
                 _ [Occ=Dead]
                 { (#,#) ipv6_s7pQ [Occ=Once]
                         ipv7_s7pR [Occ=Once] ->
                       case
                           or# [ipv7_s7pR
                                1##]
                       of
                       sat_s7pS
                       { __DEFAULT ->
                             case
                                 writeWord8OffAddr# [c#_s7pM
                                                     r1#_s7pG
                                                     sat_s7pS
                                                     ipv6_s7pQ]
                             of
                             s1#1_s7pT [OS=OneShot]
                             { __DEFAULT ->
                                   case
                                       plusAddr# [c#_s7pM
                                                  ww_s7pC]
                                   of
                                   sat_s7pU
                                   { __DEFAULT ->
                                         doit1_s7pL
                                             sat_s7pU
                                             s1#1_s7pT;
                                   };
                             };
                       };
                 };
             0# ->
                 s#_s7pN;
           };
 } in
 }}}
 Which first produces the following CMM code (found by search for
 "doit1_"):
 {{{
  doit1_s7pL_entry() //  [R2, R1]
          { info_tbl: [(c7Kw,
                        label: doit1_s7pL_info
                        rep:HeapRep 3 nonptrs { Fun {arity: 2 fun_type:
 ArgSpec 4} })]
            stack_info: arg_space: 8 updfr_space: Just 8
          }
      {offset
        c7Kw:
            _s7pM::I64 = R2;
            _s7pL::P64 = R1;
            goto c7Kp;
        c7Kp:
            if ((old + 0) - <highSp> < SpLim) goto c7Kx; else goto c7Ky;
        c7Kx:
            R2 = _s7pM::I64;
            R1 = _s7pL::P64;
            call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8;
        c7Ky:
            goto c7Ko;
        c7Ko:
            _s7pC::I64 = I64[_s7pL::P64 + 6]; // registers initialized
 inside loop here
            _s7pG::I64 = I64[_s7pL::P64 + 14];
            _s7pJ::I64 = I64[_s7pL::P64 + 22]; // to here
            _c7Kr::I64 = _s7pM::I64 < _s7pJ::I64;
            _s7pO::I64 = _c7Kr::I64;
            switch [-9223372036854775808 .. 9223372036854775807] _s7pO::I64
 {
                case 0 : goto c7Kv;
                default: goto c7Ku;
            }
        c7Kv:
            goto c7KG;
        c7KG:
            call (P64[(old + 8)])() args: 8, res: 0, upd: 8;
        c7Ku:
            goto c7KB;
        c7KB:
            _s7pR::I64 = %MO_UU_Conv_W8_W64(I8[_s7pM::I64 + (_s7pG::I64 <<
 0)]);
            _s7pR::I64 = _s7pR::I64;
            _c7KJ::I64 = _s7pR::I64 | 1;
            _s7pS::I64 = _c7KJ::I64;
            I8[_s7pM::I64 + (_s7pG::I64 << 0)] =
 %MO_UU_Conv_W64_W8(_s7pS::I64);
            _c7KO::I64 = _s7pM::I64 + _s7pC::I64;
            _s7pU::I64 = _c7KO::I64;
            _s7pM::I64 = _s7pU::I64;
            goto c7Ko;
      }
  },
 }}}
 then after many optimization passes produces the following optimized CMM
 code:
 {{{
 ==================== Optimised Cmm ====================
 2016-11-11 13:14:21.2389114 UTC

 doit1_s7pL_entry() //  [R1, R2]
         { [(c7Kw,
             doit1_s7pL_info:
                 const 8589934596;
                 const 12884901888;
                 const 9;)]
         }
     {offset
       c7Kw:
           _s7pM::I64 = R2;
           _s7pL::P64 = R1;
           goto c7Ko;
       c7Ko:
           switch [-9223372036854775808 .. 9223372036854775807] (_s7pM::I64
 < I64[_s7pL::P64 + 22]) {
               case 0 : goto c7Kv;
               default: goto c7Ku;
           }
       c7Kv:
           call (P64[Sp])() args: 8, res: 0, upd: 8;
       c7Ku:
           _s7pC::I64 = I64[_s7pL::P64 + 6]; // registers initialized
 inside loop here
           _s7pG::I64 = I64[_s7pL::P64 + 14]; // and here
           I8[_s7pM::I64 + _s7pG::I64] =
 %MO_UU_Conv_W64_W8(%MO_UU_Conv_W8_W64(I8[_s7pM::I64 + _s7pG::I64]) | 1);
           _s7pM::I64 = _s7pM::I64 + _s7pC::I64;
           goto c7Ko;
     }
 }
 }}}
 and finally the following assembly code:
 {{{
 s7pL_info$def:
 # BB#0:                                 # %c7Kw
         cmpq    %r14, 22(%rbx)
         jbe     .LBB18_2
         .align  16, 0x90
 .LBB18_1:                               # %c7Ku
                                         # =>This Inner Loop Header:
 Depth=1
         movq    14(%rbx), %rax    # registers initialized inside loop here
         movq    6(%rbx), %rcx     # and here
         addq    %r14, %rcx
         orb     $1, (%rax,%r14)
         cmpq    22(%rbx), %rcx    # and an additional unnecessary memory
 load here by LLVM?
         movq    %rcx, %r14        # extra unnecessary instruction if code
 reformulated
         jb      .LBB18_1
 .LBB18_2:                               # %c7Kv
         movq    (%rbp), %rax
         rex64 jmpq      *%rax           # TAILCALL
 }}}
 I find no problems with the STG code, but the problems persist through all
 of the other codes including the initial CMM code.  I have commented on
 where the problems are in the above codes.  I would like to see the
 optimized CMM code look like the following:
 {{{
 doit1_s7pL_entry() //  [R1, R2]
         { [(c7Kw,
             doit1_s7pL_info:
                 const 8589934596;
                 const 12884901888;
                 const 9;)]
         }
     {offset
       c7Kw:
           _s7pM::I64 = R2;
           _s7pL::P64 = R1;
           _s7pC::I64 = I64[_s7pL::P64 + 6]; // registers initialized
 outside loop here
           _s7pG::I64 = I64[_s7pL::P64 + 14]; // and here
           goto c7Ko;
       c7Ko:
           switch [-9223372036854775808 .. 9223372036854775807] (_s7pM::I64
 < I64[_s7pL::P64 + 22]) {
               case 0 : goto c7Kv;
               default: goto c7Ku;
           }
       c7Kv:
           call (P64[Sp])() args: 8, res: 0, upd: 8;
       c7Ku:
           I8[_s7pM::I64 + _s7pG::I64] =
 %MO_UU_Conv_W64_W8(%MO_UU_Conv_W8_W64(I8[_s7pM::I64 + _s7pG::I64]) | 1);
           _s7pM::I64 = _s7pM::I64 + _s7pC::I64;
           goto c7Ko;
     }
 }
 }}}
 which should produce the following desired assembly code:
 {{{
 s7pL_info$def:
 # BB#0:                                 # %c7Kw
         movq    14(%rbx), %rax  # registers initialized outside loop here
         movq    6(%rbx), %rcx   # and here
         movq    22(%rbx), %rbx  # and here
         cmpq    %r14, %rbx
         jbe     .LBB18_2
         .align  16, 0x90
 .LBB18_1:                               # %c7Ku
                                         # =>This Inner Loop Header:
 Depth=1
         orb     $1, (%rax,%r14)
         addq    %rcx, %r14
         cmpq    %rbx, %rcx      # use a register for comparison
         jb      .LBB18_1
 .LBB18_2:                               # %c7Kv
         movq    (%rbp), %rax
         rex64 jmpq      *%rax           # TAILCALL
 }}}
 The above assembly code is about as good as it gets in any language, and
 GHC should be able to produce this, at least with the LLVM backend.

--
Ticket URL: <http://ghc.haskell.org/trac/ghc/ticket/12808#comment:10>
GHC <http://www.haskell.org/ghc/>
The Glasgow Haskell Compiler