[GHC] #7367: float-out causes extra allocation

GHC ghc-devs at haskell.org
Fri Jul 31 17:30:31 UTC 2015


#7367: float-out causes extra allocation
-------------------------------------+-------------------------------------
        Reporter:  wurmli            |                   Owner:
            Type:  bug               |                  Status:  new
        Priority:  normal            |               Milestone:  7.12.1
       Component:  Compiler          |                 Version:  7.6.1
      Resolution:                    |                Keywords:
Operating System:  Linux             |            Architecture:  x86_64
 Type of failure:  Runtime           |  (amd64)
  performance bug                    |               Test Case:
      Blocked By:                    |                Blocking:
 Related Tickets:                    |  Differential Revisions:
-------------------------------------+-------------------------------------
Description changed by bgamari:

Old description:

> The Haskell fannkuchredux contribution of Louis Wasserman to "The
> Computer Language Benchmarks Game" at shootout.alioth.debian.org times
> out on the amd64 machines, but not on the i386. I can reproduce it on my
> Debian amd64 machine.
>
> It turns out that compiling without optimisation or with a simple -O
> produces a fast program, but with enourmously large heap space allocated
> (10G compared with 67k on a virtual i386 machine) and also more garbage
> collector activity.
>
> The source is below (because I don't find a way to attach the file). At
> the end  of the source I copied my make command line, run command line
> and output produced with option -sstderr.
>
> ---------------------
>

> {{{
> {-  The Computer Language Benchmarks Game
>     http://shootout.alioth.debian.org/
>     contributed by Louis Wasserman
>
>     This should be compiled with:
>         -threaded -O2 -fexcess-precision -fasm
>     and run with:
>         +RTS -N<number of cores> -RTS <input>
> -}
>
> import Control.Concurrent
> import Control.Monad
> import System.Environment
> import Foreign hiding (rotate)
> import Data.Monoid
>
> type Perm = Ptr Word8
>
> data F = F {-# UNPACK #-} !Int {-# UNPACK #-} !Int
>
> instance Monoid F where
>         mempty = F 0 0
>         F s1 m1 `mappend` F s2 m2 = F (s1 + s2) (max m1 m2)
>
> incPtr = (`advancePtr` 1)
> decPtr = (`advancePtr` (-1))
>
> flop :: Int -> Perm -> IO ()
> flop k xs = flopp xs (xs `advancePtr` k)
>  where flopp i j = when (i < j) $ swap i j >> flopp (incPtr i) (decPtr j)
>        swap i j = do
>         a <- peek i
>         b <- peek j
>         poke j a
>         poke i b
>
> flopS :: Perm -> (Int -> IO a) -> IO a
> flopS !xs f = do
>         let go !acc = do
>                 k <- peekElemOff xs 0
>                 if k == 0 then f acc else flop (fromIntegral k) xs >> go
> (acc+1)
>         go 0
>
> increment :: Ptr Word8 -> Ptr Word8 -> IO ()
> increment !p !ct = do
>         first <- peekElemOff p 1
>         pokeElemOff p 1 =<< peekElemOff p 0
>         pokeElemOff p 0 first
>
>         let go !i !first = do
>                 ci <- peekElemOff ct i
>                 if fromIntegral ci < i then pokeElemOff ct i (ci+1) else
> do
>                         pokeElemOff ct i 0
>                         let !i' = i + 1
>                         moveArray p (incPtr p) i'
>                         pokeElemOff p i' first
>                         go i' =<< peekElemOff p 0
>         go 1 first
>
> genPermutations :: Int -> Int -> Int -> Ptr Word8 -> Ptr Word8 -> IO F
> genPermutations !n !l !r !perm !count = allocaArray n $ \ destF -> do
>         let upd j !f run = do
>                 p0 <- peekElemOff perm 0
>                 if p0 == 0 then increment perm count >> run f else do
>                         copyArray destF perm n
>                         increment perm count
>                         flopS destF $ \ flops ->
>                                 run (f `mappend` F (checksum j flops)
> flops)
>         let go j !f = if j >= r then return f else upd j f (go (j+1))
>         go l mempty
>  where checksum i f = if i .&. 1 == 0 then f else -f
>
> facts :: [Int]
> facts = scanl (*) 1 [1..12]
>
> unrank :: Int -> Int -> (Ptr Word8 -> Ptr Word8 -> IO a) -> IO a
> unrank !idx !n f = allocaArray n $ \ p -> allocaArray n $ \ count ->
>   allocaArray n $ \ pp -> do
>         mapM_ (\ i -> pokeElemOff p i (fromIntegral i)) [0..n-1]
>         let go i !idx = when (i >= 0) $ do
>                 let fi = facts !! i
>                 let (q, r) = idx `quotRem` fi
>                 pokeElemOff count i (fromIntegral q)
>                 copyArray pp p (i+1)
>                 let go' j = when (j <= i) $ do
>                         let jq = j + q
>                         pokeElemOff p j =<< peekElemOff pp (if jq <= i
> then jq else jq - i - 1)
>                         go' (j+1)
>                 go' 0
>                 go (i-1) r
>         go (n-1) idx
>         f p count
>
> main = do
>    n <- fmap (read.head) getArgs
>    let fact = product [1..n]
>    let bk = fact `quot` 4
>    vars <- forM [0,bk..fact-1] $ \ ix -> do
>         var <- newEmptyMVar
>         forkIO (unrank ix n $ \ p -> genPermutations n ix (min fact (ix +
> bk)) p >=> putMVar var)
>         return var
>    F chksm mflops <- liftM mconcat (mapM takeMVar vars)
>    putStrLn $ (show chksm) ++ "\nPfannkuchen(" ++ (show n) ++ ") = " ++
> (show $ mflops)
>

> {-
>
> wurmli at noah-nofen:~/hpw/haskell/work/fannkuch$ ghc  --make
>  -XBangPatterns -O -threaded -fllvm -rtsopts fannkuchredux.ghc-3.hs
> [1 of 1] Compiling Main             ( fannkuchredux.ghc-3.hs,
> fannkuchredux.ghc-3.o )
> Linking fannkuchredux.ghc-3 ...
> wurmli at noah-nofen:~/hpw/haskell/work/fannkuch$ ./fannkuchredux.ghc-3 +RTS
> -N4 -sstderr   -RTS 12
> 3968050
> Pfannkuchen(12) = 65
>   10,538,122,952 bytes allocated in the heap
>          359,512 bytes copied during GC
>           47,184 bytes maximum residency (2 sample(s))
>           51,120 bytes maximum slop
>                4 MB total memory in use (1 MB lost due to fragmentation)
>
>                                     Tot time (elapsed)  Avg pause  Max
> pause
>   Gen  0      6053 colls,  6053 par    0.16s    0.04s     0.0000s
> 0.0001s
>   Gen  1         2 colls,     1 par    0.00s    0.00s     0.0001s
> 0.0001s
>
>   Parallel GC work balance: 40.82% (serial 0%, perfect 100%)
>
>   TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4)
>
>   SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
>
>   INIT    time    0.00s  (  0.00s elapsed)
>   MUT     time   44.73s  ( 11.51s elapsed)
>   GC      time    0.16s  (  0.04s elapsed)
>   EXIT    time    0.00s  (  0.00s elapsed)
>   Total   time   44.89s  ( 11.55s elapsed)
>
>   Alloc rate    235,589,887 bytes per MUT second
>
>   Productivity  99.6% of total user, 387.3% of total elapsed
>
> gc_alloc_block_sync: 31024
> whitehole_spin: 0
> gen[0].sync: 0
> gen[1].sync: 0
>
> -}
> }}}

New description:

 The Haskell fannkuchredux (included in nofib as fannkuch-redux)
 contribution of Louis Wasserman to "The Computer Language Benchmarks Game"
 at shootout.alioth.debian.org times out on the amd64 machines, but not on
 the i386. I can reproduce it on my Debian amd64 machine.

 It turns out that compiling without optimisation or with a simple -O
 produces a fast program, but with enormously large heap space allocated
 (10G compared with 67k on a virtual i386 machine) and also more garbage
 collector activity.

 The source is below (because I don't find a way to attach the file). At
 the end  of the source I copied my make command line, run command line and
 output produced with option -sstderr.

 ---------------------


 {{{
 {-  The Computer Language Benchmarks Game
     http://shootout.alioth.debian.org/
     contributed by Louis Wasserman

     This should be compiled with:
         -threaded -O2 -fexcess-precision -fasm
     and run with:
         +RTS -N<number of cores> -RTS <input>
 -}

 import Control.Concurrent
 import Control.Monad
 import System.Environment
 import Foreign hiding (rotate)
 import Data.Monoid

 type Perm = Ptr Word8

 data F = F {-# UNPACK #-} !Int {-# UNPACK #-} !Int

 instance Monoid F where
         mempty = F 0 0
         F s1 m1 `mappend` F s2 m2 = F (s1 + s2) (max m1 m2)

 incPtr = (`advancePtr` 1)
 decPtr = (`advancePtr` (-1))

 flop :: Int -> Perm -> IO ()
 flop k xs = flopp xs (xs `advancePtr` k)
  where flopp i j = when (i < j) $ swap i j >> flopp (incPtr i) (decPtr j)
        swap i j = do
         a <- peek i
         b <- peek j
         poke j a
         poke i b

 flopS :: Perm -> (Int -> IO a) -> IO a
 flopS !xs f = do
         let go !acc = do
                 k <- peekElemOff xs 0
                 if k == 0 then f acc else flop (fromIntegral k) xs >> go
 (acc+1)
         go 0

 increment :: Ptr Word8 -> Ptr Word8 -> IO ()
 increment !p !ct = do
         first <- peekElemOff p 1
         pokeElemOff p 1 =<< peekElemOff p 0
         pokeElemOff p 0 first

         let go !i !first = do
                 ci <- peekElemOff ct i
                 if fromIntegral ci < i then pokeElemOff ct i (ci+1) else
 do
                         pokeElemOff ct i 0
                         let !i' = i + 1
                         moveArray p (incPtr p) i'
                         pokeElemOff p i' first
                         go i' =<< peekElemOff p 0
         go 1 first

 genPermutations :: Int -> Int -> Int -> Ptr Word8 -> Ptr Word8 -> IO F
 genPermutations !n !l !r !perm !count = allocaArray n $ \ destF -> do
         let upd j !f run = do
                 p0 <- peekElemOff perm 0
                 if p0 == 0 then increment perm count >> run f else do
                         copyArray destF perm n
                         increment perm count
                         flopS destF $ \ flops ->
                                 run (f `mappend` F (checksum j flops)
 flops)
         let go j !f = if j >= r then return f else upd j f (go (j+1))
         go l mempty
  where checksum i f = if i .&. 1 == 0 then f else -f

 facts :: [Int]
 facts = scanl (*) 1 [1..12]

 unrank :: Int -> Int -> (Ptr Word8 -> Ptr Word8 -> IO a) -> IO a
 unrank !idx !n f = allocaArray n $ \ p -> allocaArray n $ \ count ->
   allocaArray n $ \ pp -> do
         mapM_ (\ i -> pokeElemOff p i (fromIntegral i)) [0..n-1]
         let go i !idx = when (i >= 0) $ do
                 let fi = facts !! i
                 let (q, r) = idx `quotRem` fi
                 pokeElemOff count i (fromIntegral q)
                 copyArray pp p (i+1)
                 let go' j = when (j <= i) $ do
                         let jq = j + q
                         pokeElemOff p j =<< peekElemOff pp (if jq <= i
 then jq else jq - i - 1)
                         go' (j+1)
                 go' 0
                 go (i-1) r
         go (n-1) idx
         f p count

 main = do
    n <- fmap (read.head) getArgs
    let fact = product [1..n]
    let bk = fact `quot` 4
    vars <- forM [0,bk..fact-1] $ \ ix -> do
         var <- newEmptyMVar
         forkIO (unrank ix n $ \ p -> genPermutations n ix (min fact (ix +
 bk)) p >=> putMVar var)
         return var
    F chksm mflops <- liftM mconcat (mapM takeMVar vars)
    putStrLn $ (show chksm) ++ "\nPfannkuchen(" ++ (show n) ++ ") = " ++
 (show $ mflops)


 {-

 wurmli at noah-nofen:~/hpw/haskell/work/fannkuch$ ghc  --make
  -XBangPatterns -O -threaded -fllvm -rtsopts fannkuchredux.ghc-3.hs
 [1 of 1] Compiling Main             ( fannkuchredux.ghc-3.hs,
 fannkuchredux.ghc-3.o )
 Linking fannkuchredux.ghc-3 ...
 wurmli at noah-nofen:~/hpw/haskell/work/fannkuch$ ./fannkuchredux.ghc-3 +RTS
 -N4 -sstderr   -RTS 12
 3968050
 Pfannkuchen(12) = 65
   10,538,122,952 bytes allocated in the heap
          359,512 bytes copied during GC
           47,184 bytes maximum residency (2 sample(s))
           51,120 bytes maximum slop
                4 MB total memory in use (1 MB lost due to fragmentation)

                                     Tot time (elapsed)  Avg pause  Max
 pause
   Gen  0      6053 colls,  6053 par    0.16s    0.04s     0.0000s
 0.0001s
   Gen  1         2 colls,     1 par    0.00s    0.00s     0.0001s
 0.0001s

   Parallel GC work balance: 40.82% (serial 0%, perfect 100%)

   TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4)

   SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)

   INIT    time    0.00s  (  0.00s elapsed)
   MUT     time   44.73s  ( 11.51s elapsed)
   GC      time    0.16s  (  0.04s elapsed)
   EXIT    time    0.00s  (  0.00s elapsed)
   Total   time   44.89s  ( 11.55s elapsed)

   Alloc rate    235,589,887 bytes per MUT second

   Productivity  99.6% of total user, 387.3% of total elapsed

 gc_alloc_block_sync: 31024
 whitehole_spin: 0
 gen[0].sync: 0
 gen[1].sync: 0

 -}
 }}}

--

--
Ticket URL: <http://ghc.haskell.org/trac/ghc/ticket/7367#comment:21>
GHC <http://www.haskell.org/ghc/>
The Glasgow Haskell Compiler


More information about the ghc-tickets mailing list