[GHC] #7367: float-out causes extra allocation
GHC
ghc-devs at haskell.org
Fri Jul 31 17:30:31 UTC 2015
#7367: float-out causes extra allocation
-------------------------------------+-------------------------------------
Reporter: wurmli | Owner:
Type: bug | Status: new
Priority: normal | Milestone: 7.12.1
Component: Compiler | Version: 7.6.1
Resolution: | Keywords:
Operating System: Linux | Architecture: x86_64
Type of failure: Runtime | (amd64)
performance bug | Test Case:
Blocked By: | Blocking:
Related Tickets: | Differential Revisions:
-------------------------------------+-------------------------------------
Description changed by bgamari:
Old description:
> The Haskell fannkuchredux contribution of Louis Wasserman to "The
> Computer Language Benchmarks Game" at shootout.alioth.debian.org times
> out on the amd64 machines, but not on the i386. I can reproduce it on my
> Debian amd64 machine.
>
> It turns out that compiling without optimisation or with a simple -O
> produces a fast program, but with enourmously large heap space allocated
> (10G compared with 67k on a virtual i386 machine) and also more garbage
> collector activity.
>
> The source is below (because I don't find a way to attach the file). At
> the end of the source I copied my make command line, run command line
> and output produced with option -sstderr.
>
> ---------------------
>
> {{{
> {- The Computer Language Benchmarks Game
> http://shootout.alioth.debian.org/
> contributed by Louis Wasserman
>
> This should be compiled with:
> -threaded -O2 -fexcess-precision -fasm
> and run with:
> +RTS -N<number of cores> -RTS <input>
> -}
>
> import Control.Concurrent
> import Control.Monad
> import System.Environment
> import Foreign hiding (rotate)
> import Data.Monoid
>
> type Perm = Ptr Word8
>
> data F = F {-# UNPACK #-} !Int {-# UNPACK #-} !Int
>
> instance Monoid F where
> mempty = F 0 0
> F s1 m1 `mappend` F s2 m2 = F (s1 + s2) (max m1 m2)
>
> incPtr = (`advancePtr` 1)
> decPtr = (`advancePtr` (-1))
>
> flop :: Int -> Perm -> IO ()
> flop k xs = flopp xs (xs `advancePtr` k)
> where flopp i j = when (i < j) $ swap i j >> flopp (incPtr i) (decPtr j)
> swap i j = do
> a <- peek i
> b <- peek j
> poke j a
> poke i b
>
> flopS :: Perm -> (Int -> IO a) -> IO a
> flopS !xs f = do
> let go !acc = do
> k <- peekElemOff xs 0
> if k == 0 then f acc else flop (fromIntegral k) xs >> go
> (acc+1)
> go 0
>
> increment :: Ptr Word8 -> Ptr Word8 -> IO ()
> increment !p !ct = do
> first <- peekElemOff p 1
> pokeElemOff p 1 =<< peekElemOff p 0
> pokeElemOff p 0 first
>
> let go !i !first = do
> ci <- peekElemOff ct i
> if fromIntegral ci < i then pokeElemOff ct i (ci+1) else
> do
> pokeElemOff ct i 0
> let !i' = i + 1
> moveArray p (incPtr p) i'
> pokeElemOff p i' first
> go i' =<< peekElemOff p 0
> go 1 first
>
> genPermutations :: Int -> Int -> Int -> Ptr Word8 -> Ptr Word8 -> IO F
> genPermutations !n !l !r !perm !count = allocaArray n $ \ destF -> do
> let upd j !f run = do
> p0 <- peekElemOff perm 0
> if p0 == 0 then increment perm count >> run f else do
> copyArray destF perm n
> increment perm count
> flopS destF $ \ flops ->
> run (f `mappend` F (checksum j flops)
> flops)
> let go j !f = if j >= r then return f else upd j f (go (j+1))
> go l mempty
> where checksum i f = if i .&. 1 == 0 then f else -f
>
> facts :: [Int]
> facts = scanl (*) 1 [1..12]
>
> unrank :: Int -> Int -> (Ptr Word8 -> Ptr Word8 -> IO a) -> IO a
> unrank !idx !n f = allocaArray n $ \ p -> allocaArray n $ \ count ->
> allocaArray n $ \ pp -> do
> mapM_ (\ i -> pokeElemOff p i (fromIntegral i)) [0..n-1]
> let go i !idx = when (i >= 0) $ do
> let fi = facts !! i
> let (q, r) = idx `quotRem` fi
> pokeElemOff count i (fromIntegral q)
> copyArray pp p (i+1)
> let go' j = when (j <= i) $ do
> let jq = j + q
> pokeElemOff p j =<< peekElemOff pp (if jq <= i
> then jq else jq - i - 1)
> go' (j+1)
> go' 0
> go (i-1) r
> go (n-1) idx
> f p count
>
> main = do
> n <- fmap (read.head) getArgs
> let fact = product [1..n]
> let bk = fact `quot` 4
> vars <- forM [0,bk..fact-1] $ \ ix -> do
> var <- newEmptyMVar
> forkIO (unrank ix n $ \ p -> genPermutations n ix (min fact (ix +
> bk)) p >=> putMVar var)
> return var
> F chksm mflops <- liftM mconcat (mapM takeMVar vars)
> putStrLn $ (show chksm) ++ "\nPfannkuchen(" ++ (show n) ++ ") = " ++
> (show $ mflops)
>
> {-
>
> wurmli at noah-nofen:~/hpw/haskell/work/fannkuch$ ghc --make
> -XBangPatterns -O -threaded -fllvm -rtsopts fannkuchredux.ghc-3.hs
> [1 of 1] Compiling Main ( fannkuchredux.ghc-3.hs,
> fannkuchredux.ghc-3.o )
> Linking fannkuchredux.ghc-3 ...
> wurmli at noah-nofen:~/hpw/haskell/work/fannkuch$ ./fannkuchredux.ghc-3 +RTS
> -N4 -sstderr -RTS 12
> 3968050
> Pfannkuchen(12) = 65
> 10,538,122,952 bytes allocated in the heap
> 359,512 bytes copied during GC
> 47,184 bytes maximum residency (2 sample(s))
> 51,120 bytes maximum slop
> 4 MB total memory in use (1 MB lost due to fragmentation)
>
> Tot time (elapsed) Avg pause Max
> pause
> Gen 0 6053 colls, 6053 par 0.16s 0.04s 0.0000s
> 0.0001s
> Gen 1 2 colls, 1 par 0.00s 0.00s 0.0001s
> 0.0001s
>
> Parallel GC work balance: 40.82% (serial 0%, perfect 100%)
>
> TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4)
>
> SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
>
> INIT time 0.00s ( 0.00s elapsed)
> MUT time 44.73s ( 11.51s elapsed)
> GC time 0.16s ( 0.04s elapsed)
> EXIT time 0.00s ( 0.00s elapsed)
> Total time 44.89s ( 11.55s elapsed)
>
> Alloc rate 235,589,887 bytes per MUT second
>
> Productivity 99.6% of total user, 387.3% of total elapsed
>
> gc_alloc_block_sync: 31024
> whitehole_spin: 0
> gen[0].sync: 0
> gen[1].sync: 0
>
> -}
> }}}
New description:
The Haskell fannkuchredux (included in nofib as fannkuch-redux)
contribution of Louis Wasserman to "The Computer Language Benchmarks Game"
at shootout.alioth.debian.org times out on the amd64 machines, but not on
the i386. I can reproduce it on my Debian amd64 machine.
It turns out that compiling without optimisation or with a simple -O
produces a fast program, but with enormously large heap space allocated
(10G compared with 67k on a virtual i386 machine) and also more garbage
collector activity.
The source is below (because I don't find a way to attach the file). At
the end of the source I copied my make command line, run command line and
output produced with option -sstderr.
---------------------
{{{
{- The Computer Language Benchmarks Game
http://shootout.alioth.debian.org/
contributed by Louis Wasserman
This should be compiled with:
-threaded -O2 -fexcess-precision -fasm
and run with:
+RTS -N<number of cores> -RTS <input>
-}
import Control.Concurrent
import Control.Monad
import System.Environment
import Foreign hiding (rotate)
import Data.Monoid
type Perm = Ptr Word8
data F = F {-# UNPACK #-} !Int {-# UNPACK #-} !Int
instance Monoid F where
mempty = F 0 0
F s1 m1 `mappend` F s2 m2 = F (s1 + s2) (max m1 m2)
incPtr = (`advancePtr` 1)
decPtr = (`advancePtr` (-1))
flop :: Int -> Perm -> IO ()
flop k xs = flopp xs (xs `advancePtr` k)
where flopp i j = when (i < j) $ swap i j >> flopp (incPtr i) (decPtr j)
swap i j = do
a <- peek i
b <- peek j
poke j a
poke i b
flopS :: Perm -> (Int -> IO a) -> IO a
flopS !xs f = do
let go !acc = do
k <- peekElemOff xs 0
if k == 0 then f acc else flop (fromIntegral k) xs >> go
(acc+1)
go 0
increment :: Ptr Word8 -> Ptr Word8 -> IO ()
increment !p !ct = do
first <- peekElemOff p 1
pokeElemOff p 1 =<< peekElemOff p 0
pokeElemOff p 0 first
let go !i !first = do
ci <- peekElemOff ct i
if fromIntegral ci < i then pokeElemOff ct i (ci+1) else
do
pokeElemOff ct i 0
let !i' = i + 1
moveArray p (incPtr p) i'
pokeElemOff p i' first
go i' =<< peekElemOff p 0
go 1 first
genPermutations :: Int -> Int -> Int -> Ptr Word8 -> Ptr Word8 -> IO F
genPermutations !n !l !r !perm !count = allocaArray n $ \ destF -> do
let upd j !f run = do
p0 <- peekElemOff perm 0
if p0 == 0 then increment perm count >> run f else do
copyArray destF perm n
increment perm count
flopS destF $ \ flops ->
run (f `mappend` F (checksum j flops)
flops)
let go j !f = if j >= r then return f else upd j f (go (j+1))
go l mempty
where checksum i f = if i .&. 1 == 0 then f else -f
facts :: [Int]
facts = scanl (*) 1 [1..12]
unrank :: Int -> Int -> (Ptr Word8 -> Ptr Word8 -> IO a) -> IO a
unrank !idx !n f = allocaArray n $ \ p -> allocaArray n $ \ count ->
allocaArray n $ \ pp -> do
mapM_ (\ i -> pokeElemOff p i (fromIntegral i)) [0..n-1]
let go i !idx = when (i >= 0) $ do
let fi = facts !! i
let (q, r) = idx `quotRem` fi
pokeElemOff count i (fromIntegral q)
copyArray pp p (i+1)
let go' j = when (j <= i) $ do
let jq = j + q
pokeElemOff p j =<< peekElemOff pp (if jq <= i
then jq else jq - i - 1)
go' (j+1)
go' 0
go (i-1) r
go (n-1) idx
f p count
main = do
n <- fmap (read.head) getArgs
let fact = product [1..n]
let bk = fact `quot` 4
vars <- forM [0,bk..fact-1] $ \ ix -> do
var <- newEmptyMVar
forkIO (unrank ix n $ \ p -> genPermutations n ix (min fact (ix +
bk)) p >=> putMVar var)
return var
F chksm mflops <- liftM mconcat (mapM takeMVar vars)
putStrLn $ (show chksm) ++ "\nPfannkuchen(" ++ (show n) ++ ") = " ++
(show $ mflops)
{-
wurmli at noah-nofen:~/hpw/haskell/work/fannkuch$ ghc --make
-XBangPatterns -O -threaded -fllvm -rtsopts fannkuchredux.ghc-3.hs
[1 of 1] Compiling Main ( fannkuchredux.ghc-3.hs,
fannkuchredux.ghc-3.o )
Linking fannkuchredux.ghc-3 ...
wurmli at noah-nofen:~/hpw/haskell/work/fannkuch$ ./fannkuchredux.ghc-3 +RTS
-N4 -sstderr -RTS 12
3968050
Pfannkuchen(12) = 65
10,538,122,952 bytes allocated in the heap
359,512 bytes copied during GC
47,184 bytes maximum residency (2 sample(s))
51,120 bytes maximum slop
4 MB total memory in use (1 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max
pause
Gen 0 6053 colls, 6053 par 0.16s 0.04s 0.0000s
0.0001s
Gen 1 2 colls, 1 par 0.00s 0.00s 0.0001s
0.0001s
Parallel GC work balance: 40.82% (serial 0%, perfect 100%)
TASKS: 6 (1 bound, 5 peak workers (5 total), using -N4)
SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled)
INIT time 0.00s ( 0.00s elapsed)
MUT time 44.73s ( 11.51s elapsed)
GC time 0.16s ( 0.04s elapsed)
EXIT time 0.00s ( 0.00s elapsed)
Total time 44.89s ( 11.55s elapsed)
Alloc rate 235,589,887 bytes per MUT second
Productivity 99.6% of total user, 387.3% of total elapsed
gc_alloc_block_sync: 31024
whitehole_spin: 0
gen[0].sync: 0
gen[1].sync: 0
-}
}}}
--
--
Ticket URL: <http://ghc.haskell.org/trac/ghc/ticket/7367#comment:21>
GHC <http://www.haskell.org/ghc/>
The Glasgow Haskell Compiler
More information about the ghc-tickets
mailing list