[Haskell-cafe] Options to reduce memory use (Shootout Benchmark)
Thorsten Hater
th at tp1.rub.de
Tue May 17 17:18:59 CEST 2011
Hello,
over the weekend I played a little with the fasta benchmark at the
shootout site.
I tried to understand the current submission and decided to build my own
version.
Astonishingly, after some tuning its quite fast (<1sec on my machine,
when the current
entry use ~11sec) but uses quite some memory (~11MB).
Basically you have to use a custom pseudo random number generator to build
strings of randomized DNA by choosing from a given probability distribution.
Is there some way to reduce memory consumption? 11MB seems quite a lot.
Thorsten
{-# LANGUAGE BangPatterns #-}
module Main where
import System.Environment
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as C (pack, take)
import qualified Data.ByteString as S
data P = P !Char !Float
type LUT = [P]
alu = C.pack "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
\GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
\CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
\ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
\GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
\AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
\AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
iubs, homs :: LUT
iubs = cdf [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02)
,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02)
,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)]
homs = cdf [('a',0.3029549426680),('c',0.1979883004921)
,('g',0.1975473066391),('t',0.3015094502008)]
-- compile LUT from assoc list
cdf :: [(Char,Float)] -> LUT
cdf ls = reverse $! cdf' [] 0 ls
where cdf' acc _ [] = acc
cdf' acc c ((v,k):ls) = cdf' ((P v c'):acc) c' ls
where !c' = k + c
-- extract Char from List by Key
choose :: LUT -> Float -> Char
choose lut !f = choose' lut
where choose' ((P v k):ls)| f <= k = v
| otherwise = choose' ls
-- PRNG
im, ia, ic :: Int
im = 139968
ia = 3877
ic = 29573
data R = R !Float !Int
imd :: Float
imd = fromIntegral im
rand :: Int -> R
rand seed = R newran newseed
where
newseed = (seed * ia + ic) `rem` im
newran = (fromIntegral newseed) / imd
-- /PRNG
-- Cache all possible results
cache :: LUT -> L.ByteString
cache ls = C.pack $! reverse $! go [v] s
where (R !f !s) = rand 42 -- if we arrive at Seed 42,
we completed one cycle, STOP
!v = choose ls f
go :: String -> Int -> String
go acc 42 = acc
go acc !q = go (v':acc) s'
where (R !f' !s') = rand q
!v' = choose ls f'
-- FASTA writer from current entry
fasta n s = do
let (t:ts) = L.toChunks s
go ts t n
where
go ss s n
| n == 0 = return ()
| l60 && n60 = S.putStrLn l >> go ss r (n-60)
| n60 = S.putStr s >> S.putStrLn a >> go (tail ss) b (n-60)
| n <= ln = S.putStrLn (S.take n s)
| otherwise = S.putStr s >> S.putStrLn (S.take (n-ln) (head ss))
where
ln = S.length s
l60 = ln >= 60
n60 = n >= 60
(l,r) = S.splitAt 60 s
(a,b) = S.splitAt (60-ln) (head ss)
main = do n <- getArgs >>= readIO . head
putStrLn ">ONE Homo sapiens alu"
fasta (n*2) $! L.cycle alu
putStrLn ">TWO IUB ambiguity codes"
fasta (n*3) (L.cycle $! cache $! iubs)
putStrLn ">THREE Homo sapiens frequency"
fasta (n*5) (L.drop (fromIntegral (n*3) `mod` 139968) $!
L.cycle $! cache homs)
More information about the Haskell-Cafe
mailing list