[Haskell-cafe] Options to reduce memory use (Shootout Benchmark)

Tue May 17 17:18:59 CEST 2011

Hello,

over the weekend I played a little with the fasta benchmark at the 
shootout site.
I tried to understand the current submission and decided to build my own 
version.
Astonishingly, after some tuning its quite fast (<1sec on my machine, 
when the current
entry use ~11sec) but uses quite some memory (~11MB).
Basically you have to use a custom pseudo random number generator to build
strings of randomized DNA by choosing from a given probability distribution.
Is there some way to reduce memory consumption? 11MB seems quite a lot.

Thorsten

{-# LANGUAGE BangPatterns #-}
module Main where

import System.Environment

import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as C (pack, take)
import qualified Data.ByteString as S

data P = P !Char !Float
type LUT = [P]

alu =  C.pack "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
               \GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
               \CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
               \ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
               \GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
               \AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
               \AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"

iubs, homs :: LUT
iubs = cdf [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02)
            ,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02)
            ,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)]

homs = cdf [('a',0.3029549426680),('c',0.1979883004921)
            ,('g',0.1975473066391),('t',0.3015094502008)]

-- compile LUT from assoc list
cdf :: [(Char,Float)] -> LUT
cdf ls = reverse $! cdf' [] 0 ls
     where cdf' acc _ [] = acc
           cdf' acc c ((v,k):ls) = cdf' ((P v c'):acc) c' ls
               where !c'  = k + c

-- extract Char from List by Key
choose :: LUT -> Float -> Char
choose lut !f = choose' lut
     where choose' ((P v k):ls)| f <= k = v
                               | otherwise = choose' ls

-- PRNG
im, ia, ic :: Int
im  = 139968
ia  = 3877
ic  = 29573

data R = R !Float !Int

imd :: Float
imd = fromIntegral im

rand :: Int -> R
rand seed = R newran newseed
     where
         newseed = (seed * ia + ic) `rem` im
         newran  = (fromIntegral newseed) / imd
-- /PRNG

-- Cache all possible results
cache :: LUT -> L.ByteString
cache ls = C.pack $! reverse $! go [v] s
     where (R !f !s) = rand 42              -- if we arrive at Seed 42, 
we completed one cycle, STOP
           !v = choose ls f
           go :: String -> Int -> String
           go acc 42 = acc
           go acc !q  = go (v':acc) s'
               where (R !f' !s') = rand q
                     !v' = choose ls f'

-- FASTA writer from current entry
fasta n s = do
     let (t:ts) = L.toChunks s
     go ts t n
   where
      go ss s n
         | n == 0     = return ()
         | l60 && n60 = S.putStrLn l >> go ss        r (n-60)
         |        n60 = S.putStr s >> S.putStrLn a >> go (tail ss) b (n-60)
         | n <= ln    = S.putStrLn (S.take n s)
         | otherwise  = S.putStr s >> S.putStrLn (S.take (n-ln) (head ss))
         where
             ln   = S.length s
             l60  = ln >= 60
             n60  = n >= 60
             (l,r) = S.splitAt 60 s
             (a,b) = S.splitAt (60-ln) (head ss)

main = do n <- getArgs >>= readIO . head
           putStrLn  ">ONE Homo sapiens alu"
           fasta (n*2) $! L.cycle alu
           putStrLn ">TWO IUB ambiguity codes"
           fasta (n*3)  (L.cycle $! cache $! iubs)
           putStrLn ">THREE Homo sapiens frequency"
           fasta (n*5) (L.drop  (fromIntegral (n*3) `mod` 139968) $! 
L.cycle $! cache homs)