[Haskell-cafe] maybe IO doesn't suck, but my code does...
Frédéric Gobry
frederic.gobry at epfl.ch
Fri Dec 3 05:33:34 EST 2004
Hello,
I'm a haskell beginner, and I'm struggling with the following problem:
I've started writing a simple apache log file analyzer, but I cannot
get rid of important memory usage problems (in fact, at each attempt, I
fear I won't be able to unlock my box as my linux 2.6.9 kernel is on its
knees, which reminds me on my early days writing C on MMU-less
processors... not because of the language of course :-))
Enclosed is a sample code, which aborts on a large (> 100000 lines)
file.
I tried different variations (readFile $ lines, openFile, openFile +
IORef,...) but with no success...
So, if the enclosed version is not too far, please give me a hint.
Alternatively, if I took the wrong direction, please refocus my search
.-)
Thanks in advance,
Frédéric
--
Frédéric Gobry Infoscience
DIT-KIS / EPFL
Tel: +41216932288
http://people.epfl.ch/frederic.gobry
-------------- next part --------------
module Hit (parseHit, Hit, ip, time, query, status, epoch, CalendarTime) where
import Text.Regex
import System.Time
import Data.FiniteMap
type IP = String
data HTTPStatus = Ok | Error deriving Show
data Hit = Hit { ip :: IP,
time :: CalendarTime,
query :: String,
status :: HTTPStatus
}
instance Show Hit where
show hit = "Hit " ++ (show $ ip hit) ++ " " ++ (calendarTimeToString $ time hit) ++ " " ++ (show $ query hit)
-- Handling of log timestamps
_months = listToFM [ ("Jan", January),
("Feb", February),
("Mar", March),
("Apr", April),
("May", May),
("Jun", June),
("Jul", July),
("Aug", August),
("Sep", September),
("Oct", October),
("Nov", November),
("Dec", December) ]
_mkMonth month = case lookupFM _months month of
Nothing -> error ("invalid month " ++ month)
Just m -> m
_mkTZ (sign:tz)
= let (q, r) = divMod (read tz :: Int) 100
s = if (sign == '-') then 1 else -1
in s * (60 * q + r) * 60
_timeRe = mkRegex "([0-9]+)/([a-zA-Z]+)/([0-9]+):([0-9]+):([0-9]+):([0-9]+) (.[0-9]+)"
_timeSplit t = ( read year :: Int,
_mkMonth month,
read day :: Int,
read h :: Int,
read m :: Int,
read s :: Int,
_mkTZ tz
)
where [day, month, year, h, m, s, tz]
= case matchRegex _timeRe t of
Nothing -> error ("invalid time: " ++ t)
Just parts -> parts
-- Parse a log timestamp into a calendar time
timeParse :: String -> CalendarTime
timeParse t
= let (year, month, day, hour, min, sec, tz) = _timeSplit t
in toUTCTime $ toClockTime (CalendarTime { ctYear = year,
ctMonth = month,
ctDay = day,
ctHour = hour,
ctMin = min,
ctSec = sec,
ctTZ = tz,
ctPicosec = 0,
ctWDay = undefined,
ctYDay = undefined,
ctTZName = undefined,
ctIsDST = False
})
-- Convenience "minimal value" for the time stamps
epoch = toUTCTime $ toClockTime (CalendarTime { ctYear = 1970,
ctMonth = January,
ctDay = 1,
ctHour = 0,
ctMin = 0,
ctSec = 0,
ctTZ = 0,
ctPicosec = 0,
ctWDay = undefined,
ctYDay = undefined,
ctTZName = undefined,
ctIsDST = False
})
_short = "^([0-9.]+) - - \\[([^]]+)\\] \"([^\"]+)\" ([0-9]+) ([0-9]+|-)"
_lineRe = mkRegex (_short ++ " \"([^\"]+)\" \"([^\"]+)\"$")
_shortRe = mkRegex (_short ++ "$")
_mkStatus status
= let v = read status :: Int
in if (v == 200) then Ok else Error
parseHit line = case (matchRegex _lineRe line) of
Nothing -> case (matchRegex _shortRe line) of
Nothing -> error ("invalid log: " ++ line)
Just [client, stamp, query, status, size]
-> Hit client (timeParse stamp) query (_mkStatus status)
Just [client, stamp, query, status, size, referrer, agent]
-> Hit client (timeParse stamp) query (_mkStatus status)
-------------- next part --------------
import IO
import System
import ParseLog
import Control.Monad.State
main = do
args <- getArgs
fh <- openFile (args !! 0) ReadMode
parseLog fh
-------------- next part --------------
module ParseLog (parseLog) where
import IO
import Data.IORef
import Hit
data Stat = Stat { latest :: CalendarTime, total :: Int }
_doParse line stat = let hit = parseHit line
in Stat (max (time hit) (latest stat)) (total stat + 1)
_parseLine fh state = do { line <- hGetLine fh ;
value <- readIORef state ;
writeIORef state (_doParse line value) ;
}
_parseLines fh state = do { end <- hIsEOF fh ;
if end then
return ()
else _parseLine fh state >> _parseLines fh state
}
parseLog fh
= do initial <- newIORef (Stat epoch 0)
_parseLines fh initial
final <- readIORef initial
putStrLn $ "fini " ++ (show $ total final)
-- putStrLn ("Total: " ++ (show $ total final) ++ " ending at " ++ (show $ latest final))
More information about the Haskell-Cafe
mailing list