[Haskell-cafe] maybe IO doesn't suck, but my code does...

Frédéric Gobry frederic.gobry at epfl.ch
Fri Dec 3 05:33:34 EST 2004


Hello,

I'm a haskell beginner, and I'm struggling with the following problem:
I've started writing a simple apache log file analyzer, but I cannot
get rid of important memory usage problems (in fact, at each attempt, I
fear I won't be able to unlock my box as my linux 2.6.9 kernel is on its
knees, which reminds me on my early days writing C on MMU-less
processors... not because of the language of course :-))

Enclosed is a sample code, which aborts on a large (> 100000 lines)
file.

I tried different variations (readFile $ lines, openFile, openFile +
IORef,...) but with no success... 

So, if the enclosed version is not too far, please give me a hint.
Alternatively, if I took the wrong direction, please refocus my search
.-)

Thanks in advance,

Frédéric

-- 
Frédéric Gobry    Infoscience
                  DIT-KIS / EPFL
                  Tel: +41216932288
http://people.epfl.ch/frederic.gobry

-------------- next part --------------
module Hit (parseHit, Hit, ip, time, query, status, epoch, CalendarTime) where

import Text.Regex
import System.Time


import Data.FiniteMap


type IP = String

data HTTPStatus = Ok | Error deriving Show

data Hit = Hit { ip     :: IP,
		 time   :: CalendarTime,
		 query  :: String,
		 status :: HTTPStatus
	       }

instance Show Hit where
    show hit = "Hit " ++ (show $ ip hit) ++ " " ++ (calendarTimeToString $ time hit) ++ " " ++ (show $ query hit)


-- Handling of log timestamps

_months = listToFM [ ("Jan", January),
		     ("Feb", February),
		     ("Mar", March),
		     ("Apr", April),
		     ("May", May),
		     ("Jun", June),
		     ("Jul", July),
		     ("Aug", August),
		     ("Sep", September),
		     ("Oct", October),
		     ("Nov", November),
		     ("Dec", December) ]

_mkMonth month = case lookupFM _months month of
		 Nothing -> error ("invalid month " ++ month)
		 Just m  -> m

_mkTZ (sign:tz)
    = let (q, r) = divMod (read tz :: Int) 100
	  s      = if (sign == '-') then 1 else -1
      in s * (60 * q + r) * 60

_timeRe = mkRegex "([0-9]+)/([a-zA-Z]+)/([0-9]+):([0-9]+):([0-9]+):([0-9]+) (.[0-9]+)"

_timeSplit t = ( read year :: Int,
		 _mkMonth month,
		 read day :: Int,
		 read h :: Int,
		 read m :: Int,
		 read s :: Int,
		 _mkTZ tz
	       )
    where [day, month, year, h, m, s, tz] 
	      = case matchRegex _timeRe t of
		Nothing    -> error ("invalid time: " ++ t)
		Just parts -> parts


-- Parse a log timestamp into a calendar time
timeParse :: String -> CalendarTime

timeParse t 
    = let (year, month, day, hour, min, sec, tz) = _timeSplit t
	  in toUTCTime $ toClockTime (CalendarTime { ctYear  = year,
						     ctMonth = month,
						     ctDay   = day,
						     ctHour  = hour,
						     ctMin   = min,
						     ctSec   = sec,
						     ctTZ    = tz,
						     ctPicosec = 0,
						     ctWDay = undefined, 
						     ctYDay = undefined, 
						     ctTZName = undefined, 
						     ctIsDST = False
						   })

-- Convenience "minimal value" for the time stamps
epoch = toUTCTime $ toClockTime (CalendarTime { ctYear  = 1970,
						ctMonth = January,
						ctDay   = 1,
						ctHour  = 0,
						ctMin   = 0,
						ctSec   = 0,
						ctTZ    = 0,
						ctPicosec = 0,
						ctWDay = undefined, 
						ctYDay = undefined, 
						ctTZName = undefined, 
						ctIsDST = False
					      })




_short = "^([0-9.]+) - - \\[([^]]+)\\] \"([^\"]+)\" ([0-9]+) ([0-9]+|-)"

_lineRe  = mkRegex (_short ++ " \"([^\"]+)\" \"([^\"]+)\"$")
_shortRe = mkRegex (_short ++ "$")

_mkStatus status 
    = let v = read status :: Int
      in if (v == 200) then Ok else Error
      

parseHit line = case (matchRegex _lineRe line) of
		Nothing     -> case (matchRegex _shortRe line) of 
			       Nothing -> error ("invalid log: " ++ line)
			       Just [client, stamp, query, status, size] 
				   -> Hit client (timeParse stamp) query (_mkStatus status)
		Just [client, stamp, query, status, size, referrer, agent] 
		    -> Hit client (timeParse stamp) query (_mkStatus status)
-------------- next part --------------
import IO
import System
import ParseLog
import Control.Monad.State

main = do
       args  <- getArgs
       fh    <- openFile (args !! 0) ReadMode
       parseLog fh
-------------- next part --------------
module ParseLog (parseLog) where

import IO
import Data.IORef

import Hit


data Stat = Stat { latest :: CalendarTime, total :: Int }

_doParse line stat = let hit = parseHit line
			 in Stat (max (time hit) (latest stat)) (total stat + 1)


_parseLine fh state = do { line <- hGetLine fh ; 
			   value <- readIORef state ; 
			   writeIORef state (_doParse line value) ; 
			 }

_parseLines fh state = do { end <- hIsEOF fh ;
			    if end then 
			    return ()
			    else _parseLine fh state >> _parseLines fh state
			  }

parseLog fh
    = do initial <- newIORef (Stat epoch 0)
	 _parseLines fh initial
	 final   <- readIORef initial
	 putStrLn $ "fini " ++ (show $ total final)

--	 putStrLn ("Total: " ++ (show $ total final) ++ " ending at " ++ (show $ latest final))




More information about the Haskell-Cafe mailing list