[Haskell-cafe] Space leak in hexpat-0.20.3/List-0.5.1
wren ng thornton
wren at freegeek.org
Sun Apr 28 23:14:58 CEST 2013
Hello all,
So I'm processing a large XML file which is a database of about 170k
entries, each of which is a reasonable enough size on its own, and I only
need streaming access to the database (basically printing out summary data
for each entry). Excellent, sounds like a job for SAX.
However, after whipping up a simplified version of the program using
hexpat, there's a space leak. Near as I can tell, it's not a problem with
my code, it's a problem with Data.List.Class (or hexpat's use thereof).
The simplified code follows, just compile it for profiling and use hp2ps
to see what I mean. The file I'm running it on can be found at:
ftp://ftp.monash.edu.au/pub/nihongo/JMdict.gz
Any ideas on what the problem really is, or how to fix it?
----------------------------------------------------------------
----------------------------------------------------------------
module JMdict (main) where
import Text.XML.Expat.SAX (SAXEvent(..))
import qualified Text.XML.Expat.SAX as SAX
import Text.XML.Expat.Tree (NodeG(..))
import qualified Text.XML.Expat.Tree as DOM
import qualified Text.XML.Expat.Proc as Proc
import qualified Text.XML.Expat.Internal.NodeClass as Node
import qualified Data.ByteString.Lazy as BL
import Data.Char (isSpace)
import Data.Text.IO as TIO
import qualified Data.Text as T
import Control.Applicative ((<$>))
import Control.Monad (forM_)
import qualified System.IO as IO
import qualified System.Environment as Sys (getArgs)
import qualified System.Exit as Sys (exitFailure)
import qualified System.Directory as Sys
(doesFileExist, getPermissions, readable)
----------------------------------------------------------------
-- | A variant of 'Control.Monad.unless' for when the boolean is
-- also monadic.
unlessM :: Monad m => m Bool -> m () -> m ()
unlessM mb handle = do
b <- mb
if b then return () else handle
-- | If the file does not exist or is not readable, then crash the
-- program.
assertFileExistsReadable :: FilePath -> IO ()
assertFileExistsReadable file = do
unlessM (Sys.doesFileExist file) $ do
IO.hPutStrLn IO.stderr $ "No such file: "++file
Sys.exitFailure
unlessM (Sys.readable <$> Sys.getPermissions file) $ do
IO.hPutStrLn IO.stderr $ "File not readable: "++file
Sys.exitFailure
main :: IO ()
main = do
files <- Sys.getArgs
forM_ files $ \file -> do
assertFileExistsReadable file
countElements 0
. filter (not . isWhitespace)
. dropPreamble
. SAX.parse SAX.defaultParseOptions
=<< BL.readFile file
where
dropPreamble (StartElement t _ : xs) | t == T.pack "JMdict" = xs
dropPreamble (_:xs) = dropPreamble xs
dropPreamble [] = []
isWhitespace (CharacterData c) | T.all isSpace c = True
isWhitespace _ = False
countElements :: Int -> [SAXEvent T.Text T.Text] -> IO ()
countElements n [] = print n
countElements n xs =
case anyElement xs of
(Left err, xs') -> fail $ show err ++": "++ show (take 3 xs')
(Right ell, xs') -> do
print (n+1)
(countElements $! n+1) xs'
----------------------------------------------------------------
data ElementError
= EmptyStream
| NoStartElement
| EndOfStream
| InvalidXML
deriving (Read, Show, Eq)
-- | Split an event stream into an initial element and the remainder
-- of the stream. Use 'DOM.saxToTree' to convert the element to a
-- tree.
anyElement
:: (Eq tag)
=> [SAXEvent tag text]
-> (Either ElementError [SAXEvent tag text], [SAXEvent tag text])
anyElement = start
where
start [] = (Left EmptyStream, [])
start xxs@(x:xs) =
case x of
StartElement t _ -> loop [t] (x:) xs
_ -> (Left NoStartElement, xxs)
loop _ _ [] = (Left EndOfStream, [])
loop [] k xs = (Right (k []), xs)
loop tts@(t:ts) k xxs@(x:xs) = step (k . (x:)) xs
where
step =
case x of
StartElement t' _ -> loop (t':tts)
EndElement t'
| t' == t -> loop ts
| otherwise -> \_ _ -> (Left InvalidXML, xxs)
_ -> loop tts
----------------------------------------------------------------
----------------------------------------------------------- fin.
--
Live well,
~wren
More information about the Haskell-Cafe
mailing list