[Haskell-cafe] Slow Text.JSON parser
Don Stewart
dons at galois.com
Sat Jan 17 17:50:00 EST 2009
It occurs to me you could also use attoparsec, which is specifically
optimised for bytestring processing.
sjoerd:
> Hi,
>
> Somebody told me about Parsec 3, which uses a Stream type class so it
> can parse any data type. This sounded like the right way to do
> encoding independent parsing, so I decided to see how it would work to
> parse UTF8 JSON.
>
> Sadly I could not use Text.JSON.Parsec directly, because it uses the
> old Parsec CharParser type. So I copied to code, and also replaced
> p_number with the "floating" parser from Text.Parsec.Token, because
> Text.JSON.Parsec uses readFloat (a dirty hack imho) which works only
> on String.
>
> If Text.JSON.Parsec was written for Parsec 3, the only thing to write
> to get UTF8 JSON parsing would be:
>
> instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8 string)
> m Char where
> uncons = return . U.uncons
>
> I did not do any performance measuring yet, I was glad I got it
> working. Any comments on the code is appreciated!
>
> greetings,
> Sjoerd Visscher
>
> {-# LANGUAGE FlexibleInstances, MultiParamTypeClasses,
> UndecidableInstances #-}
> import qualified Data.String.UTF8 as U
> import qualified Data.ByteString as B
>
> import Text.Parsec hiding (many, optional, (<|>))
> import Control.Applicative
>
> import Text.JSON.Types
> import Control.Monad
> import Data.Char
> import Numeric
>
> instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8 string)
> m Char where
> uncons = return . U.uncons
>
> type CharParser st = Parsec (U.UTF8 B.ByteString) st
>
> parseFile :: FilePath -> IO (Either ParseError JSValue)
> parseFile fileName = do
> bs <- B.readFile fileName
> return $ runParser json () fileName (U.fromRep bs)
>
> parseString :: String -> Either ParseError JSValue
> parseString s = runParser json () "(unknown)" (U.fromString s)
>
> json :: CharParser () JSValue
> json = spaces *> p_value
>
> tok :: CharParser () a -> CharParser () a
> tok p = p <* spaces
>
> p_value :: CharParser () JSValue
> p_value = (JSNull <$ p_null)
> <|> (JSBool <$> p_boolean)
> <|> (JSArray <$> p_array)
> <|> (JSString <$> p_js_string)
> <|> (JSObject <$> p_js_object)
> <|> (JSRational False <$> p_number)
> <?> "JSON value"
>
> p_null :: CharParser () ()
> p_null = tok (string "null") >> return ()
>
> p_boolean :: CharParser () Bool
> p_boolean = tok
> ( (True <$ string "true")
> <|> (False <$ string "false")
> )
>
> p_array :: CharParser () [JSValue]
> p_array = between (tok (char '[')) (tok (char ']'))
> $ p_value `sepBy` tok (char ',')
>
> p_string :: CharParser () String
> p_string = between (tok (char '"')) (char '"') (many p_char)
> where p_char = (char '\\' >> p_esc)
> <|> (satisfy (\x -> x /= '"' && x /= '\\'))
>
> p_esc = ('"' <$ char '"')
> <|> ('\\' <$ char '\\')
> <|> ('/' <$ char '/')
> <|> ('\b' <$ char 'b')
> <|> ('\f' <$ char 'f')
> <|> ('\n' <$ char 'n')
> <|> ('\r' <$ char 'r')
> <|> ('\t' <$ char 't')
> <|> (char 'u' *> p_uni)
> <?> "escape character"
>
> p_uni = check =<< count 4 (satisfy isHexDigit)
> where check x | code <= max_char = pure (toEnum code)
> | otherwise = empty
> where code = fst $ head $ readHex x
> max_char = fromEnum (maxBound :: Char)
>
> p_object :: CharParser () [(String,JSValue)]
> p_object = between (tok (char '{')) (tok (char '}'))
> $ p_field `sepBy` tok (char ',')
> where p_field = (,) <$> (p_string <* tok (char ':')) <*> p_value
>
> p_number :: CharParser () Rational
> p_number = tok floating where
>
> floating :: CharParser () Rational
> floating = do{ n <- decimal
> ; fract <- option 0 fraction
> ; expo <- option 1 exponent'
> ; return ((fromInteger n + fract)*expo)
> }
>
> fraction = do{ char '.'
> ; digits <- many1 digit <?> "fraction"
> ; return (foldr op 0 digits)
> }
> <?> "fraction"
> where
> op d f = (f + fromIntegral (digitToInt d))/10
>
> exponent' = do{ oneOf "eE"
> ; f <- sign
> ; e <- decimal <?> "exponent"
> ; return (power (f e))
> }
> <?> "exponent"
> where
> power e | e < 0 = 1/power(-e)
> | otherwise = fromInteger (10^e)
>
> sign = (char '-' >> return negate)
> <|> (char '+' >> return id)
> <|> return id
>
> decimal = number 10 digit
>
> number base baseDigit
> = do{ digits <- many1 baseDigit
> ; let n = foldl (\x d -> base*x + toInteger (digitToInt
> d)) 0 digits
> ; seq n (return n)
> }
>
>
> p_js_string :: CharParser () JSString
> p_js_string = toJSString <$> p_string
>
> p_js_object :: CharParser () (JSObject JSValue)
> p_js_object = toJSObject <$> p_object
>
>
> _______________________________________________
> Haskell-Cafe mailing list
> Haskell-Cafe at haskell.org
> http://www.haskell.org/mailman/listinfo/haskell-cafe
More information about the Haskell-Cafe
mailing list