[Haskell-cafe] Slow Text.JSON parser
Sjoerd Visscher
sjoerd at w3future.com
Sat Jan 17 18:35:14 EST 2009
Attoparsec does not have something like the Stream class, so I do not
see how I could do UTF8 parsing easily.
On Jan 17, 2009, at 11:50 PM, Don Stewart wrote:
> It occurs to me you could also use attoparsec, which is specifically
> optimised for bytestring processing.
>
> sjoerd:
>> Hi,
>>
>> Somebody told me about Parsec 3, which uses a Stream type class so it
>> can parse any data type. This sounded like the right way to do
>> encoding independent parsing, so I decided to see how it would work
>> to
>> parse UTF8 JSON.
>>
>> Sadly I could not use Text.JSON.Parsec directly, because it uses the
>> old Parsec CharParser type. So I copied to code, and also replaced
>> p_number with the "floating" parser from Text.Parsec.Token, because
>> Text.JSON.Parsec uses readFloat (a dirty hack imho) which works only
>> on String.
>>
>> If Text.JSON.Parsec was written for Parsec 3, the only thing to write
>> to get UTF8 JSON parsing would be:
>>
>> instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8
>> string)
>> m Char where
>> uncons = return . U.uncons
>>
>> I did not do any performance measuring yet, I was glad I got it
>> working. Any comments on the code is appreciated!
>>
>> greetings,
>> Sjoerd Visscher
>>
>> {-# LANGUAGE FlexibleInstances, MultiParamTypeClasses,
>> UndecidableInstances #-}
>> import qualified Data.String.UTF8 as U
>> import qualified Data.ByteString as B
>>
>> import Text.Parsec hiding (many, optional, (<|>))
>> import Control.Applicative
>>
>> import Text.JSON.Types
>> import Control.Monad
>> import Data.Char
>> import Numeric
>>
>> instance (Monad m, U.UTF8Bytes string index) => Stream (U.UTF8
>> string)
>> m Char where
>> uncons = return . U.uncons
>>
>> type CharParser st = Parsec (U.UTF8 B.ByteString) st
>>
>> parseFile :: FilePath -> IO (Either ParseError JSValue)
>> parseFile fileName = do
>> bs <- B.readFile fileName
>> return $ runParser json () fileName (U.fromRep bs)
>>
>> parseString :: String -> Either ParseError JSValue
>> parseString s = runParser json () "(unknown)" (U.fromString s)
>>
>> json :: CharParser () JSValue
>> json = spaces *> p_value
>>
>> tok :: CharParser () a -> CharParser () a
>> tok p = p <* spaces
>>
>> p_value :: CharParser () JSValue
>> p_value = (JSNull <$ p_null)
>> <|> (JSBool <$> p_boolean)
>> <|> (JSArray <$> p_array)
>> <|> (JSString <$> p_js_string)
>> <|> (JSObject <$> p_js_object)
>> <|> (JSRational False <$> p_number)
>> <?> "JSON value"
>>
>> p_null :: CharParser () ()
>> p_null = tok (string "null") >> return ()
>>
>> p_boolean :: CharParser () Bool
>> p_boolean = tok
>> ( (True <$ string "true")
>> <|> (False <$ string "false")
>> )
>>
>> p_array :: CharParser () [JSValue]
>> p_array = between (tok (char '[')) (tok (char ']'))
>> $ p_value `sepBy` tok (char ',')
>>
>> p_string :: CharParser () String
>> p_string = between (tok (char '"')) (char '"') (many p_char)
>> where p_char = (char '\\' >> p_esc)
>> <|> (satisfy (\x -> x /= '"' && x /= '\\'))
>>
>> p_esc = ('"' <$ char '"')
>> <|> ('\\' <$ char '\\')
>> <|> ('/' <$ char '/')
>> <|> ('\b' <$ char 'b')
>> <|> ('\f' <$ char 'f')
>> <|> ('\n' <$ char 'n')
>> <|> ('\r' <$ char 'r')
>> <|> ('\t' <$ char 't')
>> <|> (char 'u' *> p_uni)
>> <?> "escape character"
>>
>> p_uni = check =<< count 4 (satisfy isHexDigit)
>> where check x | code <= max_char = pure (toEnum code)
>> | otherwise = empty
>> where code = fst $ head $ readHex x
>> max_char = fromEnum (maxBound :: Char)
>>
>> p_object :: CharParser () [(String,JSValue)]
>> p_object = between (tok (char '{')) (tok (char '}'))
>> $ p_field `sepBy` tok (char ',')
>> where p_field = (,) <$> (p_string <* tok (char ':')) <*> p_value
>>
>> p_number :: CharParser () Rational
>> p_number = tok floating where
>>
>> floating :: CharParser () Rational
>> floating = do{ n <- decimal
>> ; fract <- option 0 fraction
>> ; expo <- option 1 exponent'
>> ; return ((fromInteger n + fract)*expo)
>> }
>>
>> fraction = do{ char '.'
>> ; digits <- many1 digit <?> "fraction"
>> ; return (foldr op 0 digits)
>> }
>> <?> "fraction"
>> where
>> op d f = (f + fromIntegral (digitToInt d))/10
>>
>> exponent' = do{ oneOf "eE"
>> ; f <- sign
>> ; e <- decimal <?> "exponent"
>> ; return (power (f e))
>> }
>> <?> "exponent"
>> where
>> power e | e < 0 = 1/power(-e)
>> | otherwise = fromInteger (10^e)
>>
>> sign = (char '-' >> return negate)
>> <|> (char '+' >> return id)
>> <|> return id
>>
>> decimal = number 10 digit
>>
>> number base baseDigit
>> = do{ digits <- many1 baseDigit
>> ; let n = foldl (\x d -> base*x + toInteger (digitToInt
>> d)) 0 digits
>> ; seq n (return n)
>> }
>>
>>
>> p_js_string :: CharParser () JSString
>> p_js_string = toJSString <$> p_string
>>
>> p_js_object :: CharParser () (JSObject JSValue)
>> p_js_object = toJSObject <$> p_object
>>
>>
>> _______________________________________________
>> Haskell-Cafe mailing list
>> Haskell-Cafe at haskell.org
>> http://www.haskell.org/mailman/listinfo/haskell-cafe
--
Sjoerd Visscher
sjoerd at w3future.com
More information about the Haskell-Cafe
mailing list