[Haskell-cafe] Regular Expressions without GADTs

oleg at pobox.com oleg at pobox.com
Mon Oct 17 22:00:07 EDT 2005

Conor McBride wrote:
> Inspired by Ralf's post, I thought I'd just GADTize a dependently typed 
> program I wrote in 2001.

Equally inspired, I thought of deGADTizing that code. The code below
also uses no existentials, and no local type annotations. The code is
more general in that the parser works in an arbitrary MonadPlus. If
that happens to be the List monad, we can see alternative parses. The
tests illustrate alternative parses and parsing of a list of integers, 
for example. The code seems a bit simpler, too.

{-# OPTIONS -fglasgow-exts #-}

module RX where

import Control.Monad

-- If monadPlus is List, we can get alternative parses

class RegExp t tok a | t tok -> a where
   parse :: MonadPlus m => t -> [tok] -> m a

data Zero = Zero
data One = One
newtype Check tok = Check (tok -> Bool)
data Plus r1 r2 = Plus r1 r2
data Mult r1 r2 = Mult r1 r2
data Push tok r = Push tok r
newtype Star r = Star r

data Void = Void

-- Parser Zero fails everywhere
instance RegExp Zero tok Void where
    parse _ _ = mzero

-- Parser One recognizes the empty string
instance RegExp One tok () where
    parse _ [] = return ()
    parse _ _  = mzero

-- Parser (Check p) recognizes a single-element stream [tok]
-- provided tok satisfies the predicate p
instance RegExp (Check tok) tok tok where
    parse (Check p) [t] | p t = return t
    parse _ _ = mzero

-- Parser (Push tok r) prepends tok to the stream and gets parser
-- r to recognize the result
instance RegExp r tok a => RegExp (Push tok r) tok a where
    parse (Push tok r) ts = parse r (tok:ts)

-- Non-deterministic choice of parsers r1 and r2, applied to the same stream
instance (RegExp r1 tok a, RegExp r2 tok b)
    => RegExp (Plus r1 r2) tok (Either a b) where
   parse (Plus r1 r2) ts = (liftM Left   $ parse r1 ts) `mplus`
			   (liftM Right  $ parse r2 ts)

-- Mult r1 r2: A sequence of parsers r1 and r2.
-- Note the order: we search for the longest prefix of the input stream
-- that satisfies the parser r1
instance (RegExp r1 tok a, RegExp r2 tok b)
    => RegExp (Mult r1 r2) tok (a, b) where
   parse (Mult r1 r2) ts@[] = liftM2 (,) (parse r1 ts) (parse r2 ts)
   parse (Mult r1 r2) (t:ts) = 
       parse (Mult (Push t r1) r2) ts `mplus`
       liftM2 (,) (parse r1 ([] `asTypeOf` ts)) (parse r2 (t:ts))

-- Kleene's star
instance RegExp r tok a => RegExp (Star r) tok [a] where
    parse (Star r) [] = return []
    parse (Star r) ts = parse (Mult r (Star r)) ts >>=
			(\ (x,xs) -> return $ x:xs)

p1 = (Mult (Star (Check (== 'a'))) (Star (Check (== 'b'))))

asMayBe :: Maybe a -> Maybe a
asMayBe = id

asList :: [a] -> [a]
asList = id

testp = asMayBe $
	parse (Star (Mult (Star (Check (== 'a'))) (Star (Check (== 'b')))))

-- *RX> testp
-- Just [("a","b"),("aa","b"),("aaa","bbbb")]

-- see alternative parses

testp1 = take 3 $ asList $
	parse (Star (Mult (Star (Check (== 'a'))) (Star (Check (== 'b')))))

-- Parsing the list of integers

ieven = even :: Int->Bool
iodd  = odd  :: Int->Bool
p2 = Plus (Mult (Check iodd) (Mult (Check iodd) (Star (Check ieven))))
	  (Mult (Check ieven) (Star (Check iodd)))

-- the parser is ambiguous. We can see the alternatives
test2 = take 3 $ asList $ parse (Star p2) [1::Int,1,2,3,3,4]

More information about the Haskell-Cafe mailing list