Regex parser for Dfa
Graham Klyne
GK at ninebynine.org
Fri Oct 24 15:15:37 EDT 2003
At 21:58 23/10/03 -0400, ajb at spamcop.net wrote:
> > A function to construct an (Re Char) from a simple textual representation
> > would be handy. Maybe I'll tackle that.
>
>By all means. Do you have a sourceforge account? If so, I can easily
>arrange checkin rights. Better to call it something other than Dfa.
I tackled this as an exercise in writing a lightweight Parsec parser. A
copy is attached.
I do have a sourceforge count (GrahamK), but I'm not familiar with the
procedure for submitting a file. (I do use CVS locally on my network, but
not through SSH.)
#g
------------
Graham Klyne
For email:
http://www.ninebynine.org/#Contact-------------- next part --------------
--------------------------------------------------------------------------------
-- $Id: RegexParser.hs,v 1.2 2003/10/24 15:37:38 graham Exp $
--
-- Copyright (c) 2003, G. KLYNE. All rights reserved.
-- See end of this file for licence information.
--------------------------------------------------------------------------------
-- |
-- Module : RegexParser
-- Copyright : (c) 2003, Graham Klyne
-- License : GPL V2
--
-- Maintainer : Graham Klyne
-- Stability : provisional
-- Portability : H98
--
-- This Module implements a simple regular expression parser, yielding
-- a regular expression matcher based on the Dfa library by Andrew Bromage
-- (cf. http://cvs.sourceforge.net/viewcvs.py/haskell-libs/libs/text/)
--
-- Uses the Parsec monadic parser library (but not the Parsec tokenizer)
--
--------------------------------------------------------------------------------
module RegexParser
( parseDfaFromString
)
where
import Parsec
import Dfa
( Re(..)
, matchRe
)
----------------------------------------------------------------------
-- Define parser state and helper functions
----------------------------------------------------------------------
-- N3 parser state
type RegexState = ()
{-
data RegexState = RegexState
{ foo :: () -- Null state?
}
-}
----------------------------------------------------------------------
-- Define top-level parser function:
-- accepts a string and returns a graph or error
----------------------------------------------------------------------
-- |Parse a string representation of a regular expression, returning
-- Either:
-- Left -> a string describing a syntax error in theregular expression,
-- Right -> a Dfa regular expression matcher
--
-- To use the regular expression matcher, apply function matchRe
-- (or matchRe2) from the Dfa module.
--
-- Regular expression syntaxd recognized:
--
-- regex = reterm*
--
-- reterm = reprim reterm1
--
-- reterm1 = "|" reprim
-- | reprim "*"
-- | reprim "+"
-- | reprim "?"
-- | ()
--
-- reprim = "(" regex ")"
-- | "[" crange* "]"
-- | cmatch
--
-- crange = cmatch crange1
--
-- crange1 = "-" cmatch
-- | ()
--
-- cmatch = "\" ch
-- | anych
-- | ch
--
-- anych = "."
--
-- ch = any character not including "(", ")", "[", "]", "\", "."
--
-- The syntax is interpreted such that special treatment of
-- characters takes precendence over the ch production.
--
-- The parsed regex is returned as a value of type Re Char,
-- where:
--
-- data Re t = ReOr [Re t]
-- | ReCat [Re t]
-- | ReStar (Re t)
-- | RePlus (Re t)
-- | ReOpt (Re t)
-- | ReTerm [t]
--
parseDfaFromString :: String -> (Either String (Re Char))
parseDfaFromString input =
let
pstate = ()
result = runParser regex pstate "" input
in
case result of
Left er -> Left (show er)
Right re -> Right re
----------------------------------------------------------------------
-- Syntax productions
----------------------------------------------------------------------
type RegexParser a = GenParser Char RegexState a
regex :: RegexParser (Re Char)
regex =
do { rs <- many reterm
; return $ ReCat rs
}
reterm :: RegexParser (Re Char)
reterm =
do { r1 <- reprim
; rt <- reterm1 r1
; return rt
}
reterm1 :: (Re Char) -> RegexParser (Re Char)
reterm1 r1 =
do { char '|'
; r2 <- reprim
; return $ ReOr [r1,r2]
}
<|> do { char '*'
; return $ ReStar r1
}
<|> do { char '+'
; return $ RePlus r1
}
<|> do { char '?'
; return $ ReOpt r1
}
<|> return r1
<?> "reterm (Regular expression term)"
reprim :: RegexParser (Re Char)
reprim =
do { char '('
; rp <- regex
; char ')'
; return rp
}
<|> do { char '['
; cr <- many crange
; char ']'
; return $ ReOr (concat cr)
}
<|> do { c1 <- cmatch
; return $ ReTerm c1
}
<?> "reprim (Regular expression primary value)"
crange :: RegexParser [(Re Char)]
crange =
do { c1 <- cmatch
; cr <- crange1 c1
; return cr
}
crange1 :: [Char] -> RegexParser [(Re Char)]
crange1 c1 =
do { char '-'
; c2 <- cmatch
; return $ map ReTerm $ map (:[]) [head c1..head c2]
}
<|> return (map ReTerm [c1])
<?> "crange (Regular expression character or character range)"
cmatch :: RegexParser [Char]
cmatch =
do { char '\\'
; c <- satisfy (const True)
; return [c]
}
<|> do { char '.'
; return ['\032'..'\254'] -- mumble
}
<|> do { c <- satisfy $ not . (`elem` "()[]\\.")
; return [c]
}
<?> "cmatch (escape, '.' or any character other than '(', ')', '[' or ']')"
{- Test cases, not exhaustive
(Right rei) = parseDfaFromString "(-|+)?[0123456789]+"
testrei1 = matchRe rei "123" -- True
testrei2 = matchRe rei "+456" -- True
testrei3 = matchRe rei "-789" -- True
testrei4 = matchRe rei "00123" -- True
testrei5 = matchRe rei "+00456" -- True
testrei6 = matchRe rei "-00789" -- True
testrei7 = matchRe rei " 123" -- False
testrei8 = matchRe rei "123 " -- False
testrei9 = matchRe rei "123x" -- False
testrei = and
[ testrei1, testrei2, testrei3, testrei4, testrei5
, testrei6
, not testrei7, not testrei8, not testrei9
]
(Right ref) = parseDfaFromString
"(-|+)?[0123456789]*(\\.[0123456789]+)?([eE](-|+)?[0123456789]+)?"
testref1 = matchRe ref "123"
testref2 = matchRe ref "+456"
testref3 = matchRe ref "-789"
testref4 = matchRe ref "123.456"
testref5 = matchRe ref "+456.789E23"
testref6 = matchRe ref "-00789.3400e+12"
testref7 = matchRe ref "-00789.3400e-12"
testref8 = matchRe ref ".1234"
testref9 = matchRe ref "123." -- False
testref = and
[ testref1, testref2, testref3, testref4, testref5
, testref6, testref7, testref8
, not testref9
]
-}
--------------------------------------------------------------------------------
--
-- Copyright (c) 2003, G. KLYNE. All rights reserved.
--
-- This is free software; you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation; either version 2 of the License, or
-- (at your option) any later version.
--
-- Swish is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with Swish; if not, write to:
-- The Free Software Foundation, Inc.,
-- 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
--
--------------------------------------------------------------------------------
-- $Source: /file/cvsdev/HaskellRDF/RegexParser.hs,v $
-- $Author: graham $
-- $Revision: 1.2 $
-- $Log: RegexParser.hs,v $
-- Revision 1.2 2003/10/24 15:37:38 graham
-- Tidy up and add some more test cases
--
-- Revision 1.1 2003/10/24 15:21:25 graham
-- Add Dfa regular expression parser
--
More information about the Haskell-Cafe
mailing list