[Haskell-cafe] Irrefutable pattern love and new regex engine.
Michael Speer
knomenet at gmail.com
Tue Jan 22 11:55:45 EST 2008
I've been using the creation of a regular expression engine as an
ongoing project to learn Haskell. Last night I created the newest
iteration.
My love for the irrefutable pattern can be found in the definition of
the rexn ( apply repetition to extracted nodes ) function below.
/snip
rexn ns pps = let ( ~( xs , rps ) ,
~( ~( nxs ) ,
~( rxs , rrps ) ) ) = ( exn nxs pps ,
case rps of
('?':'?':rr)
-> ( ( ns ) ,
( ns ++ xs , rr ) )
('?':rr)
-> ( ( ns ) ,
( xs ++ ns , rr ) )
/snip
Beautiful, beautiful lazy execution and irrefutable pattern matching.
The data flow is able to easily twist in and out of functions, even as
an argument to the function call that creates it.
Is this kind of usage normal?
Any and all comments, suggestions, refutations, and criticisms are welcome.
-- regular expression engine -- (c) 2008 michael speer
import Char ( isSpace )
-- import Debug.Trace ( trace )
xor :: Bool -> Bool -> Bool
xor True a = not a
xor False a = a
data RxToken = RxStart -- matchable start of target string
| RxChar Char -- a literal character to match
| RxBound -- inserted wherever alphanums touch whitespace
| RxEnd -- matchable end of target string
| RxEOF -- an additional token to push through to
catch anything trying for RxEnd
-- RxEOF is never matched.
deriving ( Show )
rxTokenize tts = RxStart : case tts of
[] -> RxEnd : RxEOF : []
tts@(t:_) -> case not $ isSpace t of
True -> RxBound : rxt tts
False -> rxt tts
where
rxt (t:[]) | not $ isSpace t = RxChar t : RxBound : RxEnd : RxEOF : []
| otherwise = RxChar t : RxEnd : RxEOF : []
rxt (t:ts@(t':_)) | isSpace t `xor` isSpace t' = RxChar t :
RxBound : rxt ts
| otherwise = RxChar t : rxt ts
data RxTransform = RxTransform ( RxNode -> RxToken -> [ RxNode ] )
| RxNullTransform
data RxNode = RxActive { rxTransforms :: [RxTransform] ,
rxMatched :: String ,
rxNumSubs :: Integer ,
rxSubExprs :: [ String ] }
| RxComplete { rxMatched :: String ,
rxNumSubs :: Integer ,
rxSubExprs :: [ String ] }
instance Show RxNode where
show (RxComplete matched _ _) = "<rx|matched:[" ++ matched ++ "]>"
data RxDepth = RxTop | RxSub deriving ( Show )
rxCompile pps = let ( xs , rps ) = oexn [success] RxTop pps
in case length rps of
0 -> RxActive { rxTransforms = xs ,
rxMatched = [] ,
rxNumSubs = 0 ,
rxSubExprs = [] }
_ -> error $ "Not all of pattern consumed :
remains : " ++ rps
where
-- or together different expression sections -- (a|b|c)
oexn ns RxTop [] = ( ns , [] )
oexn _ RxSub [] = error "Pattern ended while still in sub expression"
oexn ns d pps = let ( ~( xs , rps ) ,
~( nxs , nrps ) ) = ( aexn ns pps ,
case rps of
('|':rr) ->
let ( inxs , irps ) = oexn ns d rr
in
( xs ++ inxs , irps )
(')':rr) -> case d of
RxTop -> error "Erroneous close parenthesis in pattern "
RxSub -> ( xs , rr )
[] -> case d of
RxTop -> ( xs , [] )
RxSub -> error "End of pattern while still in sub expression" )
in ( nxs , nrps )
-- and together extracted nodes in a given expression segment -- abd?dfs
aexn ns pps = let ( ~( xs , rps ) ,
~( nxs , nrps ) ) = ( rexn nxs pps ,
case rps of
('|':_) -> ( ns , rps )
(')':_) -> ( ns , rps )
[] -> ( ns , rps )
_ -> aexn ns rps )
in ( xs , nrps )
-- replication application - weee!
rexn ns pps = let ( ~( xs , rps ) ,
~( ~( nxs ) ,
~( rxs , rrps ) ) ) = ( exn nxs pps ,
case rps of
('?':'?':rr)
-> ( ( ns ) ,
( ns ++ xs , rr ) )
('?':rr)
-> ( ( ns ) ,
( xs ++ ns , rr ) )
('*':'?':rr)
-> ( ( ns ++ xs ) ,
( ns ++ xs , rr ) )
('*':rr)
-> ( ( xs ++ ns ) ,
( xs ++ ns , rr ) )
('+':'?':rr)
-> ( ( ns ++ xs ) ,
( xs , rr ) )
('+':rr)
-> ( ( xs ++ ns ) ,
( xs , rr ) )
_
-> ( ( ns ) ,
( xs , rps ) )
)
in
( rxs , rrps )
-- extract node ( including an entire subexpression as a single node )
exn _ ('?':_) = error "Bad question mark operator"
exn _ ('*':_) = error "Bad splat operator"
exn _ ('+':_) = error "Bad plus sign operator"
exn ns ('(':ps) = oexn ns RxSub ps
exn ns (p:ps) = ( [ RxTransform ( \rxn k -> case k of
(RxChar c) -> if c == p
then
[ RxActive { rxTransforms = ns ,
rxMatched = c : rxMatched rxn ,
rxNumSubs = rxNumSubs rxn ,
rxSubExprs = rxSubExprs rxn } ]
else
[]
(RxStart) -> [rxn]
(RxBound) -> [rxn]
_ -> []
) ] ,
ps )
exn ns [] = error "can this be reached?"
success = RxTransform ( \ rxn k -> [ RxComplete { rxMatched =
reverse $ rxMatched rxn ,
rxNumSubs =
rxNumSubs rxn ,
rxSubExprs =
map reverse $ rxSubExprs rxn } ] )
rxExec n tts = iexec [n] (rxTokenize tts)
where
iexec (win@(RxComplete _ _ _ ):_) _ = Just win
iexec [] _ = Nothing
iexec nns (k:ks) = iexec ( concatMap (
\n -> case n of
(RxComplete _ _ _) -> [n]
a@(RxActive _ _ _ _) -> concatMap (\xf -> case xf of
(RxTransform f) ->
f a k
(RxNullTransform) ->
[]
) (rxTransforms a) ) nns ) ks
main = do
print $ rxTokenize "this is a test"
print $ rxExec (rxCompile "hello|world") "hello"
print $ rxExec (rxCompile "hello|world") "world"
print $ rxExec (rxCompile "abcde|ab") "abcd"
print $ rxExec (rxCompile "ab?c") "ac"
print $ rxExec (rxCompile "ab?c") "abc"
print $ rxExec (rxCompile "ab*c") "ac"
print $ rxExec (rxCompile "ab*c") "abc"
print $ rxExec (rxCompile "ab*c") "abbbc"
print $ rxExec (rxCompile "ab+c") "ac"
print $ rxExec (rxCompile "ab+c") "abc"
print $ rxExec (rxCompile "ab+c") "abbbbbc"
print $ rxExec (rxCompile "(a|b)+") "aaabbbabaaababaaabbbabbaba"
print $ rxExec (rxCompile "abc|") "zyx"
More information about the Haskell-Cafe
mailing list