[Haskell-cafe] Re: Knuth Morris Pratt for Lazy Bytestrings
implementation
Daniel Fischer
daniel.is.fischer at web.de
Sun Aug 5 10:39:39 EDT 2007
Am Mittwoch, 1. August 2007 22:54 schrieb ChrisK:
> My optimized (and fixed) version of the code is attached.
I adapted my KMP implementation from one and a half years ago to the problem
at hand (no longer search and replace, only find index of first match, and
change from Strings to ByteStrings), on my computer, for the few tests I
performed, it's about 30-40% faster than Chris' (depending on the input).
Chris, could you check whether this holds for your benchmark?
If so, any polishing and further optimisations are welcome; if that should be
the basis of an addition to the ByteString lib, I'd feel very honoured (in
other words, if you consider it useful code, you're welcome to use it).
Cheers,
Daniel
-- KMP algorithm for lazy ByteStrings
{-# OPTIONS_GHC -fbang-patterns #-}
module KMP (kmpMatch) where
import qualified Data.Array.Base as Base (unsafeAt)
import Data.Array.Unboxed (UArray, listArray)
import qualified Data.Array as A (listArray, (!), elems)
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString as S
import qualified Data.ByteString.Base as B (unsafeHead, unsafeTail,
unsafeDrop, unsafeIndex)
import Data.Int (Int64)
import Data.Word (Word8)
kmpMatch :: B.ByteString -> B.ByteString -> Int64
kmpMatch patLazy search
| B.null patLazy = 0
| otherwise = kmp 0 0 search
where
pat :: S.ByteString
pat = S.concat (B.toChunks patLazy)
patLen = S.length pat
sym :: Int -> Word8
sym = B.unsafeIndex pat
bord = A.listArray (0,patLen) $
(-1):0:[getS (sym i) i + 1 | i <- [1 .. patLen - 1]]
where
getS s n
| m < 0 || s == sym m = m
| otherwise = getS s m
where
m = bord A.! n
borders :: UArray Int Int
borders = listArray (0,patLen) $ A.elems bord
(?) :: UArray Int Int -> Int -> Int
(?) = Base.unsafeAt
getShift :: Word8 -> Int -> Int
getShift s n = help n
where
help k
| m < 0 || sym m == s = m
| otherwise = help m
where
m = borders ? k
kmp :: Int64 -> Int -> B.ByteString -> Int64
kmp !idx !match !srch
| patLen == match = idx - fromIntegral match
| B.null srch = -1
| sym match == B.head srch = kmp (idx+1) (match+1) (B.tail srch)
| match == 0 = kmp (idx+1) 0 (B.tail srch)
| otherwise =
case getShift (B.head srch) match of
-1 -> kmp idx 0 srch
0 -> kmp (idx+1) 1 (B.tail srch)
shft -> kmp (idx + fromIntegral shft) (shft+1) (B.tail
srch)
More information about the Haskell-Cafe
mailing list