[Haskell-cafe] Re: Knuth Morris Pratt for Lazy Bytestrings implementation

Daniel Fischer daniel.is.fischer at web.de
Sun Aug 5 10:39:39 EDT 2007


Am Mittwoch, 1. August 2007 22:54 schrieb ChrisK:
> My optimized (and fixed) version of the code is attached. 

I adapted my KMP implementation from one and a half years ago to the problem 
at hand (no longer search and replace, only find index of first match, and 
change from Strings to ByteStrings), on my computer, for the few tests I 
performed, it's about 30-40% faster than Chris' (depending on the input). 
Chris, could you check whether this holds for your benchmark?
If so, any polishing and further optimisations are welcome; if that should be 
the basis of an addition to the ByteString lib, I'd feel very honoured (in 
other words, if you consider it useful code, you're welcome to use it).

Cheers,
Daniel

-- KMP algorithm for lazy ByteStrings
{-# OPTIONS_GHC -fbang-patterns #-}
module KMP (kmpMatch) where

import qualified Data.Array.Base as Base (unsafeAt)
import Data.Array.Unboxed (UArray, listArray)
import qualified Data.Array as A (listArray, (!), elems)

import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString as S
import qualified Data.ByteString.Base as B (unsafeHead, unsafeTail, 
unsafeDrop, unsafeIndex)
import Data.Int (Int64)
import Data.Word (Word8)

kmpMatch :: B.ByteString -> B.ByteString -> Int64
kmpMatch patLazy search
    | B.null patLazy    = 0
    | otherwise         = kmp 0 0 search
      where
        pat :: S.ByteString
        pat = S.concat (B.toChunks patLazy)
        patLen = S.length pat
        sym :: Int -> Word8
        sym = B.unsafeIndex pat
        bord = A.listArray (0,patLen) $
                    (-1):0:[getS (sym i) i + 1 | i <- [1 .. patLen - 1]]
               where
                getS s n
                    | m < 0 || s == sym m   = m
                    | otherwise             = getS s m
                      where
                        m = bord A.! n
        borders :: UArray Int Int
        borders = listArray (0,patLen) $ A.elems bord
        (?) :: UArray Int Int -> Int -> Int
        (?) = Base.unsafeAt
        getShift :: Word8 -> Int -> Int
        getShift s n = help n
            where
              help k
                | m < 0 || sym m == s   = m
                | otherwise             = help m
                  where
                    m = borders ? k
        kmp :: Int64 -> Int -> B.ByteString -> Int64
        kmp !idx !match !srch
            | patLen == match           = idx - fromIntegral match
            | B.null srch               = -1
            | sym match == B.head srch  = kmp (idx+1) (match+1) (B.tail srch)
            | match == 0                = kmp (idx+1) 0 (B.tail srch)
            | otherwise                 =
              case getShift (B.head srch) match of
                -1      -> kmp idx 0 srch
                0       -> kmp (idx+1) 1 (B.tail srch)
                shft    -> kmp (idx + fromIntegral shft) (shft+1) (B.tail 
srch)



More information about the Haskell-Cafe mailing list