[Haskell-cafe] Re: Knuth Morris Pratt for Lazy Bytestrings
implementation
ChrisK
haskell at list.mightyreason.com
Wed Aug 1 16:54:56 EDT 2007
My optimized (and fixed) version of the code is attached. I benchmarked it with:
> module Main(main) where
> import KMPSeq
> import qualified Data.ByteString.Lazy as B
> import qualified Data.ByteString.Lazy.Char8 as BC
> infile = "endo.dna"
Modified by one character from the original copied from the endo.dna
The leading "IIIII" is often overlapping with the suffix of a prefix of
substring so KMP should help here
> substring1 = BC.pack
"IIIIIPIIIPIIIIIPIIIPFFICCPIIIPFFFFFPIIIPFFFFFPIIIPIIFIIPIIIPIIIIIPIIIPIIIIIPIIIPIIIIIPIIIPCCCCCPI"
original
"IIIIIPIIIPIIIIIPIIIPFFICCPIIIPFFFFFPIIIPFFFFFPIIIPIIIIIPIIIPIIIIIPIIIPIIIIIPIIIPIIIIIPIIIPCCCCCPI"
> getSearch = do
> B.readFile infile
> main :: IO ()
> main = do
> print . kmpMatch substring1 =<< getSearch
The bug fix is actually a big speed win for this benchmark.
The long commentary I wrote has been sent to Justin Bailey with each step I took
and each measurement I made. (800 lines with all the pasted profiles).
The speedup was from 18.5 seconds to 2.6 seconds on the above benchmark.
Cheers,
Chris Kuklewicz
>From Martin Amis' short story "The Janitor on Mars":
"...The War with the Scythers of the Orion Spur was hotly prosecuted for just
over a billion years. Who won? We did. They're still there, the Scythers. Their
planet is still there. The nature of war changed, during that trillennium. It
was no longer nuclear or quantum-gravitational. It was neurological.
Informational. Life goes on for the Scythers, but its quality has been subtly
reduced. We fixed it so that they think they're simulations in a deterministic
computer universe. It is believed that this is the maximum suffering you can
visit on a type-v world."
http://www.metafilter.com/mefi/18176
-------------- next part --------------
-- Code by Justin Bailey with optimizations by Chris Kuklewicz
{-# OPTIONS_GHC -fbang-patterns #-}
module KMPSeq (kmpMatch)
where
import qualified Data.Array.Base as Base (unsafeAt)
import qualified Data.Array.Unboxed as Unboxed (UArray)
import qualified Data.Array.IArray as IArray ((!), Array, array)
import Data.List as List (map, filter, length, null, take, maximum, foldr, all, drop)
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString as S
import qualified Data.ByteString.Base as B (unsafeHead,unsafeTail,unsafeDrop,unsafeIndex)
import GHC.Int (Int64)
import Test.QuickCheck
import Data.ByteString as S (isSubstringOf, pack, ByteString) -- Only used during testing
import Debug.Trace (trace)
import System.IO.Unsafe
import System.IO
{-
Returns the first index for which the pattern given is found
in the string given, or -1 if the pattern does not appear.
-}
kmpMatch :: B.ByteString -- ^ The pattern
-> B.ByteString -- ^ The string to search
-> Int64 -- ^ Result
kmpMatch patLazy search
| B.null patLazy = 0 -- Empty pattern matches right away.
| otherwise =
let pat :: S.ByteString
pat = S.concat (B.toChunks patLazy)
lookupTable = computeLookupS pat
patLen = fromIntegral $ S.length pat
-- currPatIdx is our position in the pattern - sometimes we don't
-- have to match the whole pattern again.
--
-- currIdx is our position in the string. We B.drop elements from str
-- on each iteration so have to track our position separately.
kmpMatch' :: B.ByteString -> Int64 -> Int -> Int64
kmpMatch' !str !currIdx !patShift
| B.null str = -1
| otherwise =
-- Find length of prefix in our pattern which matches string (possibly zero)
let matchLength :: Int
matchLength =
let matchLength' :: B.ByteString -> Int -> Int
matchLength' !str !cnt | cnt == patLen = patLen
| B.null str = 0
| B.unsafeIndex pat cnt == B.head str =
matchLength' (B.tail str) (succ cnt)
| otherwise = cnt
in matchLength' str patShift
shiftAmt = Base.unsafeAt lookupTable matchLength
{-# INLINE matchLen64 #-}
matchLen64 = fromIntegral (matchLength-patShift)
in
case matchLength of
_ | matchLength == patLen -> currIdx - fromIntegral patShift -- found complete match
| matchLength == patShift -> kmpMatch' (B.tail str) (currIdx + 1) 0
| shiftAmt > 0 -> kmpMatch' (B.drop matchLen64 str) (currIdx + matchLen64) shiftAmt
| otherwise -> kmpMatch' (B.drop matchLen64 str) (currIdx + matchLen64) patShift
in
kmpMatch' search 0 0
-- These tests have failed in the past
-- kmpMatch (toLazyBS "ccb") (toLazyBS "abcdcccb") == 5
-- kmpMatch (toLazyBS "bbaa") (toLazyBS "bdcdbbbbaa") == 6
-- kmpMatch (toLazyBS "cccadadc") (toLazyBS "adaccccadadc") == 4
-- This next one has caused an infinite loop previously.
-- kmpMatch (toLazyBS "bbbb") (toLazyBS "dddcaaaddaaabdacbcccabbada")
{-|
Given our pattern, get all the prefixes of the pattern. For each of those
prefixes, find the longest prefix from the original pattern that is also a
suffix of the prefix segment being considered, and is not equal to it. The
argument given to overlap is the length of the prefix matched so far, and the
length of the longest prefix, which is a suffix and is not equal to it, is the
value overlap returns.
If a given prefix has no possible overlap, it is mapped to -1.
-}
overlap :: S.ByteString -> [(Int, Int)]
overlap pat =
let patternLength = S.length pat
-- Necessary to reverse prefixes since 'isSuffixOf' not implemented for lazy bytestrings.
prefixes = List.map S.reverse $ List.take (fromIntegral patternLength) $ S.inits pat
longestPreSuffix prefix =
-- Find prefixes that are a prefix of this suffix, but
-- are not equal to it.
let suffixes = List.filter (\p -> p `S.isPrefixOf` prefix && p /= prefix) prefixes
in
if S.null prefix || List.null suffixes
then 0
else List.maximum (List.map S.length suffixes)
in
List.map (\prefix -> (fromIntegral $ S.length prefix, fromIntegral $longestPreSuffix prefix)) prefixes
{- |
Given a string representing a search pattern, this function
returns a function which represents, for each prefix of that
pattern, the maximally long prefix of the pattern which is a suffix
of the indicated pattern segment.
If there is no such prefix, 0 is returned.
-}
computeLookupS :: S.ByteString -> Unboxed.UArray Int Int
computeLookupS pat =
let patLen = fromIntegral $ S.length pat
table :: Unboxed.UArray Int Int
table = {-# SCC "computeLookup_table" #-} IArray.array (0, patLen - 1) (overlap pat)
in table
computeLookup :: B.ByteString -> (Int -> Int)
computeLookup pat =
let patLen = fromIntegral $ B.length pat
table :: Unboxed.UArray Int Int
table = {-# SCC "computeLookup_table" #-} IArray.array (0, patLen - 1) (overlap . S.concat . B.toChunks $ pat)
in
Base.unsafeAt table
-- Types, instances and utility functions for testing purposes.
newtype PatternChar = PatternChar Char
deriving Show
instance Arbitrary PatternChar where
arbitrary = oneof (List.map (return . PatternChar) ['a', 'b', 'c', 'd'])
patternsToString :: [PatternChar] -> B.ByteString
patternsToString chars = B.pack $ List.foldr (\(PatternChar char) str -> (toEnum $ fromEnum char) : str) [] chars
patternsToStrictString :: [PatternChar] -> S.ByteString
patternsToStrictString chars = S.pack $ List.foldr (\(PatternChar char) str -> (toEnum $ fromEnum char) : str) [] chars
toLazyBS :: String -> B.ByteString
toLazyBS = B.pack . List.map (toEnum . fromEnum)
toStrictBS :: String -> S.ByteString
toStrictBS = S.pack . List.map (toEnum . fromEnum)
-- Test that 0 and 1 element always return 0, if present.
prop_testZero :: [PatternChar] -> Property
prop_testZero pat =
let lookup = computeLookup (patternsToString pat)
in
not (List.null pat) ==>
if List.length pat > 1
then lookup 0 == 0 && lookup 1 == 0
else lookup 0 == 0
-- Test that all overlaps found are actually prefixes of the
-- pattern string
prop_testSubset :: [PatternChar] -> Property
prop_testSubset pat =
let patStr = patternsToString pat
lookup = computeLookup patStr
prefix len = B.take (fromIntegral len)
testPrefix len =
if lookup len == 0
then True
else (prefix (lookup len) patStr) `B.isPrefixOf` (prefix len patStr)
in
not (List.null pat) ==>
trivial (B.null patStr) $
List.all testPrefix [0 .. List.length pat - 1]
-- Test that the prefix given is the maximal prefix. That is,
-- add one more character makes it either equal to the string
-- or not a prefix.
prop_testCorrectPrefix :: [PatternChar] -> Property
prop_testCorrectPrefix pat =
let patStr = patternsToString pat
lookup = computeLookup patStr
isNeverSuffix len =
let origPrefix = prefix len patStr
-- Drop 1 to remove empty list
allPrefixes = List.drop 1 $ B.inits origPrefix
in
List.all (\p -> B.null p || p == origPrefix || not ((B.reverse p) `B.isPrefixOf` (B.reverse origPrefix))) allPrefixes
prefix len = B.take (fromIntegral len)
-- True if the prefix returned from lookup for the length given produces
-- a string which is a suffix of the original prefix.
isRealSuffix len = (B.reverse (prefix (lookup len) patStr)) `B.isPrefixOf` (B.reverse $ prefix len patStr)
isLongestSuffix len =
let prefixPlus = prefix ((lookup len) + 1) patStr
inputPrefix = prefix len patStr
in
prefixPlus == inputPrefix ||
not ((B.reverse prefixPlus) `B.isPrefixOf` (B.reverse inputPrefix))
testTable len =
if lookup len == 0
then isNeverSuffix len
else isRealSuffix len &&
isLongestSuffix len
in
not (List.null pat) ==>
List.all testTable [0 .. List.length pat - 1]
-- Verify that, if a match is found, it is where it's supposed to be in
-- the string and it can be independently found by other means.
prop_testMatch :: [PatternChar] -> [PatternChar] -> Property
prop_testMatch pat search =
let patStr = patternsToString pat
searchStr = patternsToString search
strictStr = patternsToStrictString search
strictPat = patternsToStrictString pat
patLen = B.length patStr
searchLen = B.length searchStr
matchIdx = kmpMatch patStr searchStr
verify =
if matchIdx > -1
then (B.take patLen $ B.drop matchIdx $ searchStr) == patStr && strictPat `S.isSubstringOf` strictStr
else (B.null patStr && B.null searchStr) || not (strictPat `S.isSubstringOf` strictStr)
in
not (List.null pat) ==>
trivial (B.null searchStr) $
classify (patLen > searchLen) "Bigger pattern than search" $
classify (patLen < searchLen) "Smaller pattern than search" $
classify (patLen == searchLen) "Equal pattern and search" $
classify (matchIdx > -1) "Match Exists" $
classify (matchIdx == -1) "Match Doesn't Exist" $
verify
-- Test a pattern that was known to fail.
prop_testBadPat :: [PatternChar] -> Property
prop_testBadPat search =
let patStr = toLazyBS "bbc"
patLen = B.length patStr
searchStr = patternsToString search
matchIdx = kmpMatch patStr searchStr
strictStr = patternsToStrictString search
strictPat = toStrictBS "bbc"
verify =
if matchIdx > -1
then (B.take patLen $ B.drop matchIdx $ searchStr) == patStr && strictPat `S.isSubstringOf` strictStr
else (B.null patStr && B.null searchStr) || not (strictPat `S.isSubstringOf` strictStr)
in
trivial (List.null search) $
verify
-- Test that a pattern on the end of the string is found OK.
prop_testEndPattern :: [PatternChar] -> [PatternChar] -> Property
prop_testEndPattern pat search =
let patStr = patternsToString pat
searchStr = patternsToString (search ++ pat)
matchIdx = kmpMatch patStr searchStr
strictStr = patternsToStrictString (search ++ pat)
strictPat = patternsToStrictString pat
patLen = B.length patStr
verify =
if matchIdx > -1
then (B.take patLen $ B.drop matchIdx $ searchStr) == patStr && strictPat `S.isSubstringOf` strictStr
else (B.null patStr && B.null searchStr) || not (strictPat `S.isSubstringOf` strictStr)
in
not (List.null pat) ==>
trivial (B.null searchStr) $
classify (matchIdx > -1) "Match Exists" $
classify (matchIdx == -1) "Match Doesn't Exists" $
verify
props1 = [ prop_testZero
, prop_testSubset
, prop_testCorrectPrefix
, prop_testBadPat
]
props2 = [ prop_testMatch
, prop_testEndPattern
]
allTests = do
mapM_ quickCheck props1
mapM_ quickCheck props2
{-
*KMPSeq> allTests
OK, passed 100 tests.
OK, passed 100 tests.
OK, passed 100 tests.
OK, passed 100 tests (15% trivial).
OK, passed 100 tests.
41% Bigger pattern than search, Match Doesn't Exist.
24% Smaller pattern than search, Match Doesn't Exist.
12% trivial, Bigger pattern than search, Match Doesn't Exist.
11% Smaller pattern than search, Match Exists.
9% Equal pattern and search, Match Doesn't Exist.
3% Equal pattern and search, Match Exists.
OK, passed 100 tests (100% Match Exists).
-}
More information about the Haskell-Cafe
mailing list