[commit: packages/bytestring] master, revert-46-patch-1, wip/nix-local-build: Implement Rabin-Karp substring search. (2160e09)

git at git.haskell.org git at git.haskell.org
Tue May 3 22:42:47 UTC 2016


Repository : ssh://git@git.haskell.org/bytestring

On branches: master,revert-46-patch-1,wip/nix-local-build
Link       : http://git.haskell.org/packages/bytestring.git/commitdiff/2160e091e215fecc9177d55a37cd50fc253ba86a

>---------------------------------------------------------------

commit 2160e091e215fecc9177d55a37cd50fc253ba86a
Author: Sean <burton.seanr at gmail.com>
Date:   Fri Sep 18 09:14:01 2015 +0100

    Implement Rabin-Karp substring search.
    
    Conflicts:
    	Data/ByteString.hs


>---------------------------------------------------------------

2160e091e215fecc9177d55a37cd50fc253ba86a
 Data/ByteString.hs | 58 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/Data/ByteString.hs b/Data/ByteString.hs
index 42263a6..90eb301 100644
--- a/Data/ByteString.hs
+++ b/Data/ByteString.hs
@@ -1327,16 +1327,38 @@ isInfixOf p s = isJust (findSubstring p s)
 --
 -- > fst (breakSubstring x y)
 --
+-- Note that calling `breakSubstring x` does some preprocessing work, so
+-- you should avoid uneccesarily duplicating breakSubstring calls with the same
+-- pattern.
+
 breakSubstring :: ByteString -- ^ String to search for
                -> ByteString -- ^ String to search in
                -> (ByteString,ByteString) -- ^ Head and tail of string broken at substring
-
-breakSubstring pat src = search 0 src
+breakSubstring pat =
+  case lp of
+    0 -> \src -> (empty,src)
+    1 -> breakByte (unsafeHead pat)
+    _ -> karpRabin
   where
-    search !n !s
-        | null s             = (src,empty)      -- not found
-        | pat `isPrefixOf` s = (take n src,s)
-        | otherwise          = search (n+1) (unsafeTail s)
+    lp          = length pat
+    k           = 2891336453 :: Word32
+    rollingHash = foldl' (\h b -> h * k + fromIntegral b) 0
+    hp          = rollingHash pat
+    m           = k ^ lp
+    karpRabin src
+      | length src < lp = (src,empty)
+      | otherwise = search (rollingHash $ unsafeTake lp src) 0
+      where
+        search !hs !n
+            | hp == hs && pat `isPrefixOf` s = (unsafeTake n src,s)
+            | length src - n <= lp           = (src,empty) -- not found
+            | otherwise                      = search hs' (n+1)
+          where
+            get = fromIntegral . unsafeIndex src
+            s   = unsafeDrop n src
+            hs' = hs * k +
+                  get (n + lp) -
+                  m * get n
 
 -- | Get the first index of a substring in another string,
 --   or 'Nothing' if the string is not found.
@@ -1344,7 +1366,11 @@ breakSubstring pat src = search 0 src
 findSubstring :: ByteString -- ^ String to search for.
               -> ByteString -- ^ String to seach in.
               -> Maybe Int
-findSubstring f i = listToMaybe (findSubstrings f i)
+findSubstring pat src
+    | null pat && null src = Just 0
+    | null b = Nothing
+    | otherwise = Just (length a)
+  where (a, b) = breakSubstring pat src
 
 {-# DEPRECATED findSubstring "findSubstring is deprecated in favour of breakSubstring." #-}
 
@@ -1354,14 +1380,16 @@ findSubstring f i = listToMaybe (findSubstrings f i)
 findSubstrings :: ByteString -- ^ String to search for.
                -> ByteString -- ^ String to seach in.
                -> [Int]
-findSubstrings pat str
-    | null pat         = [0 .. length str]
-    | otherwise        = search 0 str
-  where
-    search !n !s
-        | null s             = []
-        | pat `isPrefixOf` s = n : search (n+1) (unsafeTail s)
-        | otherwise          =     search (n+1) (unsafeTail s)
+findSubstrings pat src
+    | null pat        = [0 .. ls]
+    | otherwise       = search 0
+  where lp = length pat
+        ls = length src
+        search !n
+            | (n > ls - lp) || null b = []
+            | otherwise = let k = n + length a
+                          in  k : search (k + lp)
+          where (a, b) = breakSubstring pat (unsafeDrop n src)
 
 {-# DEPRECATED findSubstrings "findSubstrings is deprecated in favour of breakSubstring." #-}
 



More information about the ghc-commits mailing list