From: Dmitry Shubin <dmitry.sh@gmail.com>
Date: Thu, 15 Apr 2010 23:49:55 +0000 (+0400)
Subject: Add: Boyer-Moore string search algorithm
X-Git-Tag: 0.97~4718^2~8^2~1
X-Git-Url: https://gitweb.factorcode.org/gitweb.cgi?p=factor.git;a=commitdiff_plain;h=38ef5919e892e952c1e505cd12d5d3020f655468

Add: Boyer-Moore string search algorithm
---

diff --git a/extra/boyer-moore/authors.txt b/extra/boyer-moore/authors.txt
new file mode 100644
index 0000000000..e1702c7130
--- /dev/null
+++ b/extra/boyer-moore/authors.txt
@@ -0,0 +1 @@
+Dmitry Shubin
diff --git a/extra/boyer-moore/boyer-moore-docs.factor b/extra/boyer-moore/boyer-moore-docs.factor
new file mode 100644
index 0000000000..994971e208
--- /dev/null
+++ b/extra/boyer-moore/boyer-moore-docs.factor
@@ -0,0 +1,59 @@
+! Copyright (C) 2010 Dmitry Shubin.
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.markup help.syntax kernel sequences ;
+IN: boyer-moore
+
+HELP: <boyer-moore>
+{ $values
+  { "pat" sequence } { "bm" object }
+}
+{ $description
+  "Given a pattern performs pattern preprocessing and returns "
+  "results as an (opaque) object that is reusable across "
+  "searches in different sequences via " { $link search-from }
+  " generic word."
+} ;
+
+HELP: search-from
+{ $values
+  { "seq" sequence }
+  { "from" "a non-negative integer" }
+  { "obj" object }
+  { "i/f" "the index of first match or " { $link f }  }
+}
+{ $description "Performs an attempt to find the first "
+  "occurence of pattern in " { $snippet "seq" }
+  " starting from " { $snippet "from" } " using "
+  "Boyer-Moore search algorithm. Output is the index "
+  "if the attempt was  succeessful and " { $link f }
+  " otherwise."
+} ;
+
+HELP: search
+{ $values
+  { "seq" sequence }
+  { "obj" object }
+  { "i/f" "the index of first match or " { $link f } }
+}
+{ $description "A simpler variant of " { $link search-from }
+  " that starts searching from the beginning of the sequence."
+} ;
+
+ARTICLE: "boyer-moore" "The Boyer-Moore algorithm"
+{ $heading "Summary" }
+"The " { $vocab-link "boyer-moore" } " vocabulary "
+"implements a Boyer-Moore string search algorithm with "
+"so-called 'strong good suffix shift rule'. Since agorithm is "
+"alphabet-independent it is applicable to searching in any "
+"collection that implements " { $links "sequence-protocol" } "."
+
+{ $heading "Complexity" }
+"Let " { $snippet "n" } " and " { $snippet "m" } " be lengths "
+"of the sequences being searched " { $emphasis "in" } " and "
+{ $emphasis "for" } " respectively. Then searching runs in "
+{ $snippet "O(n)" } " time in it's worst case using additional "
+{ $snippet "O(m)" } " space. Preprocessing phase runs in "
+{ $snippet "O(m)" } " time."
+;
+
+ABOUT: "boyer-moore"
diff --git a/extra/boyer-moore/boyer-moore-tests.factor b/extra/boyer-moore/boyer-moore-tests.factor
new file mode 100644
index 0000000000..e444c35189
--- /dev/null
+++ b/extra/boyer-moore/boyer-moore-tests.factor
@@ -0,0 +1,10 @@
+! Copyright (C) 2010 Dmitry Shubin.
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test boyer-moore ;
+IN: boyer-moore.tests
+
+[ 0 ] [ "qwerty" "" search ] unit-test
+[ 0 ] [ "" "" search ] unit-test
+[ f ] [ "qw" "qwerty" search ] unit-test
+[ 3 ] [ "qwerty" "r" search ] unit-test
+[ 8 ] [ "qwerasdfqwer" 2 "qwe" search-from ] unit-test
diff --git a/extra/boyer-moore/boyer-moore.factor b/extra/boyer-moore/boyer-moore.factor
new file mode 100644
index 0000000000..4de88c4a1d
--- /dev/null
+++ b/extra/boyer-moore/boyer-moore.factor
@@ -0,0 +1,76 @@
+! Copyright (C) 2010 Dmitry Shubin.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors arrays assocs kernel locals math math.order
+math.ranges sequences sequences.private z-algorithm ;
+IN: boyer-moore
+
+<PRIVATE
+
+:: (normal-suffixes) ( i zs ss -- )
+    i zs nth-unsafe ss
+    [ [ i ] unless* ] change-nth-unsafe ; inline
+
+: normal-suffixes ( zs -- ss )
+    [ length [ f <array> ] [ [1,b) ] bi ] keep pick
+    [ (normal-suffixes) ] 2curry each ; inline
+
+:: (partial-suffixes) ( len old elt i -- len old/new old )
+    len elt i 1 + = [ len elt - ] [ old ] if old ; inline
+
+: partial-suffixes ( zs -- ss )
+    [ length dup ] [ <reversed> ] bi
+    [ (partial-suffixes) ] map-index 2nip ; inline
+
+: <gs-table> ( seq -- table )
+    z-values [ partial-suffixes ] [ normal-suffixes ] bi
+    [ [ nip ] when* ] 2map reverse! ; inline
+
+: insert-bc-shift ( table elt len i -- table )
+    1 + swap - swap pick 2dup key?
+    [ 3drop ] [ set-at ] if ; inline
+
+: <bc-table> ( seq -- table )
+    H{ } clone swap [ length ] keep
+    [ insert-bc-shift ] with each-index ; inline
+
+TUPLE: boyer-moore pattern bc-table gs-table ;
+
+: gs-shift ( i c bm -- s ) nip gs-table>> nth-unsafe ; inline
+
+: bc-shift ( i c bm -- s ) bc-table>> at dup 1 ? + ; inline
+
+: do-shift ( pos i c bm -- newpos )
+    [ gs-shift ] [ bc-shift ] bi-curry 2bi max + ; inline
+
+: match? ( i1 s1 i2 s2 -- ? ) [ nth-unsafe ] 2bi@ = ; inline
+
+:: mismatch? ( s1 s2 pos len -- i/f )
+    len 1 - [ [ pos + s1 ] keep s2 match? not ]
+    find-last-integer ; inline
+
+:: (search-from) ( seq from bm -- i/f )
+    bm pattern>>      :> pat
+    pat length        :> plen
+    seq length plen - :> lim
+    from
+    [ dup lim <=
+      [ seq pat pick plen mismatch?
+        [ 2dup + seq nth-unsafe bm do-shift t ] [ f ] if*
+      ] [ drop f f ] if
+    ] loop ; inline
+
+PRIVATE>
+
+: <boyer-moore> ( pat -- bm )
+    dup <reversed> [ <bc-table> ] [ <gs-table> ] bi
+    boyer-moore boa ;
+
+GENERIC: search-from ( seq from obj -- i/f )
+
+M: sequence search-from
+    dup length zero?
+    [ 3drop 0 ] [ <boyer-moore> (search-from) ] if ;
+
+M: boyer-moore search-from (search-from) ;
+
+: search ( seq obj -- i/f ) [ 0 ] dip search-from ;
diff --git a/extra/boyer-moore/summary.txt b/extra/boyer-moore/summary.txt
new file mode 100644
index 0000000000..298fcc354b
--- /dev/null
+++ b/extra/boyer-moore/summary.txt
@@ -0,0 +1 @@
+Boyer-Moore string search algorithm
diff --git a/extra/boyer-moore/tags.txt b/extra/boyer-moore/tags.txt
new file mode 100644
index 0000000000..49b4f2328e
--- /dev/null
+++ b/extra/boyer-moore/tags.txt
@@ -0,0 +1 @@
+algorithms
diff --git a/extra/z-algorithm/authors.txt b/extra/z-algorithm/authors.txt
new file mode 100644
index 0000000000..e1702c7130
--- /dev/null
+++ b/extra/z-algorithm/authors.txt
@@ -0,0 +1 @@
+Dmitry Shubin
diff --git a/extra/z-algorithm/summary.txt b/extra/z-algorithm/summary.txt
new file mode 100644
index 0000000000..c7fadf9e81
--- /dev/null
+++ b/extra/z-algorithm/summary.txt
@@ -0,0 +1 @@
+Z algorithm for pattern preprocessing
diff --git a/extra/z-algorithm/tags.txt b/extra/z-algorithm/tags.txt
new file mode 100644
index 0000000000..49b4f2328e
--- /dev/null
+++ b/extra/z-algorithm/tags.txt
@@ -0,0 +1 @@
+algorithms
diff --git a/extra/z-algorithm/z-algorithm-docs.factor b/extra/z-algorithm/z-algorithm-docs.factor
new file mode 100644
index 0000000000..395dd4952d
--- /dev/null
+++ b/extra/z-algorithm/z-algorithm-docs.factor
@@ -0,0 +1,49 @@
+! Copyright (C) 2010 Dmitry Shubin.
+! See http://factorcode.org/license.txt for BSD license.
+USING: arrays help.markup help.syntax sequences ;
+IN: z-algorithm
+
+HELP: lcp
+{ $values
+  { "seq1" sequence } { "seq2" sequence }
+  { "n" "a non-negative integer" }
+}
+{ $description
+  "Outputs the length of longest common prefix of two sequences."
+} ;
+
+HELP: z-values
+{ $values
+  { "seq" sequence } { "Z" array }
+}
+{ $description
+  "Outputs an array of the same length as " { $snippet "seq" }
+  ", containing Z-values for given sequence. See "
+  { $link "z-algorithm" } " for details."
+} ;
+
+ARTICLE: "z-algorithm" "Z algorithm"
+{ $heading "Definition" }
+"Given the sequence " { $snippet "S" } " and the index "
+{ $snippet "i" } ", let " { $snippet "i" } "-th Z value of "
+{ $snippet "S" } " be the length of the longest subsequence of "
+{ $snippet "S" } " that starts at " { $snippet "i" }
+" and matches the prefix of " { $snippet "S" } "."
+
+{ $heading "Example" }
+"Here is an example for string " { $snippet "\"abababaca\"" } ":"
+{ $table
+  { { $snippet "i:" } "0" "1" "2" "3" "4" "5" "6" "7" "8" }
+  { { $snippet "S:" } "a" "b" "a" "b" "a" "b" "a" "c" "a" }
+  { { $snippet "Z:" } "9" "0" "5" "0" "3" "0" "1" "0" "1" }
+}
+
+{ $heading "Summary" }
+"The " { $vocab-link "z-algorithm" }
+" vocabulary implements algorithm for finding all Z values for sequence "
+{ $snippet "S" }
+" in linear time. In contrast to naive approach which takes "
+{ $snippet "Î(n^2)" } " time."
+;
+
+ABOUT: "z-algorithm"
diff --git a/extra/z-algorithm/z-algorithm-tests.factor b/extra/z-algorithm/z-algorithm-tests.factor
new file mode 100644
index 0000000000..8a8fd97480
--- /dev/null
+++ b/extra/z-algorithm/z-algorithm-tests.factor
@@ -0,0 +1,13 @@
+! Copyright (C) 2010 Dmitry Shubin.
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test z-algorithm ;
+IN: z-algorithm.tests
+
+[ 0 ] [ "qwerty" "" lcp ] unit-test
+[ 0 ] [ "qwerty" "asdf" lcp ] unit-test
+[ 3 ] [ "qwerty" "qwe" lcp ] unit-test
+[ 3 ] [ "qwerty" "qwet" lcp ] unit-test
+
+[ { } ] [ "" z-values ] unit-test
+[ { 1 } ] [ "q" z-values ] unit-test
+[ { 9 0 5 0 3 0 1 0 1 } ] [ "abababaca" z-values ] unit-test
diff --git a/extra/z-algorithm/z-algorithm.factor b/extra/z-algorithm/z-algorithm.factor
new file mode 100644
index 0000000000..12306dc080
--- /dev/null
+++ b/extra/z-algorithm/z-algorithm.factor
@@ -0,0 +1,37 @@
+! Copyright (C) 2010 Dmitry Shubin.
+! See http://factorcode.org/license.txt for BSD license.
+USING: arrays combinators.smart kernel locals math math.ranges
+sequences sequences.private ;
+IN: z-algorithm
+
+: lcp ( seq1 seq2 -- n )
+    [ min-length ] 2keep mismatch [ nip ] when* ;
+
+<PRIVATE
+
+:: out-of-zbox ( seq Z l r k -- seq Z l r )
+    seq k tail-slice seq lcp :> Zk
+    Zk Z push seq Z
+    Zk 0 > [ k Zk k + 1 - ] [ l r ] if ; inline
+
+:: inside-zbox ( seq Z l r k -- seq Z l r )
+    k l - Z nth :> Zk'
+    r k - 1 +   :> b
+    seq Z Zk' b <
+    [ Zk' Z push l r ] ! still inside
+    [ seq r 1 + seq b [ tail-slice ] 2bi@ lcp :> q
+      q b + Z push k q r +
+    ] if ; inline
+
+: (z-value) ( seq Z l r k -- seq Z l r )
+    2dup < [ out-of-zbox ] [ inside-zbox ] if ; inline
+
+:: (z-values) ( seq -- Z )
+    V{ } clone 0 0 seq length :> ( Z l r len )
+    len Z push [ seq Z l r 1 len [a,b) [ (z-value) ] each ]
+    drop-outputs Z ; inline
+
+PRIVATE>
+
+: z-values ( seq -- Z )
+    dup length 0 > [ (z-values) ] when >array ;