From: Dmitry Shubin Date: Thu, 15 Apr 2010 23:49:55 +0000 (+0400) Subject: Add: Boyer-Moore string search algorithm X-Git-Tag: 0.97~4718^2~8^2~1 X-Git-Url: https://gitweb.factorcode.org/gitweb.cgi?p=factor.git;a=commitdiff_plain;h=38ef5919e892e952c1e505cd12d5d3020f655468 Add: Boyer-Moore string search algorithm --- diff --git a/extra/boyer-moore/authors.txt b/extra/boyer-moore/authors.txt new file mode 100644 index 0000000000..e1702c7130 --- /dev/null +++ b/extra/boyer-moore/authors.txt @@ -0,0 +1 @@ +Dmitry Shubin diff --git a/extra/boyer-moore/boyer-moore-docs.factor b/extra/boyer-moore/boyer-moore-docs.factor new file mode 100644 index 0000000000..994971e208 --- /dev/null +++ b/extra/boyer-moore/boyer-moore-docs.factor @@ -0,0 +1,59 @@ +! Copyright (C) 2010 Dmitry Shubin. +! See http://factorcode.org/license.txt for BSD license. +USING: help.markup help.syntax kernel sequences ; +IN: boyer-moore + +HELP: +{ $values + { "pat" sequence } { "bm" object } +} +{ $description + "Given a pattern performs pattern preprocessing and returns " + "results as an (opaque) object that is reusable across " + "searches in different sequences via " { $link search-from } + " generic word." +} ; + +HELP: search-from +{ $values + { "seq" sequence } + { "from" "a non-negative integer" } + { "obj" object } + { "i/f" "the index of first match or " { $link f } } +} +{ $description "Performs an attempt to find the first " + "occurence of pattern in " { $snippet "seq" } + " starting from " { $snippet "from" } " using " + "Boyer-Moore search algorithm. Output is the index " + "if the attempt was succeessful and " { $link f } + " otherwise." +} ; + +HELP: search +{ $values + { "seq" sequence } + { "obj" object } + { "i/f" "the index of first match or " { $link f } } +} +{ $description "A simpler variant of " { $link search-from } + " that starts searching from the beginning of the sequence." +} ; + +ARTICLE: "boyer-moore" "The Boyer-Moore algorithm" +{ $heading "Summary" } +"The " { $vocab-link "boyer-moore" } " vocabulary " +"implements a Boyer-Moore string search algorithm with " +"so-called 'strong good suffix shift rule'. Since agorithm is " +"alphabet-independent it is applicable to searching in any " +"collection that implements " { $links "sequence-protocol" } "." + +{ $heading "Complexity" } +"Let " { $snippet "n" } " and " { $snippet "m" } " be lengths " +"of the sequences being searched " { $emphasis "in" } " and " +{ $emphasis "for" } " respectively. Then searching runs in " +{ $snippet "O(n)" } " time in it's worst case using additional " +{ $snippet "O(m)" } " space. Preprocessing phase runs in " +{ $snippet "O(m)" } " time." +; + +ABOUT: "boyer-moore" diff --git a/extra/boyer-moore/boyer-moore-tests.factor b/extra/boyer-moore/boyer-moore-tests.factor new file mode 100644 index 0000000000..e444c35189 --- /dev/null +++ b/extra/boyer-moore/boyer-moore-tests.factor @@ -0,0 +1,10 @@ +! Copyright (C) 2010 Dmitry Shubin. +! See http://factorcode.org/license.txt for BSD license. +USING: tools.test boyer-moore ; +IN: boyer-moore.tests + +[ 0 ] [ "qwerty" "" search ] unit-test +[ 0 ] [ "" "" search ] unit-test +[ f ] [ "qw" "qwerty" search ] unit-test +[ 3 ] [ "qwerty" "r" search ] unit-test +[ 8 ] [ "qwerasdfqwer" 2 "qwe" search-from ] unit-test diff --git a/extra/boyer-moore/boyer-moore.factor b/extra/boyer-moore/boyer-moore.factor new file mode 100644 index 0000000000..4de88c4a1d --- /dev/null +++ b/extra/boyer-moore/boyer-moore.factor @@ -0,0 +1,76 @@ +! Copyright (C) 2010 Dmitry Shubin. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs kernel locals math math.order +math.ranges sequences sequences.private z-algorithm ; +IN: boyer-moore + + ] [ [1,b) ] bi ] keep pick + [ (normal-suffixes) ] 2curry each ; inline + +:: (partial-suffixes) ( len old elt i -- len old/new old ) + len elt i 1 + = [ len elt - ] [ old ] if old ; inline + +: partial-suffixes ( zs -- ss ) + [ length dup ] [ ] bi + [ (partial-suffixes) ] map-index 2nip ; inline + +: ( seq -- table ) + z-values [ partial-suffixes ] [ normal-suffixes ] bi + [ [ nip ] when* ] 2map reverse! ; inline + +: insert-bc-shift ( table elt len i -- table ) + 1 + swap - swap pick 2dup key? + [ 3drop ] [ set-at ] if ; inline + +: ( seq -- table ) + H{ } clone swap [ length ] keep + [ insert-bc-shift ] with each-index ; inline + +TUPLE: boyer-moore pattern bc-table gs-table ; + +: gs-shift ( i c bm -- s ) nip gs-table>> nth-unsafe ; inline + +: bc-shift ( i c bm -- s ) bc-table>> at dup 1 ? + ; inline + +: do-shift ( pos i c bm -- newpos ) + [ gs-shift ] [ bc-shift ] bi-curry 2bi max + ; inline + +: match? ( i1 s1 i2 s2 -- ? ) [ nth-unsafe ] 2bi@ = ; inline + +:: mismatch? ( s1 s2 pos len -- i/f ) + len 1 - [ [ pos + s1 ] keep s2 match? not ] + find-last-integer ; inline + +:: (search-from) ( seq from bm -- i/f ) + bm pattern>> :> pat + pat length :> plen + seq length plen - :> lim + from + [ dup lim <= + [ seq pat pick plen mismatch? + [ 2dup + seq nth-unsafe bm do-shift t ] [ f ] if* + ] [ drop f f ] if + ] loop ; inline + +PRIVATE> + +: ( pat -- bm ) + dup [ ] [ ] bi + boyer-moore boa ; + +GENERIC: search-from ( seq from obj -- i/f ) + +M: sequence search-from + dup length zero? + [ 3drop 0 ] [ (search-from) ] if ; + +M: boyer-moore search-from (search-from) ; + +: search ( seq obj -- i/f ) [ 0 ] dip search-from ; diff --git a/extra/boyer-moore/summary.txt b/extra/boyer-moore/summary.txt new file mode 100644 index 0000000000..298fcc354b --- /dev/null +++ b/extra/boyer-moore/summary.txt @@ -0,0 +1 @@ +Boyer-Moore string search algorithm diff --git a/extra/boyer-moore/tags.txt b/extra/boyer-moore/tags.txt new file mode 100644 index 0000000000..49b4f2328e --- /dev/null +++ b/extra/boyer-moore/tags.txt @@ -0,0 +1 @@ +algorithms diff --git a/extra/z-algorithm/authors.txt b/extra/z-algorithm/authors.txt new file mode 100644 index 0000000000..e1702c7130 --- /dev/null +++ b/extra/z-algorithm/authors.txt @@ -0,0 +1 @@ +Dmitry Shubin diff --git a/extra/z-algorithm/summary.txt b/extra/z-algorithm/summary.txt new file mode 100644 index 0000000000..c7fadf9e81 --- /dev/null +++ b/extra/z-algorithm/summary.txt @@ -0,0 +1 @@ +Z algorithm for pattern preprocessing diff --git a/extra/z-algorithm/tags.txt b/extra/z-algorithm/tags.txt new file mode 100644 index 0000000000..49b4f2328e --- /dev/null +++ b/extra/z-algorithm/tags.txt @@ -0,0 +1 @@ +algorithms diff --git a/extra/z-algorithm/z-algorithm-docs.factor b/extra/z-algorithm/z-algorithm-docs.factor new file mode 100644 index 0000000000..395dd4952d --- /dev/null +++ b/extra/z-algorithm/z-algorithm-docs.factor @@ -0,0 +1,49 @@ +! Copyright (C) 2010 Dmitry Shubin. +! See http://factorcode.org/license.txt for BSD license. +USING: arrays help.markup help.syntax sequences ; +IN: z-algorithm + +HELP: lcp +{ $values + { "seq1" sequence } { "seq2" sequence } + { "n" "a non-negative integer" } +} +{ $description + "Outputs the length of longest common prefix of two sequences." +} ; + +HELP: z-values +{ $values + { "seq" sequence } { "Z" array } +} +{ $description + "Outputs an array of the same length as " { $snippet "seq" } + ", containing Z-values for given sequence. See " + { $link "z-algorithm" } " for details." +} ; + +ARTICLE: "z-algorithm" "Z algorithm" +{ $heading "Definition" } +"Given the sequence " { $snippet "S" } " and the index " +{ $snippet "i" } ", let " { $snippet "i" } "-th Z value of " +{ $snippet "S" } " be the length of the longest subsequence of " +{ $snippet "S" } " that starts at " { $snippet "i" } +" and matches the prefix of " { $snippet "S" } "." + +{ $heading "Example" } +"Here is an example for string " { $snippet "\"abababaca\"" } ":" +{ $table + { { $snippet "i:" } "0" "1" "2" "3" "4" "5" "6" "7" "8" } + { { $snippet "S:" } "a" "b" "a" "b" "a" "b" "a" "c" "a" } + { { $snippet "Z:" } "9" "0" "5" "0" "3" "0" "1" "0" "1" } +} + +{ $heading "Summary" } +"The " { $vocab-link "z-algorithm" } +" vocabulary implements algorithm for finding all Z values for sequence " +{ $snippet "S" } +" in linear time. In contrast to naive approach which takes " +{ $snippet "Θ(n^2)" } " time." +; + +ABOUT: "z-algorithm" diff --git a/extra/z-algorithm/z-algorithm-tests.factor b/extra/z-algorithm/z-algorithm-tests.factor new file mode 100644 index 0000000000..8a8fd97480 --- /dev/null +++ b/extra/z-algorithm/z-algorithm-tests.factor @@ -0,0 +1,13 @@ +! Copyright (C) 2010 Dmitry Shubin. +! See http://factorcode.org/license.txt for BSD license. +USING: tools.test z-algorithm ; +IN: z-algorithm.tests + +[ 0 ] [ "qwerty" "" lcp ] unit-test +[ 0 ] [ "qwerty" "asdf" lcp ] unit-test +[ 3 ] [ "qwerty" "qwe" lcp ] unit-test +[ 3 ] [ "qwerty" "qwet" lcp ] unit-test + +[ { } ] [ "" z-values ] unit-test +[ { 1 } ] [ "q" z-values ] unit-test +[ { 9 0 5 0 3 0 1 0 1 } ] [ "abababaca" z-values ] unit-test diff --git a/extra/z-algorithm/z-algorithm.factor b/extra/z-algorithm/z-algorithm.factor new file mode 100644 index 0000000000..12306dc080 --- /dev/null +++ b/extra/z-algorithm/z-algorithm.factor @@ -0,0 +1,37 @@ +! Copyright (C) 2010 Dmitry Shubin. +! See http://factorcode.org/license.txt for BSD license. +USING: arrays combinators.smart kernel locals math math.ranges +sequences sequences.private ; +IN: z-algorithm + +: lcp ( seq1 seq2 -- n ) + [ min-length ] 2keep mismatch [ nip ] when* ; + + Zk + Zk Z push seq Z + Zk 0 > [ k Zk k + 1 - ] [ l r ] if ; inline + +:: inside-zbox ( seq Z l r k -- seq Z l r ) + k l - Z nth :> Zk' + r k - 1 + :> b + seq Z Zk' b < + [ Zk' Z push l r ] ! still inside + [ seq r 1 + seq b [ tail-slice ] 2bi@ lcp :> q + q b + Z push k q r + + ] if ; inline + +: (z-value) ( seq Z l r k -- seq Z l r ) + 2dup < [ out-of-zbox ] [ inside-zbox ] if ; inline + +:: (z-values) ( seq -- Z ) + V{ } clone 0 0 seq length :> ( Z l r len ) + len Z push [ seq Z l r 1 len [a,b) [ (z-value) ] each ] + drop-outputs Z ; inline + +PRIVATE> + +: z-values ( seq -- Z ) + dup length 0 > [ (z-values) ] when >array ;