extra: moving bloom-filters, cuckoo-filters to basis.

author John Benediktsson <mrjbq7@gmail.com>

Wed, 17 Mar 2021 04:45:38 +0000 (21:45 -0700)

committer John Benediktsson <mrjbq7@gmail.com>

Wed, 17 Mar 2021 04:45:38 +0000 (21:45 -0700)
author John Benediktsson <mrjbq7@gmail.com>
Wed, 17 Mar 2021 04:45:38 +0000 (21:45 -0700)
committer John Benediktsson <mrjbq7@gmail.com>
Wed, 17 Mar 2021 04:45:38 +0000 (21:45 -0700)
diff --git a/basis/bloom-filters/authors.txt b/basis/bloom-filters/authors.txt

new file mode 100644 (file)

index 0000000..528e5df
--- /dev/null
+++ b/basis/bloom-filters/authors.txt
@@ -0,0 +1 @@
+Alec Berryman
diff --git a/basis/bloom-filters/bloom-filters-docs.factor b/basis/bloom-filters/bloom-filters-docs.factor

new file mode 100644 (file)

index 0000000..8f136eb
--- /dev/null
+++ b/basis/bloom-filters/bloom-filters-docs.factor
@@ -0,0 +1,40 @@
+USING: help.markup help.syntax kernel math ;
+IN: bloom-filters
+
+HELP: <bloom-filter>
+{ $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." }
+          { "capacity" "The expected number of object in the set. A positive " { $link integer } "." }
+          { "bloom-filter" bloom-filter } }
+{ $description "Creates an empty Bloom filter." }
+{ $errors "Throws a " { $link invalid-size } " when unable to produce a filter meeting the given constraints. Throws a " { $link invalid-error-rate } " or a " { $link invalid-capacity } " when input is invalid." } ;
+
+
+HELP: bloom-filter-insert
+{ $values { "object" object }
+          { "bloom-filter" bloom-filter } }
+{ $description "Records the item as a member of the filter." }
+{ $side-effects "bloom-filter" } ;
+
+HELP: bloom-filter-member?
+{ $values { "object" object }
+          { "bloom-filter" bloom-filter }
+          { "?" boolean } }
+{ $description "Returns " { $link t } " if the object may be a member of Bloom filter, " { $link f } " otherwise. The false positive rate is configurable; there are no false negatives." } ;
+
+HELP: bloom-filter
+{ $class-description "This is the class for Bloom filters. These provide constant-time insertion and probabilistic membership-testing operations, but do not actually store any elements." } ;
+
+ARTICLE: "bloom-filters" "Bloom filters"
+"This is a library for Bloom filters, sets that provide a constant-time insertion operation and probabilistic membership tests, but do not actually store any elements."
+$nl
+"The accuracy of the membership test is configurable; a Bloom filter will never incorrectly report an item is not a member of the set, but may incorrectly report than an item is a member of the set."
+$nl
+"Bloom filters cannot be resized and do not support removal."
+$nl
+{ $subsections
+    <bloom-filter>
+    bloom-filter-insert
+    bloom-filter-member?
+} ;
+
+ABOUT: "bloom-filters"
diff --git a/basis/bloom-filters/bloom-filters-tests.factor b/basis/bloom-filters/bloom-filters-tests.factor

new file mode 100644 (file)

index 0000000..205c5e1
--- /dev/null
+++ b/basis/bloom-filters/bloom-filters-tests.factor
@@ -0,0 +1,77 @@
+USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
+math random sequences tools.test ;
+IN: bloom-filters.tests
+
+{ { 200 5 } } [ { 100 7 } { 200 5 } smaller-second ] unit-test
+{ { 200 5 } } [ { 200 5 } { 100 7 } smaller-second ] unit-test
+
+! The sizing information was generated using the subroutine
+! calculate_shortest_filter_length from
+! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
+
+! Test bloom-filter creation
+{ 47965 } [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
+{ 7 47965 } [ 0.01 5000 size-bloom-filter ] unit-test
+{ 7 } [ 0.01 5000 <bloom-filter> #hashes>> ] unit-test
+{ 47965 } [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
+{ 5000 } [ 0.01 5000 <bloom-filter> capacity>> ] unit-test
+{ 0 } [ 0.01 5000 <bloom-filter> count>> ] unit-test
+
+! Should return the fewest hashes to satisfy the bits requested, not the most.
+{ 32 } [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
+{ 32 } [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
+{ 4 32 } [ 0.05 5 size-bloom-filter ] unit-test
+
+! This is a lot of bits.
+[ 0.00000001 max-array-capacity size-bloom-filter ] [ invalid-size? ]  must-fail-with
+
+! Other error conditions.
+[ 1.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 0.5 0 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
+[ 0.5 -5 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
+
+! Should not generate bignum hash codes.  Enhanced double hashing may generate a
+! lot of hash codes, and it's better to do this earlier than later.
+{ t } [ 10000 <iota> [ double-hashcodes [ fixnum? ] both? ] all? ] unit-test
+
+: empty-bloom-filter ( -- bloom-filter )
+    0.01 2000 <bloom-filter> ;
+
+{ 1 } [ empty-bloom-filter dup increment-count count>> ] unit-test
+
+: basic-insert-test-setup ( -- bloom-filter )
+    1 empty-bloom-filter [ bloom-filter-insert ] keep ;
+
+! Basic tests that insert does something
+{ t } [ basic-insert-test-setup bits>> [ ] any? ] unit-test
+{ 1 } [ basic-insert-test-setup count>> ] unit-test
+
+: non-empty-bloom-filter ( -- bloom-filter )
+    1000 <iota>
+    empty-bloom-filter
+    [ [ bloom-filter-insert ] curry each ] keep ;
+
+: full-bloom-filter ( -- bloom-filter )
+    2000 <iota>
+    empty-bloom-filter
+    [ [ bloom-filter-insert ] curry each ] keep ;
+
+! Should find what we put in there.
+{ t } [ 2000 <iota>
+        full-bloom-filter
+        [ bloom-filter-member? ] curry map
+        [ ] all?
+] unit-test
+
+! We shouldn't have more than 0.01 false-positive rate.
+{ t } [ 1000 <iota> [ drop most-positive-fixnum random 1000 + ] map
+        full-bloom-filter
+        [ bloom-filter-member? ] curry map
+        [ ] count
+        ! TODO: This should be 10, but the false positive rate is currently very
+        ! high.  300 is large enough not to prevent builds from succeeding.
+        300 <=
+] unit-test
diff --git a/basis/bloom-filters/bloom-filters.factor b/basis/bloom-filters/bloom-filters.factor

new file mode 100644 (file)

index 0000000..26e2aa9
--- /dev/null
+++ b/basis/bloom-filters/bloom-filters.factor
@@ -0,0 +1,143 @@
+! Copyright (C) 2009 Alec Berryman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors arrays bit-arrays fry kernel kernel.private
+layouts locals math math.functions math.order math.private
+multiline sequences sequences.private typed ;
+FROM: math.ranges => [1,b] ;
+
+IN: bloom-filters
+
+/*
+
+TODO:
+
+- The false positive rate is 10x what it should be, based on
+  informal testing.  Better object hashes or a better method of
+  generating extra hash codes would help.  Another way is to
+  increase the number of bits used.
+
+  - Try something smarter than the bitwise complement for a
+    second hash code.
+
+  - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
+    makes a case for http://murmurhash.googlepages.com/ instead
+    of enhanced double-hashing.
+
+  - Be sure to adjust the test that asserts the number of false
+    positives isn't unreasonable.
+
+- Could round bits up to next power of two and use wrap instead
+  of mod.  This would cost a lot of bits on 32-bit platforms,
+  though, and limit the bit-array to 8MB.
+
+- Should allow user to specify the hash codes, either as inputs
+  to enhanced double hashing or for direct use.
+
+- Support for serialization.
+
+- Wrappers for combining filters.
+
+- Should we signal an error when inserting past the number of
+  objects the filter is sized for?  The filter will continue to
+  work, just not very well.
+
+*/
+
+TUPLE: bloom-filter
+{ #hashes fixnum read-only }
+{ bits bit-array read-only }
+{ capacity fixnum read-only }
+{ count fixnum } ;
+
+ERROR: invalid-size size ;
+ERROR: invalid-error-rate error-rate ;
+ERROR: invalid-capacity capacity ;
+
+<PRIVATE
+
+:: bits-to-satisfy-error-rate ( hashes error objects -- size )
+    objects hashes * neg error hashes recip ^ 1 swap - log /
+    ceiling >integer ;
+
+! 100 hashes ought to be enough for anybody.
+: #hashes-range ( -- range )
+    100 [1,b] ;
+
+! { #hashes #bits }
+: identity-configuration ( -- 2seq )
+    0 max-array-capacity 2array ;
+
+: smaller-second ( 2seq 2seq -- 2seq )
+    [ [ second ] bi@ <= ] most ;
+
+! If the number of hashes isn't positive, we haven't found
+! anything smaller than the identity configuration.
+: check-hashes ( 2seq -- 2seq )
+    dup first 0 <= [ invalid-size ] when ;
+
+! The consensus on the tradeoff between increasing the number of
+! bits and increasing the number of hash functions seems to be
+! "go for the smallest number of bits", probably because most
+! implementations just generate one hash value and cheaply
+! mangle it into the number of hashes they need.  I have not
+! seen any usage studies from the implementations that made this
+! tradeoff to support it, and I haven't done my own, but we'll
+! go with it anyway.
+: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
+    [ #hashes-range identity-configuration ] 2dip '[
+        dup _ _ bits-to-satisfy-error-rate
+        2array smaller-second
+    ] reduce check-hashes first2 ;
+
+: check-capacity ( capacity -- capacity )
+    dup 0 <= [ invalid-capacity ] when ;
+
+: check-error-rate ( error-rate -- error-rate )
+    dup [ 0 after? ] [ 1 before? ] bi and
+    [ invalid-error-rate ] unless ;
+
+PRIVATE>
+
+: <bloom-filter> ( error-rate capacity -- bloom-filter )
+    [ check-error-rate ] [ check-capacity ] bi*
+    [ size-bloom-filter <bit-array> ] keep
+    0 ! initially empty
+    bloom-filter boa ;
+
+<PRIVATE
+
+! See "Bloom Filters in Probabilistic Verification" by Peter C.
+! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced
+! Double Hashing":
+! http://www.ccs.neu.edu/home/pete/research/bloom-filters-verification.html
+! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
+: combine-hashcodes ( index hash0 hash1 -- hash )
+    { fixnum fixnum fixnum } declare
+    [ [ [ 3 ^ ] [ - ] bi 6 /i ] keep ]
+    [ fixnum*fast ] [ fixnum+fast ] tri* + abs ;
+
+: double-hashcodes ( object -- hash0 hash1 )
+    hashcode >fixnum dup most-positive-fixnum bitxor >fixnum ;
+
+: increment-count ( bloom-filter -- )
+    [ 1 + ] change-count drop ; inline
+
+: #hashes-and-length ( bloom-filter -- #hashes length )
+    [ #hashes>> ] [ bits>> length ] bi ; inline
+
+: relevant-indices ( object bloom-filter -- n quot: ( elt -- n ) )
+    [ double-hashcodes ] [ #hashes-and-length ] bi*
+    -rotd '[ _ _ combine-hashcodes _ mod ] ; inline
+
+PRIVATE>
+
+TYPED: bloom-filter-insert ( object bloom-filter: bloom-filter -- )
+    [ increment-count ]
+    [ relevant-indices ]
+    [ bits>> [ [ t ] 2dip set-nth-unsafe ] curry ]
+    tri compose each-integer ;
+
+TYPED: bloom-filter-member? ( object bloom-filter: bloom-filter -- ? )
+    [ relevant-indices ]
+    [ bits>> [ nth-unsafe ] curry ]
+    bi compose all-integers? ;
diff --git a/basis/bloom-filters/summary.txt b/basis/bloom-filters/summary.txt

new file mode 100644 (file)

index 0000000..ebfd7cf
--- /dev/null
+++ b/basis/bloom-filters/summary.txt
@@ -0,0 +1 @@
+Bloom filters
diff --git a/basis/bloom-filters/tags.txt b/basis/bloom-filters/tags.txt

new file mode 100644 (file)

index 0000000..42d711b
--- /dev/null
+++ b/basis/bloom-filters/tags.txt
@@ -0,0 +1 @@
+collections
diff --git a/basis/cuckoo-filters/authors.txt b/basis/cuckoo-filters/authors.txt

new file mode 100644 (file)

index 0000000..e091bb8
--- /dev/null
+++ b/basis/cuckoo-filters/authors.txt
@@ -0,0 +1 @@
+John Benediktsson
diff --git a/basis/cuckoo-filters/cuckoo-filters-docs.factor b/basis/cuckoo-filters/cuckoo-filters-docs.factor

new file mode 100644 (file)

index 0000000..bc1c279
--- /dev/null
+++ b/basis/cuckoo-filters/cuckoo-filters-docs.factor
@@ -0,0 +1,28 @@
+USING: byte-arrays checksums help.markup help.syntax kernel ;
+IN: cuckoo-filters
+
+HELP: cuckoo-insert
+{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
+{ $description "Insert the data into the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data was inserted." }
+{ $notes "Attempting to insert data twice will result in the hashed fingerprint of the data appearing twice and the " { $link cuckoo-filter } " size being incremented twice." } ;
+
+HELP: cuckoo-lookup
+{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
+{ $description "Lookup the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be a member. This is a probabilistic test, meaning there is a possibility of false positives." } ;
+
+HELP: cuckoo-delete
+{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
+{ $description "Remove the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be removed." } ;
+
+ARTICLE: "cuckoo-filters" "Cuckoo Filters"
+"Cuckoo Filters are probabilistic data structures similar to Bloom Filters that provides support for removing elements without significantly degrading space and performance."
+$nl
+"Instead of storing the elements themselves, it stores a fingerprint obtained by using a " { $link checksum } ". This allows for item removal without false negatives (assuming you do not try and remove an item not contained in the filter."
+$nl
+"For applications that store many items and target low false-positive rates, Cuckoo Filters can have a lower space overhead than Bloom Filters."
+$nl
+"More information is available in the paper by Andersen, Kaminsky, and Mitzenmacher titled \"Cuckoo Filter: Practically Better Than Bloom\":"
+$nl
+{ $url "http://www.pdl.cmu.edu/PDL-FTP/FS/cuckoo-conext2014.pdf" } ;
+
+ABOUT: "cuckoo-filters"
diff --git a/basis/cuckoo-filters/cuckoo-filters-tests.factor b/basis/cuckoo-filters/cuckoo-filters-tests.factor

new file mode 100644 (file)

index 0000000..af2d363
--- /dev/null
+++ b/basis/cuckoo-filters/cuckoo-filters-tests.factor
@@ -0,0 +1,29 @@
+USING: accessors combinators combinators.short-circuit
+cuckoo-filters kernel math.parser sequences tools.test ;
+
+{ t 1 t t f 0 } [
+    "factor" 100 <cuckoo-filter> {
+        [ cuckoo-insert ]
+        [ nip size>> ]
+        [ cuckoo-lookup ]
+        [ cuckoo-delete ]
+        [ cuckoo-lookup ]
+        [ nip size>> ]
+    } 2cleave
+] unit-test
+
+{ 250,000 250,000 0 } [
+    250,000 <cuckoo-filter>
+    250,000 [ number>string ] { } map-integers
+    [
+        [
+            {
+                [ over cuckoo-lookup not ]
+                [ over cuckoo-insert ]
+            } 1&&
+        ] count swap
+    ]
+    [ [ over cuckoo-lookup ] count swap ]
+    [ [ over cuckoo-delete drop ] each ] tri
+    size>>
+] unit-test
diff --git a/basis/cuckoo-filters/cuckoo-filters.factor b/basis/cuckoo-filters/cuckoo-filters.factor

new file mode 100644 (file)

index 0000000..87dc1d1
--- /dev/null
+++ b/basis/cuckoo-filters/cuckoo-filters.factor
@@ -0,0 +1,95 @@
+! Copyright (C) 2016 John Benediktsson
+! See http://factorcode.org/license.txt for BSD license
+
+USING: accessors alien alien.c-types alien.data arrays checksums
+checksums.sha combinators.short-circuit kernel locals math
+math.bitwise random sequences ;
+
+IN: cuckoo-filters
+
+<PRIVATE
+
+! The maximum number of times we kick down items/displace from
+! their buckets
+CONSTANT: max-cuckoo-count 500
+
+! The maximum load factor we allow before growing the capacity
+CONSTANT: max-load-factor 0.96
+
+! The number of fingerprint to store in each bucket
+CONSTANT: bucket-size 4
+
+: #buckets ( capacity -- #buckets )
+    [ bucket-size /i next-power-of-2 ] keep
+    over / bucket-size / max-load-factor > [ 2 * ] when ;
+
+: <cuckoo-buckets> ( capacity -- buckets )
+    #buckets [ bucket-size f <array> ] replicate ;
+
+: hash-index ( hash -- fingerprint index )
+    4 over <displaced-alien> [ uint deref ] bi@ ;
+
+: alt-index ( fingerprint index -- alt-index )
+    [ 0x5bd1e995 w* ] [ bitxor ] bi* ;
+
+: hash-indices ( bytes cuckoo-filter -- fingerprint index alt-index )
+    checksum>> checksum-bytes hash-index 2dup alt-index ;
+
+: bucket-lookup ( fingerprint bucket -- ? )
+    member? ;
+
+: bucket-insert ( fingerprint bucket -- ? )
+    dup [ not ] find drop [ swap set-nth t ] [ 2drop f ] if* ;
+
+: bucket-delete ( fingerprint bucket -- ? )
+    [ f ] 2dip [ index ] keep over [ set-nth t ] [ 3drop f ] if ;
+
+: bucket-swap ( fingerprint bucket -- fingerprint' )
+    [ length random ] keep [ swap ] change-nth ;
+
+PRIVATE>
+
+TUPLE: cuckoo-filter buckets checksum size ;
+
+: <cuckoo-filter> ( capacity -- cuckoo-filter )
+    <cuckoo-buckets> sha1 0 cuckoo-filter boa ;
+
+:: cuckoo-insert ( bytes cuckoo-filter -- ? )
+    bytes cuckoo-filter hash-indices :> ( fp! i1 i2 )
+    cuckoo-filter buckets>> :> buckets
+    buckets length :> n
+    {
+        [ fp i1 n mod buckets nth bucket-insert ]
+        [ fp i2 n mod buckets nth bucket-insert ]
+    } 0|| [
+        t
+    ] [
+        2 random zero? i1 i2 ? :> i!
+        max-cuckoo-count [
+            drop
+            fp i n mod buckets nth bucket-swap fp!
+            fp i alt-index i!
+
+            fp i n mod buckets nth bucket-insert
+        ] find-integer >boolean
+    ] if
+    dup [ cuckoo-filter [ 1 + ] change-size drop ] when ;
+
+:: cuckoo-lookup ( bytes cuckoo-filter -- ? )
+    bytes cuckoo-filter hash-indices :> ( fp i1 i2 )
+    cuckoo-filter buckets>> :> buckets
+    buckets length :> n
+    {
+        [ fp i1 n mod buckets nth bucket-lookup ]
+        [ fp i2 n mod buckets nth bucket-lookup ]
+    } 0|| ;
+
+:: cuckoo-delete ( bytes cuckoo-filter -- ? )
+    bytes cuckoo-filter hash-indices :> ( fp i1 i2 )
+    cuckoo-filter buckets>> :> buckets
+    buckets length :> n
+    {
+        [ fp i1 n mod buckets nth bucket-delete ]
+        [ fp i2 n mod buckets nth bucket-delete ]
+    } 0||
+    dup [ cuckoo-filter [ 1 - ] change-size drop ] when ;
diff --git a/basis/cuckoo-filters/summary.txt b/basis/cuckoo-filters/summary.txt

new file mode 100644 (file)

index 0000000..078b781
--- /dev/null
+++ b/basis/cuckoo-filters/summary.txt
@@ -0,0 +1 @@
+Cuckoo filters
diff --git a/basis/cuckoo-filters/tags.txt b/basis/cuckoo-filters/tags.txt

new file mode 100644 (file)

index 0000000..42d711b
--- /dev/null
+++ b/basis/cuckoo-filters/tags.txt
@@ -0,0 +1 @@
+collections
diff --git a/extra/bloom-filters/authors.txt b/extra/bloom-filters/authors.txt

deleted file mode 100644 (file)

index 528e5df..0000000
--- a/extra/bloom-filters/authors.txt
+++ /dev/null
@@ -1 +0,0 @@
-Alec Berryman
diff --git a/extra/bloom-filters/bloom-filters-docs.factor b/extra/bloom-filters/bloom-filters-docs.factor

deleted file mode 100644 (file)

index 8f136eb..0000000
--- a/extra/bloom-filters/bloom-filters-docs.factor
+++ /dev/null
@@ -1,40 +0,0 @@
-USING: help.markup help.syntax kernel math ;
-IN: bloom-filters
-
-HELP: <bloom-filter>
-{ $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." }
-          { "capacity" "The expected number of object in the set. A positive " { $link integer } "." }
-          { "bloom-filter" bloom-filter } }
-{ $description "Creates an empty Bloom filter." }
-{ $errors "Throws a " { $link invalid-size } " when unable to produce a filter meeting the given constraints. Throws a " { $link invalid-error-rate } " or a " { $link invalid-capacity } " when input is invalid." } ;
-
-
-HELP: bloom-filter-insert
-{ $values { "object" object }
-          { "bloom-filter" bloom-filter } }
-{ $description "Records the item as a member of the filter." }
-{ $side-effects "bloom-filter" } ;
-
-HELP: bloom-filter-member?
-{ $values { "object" object }
-          { "bloom-filter" bloom-filter }
-          { "?" boolean } }
-{ $description "Returns " { $link t } " if the object may be a member of Bloom filter, " { $link f } " otherwise. The false positive rate is configurable; there are no false negatives." } ;
-
-HELP: bloom-filter
-{ $class-description "This is the class for Bloom filters. These provide constant-time insertion and probabilistic membership-testing operations, but do not actually store any elements." } ;
-
-ARTICLE: "bloom-filters" "Bloom filters"
-"This is a library for Bloom filters, sets that provide a constant-time insertion operation and probabilistic membership tests, but do not actually store any elements."
-$nl
-"The accuracy of the membership test is configurable; a Bloom filter will never incorrectly report an item is not a member of the set, but may incorrectly report than an item is a member of the set."
-$nl
-"Bloom filters cannot be resized and do not support removal."
-$nl
-{ $subsections
-    <bloom-filter>
-    bloom-filter-insert
-    bloom-filter-member?
-} ;
-
-ABOUT: "bloom-filters"
diff --git a/extra/bloom-filters/bloom-filters-tests.factor b/extra/bloom-filters/bloom-filters-tests.factor

deleted file mode 100644 (file)

index 205c5e1..0000000
--- a/extra/bloom-filters/bloom-filters-tests.factor
+++ /dev/null
@@ -1,77 +0,0 @@
-USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
-math random sequences tools.test ;
-IN: bloom-filters.tests
-
-{ { 200 5 } } [ { 100 7 } { 200 5 } smaller-second ] unit-test
-{ { 200 5 } } [ { 200 5 } { 100 7 } smaller-second ] unit-test
-
-! The sizing information was generated using the subroutine
-! calculate_shortest_filter_length from
-! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
-
-! Test bloom-filter creation
-{ 47965 } [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
-{ 7 47965 } [ 0.01 5000 size-bloom-filter ] unit-test
-{ 7 } [ 0.01 5000 <bloom-filter> #hashes>> ] unit-test
-{ 47965 } [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
-{ 5000 } [ 0.01 5000 <bloom-filter> capacity>> ] unit-test
-{ 0 } [ 0.01 5000 <bloom-filter> count>> ] unit-test
-
-! Should return the fewest hashes to satisfy the bits requested, not the most.
-{ 32 } [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
-{ 32 } [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
-{ 4 32 } [ 0.05 5 size-bloom-filter ] unit-test
-
-! This is a lot of bits.
-[ 0.00000001 max-array-capacity size-bloom-filter ] [ invalid-size? ]  must-fail-with
-
-! Other error conditions.
-[ 1.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
-[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
-[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
-[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
-[ 0.5 0 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
-[ 0.5 -5 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
-
-! Should not generate bignum hash codes.  Enhanced double hashing may generate a
-! lot of hash codes, and it's better to do this earlier than later.
-{ t } [ 10000 <iota> [ double-hashcodes [ fixnum? ] both? ] all? ] unit-test
-
-: empty-bloom-filter ( -- bloom-filter )
-    0.01 2000 <bloom-filter> ;
-
-{ 1 } [ empty-bloom-filter dup increment-count count>> ] unit-test
-
-: basic-insert-test-setup ( -- bloom-filter )
-    1 empty-bloom-filter [ bloom-filter-insert ] keep ;
-
-! Basic tests that insert does something
-{ t } [ basic-insert-test-setup bits>> [ ] any? ] unit-test
-{ 1 } [ basic-insert-test-setup count>> ] unit-test
-
-: non-empty-bloom-filter ( -- bloom-filter )
-    1000 <iota>
-    empty-bloom-filter
-    [ [ bloom-filter-insert ] curry each ] keep ;
-
-: full-bloom-filter ( -- bloom-filter )
-    2000 <iota>
-    empty-bloom-filter
-    [ [ bloom-filter-insert ] curry each ] keep ;
-
-! Should find what we put in there.
-{ t } [ 2000 <iota>
-        full-bloom-filter
-        [ bloom-filter-member? ] curry map
-        [ ] all?
-] unit-test
-
-! We shouldn't have more than 0.01 false-positive rate.
-{ t } [ 1000 <iota> [ drop most-positive-fixnum random 1000 + ] map
-        full-bloom-filter
-        [ bloom-filter-member? ] curry map
-        [ ] count
-        ! TODO: This should be 10, but the false positive rate is currently very
-        ! high.  300 is large enough not to prevent builds from succeeding.
-        300 <=
-] unit-test
diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor

deleted file mode 100644 (file)

index 26e2aa9..0000000
--- a/extra/bloom-filters/bloom-filters.factor
+++ /dev/null
@@ -1,143 +0,0 @@
-! Copyright (C) 2009 Alec Berryman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays bit-arrays fry kernel kernel.private
-layouts locals math math.functions math.order math.private
-multiline sequences sequences.private typed ;
-FROM: math.ranges => [1,b] ;
-
-IN: bloom-filters
-
-/*
-
-TODO:
-
-- The false positive rate is 10x what it should be, based on
-  informal testing.  Better object hashes or a better method of
-  generating extra hash codes would help.  Another way is to
-  increase the number of bits used.
-
-  - Try something smarter than the bitwise complement for a
-    second hash code.
-
-  - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
-    makes a case for http://murmurhash.googlepages.com/ instead
-    of enhanced double-hashing.
-
-  - Be sure to adjust the test that asserts the number of false
-    positives isn't unreasonable.
-
-- Could round bits up to next power of two and use wrap instead
-  of mod.  This would cost a lot of bits on 32-bit platforms,
-  though, and limit the bit-array to 8MB.
-
-- Should allow user to specify the hash codes, either as inputs
-  to enhanced double hashing or for direct use.
-
-- Support for serialization.
-
-- Wrappers for combining filters.
-
-- Should we signal an error when inserting past the number of
-  objects the filter is sized for?  The filter will continue to
-  work, just not very well.
-
-*/
-
-TUPLE: bloom-filter
-{ #hashes fixnum read-only }
-{ bits bit-array read-only }
-{ capacity fixnum read-only }
-{ count fixnum } ;
-
-ERROR: invalid-size size ;
-ERROR: invalid-error-rate error-rate ;
-ERROR: invalid-capacity capacity ;
-
-<PRIVATE
-
-:: bits-to-satisfy-error-rate ( hashes error objects -- size )
-    objects hashes * neg error hashes recip ^ 1 swap - log /
-    ceiling >integer ;
-
-! 100 hashes ought to be enough for anybody.
-: #hashes-range ( -- range )
-    100 [1,b] ;
-
-! { #hashes #bits }
-: identity-configuration ( -- 2seq )
-    0 max-array-capacity 2array ;
-
-: smaller-second ( 2seq 2seq -- 2seq )
-    [ [ second ] bi@ <= ] most ;
-
-! If the number of hashes isn't positive, we haven't found
-! anything smaller than the identity configuration.
-: check-hashes ( 2seq -- 2seq )
-    dup first 0 <= [ invalid-size ] when ;
-
-! The consensus on the tradeoff between increasing the number of
-! bits and increasing the number of hash functions seems to be
-! "go for the smallest number of bits", probably because most
-! implementations just generate one hash value and cheaply
-! mangle it into the number of hashes they need.  I have not
-! seen any usage studies from the implementations that made this
-! tradeoff to support it, and I haven't done my own, but we'll
-! go with it anyway.
-: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
-    [ #hashes-range identity-configuration ] 2dip '[
-        dup _ _ bits-to-satisfy-error-rate
-        2array smaller-second
-    ] reduce check-hashes first2 ;
-
-: check-capacity ( capacity -- capacity )
-    dup 0 <= [ invalid-capacity ] when ;
-
-: check-error-rate ( error-rate -- error-rate )
-    dup [ 0 after? ] [ 1 before? ] bi and
-    [ invalid-error-rate ] unless ;
-
-PRIVATE>
-
-: <bloom-filter> ( error-rate capacity -- bloom-filter )
-    [ check-error-rate ] [ check-capacity ] bi*
-    [ size-bloom-filter <bit-array> ] keep
-    0 ! initially empty
-    bloom-filter boa ;
-
-<PRIVATE
-
-! See "Bloom Filters in Probabilistic Verification" by Peter C.
-! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced
-! Double Hashing":
-! http://www.ccs.neu.edu/home/pete/research/bloom-filters-verification.html
-! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
-: combine-hashcodes ( index hash0 hash1 -- hash )
-    { fixnum fixnum fixnum } declare
-    [ [ [ 3 ^ ] [ - ] bi 6 /i ] keep ]
-    [ fixnum*fast ] [ fixnum+fast ] tri* + abs ;
-
-: double-hashcodes ( object -- hash0 hash1 )
-    hashcode >fixnum dup most-positive-fixnum bitxor >fixnum ;
-
-: increment-count ( bloom-filter -- )
-    [ 1 + ] change-count drop ; inline
-
-: #hashes-and-length ( bloom-filter -- #hashes length )
-    [ #hashes>> ] [ bits>> length ] bi ; inline
-
-: relevant-indices ( object bloom-filter -- n quot: ( elt -- n ) )
-    [ double-hashcodes ] [ #hashes-and-length ] bi*
-    -rotd '[ _ _ combine-hashcodes _ mod ] ; inline
-
-PRIVATE>
-
-TYPED: bloom-filter-insert ( object bloom-filter: bloom-filter -- )
-    [ increment-count ]
-    [ relevant-indices ]
-    [ bits>> [ [ t ] 2dip set-nth-unsafe ] curry ]
-    tri compose each-integer ;
-
-TYPED: bloom-filter-member? ( object bloom-filter: bloom-filter -- ? )
-    [ relevant-indices ]
-    [ bits>> [ nth-unsafe ] curry ]
-    bi compose all-integers? ;
diff --git a/extra/bloom-filters/summary.txt b/extra/bloom-filters/summary.txt

deleted file mode 100644 (file)

index ebfd7cf..0000000
--- a/extra/bloom-filters/summary.txt
+++ /dev/null
@@ -1 +0,0 @@
-Bloom filters
diff --git a/extra/bloom-filters/tags.txt b/extra/bloom-filters/tags.txt

deleted file mode 100644 (file)

index 42d711b..0000000
--- a/extra/bloom-filters/tags.txt
+++ /dev/null
@@ -1 +0,0 @@
-collections
diff --git a/extra/cuckoo-filters/authors.txt b/extra/cuckoo-filters/authors.txt

deleted file mode 100644 (file)

index e091bb8..0000000
--- a/extra/cuckoo-filters/authors.txt
+++ /dev/null
@@ -1 +0,0 @@
-John Benediktsson
diff --git a/extra/cuckoo-filters/cuckoo-filters-docs.factor b/extra/cuckoo-filters/cuckoo-filters-docs.factor

deleted file mode 100644 (file)

index bc1c279..0000000
--- a/extra/cuckoo-filters/cuckoo-filters-docs.factor
+++ /dev/null
@@ -1,28 +0,0 @@
-USING: byte-arrays checksums help.markup help.syntax kernel ;
-IN: cuckoo-filters
-
-HELP: cuckoo-insert
-{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
-{ $description "Insert the data into the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data was inserted." }
-{ $notes "Attempting to insert data twice will result in the hashed fingerprint of the data appearing twice and the " { $link cuckoo-filter } " size being incremented twice." } ;
-
-HELP: cuckoo-lookup
-{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
-{ $description "Lookup the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be a member. This is a probabilistic test, meaning there is a possibility of false positives." } ;
-
-HELP: cuckoo-delete
-{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
-{ $description "Remove the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be removed." } ;
-
-ARTICLE: "cuckoo-filters" "Cuckoo Filters"
-"Cuckoo Filters are probabilistic data structures similar to Bloom Filters that provides support for removing elements without significantly degrading space and performance."
-$nl
-"Instead of storing the elements themselves, it stores a fingerprint obtained by using a " { $link checksum } ". This allows for item removal without false negatives (assuming you do not try and remove an item not contained in the filter."
-$nl
-"For applications that store many items and target low false-positive rates, Cuckoo Filters can have a lower space overhead than Bloom Filters."
-$nl
-"More information is available in the paper by Andersen, Kaminsky, and Mitzenmacher titled \"Cuckoo Filter: Practically Better Than Bloom\":"
-$nl
-{ $url "http://www.pdl.cmu.edu/PDL-FTP/FS/cuckoo-conext2014.pdf" } ;
-
-ABOUT: "cuckoo-filters"
diff --git a/extra/cuckoo-filters/cuckoo-filters-tests.factor b/extra/cuckoo-filters/cuckoo-filters-tests.factor

deleted file mode 100644 (file)

index af2d363..0000000
--- a/extra/cuckoo-filters/cuckoo-filters-tests.factor
+++ /dev/null
@@ -1,29 +0,0 @@
-USING: accessors combinators combinators.short-circuit
-cuckoo-filters kernel math.parser sequences tools.test ;
-
-{ t 1 t t f 0 } [
-    "factor" 100 <cuckoo-filter> {
-        [ cuckoo-insert ]
-        [ nip size>> ]
-        [ cuckoo-lookup ]
-        [ cuckoo-delete ]
-        [ cuckoo-lookup ]
-        [ nip size>> ]
-    } 2cleave
-] unit-test
-
-{ 250,000 250,000 0 } [
-    250,000 <cuckoo-filter>
-    250,000 [ number>string ] { } map-integers
-    [
-        [
-            {
-                [ over cuckoo-lookup not ]
-                [ over cuckoo-insert ]
-            } 1&&
-        ] count swap
-    ]
-    [ [ over cuckoo-lookup ] count swap ]
-    [ [ over cuckoo-delete drop ] each ] tri
-    size>>
-] unit-test
diff --git a/extra/cuckoo-filters/cuckoo-filters.factor b/extra/cuckoo-filters/cuckoo-filters.factor

deleted file mode 100644 (file)

index 87dc1d1..0000000
--- a/extra/cuckoo-filters/cuckoo-filters.factor
+++ /dev/null
@@ -1,95 +0,0 @@
-! Copyright (C) 2016 John Benediktsson
-! See http://factorcode.org/license.txt for BSD license
-
-USING: accessors alien alien.c-types alien.data arrays checksums
-checksums.sha combinators.short-circuit kernel locals math
-math.bitwise random sequences ;
-
-IN: cuckoo-filters
-
-<PRIVATE
-
-! The maximum number of times we kick down items/displace from
-! their buckets
-CONSTANT: max-cuckoo-count 500
-
-! The maximum load factor we allow before growing the capacity
-CONSTANT: max-load-factor 0.96
-
-! The number of fingerprint to store in each bucket
-CONSTANT: bucket-size 4
-
-: #buckets ( capacity -- #buckets )
-    [ bucket-size /i next-power-of-2 ] keep
-    over / bucket-size / max-load-factor > [ 2 * ] when ;
-
-: <cuckoo-buckets> ( capacity -- buckets )
-    #buckets [ bucket-size f <array> ] replicate ;
-
-: hash-index ( hash -- fingerprint index )
-    4 over <displaced-alien> [ uint deref ] bi@ ;
-
-: alt-index ( fingerprint index -- alt-index )
-    [ 0x5bd1e995 w* ] [ bitxor ] bi* ;
-
-: hash-indices ( bytes cuckoo-filter -- fingerprint index alt-index )
-    checksum>> checksum-bytes hash-index 2dup alt-index ;
-
-: bucket-lookup ( fingerprint bucket -- ? )
-    member? ;
-
-: bucket-insert ( fingerprint bucket -- ? )
-    dup [ not ] find drop [ swap set-nth t ] [ 2drop f ] if* ;
-
-: bucket-delete ( fingerprint bucket -- ? )
-    [ f ] 2dip [ index ] keep over [ set-nth t ] [ 3drop f ] if ;
-
-: bucket-swap ( fingerprint bucket -- fingerprint' )
-    [ length random ] keep [ swap ] change-nth ;
-
-PRIVATE>
-
-TUPLE: cuckoo-filter buckets checksum size ;
-
-: <cuckoo-filter> ( capacity -- cuckoo-filter )
-    <cuckoo-buckets> sha1 0 cuckoo-filter boa ;
-
-:: cuckoo-insert ( bytes cuckoo-filter -- ? )
-    bytes cuckoo-filter hash-indices :> ( fp! i1 i2 )
-    cuckoo-filter buckets>> :> buckets
-    buckets length :> n
-    {
-        [ fp i1 n mod buckets nth bucket-insert ]
-        [ fp i2 n mod buckets nth bucket-insert ]
-    } 0|| [
-        t
-    ] [
-        2 random zero? i1 i2 ? :> i!
-        max-cuckoo-count [
-            drop
-            fp i n mod buckets nth bucket-swap fp!
-            fp i alt-index i!
-
-            fp i n mod buckets nth bucket-insert
-        ] find-integer >boolean
-    ] if
-    dup [ cuckoo-filter [ 1 + ] change-size drop ] when ;
-
-:: cuckoo-lookup ( bytes cuckoo-filter -- ? )
-    bytes cuckoo-filter hash-indices :> ( fp i1 i2 )
-    cuckoo-filter buckets>> :> buckets
-    buckets length :> n
-    {
-        [ fp i1 n mod buckets nth bucket-lookup ]
-        [ fp i2 n mod buckets nth bucket-lookup ]
-    } 0|| ;
-
-:: cuckoo-delete ( bytes cuckoo-filter -- ? )
-    bytes cuckoo-filter hash-indices :> ( fp i1 i2 )
-    cuckoo-filter buckets>> :> buckets
-    buckets length :> n
-    {
-        [ fp i1 n mod buckets nth bucket-delete ]
-        [ fp i2 n mod buckets nth bucket-delete ]
-    } 0||
-    dup [ cuckoo-filter [ 1 - ] change-size drop ] when ;
diff --git a/extra/cuckoo-filters/summary.txt b/extra/cuckoo-filters/summary.txt

deleted file mode 100644 (file)

index 078b781..0000000
--- a/extra/cuckoo-filters/summary.txt
+++ /dev/null
@@ -1 +0,0 @@
-Cuckoo filters
diff --git a/extra/cuckoo-filters/tags.txt b/extra/cuckoo-filters/tags.txt

deleted file mode 100644 (file)

index 42d711b..0000000
--- a/extra/cuckoo-filters/tags.txt
+++ /dev/null
@@ -1 +0,0 @@
-collections
author	John Benediktsson <mrjbq7@gmail.com>
	Wed, 17 Mar 2021 04:45:38 +0000 (21:45 -0700)
committer	John Benediktsson <mrjbq7@gmail.com>
	Wed, 17 Mar 2021 04:45:38 +0000 (21:45 -0700)
basis/bloom-filters/authors.txt	[new file with mode: 0644]	patch \| blob
basis/bloom-filters/bloom-filters-docs.factor	[new file with mode: 0644]	patch \| blob
basis/bloom-filters/bloom-filters-tests.factor	[new file with mode: 0644]	patch \| blob
basis/bloom-filters/bloom-filters.factor	[new file with mode: 0644]	patch \| blob
basis/bloom-filters/summary.txt	[new file with mode: 0644]	patch \| blob
basis/bloom-filters/tags.txt	[new file with mode: 0644]	patch \| blob
basis/cuckoo-filters/authors.txt	[new file with mode: 0644]	patch \| blob
basis/cuckoo-filters/cuckoo-filters-docs.factor	[new file with mode: 0644]	patch \| blob
basis/cuckoo-filters/cuckoo-filters-tests.factor	[new file with mode: 0644]	patch \| blob
basis/cuckoo-filters/cuckoo-filters.factor	[new file with mode: 0644]	patch \| blob
basis/cuckoo-filters/summary.txt	[new file with mode: 0644]	patch \| blob
basis/cuckoo-filters/tags.txt	[new file with mode: 0644]	patch \| blob
extra/bloom-filters/authors.txt	[deleted file]	patch \| blob \| history
extra/bloom-filters/bloom-filters-docs.factor	[deleted file]	patch \| blob \| history
extra/bloom-filters/bloom-filters-tests.factor	[deleted file]	patch \| blob \| history
extra/bloom-filters/bloom-filters.factor	[deleted file]	patch \| blob \| history
extra/bloom-filters/summary.txt	[deleted file]	patch \| blob \| history
extra/bloom-filters/tags.txt	[deleted file]	patch \| blob \| history
extra/cuckoo-filters/authors.txt	[deleted file]	patch \| blob \| history
extra/cuckoo-filters/cuckoo-filters-docs.factor	[deleted file]	patch \| blob \| history
extra/cuckoo-filters/cuckoo-filters-tests.factor	[deleted file]	patch \| blob \| history
extra/cuckoo-filters/cuckoo-filters.factor	[deleted file]	patch \| blob \| history
extra/cuckoo-filters/summary.txt	[deleted file]	patch \| blob \| history
extra/cuckoo-filters/tags.txt	[deleted file]	patch \| blob \| history