--- /dev/null
+Alec Berryman
--- /dev/null
+USING: help.markup help.syntax kernel math ;
+IN: bloom-filters
+
+HELP: <bloom-filter>
+{ $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." }
+ { "capacity" "The expected number of object in the set. A positive " { $link integer } "." }
+ { "bloom-filter" bloom-filter } }
+{ $description "Creates an empty Bloom filter." }
+{ $errors "Throws a " { $link invalid-size } " when unable to produce a filter meeting the given constraints. Throws a " { $link invalid-error-rate } " or a " { $link invalid-capacity } " when input is invalid." } ;
+
+
+HELP: bloom-filter-insert
+{ $values { "object" object }
+ { "bloom-filter" bloom-filter } }
+{ $description "Records the item as a member of the filter." }
+{ $side-effects "bloom-filter" } ;
+
+HELP: bloom-filter-member?
+{ $values { "object" object }
+ { "bloom-filter" bloom-filter }
+ { "?" boolean } }
+{ $description "Returns " { $link t } " if the object may be a member of Bloom filter, " { $link f } " otherwise. The false positive rate is configurable; there are no false negatives." } ;
+
+HELP: bloom-filter
+{ $class-description "This is the class for Bloom filters. These provide constant-time insertion and probabilistic membership-testing operations, but do not actually store any elements." } ;
+
+ARTICLE: "bloom-filters" "Bloom filters"
+"This is a library for Bloom filters, sets that provide a constant-time insertion operation and probabilistic membership tests, but do not actually store any elements."
+$nl
+"The accuracy of the membership test is configurable; a Bloom filter will never incorrectly report an item is not a member of the set, but may incorrectly report than an item is a member of the set."
+$nl
+"Bloom filters cannot be resized and do not support removal."
+$nl
+{ $subsections
+ <bloom-filter>
+ bloom-filter-insert
+ bloom-filter-member?
+} ;
+
+ABOUT: "bloom-filters"
--- /dev/null
+USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
+math random sequences tools.test ;
+IN: bloom-filters.tests
+
+{ { 200 5 } } [ { 100 7 } { 200 5 } smaller-second ] unit-test
+{ { 200 5 } } [ { 200 5 } { 100 7 } smaller-second ] unit-test
+
+! The sizing information was generated using the subroutine
+! calculate_shortest_filter_length from
+! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
+
+! Test bloom-filter creation
+{ 47965 } [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
+{ 7 47965 } [ 0.01 5000 size-bloom-filter ] unit-test
+{ 7 } [ 0.01 5000 <bloom-filter> #hashes>> ] unit-test
+{ 47965 } [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
+{ 5000 } [ 0.01 5000 <bloom-filter> capacity>> ] unit-test
+{ 0 } [ 0.01 5000 <bloom-filter> count>> ] unit-test
+
+! Should return the fewest hashes to satisfy the bits requested, not the most.
+{ 32 } [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
+{ 32 } [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
+{ 4 32 } [ 0.05 5 size-bloom-filter ] unit-test
+
+! This is a lot of bits.
+[ 0.00000001 max-array-capacity size-bloom-filter ] [ invalid-size? ] must-fail-with
+
+! Other error conditions.
+[ 1.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 0.5 0 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
+[ 0.5 -5 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
+
+! Should not generate bignum hash codes. Enhanced double hashing may generate a
+! lot of hash codes, and it's better to do this earlier than later.
+{ t } [ 10000 <iota> [ double-hashcodes [ fixnum? ] both? ] all? ] unit-test
+
+: empty-bloom-filter ( -- bloom-filter )
+ 0.01 2000 <bloom-filter> ;
+
+{ 1 } [ empty-bloom-filter dup increment-count count>> ] unit-test
+
+: basic-insert-test-setup ( -- bloom-filter )
+ 1 empty-bloom-filter [ bloom-filter-insert ] keep ;
+
+! Basic tests that insert does something
+{ t } [ basic-insert-test-setup bits>> [ ] any? ] unit-test
+{ 1 } [ basic-insert-test-setup count>> ] unit-test
+
+: non-empty-bloom-filter ( -- bloom-filter )
+ 1000 <iota>
+ empty-bloom-filter
+ [ [ bloom-filter-insert ] curry each ] keep ;
+
+: full-bloom-filter ( -- bloom-filter )
+ 2000 <iota>
+ empty-bloom-filter
+ [ [ bloom-filter-insert ] curry each ] keep ;
+
+! Should find what we put in there.
+{ t } [ 2000 <iota>
+ full-bloom-filter
+ [ bloom-filter-member? ] curry map
+ [ ] all?
+] unit-test
+
+! We shouldn't have more than 0.01 false-positive rate.
+{ t } [ 1000 <iota> [ drop most-positive-fixnum random 1000 + ] map
+ full-bloom-filter
+ [ bloom-filter-member? ] curry map
+ [ ] count
+ ! TODO: This should be 10, but the false positive rate is currently very
+ ! high. 300 is large enough not to prevent builds from succeeding.
+ 300 <=
+] unit-test
--- /dev/null
+! Copyright (C) 2009 Alec Berryman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors arrays bit-arrays fry kernel kernel.private
+layouts locals math math.functions math.order math.private
+multiline sequences sequences.private typed ;
+FROM: math.ranges => [1,b] ;
+
+IN: bloom-filters
+
+/*
+
+TODO:
+
+- The false positive rate is 10x what it should be, based on
+ informal testing. Better object hashes or a better method of
+ generating extra hash codes would help. Another way is to
+ increase the number of bits used.
+
+ - Try something smarter than the bitwise complement for a
+ second hash code.
+
+ - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
+ makes a case for http://murmurhash.googlepages.com/ instead
+ of enhanced double-hashing.
+
+ - Be sure to adjust the test that asserts the number of false
+ positives isn't unreasonable.
+
+- Could round bits up to next power of two and use wrap instead
+ of mod. This would cost a lot of bits on 32-bit platforms,
+ though, and limit the bit-array to 8MB.
+
+- Should allow user to specify the hash codes, either as inputs
+ to enhanced double hashing or for direct use.
+
+- Support for serialization.
+
+- Wrappers for combining filters.
+
+- Should we signal an error when inserting past the number of
+ objects the filter is sized for? The filter will continue to
+ work, just not very well.
+
+*/
+
+TUPLE: bloom-filter
+{ #hashes fixnum read-only }
+{ bits bit-array read-only }
+{ capacity fixnum read-only }
+{ count fixnum } ;
+
+ERROR: invalid-size size ;
+ERROR: invalid-error-rate error-rate ;
+ERROR: invalid-capacity capacity ;
+
+<PRIVATE
+
+:: bits-to-satisfy-error-rate ( hashes error objects -- size )
+ objects hashes * neg error hashes recip ^ 1 swap - log /
+ ceiling >integer ;
+
+! 100 hashes ought to be enough for anybody.
+: #hashes-range ( -- range )
+ 100 [1,b] ;
+
+! { #hashes #bits }
+: identity-configuration ( -- 2seq )
+ 0 max-array-capacity 2array ;
+
+: smaller-second ( 2seq 2seq -- 2seq )
+ [ [ second ] bi@ <= ] most ;
+
+! If the number of hashes isn't positive, we haven't found
+! anything smaller than the identity configuration.
+: check-hashes ( 2seq -- 2seq )
+ dup first 0 <= [ invalid-size ] when ;
+
+! The consensus on the tradeoff between increasing the number of
+! bits and increasing the number of hash functions seems to be
+! "go for the smallest number of bits", probably because most
+! implementations just generate one hash value and cheaply
+! mangle it into the number of hashes they need. I have not
+! seen any usage studies from the implementations that made this
+! tradeoff to support it, and I haven't done my own, but we'll
+! go with it anyway.
+: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
+ [ #hashes-range identity-configuration ] 2dip '[
+ dup _ _ bits-to-satisfy-error-rate
+ 2array smaller-second
+ ] reduce check-hashes first2 ;
+
+: check-capacity ( capacity -- capacity )
+ dup 0 <= [ invalid-capacity ] when ;
+
+: check-error-rate ( error-rate -- error-rate )
+ dup [ 0 after? ] [ 1 before? ] bi and
+ [ invalid-error-rate ] unless ;
+
+PRIVATE>
+
+: <bloom-filter> ( error-rate capacity -- bloom-filter )
+ [ check-error-rate ] [ check-capacity ] bi*
+ [ size-bloom-filter <bit-array> ] keep
+ 0 ! initially empty
+ bloom-filter boa ;
+
+<PRIVATE
+
+! See "Bloom Filters in Probabilistic Verification" by Peter C.
+! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced
+! Double Hashing":
+! http://www.ccs.neu.edu/home/pete/research/bloom-filters-verification.html
+! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
+: combine-hashcodes ( index hash0 hash1 -- hash )
+ { fixnum fixnum fixnum } declare
+ [ [ [ 3 ^ ] [ - ] bi 6 /i ] keep ]
+ [ fixnum*fast ] [ fixnum+fast ] tri* + abs ;
+
+: double-hashcodes ( object -- hash0 hash1 )
+ hashcode >fixnum dup most-positive-fixnum bitxor >fixnum ;
+
+: increment-count ( bloom-filter -- )
+ [ 1 + ] change-count drop ; inline
+
+: #hashes-and-length ( bloom-filter -- #hashes length )
+ [ #hashes>> ] [ bits>> length ] bi ; inline
+
+: relevant-indices ( object bloom-filter -- n quot: ( elt -- n ) )
+ [ double-hashcodes ] [ #hashes-and-length ] bi*
+ -rotd '[ _ _ combine-hashcodes _ mod ] ; inline
+
+PRIVATE>
+
+TYPED: bloom-filter-insert ( object bloom-filter: bloom-filter -- )
+ [ increment-count ]
+ [ relevant-indices ]
+ [ bits>> [ [ t ] 2dip set-nth-unsafe ] curry ]
+ tri compose each-integer ;
+
+TYPED: bloom-filter-member? ( object bloom-filter: bloom-filter -- ? )
+ [ relevant-indices ]
+ [ bits>> [ nth-unsafe ] curry ]
+ bi compose all-integers? ;
--- /dev/null
+Bloom filters
--- /dev/null
+collections
--- /dev/null
+John Benediktsson
--- /dev/null
+USING: byte-arrays checksums help.markup help.syntax kernel ;
+IN: cuckoo-filters
+
+HELP: cuckoo-insert
+{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
+{ $description "Insert the data into the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data was inserted." }
+{ $notes "Attempting to insert data twice will result in the hashed fingerprint of the data appearing twice and the " { $link cuckoo-filter } " size being incremented twice." } ;
+
+HELP: cuckoo-lookup
+{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
+{ $description "Lookup the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be a member. This is a probabilistic test, meaning there is a possibility of false positives." } ;
+
+HELP: cuckoo-delete
+{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
+{ $description "Remove the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be removed." } ;
+
+ARTICLE: "cuckoo-filters" "Cuckoo Filters"
+"Cuckoo Filters are probabilistic data structures similar to Bloom Filters that provides support for removing elements without significantly degrading space and performance."
+$nl
+"Instead of storing the elements themselves, it stores a fingerprint obtained by using a " { $link checksum } ". This allows for item removal without false negatives (assuming you do not try and remove an item not contained in the filter."
+$nl
+"For applications that store many items and target low false-positive rates, Cuckoo Filters can have a lower space overhead than Bloom Filters."
+$nl
+"More information is available in the paper by Andersen, Kaminsky, and Mitzenmacher titled \"Cuckoo Filter: Practically Better Than Bloom\":"
+$nl
+{ $url "http://www.pdl.cmu.edu/PDL-FTP/FS/cuckoo-conext2014.pdf" } ;
+
+ABOUT: "cuckoo-filters"
--- /dev/null
+USING: accessors combinators combinators.short-circuit
+cuckoo-filters kernel math.parser sequences tools.test ;
+
+{ t 1 t t f 0 } [
+ "factor" 100 <cuckoo-filter> {
+ [ cuckoo-insert ]
+ [ nip size>> ]
+ [ cuckoo-lookup ]
+ [ cuckoo-delete ]
+ [ cuckoo-lookup ]
+ [ nip size>> ]
+ } 2cleave
+] unit-test
+
+{ 250,000 250,000 0 } [
+ 250,000 <cuckoo-filter>
+ 250,000 [ number>string ] { } map-integers
+ [
+ [
+ {
+ [ over cuckoo-lookup not ]
+ [ over cuckoo-insert ]
+ } 1&&
+ ] count swap
+ ]
+ [ [ over cuckoo-lookup ] count swap ]
+ [ [ over cuckoo-delete drop ] each ] tri
+ size>>
+] unit-test
--- /dev/null
+! Copyright (C) 2016 John Benediktsson
+! See http://factorcode.org/license.txt for BSD license
+
+USING: accessors alien alien.c-types alien.data arrays checksums
+checksums.sha combinators.short-circuit kernel locals math
+math.bitwise random sequences ;
+
+IN: cuckoo-filters
+
+<PRIVATE
+
+! The maximum number of times we kick down items/displace from
+! their buckets
+CONSTANT: max-cuckoo-count 500
+
+! The maximum load factor we allow before growing the capacity
+CONSTANT: max-load-factor 0.96
+
+! The number of fingerprint to store in each bucket
+CONSTANT: bucket-size 4
+
+: #buckets ( capacity -- #buckets )
+ [ bucket-size /i next-power-of-2 ] keep
+ over / bucket-size / max-load-factor > [ 2 * ] when ;
+
+: <cuckoo-buckets> ( capacity -- buckets )
+ #buckets [ bucket-size f <array> ] replicate ;
+
+: hash-index ( hash -- fingerprint index )
+ 4 over <displaced-alien> [ uint deref ] bi@ ;
+
+: alt-index ( fingerprint index -- alt-index )
+ [ 0x5bd1e995 w* ] [ bitxor ] bi* ;
+
+: hash-indices ( bytes cuckoo-filter -- fingerprint index alt-index )
+ checksum>> checksum-bytes hash-index 2dup alt-index ;
+
+: bucket-lookup ( fingerprint bucket -- ? )
+ member? ;
+
+: bucket-insert ( fingerprint bucket -- ? )
+ dup [ not ] find drop [ swap set-nth t ] [ 2drop f ] if* ;
+
+: bucket-delete ( fingerprint bucket -- ? )
+ [ f ] 2dip [ index ] keep over [ set-nth t ] [ 3drop f ] if ;
+
+: bucket-swap ( fingerprint bucket -- fingerprint' )
+ [ length random ] keep [ swap ] change-nth ;
+
+PRIVATE>
+
+TUPLE: cuckoo-filter buckets checksum size ;
+
+: <cuckoo-filter> ( capacity -- cuckoo-filter )
+ <cuckoo-buckets> sha1 0 cuckoo-filter boa ;
+
+:: cuckoo-insert ( bytes cuckoo-filter -- ? )
+ bytes cuckoo-filter hash-indices :> ( fp! i1 i2 )
+ cuckoo-filter buckets>> :> buckets
+ buckets length :> n
+ {
+ [ fp i1 n mod buckets nth bucket-insert ]
+ [ fp i2 n mod buckets nth bucket-insert ]
+ } 0|| [
+ t
+ ] [
+ 2 random zero? i1 i2 ? :> i!
+ max-cuckoo-count [
+ drop
+ fp i n mod buckets nth bucket-swap fp!
+ fp i alt-index i!
+
+ fp i n mod buckets nth bucket-insert
+ ] find-integer >boolean
+ ] if
+ dup [ cuckoo-filter [ 1 + ] change-size drop ] when ;
+
+:: cuckoo-lookup ( bytes cuckoo-filter -- ? )
+ bytes cuckoo-filter hash-indices :> ( fp i1 i2 )
+ cuckoo-filter buckets>> :> buckets
+ buckets length :> n
+ {
+ [ fp i1 n mod buckets nth bucket-lookup ]
+ [ fp i2 n mod buckets nth bucket-lookup ]
+ } 0|| ;
+
+:: cuckoo-delete ( bytes cuckoo-filter -- ? )
+ bytes cuckoo-filter hash-indices :> ( fp i1 i2 )
+ cuckoo-filter buckets>> :> buckets
+ buckets length :> n
+ {
+ [ fp i1 n mod buckets nth bucket-delete ]
+ [ fp i2 n mod buckets nth bucket-delete ]
+ } 0||
+ dup [ cuckoo-filter [ 1 - ] change-size drop ] when ;
--- /dev/null
+Cuckoo filters
--- /dev/null
+collections
+++ /dev/null
-Alec Berryman
+++ /dev/null
-USING: help.markup help.syntax kernel math ;
-IN: bloom-filters
-
-HELP: <bloom-filter>
-{ $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." }
- { "capacity" "The expected number of object in the set. A positive " { $link integer } "." }
- { "bloom-filter" bloom-filter } }
-{ $description "Creates an empty Bloom filter." }
-{ $errors "Throws a " { $link invalid-size } " when unable to produce a filter meeting the given constraints. Throws a " { $link invalid-error-rate } " or a " { $link invalid-capacity } " when input is invalid." } ;
-
-
-HELP: bloom-filter-insert
-{ $values { "object" object }
- { "bloom-filter" bloom-filter } }
-{ $description "Records the item as a member of the filter." }
-{ $side-effects "bloom-filter" } ;
-
-HELP: bloom-filter-member?
-{ $values { "object" object }
- { "bloom-filter" bloom-filter }
- { "?" boolean } }
-{ $description "Returns " { $link t } " if the object may be a member of Bloom filter, " { $link f } " otherwise. The false positive rate is configurable; there are no false negatives." } ;
-
-HELP: bloom-filter
-{ $class-description "This is the class for Bloom filters. These provide constant-time insertion and probabilistic membership-testing operations, but do not actually store any elements." } ;
-
-ARTICLE: "bloom-filters" "Bloom filters"
-"This is a library for Bloom filters, sets that provide a constant-time insertion operation and probabilistic membership tests, but do not actually store any elements."
-$nl
-"The accuracy of the membership test is configurable; a Bloom filter will never incorrectly report an item is not a member of the set, but may incorrectly report than an item is a member of the set."
-$nl
-"Bloom filters cannot be resized and do not support removal."
-$nl
-{ $subsections
- <bloom-filter>
- bloom-filter-insert
- bloom-filter-member?
-} ;
-
-ABOUT: "bloom-filters"
+++ /dev/null
-USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
-math random sequences tools.test ;
-IN: bloom-filters.tests
-
-{ { 200 5 } } [ { 100 7 } { 200 5 } smaller-second ] unit-test
-{ { 200 5 } } [ { 200 5 } { 100 7 } smaller-second ] unit-test
-
-! The sizing information was generated using the subroutine
-! calculate_shortest_filter_length from
-! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
-
-! Test bloom-filter creation
-{ 47965 } [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
-{ 7 47965 } [ 0.01 5000 size-bloom-filter ] unit-test
-{ 7 } [ 0.01 5000 <bloom-filter> #hashes>> ] unit-test
-{ 47965 } [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
-{ 5000 } [ 0.01 5000 <bloom-filter> capacity>> ] unit-test
-{ 0 } [ 0.01 5000 <bloom-filter> count>> ] unit-test
-
-! Should return the fewest hashes to satisfy the bits requested, not the most.
-{ 32 } [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
-{ 32 } [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
-{ 4 32 } [ 0.05 5 size-bloom-filter ] unit-test
-
-! This is a lot of bits.
-[ 0.00000001 max-array-capacity size-bloom-filter ] [ invalid-size? ] must-fail-with
-
-! Other error conditions.
-[ 1.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
-[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
-[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
-[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
-[ 0.5 0 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
-[ 0.5 -5 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
-
-! Should not generate bignum hash codes. Enhanced double hashing may generate a
-! lot of hash codes, and it's better to do this earlier than later.
-{ t } [ 10000 <iota> [ double-hashcodes [ fixnum? ] both? ] all? ] unit-test
-
-: empty-bloom-filter ( -- bloom-filter )
- 0.01 2000 <bloom-filter> ;
-
-{ 1 } [ empty-bloom-filter dup increment-count count>> ] unit-test
-
-: basic-insert-test-setup ( -- bloom-filter )
- 1 empty-bloom-filter [ bloom-filter-insert ] keep ;
-
-! Basic tests that insert does something
-{ t } [ basic-insert-test-setup bits>> [ ] any? ] unit-test
-{ 1 } [ basic-insert-test-setup count>> ] unit-test
-
-: non-empty-bloom-filter ( -- bloom-filter )
- 1000 <iota>
- empty-bloom-filter
- [ [ bloom-filter-insert ] curry each ] keep ;
-
-: full-bloom-filter ( -- bloom-filter )
- 2000 <iota>
- empty-bloom-filter
- [ [ bloom-filter-insert ] curry each ] keep ;
-
-! Should find what we put in there.
-{ t } [ 2000 <iota>
- full-bloom-filter
- [ bloom-filter-member? ] curry map
- [ ] all?
-] unit-test
-
-! We shouldn't have more than 0.01 false-positive rate.
-{ t } [ 1000 <iota> [ drop most-positive-fixnum random 1000 + ] map
- full-bloom-filter
- [ bloom-filter-member? ] curry map
- [ ] count
- ! TODO: This should be 10, but the false positive rate is currently very
- ! high. 300 is large enough not to prevent builds from succeeding.
- 300 <=
-] unit-test
+++ /dev/null
-! Copyright (C) 2009 Alec Berryman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays bit-arrays fry kernel kernel.private
-layouts locals math math.functions math.order math.private
-multiline sequences sequences.private typed ;
-FROM: math.ranges => [1,b] ;
-
-IN: bloom-filters
-
-/*
-
-TODO:
-
-- The false positive rate is 10x what it should be, based on
- informal testing. Better object hashes or a better method of
- generating extra hash codes would help. Another way is to
- increase the number of bits used.
-
- - Try something smarter than the bitwise complement for a
- second hash code.
-
- - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
- makes a case for http://murmurhash.googlepages.com/ instead
- of enhanced double-hashing.
-
- - Be sure to adjust the test that asserts the number of false
- positives isn't unreasonable.
-
-- Could round bits up to next power of two and use wrap instead
- of mod. This would cost a lot of bits on 32-bit platforms,
- though, and limit the bit-array to 8MB.
-
-- Should allow user to specify the hash codes, either as inputs
- to enhanced double hashing or for direct use.
-
-- Support for serialization.
-
-- Wrappers for combining filters.
-
-- Should we signal an error when inserting past the number of
- objects the filter is sized for? The filter will continue to
- work, just not very well.
-
-*/
-
-TUPLE: bloom-filter
-{ #hashes fixnum read-only }
-{ bits bit-array read-only }
-{ capacity fixnum read-only }
-{ count fixnum } ;
-
-ERROR: invalid-size size ;
-ERROR: invalid-error-rate error-rate ;
-ERROR: invalid-capacity capacity ;
-
-<PRIVATE
-
-:: bits-to-satisfy-error-rate ( hashes error objects -- size )
- objects hashes * neg error hashes recip ^ 1 swap - log /
- ceiling >integer ;
-
-! 100 hashes ought to be enough for anybody.
-: #hashes-range ( -- range )
- 100 [1,b] ;
-
-! { #hashes #bits }
-: identity-configuration ( -- 2seq )
- 0 max-array-capacity 2array ;
-
-: smaller-second ( 2seq 2seq -- 2seq )
- [ [ second ] bi@ <= ] most ;
-
-! If the number of hashes isn't positive, we haven't found
-! anything smaller than the identity configuration.
-: check-hashes ( 2seq -- 2seq )
- dup first 0 <= [ invalid-size ] when ;
-
-! The consensus on the tradeoff between increasing the number of
-! bits and increasing the number of hash functions seems to be
-! "go for the smallest number of bits", probably because most
-! implementations just generate one hash value and cheaply
-! mangle it into the number of hashes they need. I have not
-! seen any usage studies from the implementations that made this
-! tradeoff to support it, and I haven't done my own, but we'll
-! go with it anyway.
-: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
- [ #hashes-range identity-configuration ] 2dip '[
- dup _ _ bits-to-satisfy-error-rate
- 2array smaller-second
- ] reduce check-hashes first2 ;
-
-: check-capacity ( capacity -- capacity )
- dup 0 <= [ invalid-capacity ] when ;
-
-: check-error-rate ( error-rate -- error-rate )
- dup [ 0 after? ] [ 1 before? ] bi and
- [ invalid-error-rate ] unless ;
-
-PRIVATE>
-
-: <bloom-filter> ( error-rate capacity -- bloom-filter )
- [ check-error-rate ] [ check-capacity ] bi*
- [ size-bloom-filter <bit-array> ] keep
- 0 ! initially empty
- bloom-filter boa ;
-
-<PRIVATE
-
-! See "Bloom Filters in Probabilistic Verification" by Peter C.
-! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced
-! Double Hashing":
-! http://www.ccs.neu.edu/home/pete/research/bloom-filters-verification.html
-! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
-: combine-hashcodes ( index hash0 hash1 -- hash )
- { fixnum fixnum fixnum } declare
- [ [ [ 3 ^ ] [ - ] bi 6 /i ] keep ]
- [ fixnum*fast ] [ fixnum+fast ] tri* + abs ;
-
-: double-hashcodes ( object -- hash0 hash1 )
- hashcode >fixnum dup most-positive-fixnum bitxor >fixnum ;
-
-: increment-count ( bloom-filter -- )
- [ 1 + ] change-count drop ; inline
-
-: #hashes-and-length ( bloom-filter -- #hashes length )
- [ #hashes>> ] [ bits>> length ] bi ; inline
-
-: relevant-indices ( object bloom-filter -- n quot: ( elt -- n ) )
- [ double-hashcodes ] [ #hashes-and-length ] bi*
- -rotd '[ _ _ combine-hashcodes _ mod ] ; inline
-
-PRIVATE>
-
-TYPED: bloom-filter-insert ( object bloom-filter: bloom-filter -- )
- [ increment-count ]
- [ relevant-indices ]
- [ bits>> [ [ t ] 2dip set-nth-unsafe ] curry ]
- tri compose each-integer ;
-
-TYPED: bloom-filter-member? ( object bloom-filter: bloom-filter -- ? )
- [ relevant-indices ]
- [ bits>> [ nth-unsafe ] curry ]
- bi compose all-integers? ;
+++ /dev/null
-Bloom filters
+++ /dev/null
-collections
+++ /dev/null
-John Benediktsson
+++ /dev/null
-USING: byte-arrays checksums help.markup help.syntax kernel ;
-IN: cuckoo-filters
-
-HELP: cuckoo-insert
-{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
-{ $description "Insert the data into the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data was inserted." }
-{ $notes "Attempting to insert data twice will result in the hashed fingerprint of the data appearing twice and the " { $link cuckoo-filter } " size being incremented twice." } ;
-
-HELP: cuckoo-lookup
-{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
-{ $description "Lookup the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be a member. This is a probabilistic test, meaning there is a possibility of false positives." } ;
-
-HELP: cuckoo-delete
-{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } }
-{ $description "Remove the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be removed." } ;
-
-ARTICLE: "cuckoo-filters" "Cuckoo Filters"
-"Cuckoo Filters are probabilistic data structures similar to Bloom Filters that provides support for removing elements without significantly degrading space and performance."
-$nl
-"Instead of storing the elements themselves, it stores a fingerprint obtained by using a " { $link checksum } ". This allows for item removal without false negatives (assuming you do not try and remove an item not contained in the filter."
-$nl
-"For applications that store many items and target low false-positive rates, Cuckoo Filters can have a lower space overhead than Bloom Filters."
-$nl
-"More information is available in the paper by Andersen, Kaminsky, and Mitzenmacher titled \"Cuckoo Filter: Practically Better Than Bloom\":"
-$nl
-{ $url "http://www.pdl.cmu.edu/PDL-FTP/FS/cuckoo-conext2014.pdf" } ;
-
-ABOUT: "cuckoo-filters"
+++ /dev/null
-USING: accessors combinators combinators.short-circuit
-cuckoo-filters kernel math.parser sequences tools.test ;
-
-{ t 1 t t f 0 } [
- "factor" 100 <cuckoo-filter> {
- [ cuckoo-insert ]
- [ nip size>> ]
- [ cuckoo-lookup ]
- [ cuckoo-delete ]
- [ cuckoo-lookup ]
- [ nip size>> ]
- } 2cleave
-] unit-test
-
-{ 250,000 250,000 0 } [
- 250,000 <cuckoo-filter>
- 250,000 [ number>string ] { } map-integers
- [
- [
- {
- [ over cuckoo-lookup not ]
- [ over cuckoo-insert ]
- } 1&&
- ] count swap
- ]
- [ [ over cuckoo-lookup ] count swap ]
- [ [ over cuckoo-delete drop ] each ] tri
- size>>
-] unit-test
+++ /dev/null
-! Copyright (C) 2016 John Benediktsson
-! See http://factorcode.org/license.txt for BSD license
-
-USING: accessors alien alien.c-types alien.data arrays checksums
-checksums.sha combinators.short-circuit kernel locals math
-math.bitwise random sequences ;
-
-IN: cuckoo-filters
-
-<PRIVATE
-
-! The maximum number of times we kick down items/displace from
-! their buckets
-CONSTANT: max-cuckoo-count 500
-
-! The maximum load factor we allow before growing the capacity
-CONSTANT: max-load-factor 0.96
-
-! The number of fingerprint to store in each bucket
-CONSTANT: bucket-size 4
-
-: #buckets ( capacity -- #buckets )
- [ bucket-size /i next-power-of-2 ] keep
- over / bucket-size / max-load-factor > [ 2 * ] when ;
-
-: <cuckoo-buckets> ( capacity -- buckets )
- #buckets [ bucket-size f <array> ] replicate ;
-
-: hash-index ( hash -- fingerprint index )
- 4 over <displaced-alien> [ uint deref ] bi@ ;
-
-: alt-index ( fingerprint index -- alt-index )
- [ 0x5bd1e995 w* ] [ bitxor ] bi* ;
-
-: hash-indices ( bytes cuckoo-filter -- fingerprint index alt-index )
- checksum>> checksum-bytes hash-index 2dup alt-index ;
-
-: bucket-lookup ( fingerprint bucket -- ? )
- member? ;
-
-: bucket-insert ( fingerprint bucket -- ? )
- dup [ not ] find drop [ swap set-nth t ] [ 2drop f ] if* ;
-
-: bucket-delete ( fingerprint bucket -- ? )
- [ f ] 2dip [ index ] keep over [ set-nth t ] [ 3drop f ] if ;
-
-: bucket-swap ( fingerprint bucket -- fingerprint' )
- [ length random ] keep [ swap ] change-nth ;
-
-PRIVATE>
-
-TUPLE: cuckoo-filter buckets checksum size ;
-
-: <cuckoo-filter> ( capacity -- cuckoo-filter )
- <cuckoo-buckets> sha1 0 cuckoo-filter boa ;
-
-:: cuckoo-insert ( bytes cuckoo-filter -- ? )
- bytes cuckoo-filter hash-indices :> ( fp! i1 i2 )
- cuckoo-filter buckets>> :> buckets
- buckets length :> n
- {
- [ fp i1 n mod buckets nth bucket-insert ]
- [ fp i2 n mod buckets nth bucket-insert ]
- } 0|| [
- t
- ] [
- 2 random zero? i1 i2 ? :> i!
- max-cuckoo-count [
- drop
- fp i n mod buckets nth bucket-swap fp!
- fp i alt-index i!
-
- fp i n mod buckets nth bucket-insert
- ] find-integer >boolean
- ] if
- dup [ cuckoo-filter [ 1 + ] change-size drop ] when ;
-
-:: cuckoo-lookup ( bytes cuckoo-filter -- ? )
- bytes cuckoo-filter hash-indices :> ( fp i1 i2 )
- cuckoo-filter buckets>> :> buckets
- buckets length :> n
- {
- [ fp i1 n mod buckets nth bucket-lookup ]
- [ fp i2 n mod buckets nth bucket-lookup ]
- } 0|| ;
-
-:: cuckoo-delete ( bytes cuckoo-filter -- ? )
- bytes cuckoo-filter hash-indices :> ( fp i1 i2 )
- cuckoo-filter buckets>> :> buckets
- buckets length :> n
- {
- [ fp i1 n mod buckets nth bucket-delete ]
- [ fp i2 n mod buckets nth bucket-delete ]
- } 0||
- dup [ cuckoo-filter [ 1 - ] change-size drop ] when ;
+++ /dev/null
-Cuckoo filters
+++ /dev/null
-collections