extra/bloom-filters/bloom-filters.factor

   1 ! Copyright (C) 2009 Alec Berryman.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors arrays bit-arrays fry infix kernel layouts locals math
   4 math.functions multiline sequences ;
   5 IN: bloom-filters
   6
   7 FROM: math.ranges => [1,b] [0,b) ;
   8 FROM: math.intervals => (a,b) interval-contains? ;
   9
  10 /*
  11
  12 TODO:
  13
  14 - The false positive rate is 10x what it should be, based on informal testing.
  15   Better object hashes or a better method of generating extra hash codes would
  16   help.  Another way is to increase the number of bits used.
  17
  18   - Try something smarter than the bitwise complement for a second hash code.
  19
  20   - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
  21     makes a case for http://murmurhash.googlepages.com/ instead of enhanced
  22     double-hashing.
  23
  24   - Be sure to adjust the test that asserts the number of false positives isn't
  25     unreasonable.
  26
  27 - Could round bits up to next power of two and use wrap instead of mod.  This
  28   would cost a lot of bits on 32-bit platforms, though, and limit the bit-array
  29   to 8MB.
  30
  31 - Should allow user to specify the hash codes, either as inputs to enhanced
  32   double hashing or for direct use.
  33
  34 - Support for serialization.
  35
  36 - Wrappers for combining filters.
  37
  38 - Should we signal an error when inserting past the number of objects the filter
  39   is sized for?  The filter will continue to work, just not very well.
  40
  41 */
  42
  43 TUPLE: bloom-filter
  44 { n-hashes fixnum read-only }
  45 { bits bit-array read-only }
  46 { maximum-n-objects fixnum read-only }
  47 { current-n-objects fixnum } ;
  48
  49 ERROR: capacity-error ;
  50 ERROR: invalid-error-rate ;
  51 ERROR: invalid-n-objects ;
  52
  53 <PRIVATE
  54
  55 ! infix doesn't like ^
  56 : pow ( x y -- z )
  57     ^ ; inline
  58
  59 :: bits-to-satisfy-error-rate ( hashes error objects -- size )
  60     [infix -(objects * hashes) / log(1 - pow(error, (1/hashes))) infix]
  61     ceiling >integer ;
  62
  63 ! 100 hashes ought to be enough for anybody.
  64 : n-hashes-range ( -- range )
  65     100 [1,b] ;
  66
  67 ! { n-hashes n-bits }
  68 : identity-configuration ( -- 2seq )
  69     0 max-array-capacity 2array ;
  70
  71 : smaller-second ( 2seq 2seq -- 2seq )
  72     [ [ second ] bi@ <= ] most ;
  73
  74 ! If the number of hashes isn't positive, we haven't found anything smaller than the
  75 ! identity configuration.
  76 : validate-sizes ( 2seq -- )
  77     first 0 <= [ capacity-error ] when ;
  78
  79 ! The consensus on the tradeoff between increasing the number of bits and
  80 ! increasing the number of hash functions seems to be "go for the smallest
  81 ! number of bits", probably because most implementations just generate one hash
  82 ! value and cheaply mangle it into the number of hashes they need.  I have not
  83 ! seen any usage studies from the implementations that made this tradeoff to
  84 ! support it, and I haven't done my own, but we'll go with it anyway.
  85 !
  86 : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
  87     [ n-hashes-range identity-configuration ] 2dip
  88     '[ dup [ _ _ bits-to-satisfy-error-rate ]
  89        call 2array smaller-second ]
  90     reduce
  91     dup validate-sizes
  92     first2 ;
  93
  94 : validate-n-objects ( n-objects -- )
  95     0 <= [ invalid-n-objects ] when ;
  96
  97 : valid-error-rate-interval ( -- interval )
  98     0 1 (a,b) ;
  99
 100 : validate-error-rate ( error-rate -- )
 101     valid-error-rate-interval interval-contains?
 102     [ invalid-error-rate ] unless ;
 103
 104 : validate-constraints ( error-rate n-objects -- )
 105     validate-n-objects validate-error-rate ;
 106
 107 PRIVATE>
 108
 109 : <bloom-filter> ( error-rate number-objects -- bloom-filter )
 110     [ validate-constraints ] 2keep
 111     [ size-bloom-filter <bit-array> ] keep
 112     0 ! initially empty
 113     bloom-filter boa ;
 114
 115 <PRIVATE
 116
 117 ! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and
 118 ! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing":
 119 ! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
 120 :: enhanced-double-hash ( index hash0 hash1 -- hash )
 121     [infix hash0 + (index * hash1) + ((pow(index, 3) - index) / 6) infix] ;
 122
 123 : enhanced-double-hashes ( hash0 hash1 n -- seq )
 124     [0,b)
 125     [ '[ _ _ enhanced-double-hash ] ] dip
 126     swap map ;
 127
 128 ! Make sure it's a fixnum here to speed up double-hashing.
 129 : hashcodes-from-hashcode ( n -- n n )
 130     dup most-positive-fixnum bitxor ;
 131
 132 : hashcodes-from-object ( obj -- n n )
 133     hashcode abs hashcodes-from-hashcode ;
 134
 135 : set-indices ( indices bit-array -- )
 136     [ [ drop t ] change-nth ] curry each ;
 137
 138 : increment-n-objects ( bloom-filter -- )
 139     [ 1 + ] change-current-n-objects drop ;
 140
 141 : n-hashes-and-length ( bloom-filter -- n-hashes length )
 142     [ n-hashes>> ] [ bits>> length ] bi ;
 143
 144 : relevant-indices ( value bloom-filter -- indices )
 145     [ hashcodes-from-object ] [ n-hashes-and-length ] bi*
 146     [ enhanced-double-hashes ] dip '[ _ mod ] map ;
 147
 148 PRIVATE>
 149
 150 : bloom-filter-insert ( object bloom-filter -- )
 151     [ increment-n-objects ]
 152     [ relevant-indices ]
 153     [ bits>> set-indices ]
 154     tri ;
 155
 156 : bloom-filter-member? ( object bloom-filter -- ? )
 157     [ relevant-indices ] keep
 158     bits>> nths [ ] all? ;