]> gitweb.factorcode.org Git - factor.git/commitdiff
math.cardinality: adding a cardinality estimator.
authorJohn Benediktsson <mrjbq7@gmail.com>
Sat, 8 Sep 2012 21:00:36 +0000 (14:00 -0700)
committerJohn Benediktsson <mrjbq7@gmail.com>
Sat, 8 Sep 2012 21:00:36 +0000 (14:00 -0700)
extra/math/cardinality/authors.txt [new file with mode: 0644]
extra/math/cardinality/cardinality-docs.factor [new file with mode: 0644]
extra/math/cardinality/cardinality.factor [new file with mode: 0644]

diff --git a/extra/math/cardinality/authors.txt b/extra/math/cardinality/authors.txt
new file mode 100644 (file)
index 0000000..e091bb8
--- /dev/null
@@ -0,0 +1 @@
+John Benediktsson
diff --git a/extra/math/cardinality/cardinality-docs.factor b/extra/math/cardinality/cardinality-docs.factor
new file mode 100644 (file)
index 0000000..7cf4731
--- /dev/null
@@ -0,0 +1,12 @@
+! Copyright (C) 2012 John Benediktsson
+! See http://factorcode.org/license.txt for BSD license
+USING: help.markup help.syntax layouts math sequences ;
+IN: math.cardinality
+
+HELP: trailing-zeros
+{ $values { "m" number } { "n" number } }
+{ $description "Counts the number of trailing 0 bits in " { $snippet "m" } ", returning " { $link fixnum-bits } " if the number is zero." } ;
+
+HELP: estimate-cardinality
+{ $values { "seq" sequence } { "k" number } { "n" number } }
+{ $description "Estimates the number of unique elements in " { $snippet "seq" } "." $nl "The number " { $snippet "k" } " controls how many bits of hash to use, creating " { $snippet "2^k" } " buckets." } ;
diff --git a/extra/math/cardinality/cardinality.factor b/extra/math/cardinality/cardinality.factor
new file mode 100644 (file)
index 0000000..3976c46
--- /dev/null
@@ -0,0 +1,25 @@
+! Copyright (C) 2012 John Benediktsson
+! See http://factorcode.org/license.txt for BSD license
+USING: arrays kernel layouts locals math math.functions
+math.order math.statistics sequences ;
+IN: math.cardinality
+
+GENERIC: trailing-zeros ( m -- n )
+
+M: fixnum trailing-zeros
+    [ fixnum-bits ] [
+        0 [ over even? ] [ [ 2/ ] [ 1 + ] bi* ] while nip
+    ] if-zero ;
+
+:: estimate-cardinality ( seq k -- n )
+    k 2^                         :> num_buckets
+    num_buckets 0 <array>        :> max_zeros
+    seq [
+        hashcode >fixnum         :> h
+        h num_buckets 1 - bitand :> bucket
+        h k neg shift            :> bucket_hash
+        bucket max_zeros [
+            bucket_hash trailing-zeros max
+        ] change-nth
+    ] each
+    max_zeros [ mean 2 swap ^ ] [ length * ] bi 0.79402 * ;