]> gitweb.factorcode.org Git - factor.git/commitdiff
reservoir-sampling: Sample k elements from an unknown number of elements with equal...
authorDoug Coleman <doug.coleman@gmail.com>
Sun, 27 Dec 2020 14:41:06 +0000 (08:41 -0600)
committerDoug Coleman <doug.coleman@gmail.com>
Sun, 27 Dec 2020 14:41:06 +0000 (08:41 -0600)
basis/reservoir-sampling/authors.txt [new file with mode: 0644]
basis/reservoir-sampling/reservoir-sampling-docs.factor [new file with mode: 0644]
basis/reservoir-sampling/reservoir-sampling-tests.factor [new file with mode: 0644]
basis/reservoir-sampling/reservoir-sampling.factor [new file with mode: 0644]

diff --git a/basis/reservoir-sampling/authors.txt b/basis/reservoir-sampling/authors.txt
new file mode 100644 (file)
index 0000000..7c1b2f2
--- /dev/null
@@ -0,0 +1 @@
+Doug Coleman
diff --git a/basis/reservoir-sampling/reservoir-sampling-docs.factor b/basis/reservoir-sampling/reservoir-sampling-docs.factor
new file mode 100644 (file)
index 0000000..47ab38d
--- /dev/null
@@ -0,0 +1,51 @@
+! Copyright (C) 2020 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.markup help.syntax kernel math multiline vectors ;
+IN: reservoir-sampling
+
+HELP: <reservoir-sampler>
+{ $values k: integer sampler: object }
+{ $description Creates an object that will hold k samples from everything it sees with equal probability. To show a reservoir-sampler an object, call \ reservoir-sample . } ;
+
+HELP: reservoir-sample
+{ $values obj: object sampler: object }
+{ $description Feeds a sample to a \ reservoir-sampler which will maintain a vector of samples with equal probability. This word is especially useful when you do not know how many objects will appear but wish to sample them with equal probability, such as in a stream with unknown length. }
+{ $unchecked-example
+    [=[ 
+        USING: prettyprint io strings math reservoir-sampling
+        kernel accessors io.streams.string ;
+        
+        "Nothing will fundamentally change." [
+            10 <reservoir-sampler>
+            [ [ read1 dup ] swap '[ dup 1string . _ reservoir-sample ] while ] keep nip sampled>> >string .
+        ] with-string-reader
+        ""
+    ]=]
+} ;
+
+HELP: reservoir-sample-iteration
+{ $values iteration: integer k: integer obj: object sampled: vector sampled': vector }
+{ $description Sample with equal probabilty without using a \ reservoir-sampler object. } ;
+
+HELP: reservoir-sampler
+{ $class-description The class of a reservoir sampler object. Create one with \ <reservoir-sampler> . } ;
+
+ARTICLE: "reservoir-sampling" "Reservoir Sampling"
+The { $vocab-link "reservoir-sampling" } vocabulary is a way to take k samples with equal probability from all the objects shown to the sampler. This means that you do not have to know how many objects the sampler will eventually see, and that the probability will still be equivalent.
+
+Create a sampler:
+{ $subsections
+    <reservoir-sampler>
+}
+
+Show it samples:
+{ $subsections
+    reservoir-sample
+}
+
+Reservoir sampling without an object: 
+{ $subsections
+    reservoir-sample-iteration
+} ;
+
+ABOUT: "reservoir-sampling"
diff --git a/basis/reservoir-sampling/reservoir-sampling-tests.factor b/basis/reservoir-sampling/reservoir-sampling-tests.factor
new file mode 100644 (file)
index 0000000..5c6c97d
--- /dev/null
@@ -0,0 +1,26 @@
+! Copyright (C) 2020 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors kernel random random.mersenne-twister
+reservoir-sampling tools.test ;
+IN: reservoir-sampling.tests
+
+{
+    T{ reservoir-sampler
+        { iteration 11 }
+        { k 4 }
+        { sampled V{ 1 2 1005 1004 } }
+    }
+} [
+    123 <mersenne-twister> [
+        4 <reservoir-sampler>
+        V{ 1 2 3 4 } clone >>sampled
+        4 >>iteration
+        1001 over reservoir-sample
+        1002 over reservoir-sample
+        1003 over reservoir-sample
+        1004 over reservoir-sample
+        1005 over reservoir-sample
+        1006 over reservoir-sample
+        1007 over reservoir-sample
+    ] with-random
+] unit-test
\ No newline at end of file
diff --git a/basis/reservoir-sampling/reservoir-sampling.factor b/basis/reservoir-sampling/reservoir-sampling.factor
new file mode 100644 (file)
index 0000000..a646fda
--- /dev/null
@@ -0,0 +1,33 @@
+! Copyright (C) 2020 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors kernel math math.functions random sequences ;
+IN: reservoir-sampling
+
+: reservoir-sample-iteration ( iteration k obj sampled -- sampled' )
+    pick over length > [
+        [ push ] keep 2nip
+    ] [
+        roll random roll dupd < [
+            swap [ set-nth ] keep
+        ] [
+           drop nip
+        ] if
+    ] if ;
+
+TUPLE: reservoir-sampler iteration k sampled ;
+: <reservoir-sampler> ( k -- sampler )
+    reservoir-sampler new
+        V{ } clone >>sampled
+        0 >>iteration
+        swap >>k ; inline
+
+: reservoir-sample ( obj sampler -- )
+    [ sampled>> length ] [ k>> ] [ [ 1 + ] change-iteration -rot ] tri < [
+        sampled>> push
+    ] [
+        [ ] [ iteration>> random dup ] [ k>> ] tri < [
+            swap sampled>> set-nth
+        ] [
+            3drop
+        ] if
+    ] if ;
\ No newline at end of file