]> gitweb.factorcode.org Git - factor.git/commitdiff
machine-learning.rebalancing: Add a way to rebalance an X,y dataset to N samples...
authorDoug Coleman <doug.coleman@gmail.com>
Sat, 17 Nov 2012 20:38:52 +0000 (12:38 -0800)
committerDoug Coleman <doug.coleman@gmail.com>
Sat, 17 Nov 2012 20:39:54 +0000 (12:39 -0800)
extra/machine-learning/rebalancing/authors.txt [new file with mode: 0644]
extra/machine-learning/rebalancing/rebalancing-tests.factor [new file with mode: 0644]
extra/machine-learning/rebalancing/rebalancing.factor [new file with mode: 0644]

diff --git a/extra/machine-learning/rebalancing/authors.txt b/extra/machine-learning/rebalancing/authors.txt
new file mode 100644 (file)
index 0000000..7c1b2f2
--- /dev/null
@@ -0,0 +1 @@
+Doug Coleman
diff --git a/extra/machine-learning/rebalancing/rebalancing-tests.factor b/extra/machine-learning/rebalancing/rebalancing-tests.factor
new file mode 100644 (file)
index 0000000..bc02dce
--- /dev/null
@@ -0,0 +1,19 @@
+! Copyright (C) 2012 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: assocs kernel machine-learning.rebalancing math
+math.statistics math.text.english sequences tools.test
+math.vectors ;
+IN: machine-learning.rebalancing.tests
+
+[ t ] [
+    { 1 1 1 2 } [ [ number>text ] map ] [ ] bi
+    100,000 balance-labels nip
+    histogram values first2 - abs 3,000 <
+] unit-test
+
+
+[ t ] [
+    { 1 1 1 2 } [ [ number>text ] map ] [ ] bi
+    { 1/10 9/10 } 100,000 skew-labels nip
+    histogram values { 10,000 90,000 } -.05 v~
+] unit-test
diff --git a/extra/machine-learning/rebalancing/rebalancing.factor b/extra/machine-learning/rebalancing/rebalancing.factor
new file mode 100644 (file)
index 0000000..c8c77f6
--- /dev/null
@@ -0,0 +1,42 @@
+! Copyright (C) 2012 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: arrays assocs fry kernel math math.functions
+math.statistics memoize random sequences sorting ;
+IN: machine-learning.rebalancing
+
+ERROR: probability-sum-not-one seq ;
+
+: check-probabilities ( seq -- seq )
+    dup sum 1.0 .00000000001 ~ [ probability-sum-not-one ] unless ;
+
+: equal-probabilities ( n -- array )
+    dup recip <array> ; inline
+
+MEMO: probabilities-seq ( seq -- seq' )
+    check-probabilities [ >float ] map cum-sum ;
+
+: probabilities-quot ( seq -- quot )
+    probabilities-seq
+    '[ _ random-unit '[ _ > ] find drop ] ; inline
+
+: stratified-sample ( stratified-sequences probability-sequence -- elt )
+    probabilities-quot call swap nth random ; inline
+
+: balance-labels ( X y n -- X' y' )
+    [
+        dup [ ] collect-index-by
+        values dup length equal-probabilities
+        '[
+            _ _ _ _ stratified-sample
+            '[ _ swap nth ] bi@ 2array
+        ]
+    ] dip swap replicate [ keys ] [ values ] bi ;
+
+: skew-labels ( X y probs n -- X' y' )
+    [
+        [ dup [ ] collect-index-by sort-keys values ] dip
+        '[
+            _ _ _ _ stratified-sample
+            '[ _ swap nth ] bi@ 2array
+        ]
+    ] dip swap replicate [ keys ] [ values ] bi ;