--- /dev/null
+! Copyright (C) 2012 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: assocs kernel machine-learning.rebalancing math
+math.statistics math.text.english sequences tools.test
+math.vectors ;
+IN: machine-learning.rebalancing.tests
+
+[ t ] [
+ { 1 1 1 2 } [ [ number>text ] map ] [ ] bi
+ 100,000 balance-labels nip
+ histogram values first2 - abs 3,000 <
+] unit-test
+
+
+[ t ] [
+ { 1 1 1 2 } [ [ number>text ] map ] [ ] bi
+ { 1/10 9/10 } 100,000 skew-labels nip
+ histogram values { 10,000 90,000 } -.05 v~
+] unit-test
--- /dev/null
+! Copyright (C) 2012 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: arrays assocs fry kernel math math.functions
+math.statistics memoize random sequences sorting ;
+IN: machine-learning.rebalancing
+
+ERROR: probability-sum-not-one seq ;
+
+: check-probabilities ( seq -- seq )
+ dup sum 1.0 .00000000001 ~ [ probability-sum-not-one ] unless ;
+
+: equal-probabilities ( n -- array )
+ dup recip <array> ; inline
+
+MEMO: probabilities-seq ( seq -- seq' )
+ check-probabilities [ >float ] map cum-sum ;
+
+: probabilities-quot ( seq -- quot )
+ probabilities-seq
+ '[ _ random-unit '[ _ > ] find drop ] ; inline
+
+: stratified-sample ( stratified-sequences probability-sequence -- elt )
+ probabilities-quot call swap nth random ; inline
+
+: balance-labels ( X y n -- X' y' )
+ [
+ dup [ ] collect-index-by
+ values dup length equal-probabilities
+ '[
+ _ _ _ _ stratified-sample
+ '[ _ swap nth ] bi@ 2array
+ ]
+ ] dip swap replicate [ keys ] [ values ] bi ;
+
+: skew-labels ( X y probs n -- X' y' )
+ [
+ [ dup [ ] collect-index-by sort-keys values ] dip
+ '[
+ _ _ _ _ stratified-sample
+ '[ _ swap nth ] bi@ 2array
+ ]
+ ] dip swap replicate [ keys ] [ values ] bi ;