1 ! Copyright (C) 2012 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: arrays assocs fry kernel math math.functions
4 math.statistics memoize random sequences sorting ;
5 IN: machine-learning.rebalancing
7 ERROR: probability-sum-not-one seq ;
9 : check-probabilities ( seq -- seq )
10 dup sum 1.0 .00000000001 ~ [ throw-probability-sum-not-one ] unless ;
12 : equal-probabilities ( n -- array )
13 dup recip <array> ; inline
15 MEMO: probabilities-seq ( seq -- seq' )
16 check-probabilities [ >float ] map cum-sum ;
18 : probabilities-quot ( seq -- quot )
20 '[ _ random-unit '[ _ > ] find drop ] ; inline
22 : stratified-sample ( stratified-sequences probability-sequence -- elt )
23 probabilities-quot call swap nth random ; inline
25 : stratified-samples ( stratified-sequences probability-sequence n -- elt )
26 [ '[ _ _ stratified-sample ] ] dip swap replicate ;
28 : equal-stratified-sample ( stratified-sequences -- elt )
29 random random ; inline
31 : balance-labels ( X y n -- X' y' )
33 dup [ ] collect-index-by
35 _ _ _ equal-stratified-sample
36 '[ _ swap nth ] bi@ 2array
38 ] dip swap replicate [ keys ] [ values ] bi ;
40 : skew-labels ( X y probs n -- X' y' )
42 [ dup [ ] collect-index-by sort-keys values ] dip
44 _ _ _ _ stratified-sample
45 '[ _ swap nth ] bi@ 2array
47 ] dip swap replicate [ keys ] [ values ] bi ;