-! Copyright (C) 2008 Doug Coleman, Michael Judge.
+! Copyright (C) 2008 Doug Coleman, Michael Judge, Loryn Jenkins.
! See http://factorcode.org/license.txt for BSD license.
-USING: assocs combinators generalizations kernel locals math
-math.functions math.order math.vectors sequences
-sequences.private sorting fry arrays grouping sets ;
+USING: accessors arrays assocs combinators
+combinators.short-circuit fry generalizations grouping kernel
+locals math math.functions math.order ranges math.vectors
+sequences sequences.private sorting ;
IN: math.statistics
: power-mean ( seq p -- x )
- [ '[ _ ^ ] map-sum ] [ [ length / ] [ recip ^ ] bi* ] 2bi ;
+ [ '[ _ ^ ] map-sum ] [ [ length / ] [ recip ^ ] bi* ] 2bi ; inline
+
+! Delta in degrees-of-freedom
+: mean-ddof ( seq ddof -- x )
+ [ [ sum ] [ length ] bi ] dip -
+ [ drop 0 ] [ / ] if-zero ; inline
: mean ( seq -- x )
- [ sum ] [ length ] bi / ;
+ 0 mean-ddof ; inline
+
+: meanest ( seq -- x )
+ [ mean ] keep [ - abs ] with infimum-by ;
+
+GENERIC: sum-of-squares ( seq -- x )
+M: object sum-of-squares [ sq ] map-sum ;
+M: iota sum-of-squares
+ n>> 1 - [ ] [ 1 + ] [ 1/2 + ] tri * * 3 / ;
+M: ranges:range sum-of-squares
+ dup { [ step>> 1 = ] [ from>> integer? ] } 1&& [
+ [ from>> ] [ length>> ] bi dupd +
+ [ <iota> sum-of-squares ] bi@ swap -
+ ] [ call-next-method ] if ;
+
+GENERIC: sum-of-cubes ( seq -- x )
+M: object sum-of-cubes [ 3 ^ ] map-sum ;
+M: iota sum-of-cubes sum sq ;
+M: ranges:range sum-of-cubes
+ dup { [ step>> 1 = ] [ from>> integer? ] } 1&& [
+ [ from>> ] [ length>> ] bi dupd +
+ [ <iota> sum-of-cubes ] bi@ swap -
+ ] [ call-next-method ] if ;
+
+GENERIC: sum-of-quads ( seq -- x )
+M: object sum-of-quads [ 4 ^ ] map-sum ;
+M: iota sum-of-quads
+ [let n>> 1 - :> n
+ n 0 > [
+ n
+ n 1 +
+ n 2 * 1 +
+ n sq 3 * n 3 * + 1 -
+ * * * 30 /
+ ] [ 0 ] if
+ ] ;
+M: ranges:range sum-of-quads
+ dup { [ step>> 1 = ] [ from>> integer? ] } 1&& [
+ [ from>> ] [ length>> ] bi dupd +
+ [ <iota> sum-of-quads ] bi@ swap -
+ ] [ call-next-method ] if ;
+
+: sum-of-squared-errors ( seq -- x )
+ [ mean ] keep [ - sq ] with map-sum ; inline
+
+: sum-of-absolute-errors ( seq -- x )
+ [ mean ] keep [ - ] with map-sum ; inline
: quadratic-mean ( seq -- x ) ! root-mean-square
- [ [ sq ] map-sum ] [ length ] bi / sqrt ;
+ [ sum-of-squares ] [ length ] bi / sqrt ; inline
: geometric-mean ( seq -- x )
- [ length ] [ product ] bi nth-root ;
+ [ [ log ] map-sum ] [ length ] bi /f e^ ; inline
: harmonic-mean ( seq -- x )
- [ recip ] map-sum recip ;
+ [ [ recip ] map-sum ] [ length swap / ] bi ; inline
: contraharmonic-mean ( seq -- x )
- [ [ sq ] map-sum ] [ sum ] bi / ;
+ [ sum-of-squares ] [ sum ] bi / ; inline
+
+<PRIVATE
+
+: trim-points ( p seq -- from to seq )
+ [ length [ * >integer ] keep over - ] keep ;
+
+PRIVATE>
+
+: trimmed-mean ( seq p -- x )
+ swap natural-sort trim-points <slice> mean ;
+
+: winsorized-mean ( seq p -- x )
+ swap natural-sort trim-points
+ [ <slice> ]
+ [ nip dupd nth <array> ]
+ [ [ 1 - ] dip nth <array> ] 3tri
+ surround mean ;
<PRIVATE
-:: ((kth-object)) ( seq k nth-quot exchange-quot quot: ( x y -- ? ) -- elt )
- #! Wirth's method, Algorithm's + Data structues = Programs p. 84
- #! The algorithm modifiers seq, so we clone it
+:: kth-object-impl ( seq k nth-quot exchange-quot quot: ( x y -- ? ) -- elt )
+ ! Wirth's method, Algorithm's + Data structues = Programs p. 84
k seq bounds-check 2drop
0 :> i!
0 :> j!
seq length 1 - :> m!
[ l m < ]
[
- k seq nth x!
+ k seq nth-unsafe x!
l i!
m j!
[ i j <= ]
j k < [ i l! ] when
k i < [ j m! ] when
] while
- k seq nth ; inline
+ k seq nth-unsafe ; inline
: (kth-object) ( seq k nth-quot exchange-quot quot: ( x y -- ? ) -- elt )
- [ clone ] 4dip ((kth-object)) ; inline
+ ! The algorithm modifies seq, so we clone it
+ [ >array ] 4dip kth-object-impl ; inline
: kth-object-unsafe ( seq k quot: ( x y -- ? ) -- elt )
[ [ nth-unsafe ] [ exchange-unsafe ] ] dip (kth-object) ; inline
: kth-objects-unsafe ( seq kths quot: ( x y -- ? ) -- elts )
- [ clone ] 2dip
- '[ [ nth-unsafe ] [ exchange-unsafe ] _ ((kth-object)) ] with map ; inline
+ '[ _ kth-object-unsafe ] with map ; inline
PRIVATE>
: kth-object ( seq k quot: ( x y -- ? ) -- elt )
- [ [ nth ] [ exchange ] ] dip (kth-object) ; inline
+ [ [ nth ] [ exchange ] ] dip (kth-object) ; inline
: kth-objects ( seq kths quot: ( x y -- ? ) -- elts )
- [ clone ] 2dip
- '[ [ nth ] [ exchange ] _ ((kth-object)) ] with map ; inline
+ '[ _ kth-object ] with map ; inline
: kth-smallests ( seq kths -- elts ) [ < ] kth-objects-unsafe ;
} case
] each ;
-: lower-median-index ( seq -- n )
+: lower-median-index ( seq -- n )
[ midpoint@ ]
[ length odd? [ 1 - ] unless ] bi ;
bi kth-smallests first2 ;
: median ( seq -- x )
- dup length odd?
- [ lower-median ] [ medians + 2 / ] if ;
+ dup length odd? [ lower-median ] [ medians + 2 / ] if ;
! quantile can be any n-tile. quartile is n = 4, percentile is n = 100
! a,b,c,d parameters, N - number of samples, q is quantile (1/2 for median, 1/4 for 1st quartile)
: frac ( x -- x' )
>fraction [ /mod nip ] keep / ; inline
-:: quantile-indices ( seq qs a b c d -- seq )
+:: quantile-indices ( seq qs a b -- seq )
qs [ [ a b seq length ] dip quantile-x ] map ;
:: qabcd ( y-floor y-ceiling x c d -- qabcd )
y-floor y-ceiling y-floor - c d x frac * + * + ;
:: quantile-abcd ( seq qs a b c d -- quantile )
- seq qs a b c d quantile-indices :> indices
- indices [ [ floor ] [ ceiling ] bi 2array ] map
+ seq qs a b quantile-indices :> indices
+ indices [ [ floor 0 max ] [ ceiling seq length 1 - min ] bi 2array ] map
concat :> index-pairs
seq index-pairs kth-smallests
: quartile ( seq -- seq' )
{ 1/4 1/2 3/4 } quantile5 ;
-<PRIVATE
-
-: (sequence>assoc) ( seq map-quot: ( x -- ..y ) insert-quot: ( ..y assoc -- ) assoc -- assoc )
- [ swap curry compose each ] keep ; inline
+: interquartile ( seq -- q1 q3 )
+ quartile [ first ] [ last ] bi ;
-PRIVATE>
+: interquartile-range ( seq -- n )
+ interquartile - ;
-: sequence>assoc! ( assoc seq map-quot: ( x -- ..y ) insert-quot: ( ..y assoc -- ) -- assoc )
- 4 nrot (sequence>assoc) ; inline
+: midhinge ( seq -- n )
+ interquartile + 2 / ;
-: sequence>assoc ( seq map-quot: ( x -- ..y ) insert-quot: ( ..y assoc -- ) exemplar -- assoc )
- clone (sequence>assoc) ; inline
+: trimean ( seq -- x )
+ quartile first3 [ 2 * ] dip + + 4 / ;
-: sequence>hashtable ( seq map-quot: ( x -- ..y ) insert-quot: ( ..y assoc -- ) -- hashtable )
- H{ } sequence>assoc ; inline
+: histogram-by! ( assoc seq quot: ( x -- bin ) -- hashtable )
+ rot [ '[ @ _ inc-at ] each ] keep ; inline
: histogram! ( hashtable seq -- hashtable )
- [ ] [ inc-at ] sequence>assoc! ;
+ [ ] histogram-by! ; inline
: histogram-by ( seq quot: ( x -- bin ) -- hashtable )
- [ inc-at ] sequence>hashtable ; inline
+ [ H{ } clone ] 2dip histogram-by! ; inline
: histogram ( seq -- hashtable )
[ ] histogram-by ;
: sorted-histogram ( seq -- alist )
histogram sort-values ;
-: collect-pairs ( seq quot: ( x -- v k ) -- hashtable )
- [ push-at ] sequence>hashtable ; inline
+: normalized-histogram ( seq -- alist )
+ [ histogram ] [ length ] bi '[ _ / ] assoc-map ;
-: collect-by ( seq quot: ( x -- x' ) -- hashtable )
- [ dup ] prepose collect-pairs ; inline
+: equal-probabilities ( n -- array )
+ dup recip <array> ; inline
: mode ( seq -- x )
- histogram >alist
- [ ] [ [ [ second ] bi@ > ] most ] map-reduce first ;
-
-ERROR: empty-sequence ;
+ histogram >alist [ second ] supremum-by first ;
: minmax ( seq -- min max )
- [
- empty-sequence
- ] [
- [ first dup ] keep [ [ min ] [ max ] bi-curry bi* ] each
- ] if-empty ;
+ [ first dup ] keep [ [ min ] [ max ] bi-curry bi* ] 1 each-from ;
: range ( seq -- x )
minmax swap - ;
-: sample-var ( seq -- x )
- #! normalize by N-1
- dup length 1 <= [
- drop 0
- ] [
- [ [ mean ] keep [ - sq ] with map-sum ]
- [ length 1 - ] bi /
- ] if ;
+: fivenum ( seq -- seq' )
+ [ quartile ] [ minmax ] bi [ prefix ] [ suffix ] bi* ;
-: full-var ( seq -- x )
- dup length 1 <= [
- drop 0
+: var-ddof ( seq n -- x )
+ 2dup [ length ] dip - 0 <= [
+ 2drop 0
] [
- [ [ mean ] keep [ - sq ] with map-sum ]
- [ length ] bi /
- ] if ;
+ [ [ sum-of-squared-errors ] [ length ] bi ] dip - /
+ ] if ; inline
+
+: population-var ( seq -- x ) 0 var-ddof ; inline
+
+: sample-var ( seq -- x ) 1 var-ddof ; inline
-ALIAS: var sample-var
+: std-ddof ( seq n -- x ) var-ddof sqrt ; inline
-: sample-std ( seq -- x ) sample-var sqrt ;
+: population-std ( seq -- x ) 0 std-ddof ; inline
-: full-std ( seq -- x ) full-var sqrt ;
+: sample-std ( seq -- x ) 1 std-ddof ; inline
ALIAS: std sample-std
-: mean-dev ( seq -- x ) dup mean v-n vabs mean ;
+: signal-to-noise ( seq -- x ) [ mean ] [ population-std ] bi / ;
-: median-dev ( seq -- x ) dup median v-n vabs mean ;
+: demean ( seq -- seq' ) dup mean v-n ;
-: sample-ste ( seq -- x ) [ sample-std ] [ length ] bi sqrt / ;
+: mean-dev ( seq -- x ) demean vabs mean ;
-: full-ste ( seq -- x ) [ full-std ] [ length ] bi sqrt / ;
+: demedian ( seq -- seq' ) dup median v-n ;
-ALIAS: ste sample-ste
+: median-dev ( seq -- x ) demedian vabs mean ;
-: ((r)) ( mean(x) mean(y) {x} {y} -- (r) )
- ! finds sigma((xi-mean(x))(yi-mean(y))
- 0 [ [ [ pick ] dip swap - ] bi@ * + ] 2reduce 2nip ;
+: ste-ddof ( seq n -- x ) '[ _ std-ddof ] [ length ] bi sqrt / ;
-: (r) ( mean(x) mean(y) {x} {y} sx sy -- r )
- * recip [ [ ((r)) ] keep length 1 - / ] dip * ;
+: population-ste ( seq -- x ) 0 ste-ddof ;
-: [r] ( {{x,y}...} -- mean(x) mean(y) {x} {y} sx sy )
- first2 [ [ [ mean ] bi@ ] 2keep ] 2keep [ std ] bi@ ;
+: sample-ste ( seq -- x ) 1 ste-ddof ;
-: r ( {{x,y}...} -- r )
- [r] (r) ;
+<PRIVATE
+: r-sum-diffs ( x-mean y-mean x-seq y-seq -- (r) )
+ ! finds sigma((xi-mean(x))(yi-mean(y))
+ 0 [ [ reach - ] bi@ * + ] 2reduce 2nip ;
+
+: (r) ( x-mean y-mean x-seq y-seq x-std y-std -- r )
+ * recip [ [ r-sum-diffs ] keep length 1 - / ] dip * ;
-: r^2 ( {{x,y}...} -- r )
- r sq ;
+: r-stats ( xy-pairs -- x-mean y-mean x-seq y-seq x-std y-std )
+ first2 [ [ [ mean ] bi@ ] 2keep ] 2keep [ population-std ] bi@ ;
+PRIVATE>
-: least-squares ( {{x,y}...} -- alpha beta )
- [r] { [ 2dup ] [ ] [ ] [ ] [ ] } spread
- ! stack is mean(x) mean(y) mean(x) mean(y) {x} {y} sx sy
+: pearson-r ( xy-pairs -- r ) r-stats (r) ;
+
+: least-squares ( xy-pairs -- alpha beta )
+ r-stats [ 2dup ] 4dip
+ ! stack is x-mean y-mean x-mean y-mean x-seq y-seq x-std y-std
[ (r) ] 2keep ! stack is mean(x) mean(y) r sx sy
swap / * ! stack is mean(x) mean(y) beta
[ swapd * - ] keep ;
-: cov ( {x} {y} -- cov )
- [ dup mean v-n ] bi@ v* mean ;
+: cov-ddof ( x-seq y-seq ddof -- cov )
+ [ [ demean ] bi@ v* ] dip mean-ddof ;
+
+: population-cov ( x-seq y-seq -- cov ) 0 cov-ddof ; inline
-: sample-corr ( {x} {y} -- corr )
- [ cov ] [ [ sample-var ] bi@ * sqrt ] 2bi / ;
+: sample-cov ( x-seq y-seq -- cov ) 1 cov-ddof ; inline
-: full-corr ( {x} {y} -- corr )
- [ cov ] [ [ full-var ] bi@ * sqrt ] 2bi / ;
+: corr-ddof ( x-seq y-seq n -- corr )
+ [ [ population-cov ] ] dip
+ '[ [ _ var-ddof ] bi@ * sqrt ] 2bi / ;
-ALIAS: corr sample-corr
+: population-corr ( x-seq y-seq -- corr ) 0 corr-ddof ; inline
+
+: sample-corr ( x-seq y-seq -- corr ) 1 corr-ddof ; inline
: cum-sum ( seq -- seq' )
- 0 swap [ + dup ] map nip ;
+ 0 [ + ] accumulate* ;
+
+: cum-sum0 ( seq -- seq' )
+ 0 [ + ] accumulate nip ;
: cum-product ( seq -- seq' )
- 1 swap [ * dup ] map nip ;
+ 1 [ * ] accumulate* ;
+
+: cum-product1 ( seq -- seq' )
+ 1 [ * ] accumulate nip ;
+
+: cum-mean ( seq -- seq' )
+ 0 swap [ [ + dup ] dip 1 + / ] map-index nip ;
+
+: cum-count ( seq quot: ( elt -- ? ) -- seq' )
+ [ 0 ] dip '[ @ [ 1 + ] when ] accumulate* ; inline
: cum-min ( seq -- seq' )
- [ ?first ] keep [ min dup ] map nip ;
+ dup ?first [ min ] accumulate* ;
: cum-max ( seq -- seq' )
- [ ?first ] keep [ max dup ] map nip ;
+ dup ?first [ max ] accumulate* ;
-: entropy ( seq -- n )
- dup members [ [ = ] curry count ] with map
- dup sum v/n dup [ log ] map v* sum neg ;
+: entropy ( probabilities -- n )
+ dup sum '[ _ / dup log * ] map-sum neg ;
+
+: maximum-entropy ( probabilities -- n )
+ length log ;
+
+: normalized-entropy ( probabilities -- n )
+ [ entropy ] [ maximum-entropy ] bi / ;
: binary-entropy ( p -- h )
[ dup log * ] [ 1 swap - dup log * ] bi + neg 2 log / ;
: standardize ( u -- v )
- [ dup mean v-n ] [ std ] bi v/n ;
+ [ demean ] [ sample-std ] bi [ v/n ] unless-zero ;
+
+: standardize-2d ( u -- v )
+ flip [ standardize ] map flip ;
: differences ( u -- v )
- [ 1 tail-slice ] keep [ - ] 2map ;
+ [ rest-slice ] keep v- ;
: rescale ( u -- v )
dup minmax over - [ v-n ] [ v/n ] bi* ;
+
+: rankings ( histogram -- assoc )
+ sort-keys 0 swap [ rot [ + ] keep swapd ] H{ } assoc-map-as nip ;
+
+: rank-values ( seq -- seq' )
+ dup histogram rankings '[ _ at ] map ;
+
+: z-score ( seq -- n )
+ [ demean ] [ sample-std ] bi v/n ;
+
+: dcg ( scores -- dcg )
+ dup length 1 + 2 swap [a..b] [ log 2 log /f ] map v/ sum ;
+
+: ndcg ( scores -- ndcg )
+ [ 0.0 ] [
+ dup dcg [
+ drop 0.0
+ ] [
+ swap natural-sort <reversed> dcg /f
+ ] if-zero
+ ] if-empty ;