1 ! Copyright (C) 2018 Björn Lindqvist
2 ! See https://factorcode.org/license.txt for BSD license
3 USING: accessors assocs grouping.extras kernel math
4 math.functions math.statistics sequences sequences.extras
6 IN: machine-learning.decision-trees
8 ! Why convert the logarithm to base 2? I don't know.
9 : entropy2 ( seq -- e )
10 normalized-histogram values entropy 2 log / ;
12 : group-by-sorted ( seq quot: ( elt -- key ) -- groups )
13 [ sort-with ] keep group-by ; inline
15 : subsets-weighted-entropy ( data-target idx -- seq )
16 ! Group the data according to the given index.
17 '[ first _ swap nth ] group-by-sorted
18 ! Then unpack the partitioned groups of targets
19 '[ [ second ] map ] assoc-map values
20 ! Finally, calculate the weighted entropy for each group
21 [ [ entropy2 ] [ length ] bi * ] map-sum ; inline
23 :: average-gain ( dataset idx -- gain )
24 dataset targets>> :> targets
25 dataset features>> :> features
26 features targets zip :> features-targets
27 features-targets idx subsets-weighted-entropy :> weighted
29 targets entropy2 weighted features length / - ;
31 : highest-gain-index ( dataset -- idx )
32 dup feature-names>> length <iota> [