extra/machine-learning/decision-trees/decision-trees.factor

   1 ! Copyright (C) 2018 Björn Lindqvist
   2 ! See https://factorcode.org/license.txt for BSD license
   3 USING: accessors assocs grouping.extras kernel math
   4 math.functions math.statistics sequences sequences.extras
   5 sorting ;
   6 IN: machine-learning.decision-trees
   7
   8 ! Why convert the logarithm to base 2? I don't know.
   9 : entropy2 ( seq -- e )
  10     normalized-histogram values entropy 2 log / ;
  11
  12 : group-by-sorted ( seq quot: ( elt -- key ) -- groups )
  13     [ sort-by ] keep group-by ; inline
  14
  15 : subsets-weighted-entropy ( data-target idx -- seq )
  16     ! Group the data according to the given index.
  17     '[ first _ swap nth ] group-by-sorted
  18     ! Then unpack the partitioned groups of targets
  19     '[ [ second ] map ] assoc-map values
  20     ! Finally, calculate the weighted entropy for each group
  21     [ [ entropy2 ] [ length ] bi * ] map-sum ; inline
  22
  23 :: average-gain ( dataset idx -- gain )
  24     dataset targets>> :> targets
  25     dataset features>> :> features
  26     features targets zip :> features-targets
  27     features-targets idx subsets-weighted-entropy :> weighted
  28
  29     targets entropy2 weighted features length / - ;
  30
  31 : highest-gain-index ( dataset -- idx )
  32     dup feature-names>> length <iota> [
  33         average-gain
  34     ] with map arg-max ;