extra/machine-learning/data-sets/data-sets.factor

   1 ! Copyright (C) 2012 John Benediktsson, Doug Coleman
   2 ! See http://factorcode.org/license.txt for BSD license
   3
   4 USING: accessors arrays ascii assocs byte-arrays combinators
   5 combinators.short-circuit concurrency.combinators csv grouping
   6 http.client images images.viewer io io.directories
   7 io.encodings.binary io.encodings.utf8 io.files io.launcher
   8 io.pathnames kernel math math.parser namespaces sequences
   9 splitting ui.gadgets.panes ;
  10 IN: machine-learning.data-sets
  11
  12 TUPLE: data-set
  13     features targets
  14     feature-names target-names
  15     description ;
  16
  17 C: <data-set> data-set
  18
  19 <PRIVATE
  20
  21 : load-file ( name -- contents )
  22     "resource:extra/machine-learning/data-sets/" prepend
  23     utf8 file-contents ;
  24
  25 : load-tabular-file ( name -- lines )
  26     load-file [ blank? ] trim lines
  27     [ [ blank? ] split-when harvest ] map harvest ;
  28
  29 : numerify ( table -- data names )
  30     unclip [ [ [ string>number ] map ] map ] dip ;
  31
  32 : load-table ( name -- data names )
  33     load-tabular-file numerify ;
  34
  35 : load-table-csv ( name -- data names )
  36     load-file string>csv numerify ;
  37
  38 PRIVATE>
  39
  40 : load-monks ( name -- data-set )
  41     load-tabular-file
  42     ! Omits the identifiers which are not so interesting.
  43     [ but-last [ string>number ] map ] map
  44     [ [ rest ] map ] [ [ first ] map ] bi
  45     { "a1" "a2" "a3" "a4" "a5" "a6" }
  46     { "no" "yes" }
  47     "monks.names" load-file
  48     <data-set> ;
  49
  50 : load-iris ( -- data-set )
  51     "iris.csv" load-table-csv
  52     [ [ unclip-last ] { } map>assoc unzip ] [ 2 tail ] bi*
  53     {
  54         "sepal length (cm)" "sepal width (cm)"
  55         "petal length (cm)" "petal width (cm)"
  56     } swap
  57     "iris.rst" load-file
  58     <data-set> ;
  59
  60 : load-linnerud ( -- data-set )
  61     data-set new
  62         "linnerud_exercise.csv" load-table
  63         [ >>features ] [ >>feature-names ] bi*
  64         "linnerud_physiological.csv" load-table
  65         [ >>targets ] [ >>target-names ] bi*
  66         "linnerud.rst" load-file >>description ;
  67
  68 : download-to-directory ( url directory -- )
  69     dup make-directories
  70     [
  71         dup { [ download-name file-exists? ] [ file-stem file-exists? ] } 1|| [
  72             drop
  73         ] [
  74             download
  75         ] if
  76     ] with-directory ;
  77
  78 : gzip-decompress-file ( path -- )
  79     { "gzip" "-d" } swap suffix run-process drop ;
  80
  81 : mnist-data>array ( bytes -- seq )
  82     16 tail-slice 28 28 * <groups> [
  83         >byte-array <image>
  84             swap >>bitmap
  85             { 28 28 } >>dim
  86             L >>component-order
  87             ubyte-components >>component-type
  88     ] map ;
  89
  90 : mnist-labels>array ( bytes -- seq )
  91     8 tail-slice >array ;
  92
  93 : image-grid. ( image-seq -- )
  94     [
  95         [
  96             <image-gadget> output-stream get write-gadget
  97         ] each
  98         output-stream get stream-nl
  99     ] each ;
 100
 101 : load-mnist ( -- data-set )
 102     "resource:datasets" dup make-directories [
 103         {
 104             "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"
 105             "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"
 106             "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"
 107             "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"
 108         }
 109         [ [ "resource:datasets/" download-to-directory ] parallel-each ]
 110         [ [ dup file-stem file-exists? [ drop ] [ file-name gzip-decompress-file ] if ] each ]
 111         [ [ file-stem binary file-contents ] map ] tri
 112         first4 {
 113             [ mnist-data>array ]
 114             [ mnist-labels>array ]
 115             [ mnist-data>array ]
 116             [ mnist-labels>array ]
 117         } spread 4array
 118     ] with-directory ;