]> gitweb.factorcode.org Git - factor.git/blob - extra/machine-learning/data-sets/data-sets.factor
http.download: fix for new changes
[factor.git] / extra / machine-learning / data-sets / data-sets.factor
1 ! Copyright (C) 2012 John Benediktsson, Doug Coleman
2 ! See https://factorcode.org/license.txt for BSD license
3
4 USING: accessors arrays ascii assocs byte-arrays combinators
5 concurrency.combinators csv grouping http.download images
6 images.viewer io io.directories io.encodings.binary
7 io.encodings.utf8 io.files io.launcher io.pathnames kernel math
8 math.parser namespaces sequences splitting ui.gadgets.panes ;
9
10 IN: machine-learning.data-sets
11
12 TUPLE: data-set
13     features targets
14     feature-names target-names
15     description ;
16
17 C: <data-set> data-set
18
19 <PRIVATE
20
21 : load-file ( name -- contents )
22     "resource:extra/machine-learning/data-sets/" prepend
23     utf8 file-contents ;
24
25 : load-tabular-file ( name -- lines )
26     load-file [ blank? ] trim split-lines
27     [ [ blank? ] split-when harvest ] map harvest ;
28
29 : numerify ( table -- data names )
30     unclip [ [ [ string>number ] map ] map ] dip ;
31
32 : load-table ( name -- data names )
33     load-tabular-file numerify ;
34
35 : load-table-csv ( name -- data names )
36     load-file string>csv numerify ;
37
38 PRIVATE>
39
40 : load-monks ( name -- data-set )
41     load-tabular-file
42     ! Omits the identifiers which are not so interesting.
43     [ but-last [ string>number ] map ] map
44     [ [ rest ] map ] [ [ first ] map ] bi
45     { "a1" "a2" "a3" "a4" "a5" "a6" }
46     { "no" "yes" }
47     "monks.names" load-file
48     <data-set> ;
49
50 : load-iris ( -- data-set )
51     "iris.csv" load-table-csv
52     [ [ unclip-last ] { } map>assoc unzip ] [ 2 tail ] bi*
53     {
54         "sepal length (cm)" "sepal width (cm)"
55         "petal length (cm)" "petal width (cm)"
56     } swap
57     "iris.rst" load-file
58     <data-set> ;
59
60 : load-linnerud ( -- data-set )
61     data-set new
62         "linnerud_exercise.csv" load-table
63         [ >>features ] [ >>feature-names ] bi*
64         "linnerud_physiological.csv" load-table
65         [ >>targets ] [ >>target-names ] bi*
66         "linnerud.rst" load-file >>description ;
67
68 : gzip-decompress-file ( path -- )
69     { "gzip" "-d" } swap suffix try-process ;
70
71 : mnist-data>array ( bytes -- seq )
72     16 tail-slice 28 28 * <groups> [
73         >byte-array <image>
74             swap >>bitmap
75             { 28 28 } >>dim
76             L >>component-order
77             ubyte-components >>component-type
78     ] map ;
79
80 : mnist-labels>array ( bytes -- seq )
81     8 tail-slice >array ;
82
83 : image-grid. ( image-seq -- )
84     [
85         [
86             <image-gadget> output-stream get write-gadget
87         ] each
88         output-stream get stream-nl
89     ] each ;
90
91 CONSTANT: datasets-path "resource:datasets/"
92
93 : load-mnist ( -- data-set )
94     datasets-path dup make-directories [
95         {
96             "https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/train-images-idx3-ubyte.gz"
97             "https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/train-labels-idx1-ubyte.gz"
98             "https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/t10k-images-idx3-ubyte.gz"
99             "https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/t10k-labels-idx1-ubyte.gz"
100         }
101         [ [ download-once ] parallel-each ]
102         [ [ dup file-stem file-exists? [ drop ] [ file-name gzip-decompress-file ] if ] each ]
103         [ [ file-stem binary file-contents ] map ] tri
104         first4 {
105             [ mnist-data>array ]
106             [ mnist-labels>array ]
107             [ mnist-data>array ]
108             [ mnist-labels>array ]
109         } spread 4array
110     ] with-directory ;