1 ! Copyright (C) 2012 John Benediktsson, Doug Coleman
2 ! See http://factorcode.org/license.txt for BSD license
4 USING: accessors arrays ascii assocs byte-arrays combinators
5 combinators.short-circuit concurrency.combinators csv grouping
6 http.client images images.viewer io io.directories
7 io.encodings.binary io.encodings.utf8 io.files io.launcher
8 io.pathnames kernel math math.parser namespaces sequences
9 splitting ui.gadgets.panes ;
10 IN: machine-learning.data-sets
14 feature-names target-names
17 C: <data-set> data-set
21 : load-file ( name -- contents )
22 "resource:extra/machine-learning/data-sets/" prepend
25 : load-tabular-file ( name -- lines )
26 load-file [ blank? ] trim lines
27 [ [ blank? ] split-when harvest ] map harvest ;
29 : numerify ( table -- data names )
30 unclip [ [ [ string>number ] map ] map ] dip ;
32 : load-table ( name -- data names )
33 load-tabular-file numerify ;
35 : load-table-csv ( name -- data names )
36 load-file string>csv numerify ;
40 : load-monks ( name -- data-set )
42 ! Omits the identifiers which are not so interesting.
43 [ but-last [ string>number ] map ] map
44 [ [ rest ] map ] [ [ first ] map ] bi
45 { "a1" "a2" "a3" "a4" "a5" "a6" }
47 "monks.names" load-file
50 : load-iris ( -- data-set )
51 "iris.csv" load-table-csv
52 [ [ unclip-last ] { } map>assoc unzip ] [ 2 tail ] bi*
54 "sepal length (cm)" "sepal width (cm)"
55 "petal length (cm)" "petal width (cm)"
60 : load-linnerud ( -- data-set )
62 "linnerud_exercise.csv" load-table
63 [ >>features ] [ >>feature-names ] bi*
64 "linnerud_physiological.csv" load-table
65 [ >>targets ] [ >>target-names ] bi*
66 "linnerud.rst" load-file >>description ;
68 : download-to-directory ( url directory -- )
71 dup { [ download-name file-exists? ] [ file-stem file-exists? ] } 1|| [
78 : gzip-decompress-file ( path -- )
79 { "gzip" "-d" } swap suffix run-process drop ;
81 : mnist-data>array ( bytes -- seq )
82 16 tail-slice 28 28 * <groups> [
87 ubyte-components >>component-type
90 : mnist-labels>array ( bytes -- seq )
93 : image-grid. ( image-seq -- )
96 <image-gadget> output-stream get write-gadget
98 output-stream get stream-nl
101 : load-mnist ( -- data-set )
102 "resource:datasets" dup make-directories [
104 "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"
105 "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"
106 "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"
107 "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"
109 [ [ "resource:datasets/" download-to-directory ] parallel-each ]
110 [ [ dup file-stem file-exists? [ drop ] [ file-name gzip-decompress-file ] if ] each ]
111 [ [ file-stem binary file-contents ] map ] tri
114 [ mnist-labels>array ]
116 [ mnist-labels>array ]