]> gitweb.factorcode.org Git - factor.git/commitdiff
machine-learning.data-sets: load commonly used test data sets.
authorJohn Benediktsson <mrjbq7@gmail.com>
Tue, 4 Dec 2012 18:00:02 +0000 (10:00 -0800)
committerJohn Benediktsson <mrjbq7@gmail.com>
Tue, 4 Dec 2012 18:00:02 +0000 (10:00 -0800)
extra/machine-learning/data-sets/data-sets.factor [new file with mode: 0644]
extra/machine-learning/data-sets/iris.csv [new file with mode: 0644]
extra/machine-learning/data-sets/iris.rst [new file with mode: 0644]

diff --git a/extra/machine-learning/data-sets/data-sets.factor b/extra/machine-learning/data-sets/data-sets.factor
new file mode 100644 (file)
index 0000000..d7172e9
--- /dev/null
@@ -0,0 +1,34 @@
+! Copyright (C) 2012 John Benediktsson
+! See http://factorcode.org/license.txt for BSD license
+
+USING: assocs csv io.encodings.utf8 io.files kernel math.parser
+sequences ;
+
+IN: machine-learning.data-sets
+
+TUPLE: data-set data target target-names description
+feature-names ;
+
+C: <data-set> data-set
+
+<PRIVATE
+
+: load-file ( name -- contents )
+    "resource:extra/machine-learning/data-sets/" prepend
+    utf8 file-contents ;
+
+PRIVATE>
+
+: load-iris ( -- data-set )
+    "iris.csv" load-file string>csv unclip [
+        [
+            unclip-last
+            [ [ string>number ] map ]
+            [ string>number ] bi*
+        ] { } map>assoc unzip
+    ] [ 2 tail ] bi*
+    "iris.rst" load-file
+    {
+        "sepal length (cm)" "sepal width (cm)"
+        "petal length (cm)" "petal width (cm)"
+    } <data-set> ;
diff --git a/extra/machine-learning/data-sets/iris.csv b/extra/machine-learning/data-sets/iris.csv
new file mode 100644 (file)
index 0000000..93fca4d
--- /dev/null
@@ -0,0 +1,151 @@
+150,4,setosa,versicolor,virginica
+5.1,3.5,1.4,0.2,0
+4.9,3.0,1.4,0.2,0
+4.7,3.2,1.3,0.2,0
+4.6,3.1,1.5,0.2,0
+5.0,3.6,1.4,0.2,0
+5.4,3.9,1.7,0.4,0
+4.6,3.4,1.4,0.3,0
+5.0,3.4,1.5,0.2,0
+4.4,2.9,1.4,0.2,0
+4.9,3.1,1.5,0.1,0
+5.4,3.7,1.5,0.2,0
+4.8,3.4,1.6,0.2,0
+4.8,3.0,1.4,0.1,0
+4.3,3.0,1.1,0.1,0
+5.8,4.0,1.2,0.2,0
+5.7,4.4,1.5,0.4,0
+5.4,3.9,1.3,0.4,0
+5.1,3.5,1.4,0.3,0
+5.7,3.8,1.7,0.3,0
+5.1,3.8,1.5,0.3,0
+5.4,3.4,1.7,0.2,0
+5.1,3.7,1.5,0.4,0
+4.6,3.6,1.0,0.2,0
+5.1,3.3,1.7,0.5,0
+4.8,3.4,1.9,0.2,0
+5.0,3.0,1.6,0.2,0
+5.0,3.4,1.6,0.4,0
+5.2,3.5,1.5,0.2,0
+5.2,3.4,1.4,0.2,0
+4.7,3.2,1.6,0.2,0
+4.8,3.1,1.6,0.2,0
+5.4,3.4,1.5,0.4,0
+5.2,4.1,1.5,0.1,0
+5.5,4.2,1.4,0.2,0
+4.9,3.1,1.5,0.1,0
+5.0,3.2,1.2,0.2,0
+5.5,3.5,1.3,0.2,0
+4.9,3.1,1.5,0.1,0
+4.4,3.0,1.3,0.2,0
+5.1,3.4,1.5,0.2,0
+5.0,3.5,1.3,0.3,0
+4.5,2.3,1.3,0.3,0
+4.4,3.2,1.3,0.2,0
+5.0,3.5,1.6,0.6,0
+5.1,3.8,1.9,0.4,0
+4.8,3.0,1.4,0.3,0
+5.1,3.8,1.6,0.2,0
+4.6,3.2,1.4,0.2,0
+5.3,3.7,1.5,0.2,0
+5.0,3.3,1.4,0.2,0
+7.0,3.2,4.7,1.4,1
+6.4,3.2,4.5,1.5,1
+6.9,3.1,4.9,1.5,1
+5.5,2.3,4.0,1.3,1
+6.5,2.8,4.6,1.5,1
+5.7,2.8,4.5,1.3,1
+6.3,3.3,4.7,1.6,1
+4.9,2.4,3.3,1.0,1
+6.6,2.9,4.6,1.3,1
+5.2,2.7,3.9,1.4,1
+5.0,2.0,3.5,1.0,1
+5.9,3.0,4.2,1.5,1
+6.0,2.2,4.0,1.0,1
+6.1,2.9,4.7,1.4,1
+5.6,2.9,3.6,1.3,1
+6.7,3.1,4.4,1.4,1
+5.6,3.0,4.5,1.5,1
+5.8,2.7,4.1,1.0,1
+6.2,2.2,4.5,1.5,1
+5.6,2.5,3.9,1.1,1
+5.9,3.2,4.8,1.8,1
+6.1,2.8,4.0,1.3,1
+6.3,2.5,4.9,1.5,1
+6.1,2.8,4.7,1.2,1
+6.4,2.9,4.3,1.3,1
+6.6,3.0,4.4,1.4,1
+6.8,2.8,4.8,1.4,1
+6.7,3.0,5.0,1.7,1
+6.0,2.9,4.5,1.5,1
+5.7,2.6,3.5,1.0,1
+5.5,2.4,3.8,1.1,1
+5.5,2.4,3.7,1.0,1
+5.8,2.7,3.9,1.2,1
+6.0,2.7,5.1,1.6,1
+5.4,3.0,4.5,1.5,1
+6.0,3.4,4.5,1.6,1
+6.7,3.1,4.7,1.5,1
+6.3,2.3,4.4,1.3,1
+5.6,3.0,4.1,1.3,1
+5.5,2.5,4.0,1.3,1
+5.5,2.6,4.4,1.2,1
+6.1,3.0,4.6,1.4,1
+5.8,2.6,4.0,1.2,1
+5.0,2.3,3.3,1.0,1
+5.6,2.7,4.2,1.3,1
+5.7,3.0,4.2,1.2,1
+5.7,2.9,4.2,1.3,1
+6.2,2.9,4.3,1.3,1
+5.1,2.5,3.0,1.1,1
+5.7,2.8,4.1,1.3,1
+6.3,3.3,6.0,2.5,2
+5.8,2.7,5.1,1.9,2
+7.1,3.0,5.9,2.1,2
+6.3,2.9,5.6,1.8,2
+6.5,3.0,5.8,2.2,2
+7.6,3.0,6.6,2.1,2
+4.9,2.5,4.5,1.7,2
+7.3,2.9,6.3,1.8,2
+6.7,2.5,5.8,1.8,2
+7.2,3.6,6.1,2.5,2
+6.5,3.2,5.1,2.0,2
+6.4,2.7,5.3,1.9,2
+6.8,3.0,5.5,2.1,2
+5.7,2.5,5.0,2.0,2
+5.8,2.8,5.1,2.4,2
+6.4,3.2,5.3,2.3,2
+6.5,3.0,5.5,1.8,2
+7.7,3.8,6.7,2.2,2
+7.7,2.6,6.9,2.3,2
+6.0,2.2,5.0,1.5,2
+6.9,3.2,5.7,2.3,2
+5.6,2.8,4.9,2.0,2
+7.7,2.8,6.7,2.0,2
+6.3,2.7,4.9,1.8,2
+6.7,3.3,5.7,2.1,2
+7.2,3.2,6.0,1.8,2
+6.2,2.8,4.8,1.8,2
+6.1,3.0,4.9,1.8,2
+6.4,2.8,5.6,2.1,2
+7.2,3.0,5.8,1.6,2
+7.4,2.8,6.1,1.9,2
+7.9,3.8,6.4,2.0,2
+6.4,2.8,5.6,2.2,2
+6.3,2.8,5.1,1.5,2
+6.1,2.6,5.6,1.4,2
+7.7,3.0,6.1,2.3,2
+6.3,3.4,5.6,2.4,2
+6.4,3.1,5.5,1.8,2
+6.0,3.0,4.8,1.8,2
+6.9,3.1,5.4,2.1,2
+6.7,3.1,5.6,2.4,2
+6.9,3.1,5.1,2.3,2
+5.8,2.7,5.1,1.9,2
+6.8,3.2,5.9,2.3,2
+6.7,3.3,5.7,2.5,2
+6.7,3.0,5.2,2.3,2
+6.3,2.5,5.0,1.9,2
+6.5,3.0,5.2,2.0,2
+6.2,3.4,5.4,2.3,2
+5.9,3.0,5.1,1.8,2
diff --git a/extra/machine-learning/data-sets/iris.rst b/extra/machine-learning/data-sets/iris.rst
new file mode 100644 (file)
index 0000000..0e918f8
--- /dev/null
@@ -0,0 +1,59 @@
+Iris Plants Database
+
+Notes
+-----
+Data Set Characteristics:
+    :Number of Instances: 150 (50 in each of three classes)
+    :Number of Attributes: 4 numeric, predictive attributes and the class
+    :Attribute Information:
+        - sepal length in cm
+        - sepal width in cm
+        - petal length in cm
+        - petal width in cm
+        - class:
+                - Iris-Setosa
+                - Iris-Versicolour
+                - Iris-Virginica
+    :Summary Statistics:
+    ============== ==== ==== ======= ===== ====================
+                    Min  Max   Mean    SD   Class Correlation
+    ============== ==== ==== ======= ===== ====================
+    sepal length:   4.3  7.9   5.84   0.83    0.7826
+    sepal width:    2.0  4.4   3.05   0.43   -0.4194
+    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
+    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)
+    ============== ==== ==== ======= ===== ====================
+    :Missing Attribute Values: None
+    :Class Distribution: 33.3% for each of 3 classes.
+    :Creator: R.A. Fisher
+    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
+    :Date: July, 1988
+
+This is a copy of UCI ML iris datasets.
+http://archive.ics.uci.edu/ml/datasets/Iris
+
+The famous Iris database, first used by Sir R.A Fisher
+
+This is perhaps the best known database to be found in the
+pattern recognition literature.  Fisher's paper is a classic in the field and
+is referenced frequently to this day.  (See Duda & Hart, for example.)  The
+data set contains 3 classes of 50 instances each, where each class refers to a
+type of iris plant.  One class is linearly separable from the other 2; the
+latter are NOT linearly separable from each other.
+
+References
+----------
+   - Fisher,R.A. "The use of multiple measurements in taxonomic problems"
+     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
+     Mathematical Statistics" (John Wiley, NY, 1950).
+   - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
+     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
+   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
+     Structure and Classification Rule for Recognition in Partially Exposed
+     Environments".  IEEE Transactions on Pattern Analysis and Machine
+     Intelligence, Vol. PAMI-2, No. 1, 67-71.
+   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions
+     on Information Theory, May 1972, 431-433.
+   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II
+     conceptual clustering system finds 3 classes in the data.
+   - Many, many more ...