├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── examples ├── classification_task.rs └── regression_task.rs └── src ├── baseline ├── mod.rs ├── naive_bayes_classifier.rs └── naive_linear_regression.rs ├── dataset.rs ├── error.rs ├── lib.rs ├── measure_accumulator.rs ├── openml_api ├── api_types.rs ├── file_lock.rs ├── impls_from_json.rs ├── impls_from_openml.rs ├── mod.rs └── web_access.rs ├── prelude.rs ├── procedures ├── frozen_sets.rs └── mod.rs └── tasks ├── mod.rs ├── supervised_classification.rs └── supervised_regression.rs /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | /target 3 | **/*.rs.bk 4 | Cargo.lock 5 | 6 | .idea 7 | 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: 3 | - stable 4 | - beta 5 | - nightly 6 | matrix: 7 | allow_failures: 8 | - rust: nightly 9 | fast_finish: true 10 | 11 | cache: cargo 12 | 13 | # Taken from Trust 14 | before_cache: 15 | # Travis can't cache files that are not readable by "others" 16 | - chmod -R a+r $HOME/.cargo 17 | 18 | # Only check that the project builds. Do not run tests to avoid 19 | # straining the openml server 20 | script: 21 | - cargo build --verbose --all 22 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "openml" 3 | version = "0.1.2" 4 | authors = ["Martin Billinger "] 5 | 6 | description = "A rust interface to [OpenML](http://openml.org/)." 7 | keywords = ["machine-learning", "openml", "data", "dataset"] 8 | categories = ["science"] 9 | 10 | repository = "https://github.com/mbillingr/openml-rust" 11 | readme = "README.md" 12 | 13 | license = "MIT/Apache-2.0" 14 | 15 | [badges] 16 | travis-ci = { repository = "mbillingr/openml-rust" } 17 | 18 | [dev-dependencies] 19 | simple_logger = "0.5" 20 | time = "0.1" 21 | 22 | [dependencies] 23 | app_dirs = "1.2.1" 24 | arff = "0.3" 25 | fs2 = "0.4.3" 26 | futures = "0.1" 27 | hyper = "0.11" 28 | hyper-tls = "0.1" 29 | log = "0.4" 30 | num-traits = "0.2" 31 | serde = "1.0" 32 | serde_derive = "1.0" 33 | serde_json = "1.0" 34 | tokio-core = "0.1" 35 | 36 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Martin Billinger 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # openml-rust 2 | A rust interface to [OpenML](http://openml.org/). 3 | 4 | The aim of this crate is to give rust code access to Machine Learning data hosted by OpenML. 5 | Thus, Machine Learning algorithms developed in Rust can be easily applied to state-of-the-art 6 | data sets and their performance compared to existing implementations in a reproducable way. 7 | 8 | ## Example 9 | 10 | ```rust 11 | extern crate openml; 12 | 13 | use openml::prelude::*; 14 | use openml::{PredictiveAccuracy, SupervisedClassification}; 15 | use openml::baseline::NaiveBayesClassifier; 16 | 17 | fn main() { 18 | // Load "Supervised Classification on iris" task (https://www.openml.org/t/59) 19 | let task = SupervisedClassification::from_openml(59).unwrap(); 20 | 21 | println!("Task: {}", task.name()); 22 | 23 | // run the task 24 | let result: PredictiveAccuracy<_> = task.run(|train, test| { 25 | // train classifier 26 | let nbc: NaiveBayesClassifier = train 27 | .map(|(x, y)| (x, y)) 28 | .collect(); 29 | 30 | // test classifier 31 | let y_out: Vec<_> = test 32 | .map(|x| nbc.predict(x)) 33 | .collect(); 34 | 35 | Box::new(y_out.into_iter()) 36 | }); 37 | 38 | println!("Classification Accuracy: {}", result.result()); 39 | } 40 | ``` 41 | 42 | ## Goals 43 | - [x] get data sets 44 | - [x] get tasks 45 | - Runtime check panics if the wrong task type is loaded (`SupervisedRegression` attempts to load a Clustering Task) 46 | - [x] get split sets 47 | - [ ] task types 48 | - [x] Supervised Classification 49 | - [x] Supervised Regression 50 | - [ ] Learning Curve 51 | - [ ] Clustering 52 | - [x] run tasks 53 | - runner takes a closure where the user defines learning and prediction 54 | - [ ] make openml.org optional (manual construction of tasks) 55 | 56 | 57 | ## Future Maybe-Goals 58 | - flow support 59 | - run support 60 | - full OpenML API support 61 | - authentication 62 | - more tasks 63 | - Supervised Datastream Classification 64 | - Machine Learning Challenge 65 | - Survival Analysis 66 | - Subgroup Discovery 67 | 68 | ## Non-Goals 69 | - implementations of machine learning algorithms 70 | -------------------------------------------------------------------------------- /examples/classification_task.rs: -------------------------------------------------------------------------------- 1 | extern crate openml; 2 | 3 | use openml::prelude::*; 4 | use openml::{PredictiveAccuracy, SupervisedClassification}; 5 | use openml::baseline::NaiveBayesClassifier; 6 | 7 | fn main() { 8 | // Load "Supervised Classification on iris" task (https://www.openml.org/t/59) 9 | let task = SupervisedClassification::from_openml(59).unwrap(); 10 | 11 | println!("Task: {}", task.name()); 12 | 13 | // run the task 14 | let result: PredictiveAccuracy<_> = task.run(|train, test| { 15 | // train classifier 16 | let nbc: NaiveBayesClassifier = train 17 | .map(|(x, y)| (x, y)) 18 | .collect(); 19 | 20 | // test classifier 21 | let y_out: Vec<_> = test 22 | .map(|x| nbc.predict(x)) 23 | .collect(); 24 | 25 | Box::new(y_out.into_iter()) 26 | }); 27 | 28 | println!("Classification Accuracy: {}", result.result()); 29 | } 30 | -------------------------------------------------------------------------------- /examples/regression_task.rs: -------------------------------------------------------------------------------- 1 | extern crate openml; 2 | 3 | use openml::prelude::*; 4 | use openml::{RootMeanSquaredError, SupervisedRegression}; 5 | use openml::baseline::NaiveLinearRegression; 6 | 7 | fn main() { 8 | // Load "Supervised Regression on liver-disorders" task (https://www.openml.org/t/52948) 9 | let task = SupervisedRegression::from_openml(52948).unwrap(); 10 | 11 | println!("Task: {}", task.name()); 12 | 13 | // run the task 14 | let result: RootMeanSquaredError<_> = task.run(|train, test| { 15 | // train model 16 | let model: NaiveLinearRegression = train 17 | .map(|(x, y)| (x, y)) 18 | .collect(); 19 | 20 | // test model 21 | let y_out: Vec<_> = test 22 | .map(|x| model.predict(x)) 23 | .collect(); 24 | 25 | Box::new(y_out.into_iter()) 26 | }); 27 | 28 | println!("Root Mean Squared Error: {}", result.result()); 29 | } 30 | -------------------------------------------------------------------------------- /src/baseline/mod.rs: -------------------------------------------------------------------------------- 1 | //! Implementation of simple baseline models, used for testing and demonstration. 2 | 3 | mod naive_bayes_classifier; 4 | mod naive_linear_regression; 5 | 6 | pub use self::naive_bayes_classifier::NaiveBayesClassifier; 7 | pub use self::naive_linear_regression::NaiveLinearRegression; -------------------------------------------------------------------------------- /src/baseline/naive_bayes_classifier.rs: -------------------------------------------------------------------------------- 1 | //! Implementation of a Gaussian Naive Bayes Classifier 2 | 3 | use std::cmp::Ordering; 4 | use std::collections::HashMap; 5 | use std::f64; 6 | use std::fmt; 7 | use std::hash::Hash; 8 | use std::iter::FromIterator; 9 | 10 | /// A Gaussian Naive Bayes Classifier 11 | /// 12 | /// The classifier is trained by consuming an iterator over the training data: 13 | /// ``` 14 | /// # use openml::baseline::NaiveBayesClassifier; 15 | /// # let data: Vec<(&[f64], &u8)> = vec![]; 16 | /// let nbc: NaiveBayesClassifier<_> = data 17 | /// .into_iter() 18 | /// .collect(); 19 | /// ``` 20 | #[derive(Debug)] 21 | pub struct NaiveBayesClassifier 22 | where C: Eq + Hash 23 | { 24 | class_distributions: HashMap, 25 | } 26 | 27 | /// Distribution of each feature column 28 | #[derive(Debug, Clone)] 29 | struct FeatureDistribution { 30 | distributions: Vec 31 | } 32 | 33 | /// Univariate Normal Distribution 34 | #[derive(Copy, Clone)] 35 | struct NormalDistribution { 36 | sum: f64, 37 | sqsum: f64, 38 | n: usize 39 | } 40 | 41 | impl<'a, C: 'a, J> FromIterator<(J, &'a C)> for NaiveBayesClassifier 42 | where 43 | J: IntoIterator, 44 | C: Eq + Hash + Copy, 45 | { 46 | fn from_iter>(iter: I) -> Self { 47 | let mut class_distributions = HashMap::new(); 48 | 49 | for (x, &y) in iter { 50 | let distributions = &mut class_distributions 51 | .entry(y) 52 | .or_insert(FeatureDistribution::new()) 53 | .distributions; 54 | 55 | for (i, &xi) in x.into_iter().enumerate() { 56 | if i >= distributions.len() { 57 | distributions.resize(1 + i, NormalDistribution::new()); 58 | } 59 | 60 | distributions[i].update(xi); 61 | } 62 | } 63 | 64 | NaiveBayesClassifier { 65 | class_distributions 66 | } 67 | } 68 | } 69 | 70 | impl NaiveBayesClassifier 71 | where C: Eq + Hash + Copy, 72 | { 73 | /// predict target class for a single feature vector 74 | pub fn predict(&self, x: &[f64]) -> C { 75 | self.class_distributions 76 | .iter() 77 | .map(|(c, dists)| { 78 | let mut lnprob = 0.0; 79 | for (&xi, dist) in x.iter().zip(dists.distributions.iter()) { 80 | lnprob += dist.lnprob(xi); 81 | } 82 | (c, lnprob) 83 | }) 84 | .max_by(|(_, lnp1), (_, lnp2)| { 85 | if lnp1 > lnp2 { 86 | Ordering::Greater 87 | } else if lnp1 == lnp2 { 88 | Ordering::Equal 89 | } else { 90 | Ordering::Less 91 | } 92 | }) 93 | .map(|(&c, _)| c) 94 | .unwrap() 95 | } 96 | } 97 | 98 | impl FeatureDistribution { 99 | fn new() -> Self { 100 | FeatureDistribution { 101 | distributions: Vec::new() 102 | } 103 | } 104 | } 105 | 106 | impl NormalDistribution { 107 | fn new() -> Self { 108 | NormalDistribution { 109 | sum: 0.0, 110 | sqsum: 0.0, 111 | n: 0 112 | } 113 | } 114 | 115 | fn update(&mut self, x: f64) { 116 | self.sum += x; 117 | self.sqsum += x * x; 118 | self.n += 1; 119 | } 120 | 121 | fn mean(&self) -> f64 { 122 | self.sum / self.n as f64 123 | } 124 | 125 | fn variance(&self) -> f64 { 126 | (self.sqsum - (self.sum * self.sum) / self.n as f64) / (self.n as f64 - 1.0) 127 | } 128 | 129 | fn lnprob(&self, x: f64) -> f64 { 130 | let v = self.variance(); 131 | let xm = x - self.mean(); 132 | 133 | 0.5 * ((1.0 / (2.0 * f64::consts::PI * v)).ln() - (xm * xm) / v) 134 | 135 | } 136 | } 137 | 138 | impl fmt::Debug for NormalDistribution { 139 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 140 | write!(f, "N{{{}; {}}}", self.mean(), self.variance()) 141 | } 142 | } 143 | 144 | #[test] 145 | fn nbc() { 146 | let data = vec![(vec![1.0, 2.0], 'A'), 147 | (vec![2.0, 1.0], 'A'), 148 | (vec![1.0, 5.0], 'B'), 149 | (vec![2.0, 6.0], 'B')]; 150 | 151 | let nbc: NaiveBayesClassifier<_> = data 152 | .iter() 153 | .map(|(x, y)| (x, y)) 154 | .collect(); 155 | 156 | assert_eq!(nbc.predict(&[1.5, 1.5]), 'A'); 157 | assert_eq!(nbc.predict(&[5.5, 1.5]), 'A'); 158 | assert_eq!(nbc.predict(&[1.5, 5.5]), 'B'); 159 | assert_eq!(nbc.predict(&[5.5, 5.5]), 'B'); 160 | } 161 | -------------------------------------------------------------------------------- /src/baseline/naive_linear_regression.rs: -------------------------------------------------------------------------------- 1 | //! Implementation of a Naive Linear Regression model 2 | 3 | use std::f64; 4 | use std::iter::FromIterator; 5 | 6 | /// A Naive Linear Regression model 7 | /// 8 | /// This is univariate regression on a single feature. During training the best feature is selected. 9 | /// The model is trained by consuming an iterator over the training data: 10 | /// ``` 11 | /// # use openml::baseline::NaiveLinearRegression; 12 | /// # let data: Vec<(&[f64], &f64)> = vec![]; 13 | /// let model: NaiveLinearRegression = data 14 | /// .into_iter() 15 | /// .collect(); 16 | /// ``` 17 | #[derive(Debug)] 18 | pub struct NaiveLinearRegression 19 | { 20 | slope: f64, 21 | intercept: f64, 22 | feature: usize, 23 | } 24 | 25 | impl<'a, J> FromIterator<(J, &'a f64)> for NaiveLinearRegression 26 | where 27 | J: IntoIterator, 28 | { 29 | fn from_iter>(iter: I) -> Self { 30 | let mut feature_columns = Vec::new(); 31 | let mut target_column = Vec::new(); 32 | 33 | for (x, &y) in iter { 34 | target_column.push(y); 35 | for (i, &xi) in x.into_iter().enumerate() { 36 | if i >= feature_columns.len() { 37 | feature_columns.push(Vec::new()); 38 | } 39 | 40 | feature_columns[i].push(xi); 41 | } 42 | } 43 | 44 | let mut y_mean = 0.0; 45 | for y in &target_column { 46 | y_mean += *y; 47 | } 48 | y_mean /= target_column.len() as f64; 49 | 50 | let mut best_err = f64::INFINITY; 51 | let mut best_slope = f64::NAN; 52 | let mut best_intercept = f64::NAN; 53 | let mut best_feature = 0; 54 | 55 | for (i, feature) in feature_columns.iter().enumerate() { 56 | let mut x_mean = 0.0; 57 | for x in feature { 58 | x_mean += *x; 59 | } 60 | x_mean /= feature.len() as f64; 61 | 62 | let mut x_var = 0.0; 63 | let mut covar = 0.0; 64 | for (x, y) in feature.iter().zip(target_column.iter()) { 65 | let x = *x - x_mean; 66 | let y = *y - y_mean; 67 | 68 | x_var += x * x; 69 | covar += x * y; 70 | } 71 | 72 | let slope = covar / x_var; 73 | let intercept = y_mean - slope * x_mean; 74 | 75 | let err: f64 = feature.iter() 76 | .zip(target_column.iter()) 77 | .map(|(&x, &y)| intercept + slope * x - y) 78 | .map(|r| r * r) 79 | .sum(); 80 | 81 | if err < best_err { 82 | best_err = err; 83 | best_slope = slope; 84 | best_intercept = intercept; 85 | best_feature = i; 86 | } 87 | } 88 | 89 | NaiveLinearRegression { 90 | slope: best_slope, 91 | intercept: best_intercept, 92 | feature: best_feature, 93 | } 94 | } 95 | } 96 | 97 | impl NaiveLinearRegression 98 | { 99 | /// predict target value for a single feature vector 100 | pub fn predict(&self, x: &[f64]) -> f64 { 101 | self.intercept + x[self.feature] * self.slope 102 | } 103 | } 104 | 105 | #[test] 106 | fn nbc_flat() { 107 | let data = vec![(vec![1.0, 2.0], 3.0), 108 | (vec![2.0, 1.0], 3.0), 109 | (vec![1.0, 5.0], 3.0), 110 | (vec![2.0, 6.0], 3.0)]; 111 | 112 | let nlr: NaiveLinearRegression = data 113 | .iter() 114 | .map(|(x, y)| (x, y)) 115 | .collect(); 116 | 117 | assert_eq!(nlr.predict(&[1.5, 1.5]), 3.0); 118 | assert_eq!(nlr.predict(&[5.5, 1.5]), 3.0); 119 | assert_eq!(nlr.predict(&[1.5, 5.5]), 3.0); 120 | assert_eq!(nlr.predict(&[5.5, 5.5]), 3.0); 121 | } 122 | 123 | #[test] 124 | fn nbc_slope() { 125 | let data = vec![(vec![1.0, 2.0], 8.0), 126 | (vec![2.0, 1.0], 9.0), 127 | (vec![1.0, 5.0], 5.0), 128 | (vec![2.0, 6.0], 4.0)]; 129 | 130 | let nlr: NaiveLinearRegression = data 131 | .iter() 132 | .map(|(x, y)| (x, y)) 133 | .collect(); 134 | 135 | assert_eq!(nlr.predict(&[1.5, 1.5]), 8.5); 136 | assert_eq!(nlr.predict(&[5.5, 1.5]), 8.5); 137 | assert_eq!(nlr.predict(&[1.5, 5.5]), 4.5); 138 | assert_eq!(nlr.predict(&[5.5, 5.5]), 4.5); 139 | } 140 | -------------------------------------------------------------------------------- /src/dataset.rs: -------------------------------------------------------------------------------- 1 | use arff::dynamic::DataSet as ArffDataSet; 2 | 3 | /// An arbitrary data set 4 | #[derive(Debug)] 5 | pub(crate) struct DataSet { 6 | pub(crate) arff: ArffDataSet, 7 | pub(crate) target: Option, 8 | } 9 | 10 | impl DataSet { 11 | /// return two `ArffDataSet`s; one containing the features and the other containing the target 12 | /// variable. 13 | pub(crate) fn clone_split(&self) -> Option<(ArffDataSet, ArffDataSet)> { 14 | match self.target { 15 | None => None, 16 | Some(ref col) => { 17 | let data = self.arff.clone(); 18 | Some(data.split_one(col)) 19 | } 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use std::io::Error as IoError; 2 | use std::result::Result as StdResult; 3 | use std::string::FromUtf8Error; 4 | 5 | use app_dirs::AppDirsError; 6 | use arff::Error as ArffError; 7 | use hyper::Error as HyperError; 8 | use hyper::error::UriError; 9 | use hyper_tls::Error as TlsError; 10 | use serde_json::Error as JsonError; 11 | 12 | pub type Result = StdResult; 13 | 14 | #[derive(Debug)] 15 | pub enum Error { 16 | IoError(IoError), 17 | Utf8Error(FromUtf8Error), 18 | HyperError(HyperError), 19 | HyperUriError(UriError), 20 | HyperTlsError(TlsError), 21 | JsonError(JsonError), 22 | ArffError(ArffError), 23 | AppDirsError(AppDirsError), 24 | } 25 | 26 | impl From for Error { 27 | fn from(e: IoError) -> Self { 28 | Error::IoError(e) 29 | } 30 | } 31 | 32 | impl From for Error { 33 | fn from(e: FromUtf8Error) -> Self { 34 | Error::Utf8Error(e) 35 | } 36 | } 37 | 38 | impl From for Error { 39 | fn from(e: HyperError) -> Self { 40 | Error::HyperError(e) 41 | } 42 | } 43 | 44 | impl From for Error { 45 | fn from(e: UriError) -> Self { 46 | Error::HyperUriError(e) 47 | } 48 | } 49 | 50 | impl From for Error { 51 | fn from(e: TlsError) -> Self { 52 | Error::HyperTlsError(e) 53 | } 54 | } 55 | 56 | impl From for Error { 57 | fn from(e: JsonError) -> Self { 58 | Error::JsonError(e) 59 | } 60 | } 61 | 62 | impl From for Error { 63 | fn from(e: ArffError) -> Self { 64 | Error::ArffError(e) 65 | } 66 | } 67 | 68 | impl From for Error { 69 | fn from(e: AppDirsError) -> Self { 70 | match e { 71 | AppDirsError::Io(e) => Error::IoError(e), 72 | _ => Error::AppDirsError(e) 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # openml-rust 2 | //! 3 | //! The openml crate provides functions to fetch tasks and data sets from https://openml.org, and 4 | //! run them with machine learning models. 5 | //! 6 | //! ## Example 7 | //! 8 | //! ```rust 9 | //!extern crate openml; 10 | //! 11 | //!use openml::prelude::*; 12 | //!use openml::{PredictiveAccuracy, SupervisedClassification}; 13 | //!use openml::baseline::NaiveBayesClassifier; 14 | //! 15 | //!fn main() { 16 | //! // Load "Supervised Classification on iris" task (https://www.openml.org/t/59) 17 | //! let task = SupervisedClassification::from_openml(59).unwrap(); 18 | //! 19 | //! println!("Task: {}", task.name()); 20 | //! 21 | //! // run the task 22 | //! let result: PredictiveAccuracy<_> = task.run(|train, test| { 23 | //! // train classifier 24 | //! let nbc: NaiveBayesClassifier = train 25 | //! .map(|(x, y)| (x, y)) 26 | //! .collect(); 27 | //! 28 | //! // test classifier 29 | //! let y_out: Vec<_> = test 30 | //! .map(|x| nbc.predict(x)) 31 | //! .collect(); 32 | //! 33 | //! Box::new(y_out.into_iter()) 34 | //! }); 35 | //! 36 | //! println!("Classification Accuracy: {}", result.result()); 37 | //!} 38 | //! ``` 39 | 40 | extern crate app_dirs; 41 | extern crate arff; 42 | extern crate fs2; 43 | extern crate futures; 44 | extern crate hyper; 45 | extern crate hyper_tls; 46 | #[macro_use] 47 | extern crate log; 48 | extern crate num_traits; 49 | extern crate serde; 50 | #[macro_use] 51 | extern crate serde_derive; 52 | extern crate serde_json; 53 | #[cfg(test)] 54 | extern crate simple_logger; 55 | #[cfg(test)] 56 | extern crate time; 57 | extern crate tokio_core; 58 | 59 | pub mod baseline; 60 | mod dataset; 61 | mod error; 62 | mod measure_accumulator; 63 | mod openml_api; 64 | pub mod prelude; 65 | mod procedures; 66 | mod tasks; 67 | 68 | pub use measure_accumulator::{ 69 | MeasureAccumulator, 70 | PredictiveAccuracy, 71 | RootMeanSquaredError 72 | }; 73 | 74 | pub use tasks::{ 75 | SupervisedClassification, 76 | SupervisedRegression, 77 | Task 78 | }; 79 | 80 | #[cfg(test)] 81 | mod tests { 82 | use log::Level; 83 | use time::PreciseTime; 84 | 85 | use baseline::NaiveBayesClassifier; 86 | use measure_accumulator::PredictiveAccuracy; 87 | 88 | use super::*; 89 | 90 | #[test] 91 | fn apidev() { 92 | let task = SupervisedClassification::from_openml(59).unwrap(); 93 | 94 | println!("{}", task.name()); 95 | 96 | let result: PredictiveAccuracy<_> = task.run_static(|_train, test| { 97 | let y_out: Vec<_> = test.map(|_row: &[f64; 4]| 0).collect(); 98 | Box::new(y_out.into_iter()) 99 | }); 100 | 101 | println!("{:#?}", result); 102 | 103 | #[allow(dead_code)] 104 | #[derive(Deserialize)] 105 | struct Row { 106 | sepallength: f32, 107 | sepalwidth: f32, 108 | petallength: f32, 109 | petalwidth: f32, 110 | } 111 | 112 | let result: PredictiveAccuracy<_> = task.run_static(|train, test| { 113 | let (_x_train, _y_train): (Vec<&Row>, Vec) = train.unzip(); 114 | let y_out: Vec<_> = test.map(|_row: &Row| 0).collect(); 115 | Box::new(y_out.into_iter()) 116 | }); 117 | 118 | println!("{:#?}", result); 119 | 120 | let result: PredictiveAccuracy<_> = task.run(|train, test| { 121 | // train classifier 122 | let nbc: NaiveBayesClassifier = train 123 | .map(|(x, y)| (x, y)) 124 | .collect(); 125 | 126 | // test classifier 127 | let y_out: Vec<_> = test 128 | .map(|x| nbc.predict(x)) 129 | .collect(); 130 | 131 | Box::new(y_out.into_iter()) 132 | }); 133 | 134 | println!("{:#?}", result); 135 | } 136 | 137 | #[test] 138 | fn apidev2() { 139 | use simple_logger; 140 | simple_logger::init_with_level(Level::Info).unwrap(); 141 | 142 | let start = PreciseTime::now(); 143 | 144 | let task = SupervisedClassification::from_openml(146825).unwrap(); 145 | //let task = SupervisedClassification::from_openml(167147).unwrap(); 146 | 147 | let end = PreciseTime::now(); 148 | 149 | let result: PredictiveAccuracy<_> = task.run(|_train, test| { 150 | let y_out: Vec<_> = test.map(|_row: &[u8]| 0).collect(); 151 | Box::new(y_out.into_iter()) 152 | }); 153 | 154 | println!("{:#?}", result); 155 | 156 | println!("loading took {} seconds.", start.to(end)); 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/measure_accumulator.rs: -------------------------------------------------------------------------------- 1 | //! Measure accumulators are summaries of model performance, such as classification accuracy or 2 | //! regression error. 3 | 4 | use std::cmp::Eq; 5 | use std::collections::HashMap; 6 | use std::hash::Hash; 7 | use std::marker::PhantomData; 8 | 9 | use num_traits::AsPrimitive; 10 | 11 | /// Trait implemented by performance measures 12 | pub trait MeasureAccumulator { 13 | /// initialize new measure 14 | fn new() -> Self; 15 | 16 | /// update with one prediction 17 | fn update_one(&mut self, known: &T, pred: &T); 18 | 19 | /// get resulting performance 20 | fn result(&self) -> f64; 21 | 22 | /// update with multiple predictions 23 | fn update>(&mut self, known: I, predicted: I) { 24 | for (k, p) in known.zip(predicted) { 25 | self.update_one(&k, &p) 26 | } 27 | } 28 | } 29 | 30 | /// Classification Accuracy: relative amount of correctly classified labels 31 | #[derive(Debug)] 32 | pub struct PredictiveAccuracy { 33 | n_correct: usize, 34 | n_wrong: usize, 35 | _t: PhantomData, 36 | } 37 | 38 | impl MeasureAccumulator for PredictiveAccuracy 39 | where 40 | T: PartialEq, 41 | { 42 | fn new() -> Self { 43 | PredictiveAccuracy { 44 | n_correct: 0, 45 | n_wrong: 0, 46 | _t: PhantomData, 47 | } 48 | } 49 | 50 | fn update_one(&mut self, known: &T, pred: &T) { 51 | if known == pred { 52 | self.n_correct += 1; 53 | } else { 54 | self.n_wrong += 1; 55 | } 56 | } 57 | 58 | fn result(&self) -> f64 { 59 | self.n_correct as f64 / (self.n_correct + self.n_wrong) as f64 60 | } 61 | } 62 | 63 | /// Root Mean Squared Error 64 | #[derive(Debug)] 65 | pub struct RootMeanSquaredError { 66 | sum_of_squares: f64, 67 | n: usize, 68 | _t: PhantomData, 69 | } 70 | 71 | impl MeasureAccumulator for RootMeanSquaredError 72 | where 73 | T: AsPrimitive, 74 | { 75 | fn new() -> Self { 76 | RootMeanSquaredError { 77 | sum_of_squares: 0.0, 78 | n: 0, 79 | _t: PhantomData, 80 | } 81 | } 82 | 83 | fn update_one(&mut self, known: &T, pred: &T) { 84 | let diff = known.as_() - pred.as_(); 85 | self.sum_of_squares += diff * diff; 86 | self.n += 1; 87 | } 88 | 89 | fn result(&self) -> f64 { 90 | (self.sum_of_squares / self.n as f64).sqrt() 91 | } 92 | } 93 | 94 | 95 | /// Adjusted Rand Index 96 | #[derive(Debug)] 97 | pub struct AdjustedRandIndex 98 | where T: Eq + Hash, 99 | { 100 | contingency_table: HashMap<(T, T), usize> 101 | } 102 | 103 | 104 | 105 | impl MeasureAccumulator for AdjustedRandIndex 106 | where T: Eq + Hash + Clone, 107 | { 108 | fn new() -> Self { 109 | AdjustedRandIndex { 110 | contingency_table: HashMap::new() 111 | } 112 | } 113 | 114 | fn update_one(&mut self, known: &T, pred: &T) { 115 | let n = self.contingency_table 116 | .entry((known.clone(), pred.clone())) 117 | .or_insert(0); 118 | *n += 1; 119 | } 120 | 121 | fn result(&self) -> f64 { 122 | let mut a = HashMap::new(); 123 | let mut b = HashMap::new(); 124 | 125 | let mut ri = 0usize; 126 | let mut n_tot = 0usize; 127 | 128 | for ((ak, bk), &n) in self.contingency_table.iter() { 129 | n_tot += n; 130 | ri += combinations(n); 131 | 132 | *a.entry(ak).or_insert(0usize) += n; 133 | *b.entry(bk).or_insert(0usize) += n; 134 | } 135 | 136 | let a_sum: usize = a.iter().map(|(_, &n)| combinations(n)).sum(); 137 | let b_sum: usize = b.iter().map(|(_, &n)| combinations(n)).sum(); 138 | 139 | let expected_ri = (a_sum as f64) * (b_sum as f64) / combinations(n_tot) as f64; 140 | let max_ri = (a_sum + b_sum) as f64 / 2.0; 141 | 142 | (ri as f64 - expected_ri) / (max_ri - expected_ri) 143 | } 144 | } 145 | 146 | fn combinations(n: usize) -> usize { 147 | if n % 2 == 0 { 148 | (n - 1) * (n / 2) 149 | } else { 150 | n * ((n - 1) / 2) 151 | } 152 | } 153 | 154 | #[test] 155 | fn ari() { 156 | let labels_true = [0, 0, 0, 1, 1, 1]; 157 | let labels_pred = [0, 0, 1, 1, 2, 2]; 158 | 159 | let mut ari = AdjustedRandIndex::new(); 160 | ari.update(labels_true.iter(), labels_pred.iter()); 161 | 162 | assert_eq!(ari.result(), 0.24242424242424246); 163 | } 164 | -------------------------------------------------------------------------------- /src/openml_api/api_types.rs: -------------------------------------------------------------------------------- 1 | use serde_json; 2 | 3 | /// Generic JSON response as returned by the OpenML API 4 | #[derive(Debug, Serialize, Deserialize)] 5 | pub struct GenericResponse(serde_json::Value); 6 | 7 | impl GenericResponse { 8 | #[inline(always)] 9 | pub fn look_up<'a>(&'a self, p: &str) -> Option<&'a serde_json::Value> { 10 | self.0.pointer(p) 11 | } 12 | } 13 | 14 | /// A row in a split file 15 | #[derive(Debug, Deserialize)] 16 | pub(crate) struct CrossValItem { 17 | #[serde(rename = "type")] 18 | pub purpose: TrainTest, 19 | 20 | pub rowid: usize, 21 | 22 | pub repeat: usize, 23 | 24 | pub fold: usize, 25 | } 26 | 27 | #[derive(Debug, Deserialize)] 28 | pub(crate) enum TrainTest { 29 | #[serde(rename = "TRAIN")] 30 | Train, 31 | 32 | #[serde(rename = "TEST")] 33 | Test, 34 | } 35 | 36 | /// Cost matrix, used by some classification tasks - currently UNIMPLEMENTED 37 | #[derive(Debug)] 38 | pub(crate) enum CostMatrix { 39 | None, 40 | } 41 | 42 | impl<'a> From<&'a serde_json::Value> for CostMatrix { 43 | fn from(item: &serde_json::Value) -> Self { 44 | let v = &item["cost_matrix"]; 45 | match v.as_array() { 46 | None => panic!("invalid cots matrix"), 47 | Some(c) if c.is_empty() => CostMatrix::None, 48 | Some(_) => unimplemented!("cost matrix"), 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/openml_api/file_lock.rs: -------------------------------------------------------------------------------- 1 | //! file locking mechanisms 2 | 3 | use std::fs::File; 4 | use std::io::{self, Read, Write}; 5 | 6 | use fs2::FileExt; 7 | 8 | /// A scoped exclusive lock for use by file writers 9 | pub struct ExclusiveLock { 10 | file: File, 11 | } 12 | 13 | impl ExclusiveLock { 14 | /// acquire locked file 15 | pub fn new(file: File) -> io::Result { 16 | file.lock_exclusive()?; 17 | Ok(ExclusiveLock { file }) 18 | } 19 | } 20 | 21 | impl Drop for ExclusiveLock { 22 | /// release locked file 23 | fn drop(&mut self) { 24 | self.file.unlock().unwrap(); 25 | } 26 | } 27 | 28 | impl Read for ExclusiveLock { 29 | /// read from locked file 30 | #[inline(always)] 31 | fn read(&mut self, data: &mut [u8]) -> io::Result { 32 | self.file.read(data) 33 | } 34 | } 35 | 36 | impl Write for ExclusiveLock { 37 | /// write to locked file 38 | #[inline(always)] 39 | fn write(&mut self, data: &[u8]) -> io::Result { 40 | self.file.write(data) 41 | } 42 | 43 | /// flush buffer of locked file 44 | #[inline(always)] 45 | fn flush(&mut self) -> io::Result<()> { 46 | self.file.flush() 47 | } 48 | } 49 | 50 | pub struct SharedLock { 51 | file: File, 52 | } 53 | 54 | /// A scoped shared lock for use by file readers 55 | impl SharedLock { 56 | /// acquire locked file 57 | pub fn new(file: File) -> io::Result { 58 | file.lock_shared()?; 59 | Ok(SharedLock { file }) 60 | } 61 | } 62 | 63 | impl Drop for SharedLock { 64 | /// release locked file 65 | fn drop(&mut self) { 66 | self.file.unlock().unwrap(); 67 | } 68 | } 69 | 70 | impl Read for SharedLock { 71 | /// read from locked file 72 | #[inline(always)] 73 | fn read(&mut self, data: &mut [u8]) -> io::Result { 74 | self.file.read(data) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/openml_api/impls_from_json.rs: -------------------------------------------------------------------------------- 1 | //! implementations to convert the API's JSON responses into corresponding Rust structures 2 | use arff; 3 | use arff::dynamic::DataSet as ArffDataSet; 4 | use serde_json; 5 | 6 | use dataset::DataSet; 7 | use error::Result; 8 | use procedures::{Fold, FrozenSets}; 9 | use tasks::{SupervisedClassification, SupervisedRegression}; 10 | 11 | use super::api_types::{CrossValItem, GenericResponse, TrainTest}; 12 | use super::web_access::get_cached; 13 | 14 | impl DataSet { 15 | fn from_json(item: &serde_json::Value) -> Self { 16 | let v = &item["data_set"]; 17 | let id = v["data_set_id"].as_str().unwrap(); 18 | let target = v["target_feature"].as_str(); 19 | 20 | let info_url = format!("https://www.openml.org/api/v1/json/data/{}", id); 21 | let info: GenericResponse = serde_json::from_str(&get_cached(&info_url).unwrap()).unwrap(); 22 | 23 | let default_target = info.look_up("/data_set_description/default_target_attribute") 24 | .and_then(|v| v.as_str()); 25 | 26 | let target = match (default_target, target) { 27 | (Some(s), None) | (_, Some(s)) => Some(s.to_owned()), 28 | (None, None) => None, 29 | }; 30 | 31 | let dset_url = info.look_up("/data_set_description/url") 32 | .unwrap() 33 | .as_str() 34 | .unwrap(); 35 | let dset_str = get_cached(&dset_url).unwrap(); 36 | let dset = ArffDataSet::from_str(&dset_str).unwrap(); 37 | 38 | DataSet { arff: dset, target } 39 | } 40 | } 41 | 42 | impl SupervisedClassification { 43 | pub fn from_json(task_json: &serde_json::Value) -> Self { 44 | let mut source_data = None; 45 | let mut estimation_procedure = None; 46 | //let mut cost_matrix = None; 47 | 48 | for input_item in task_json["input"].as_array().unwrap() { 49 | match input_item["name"].as_str() { 50 | Some("source_data") => source_data = Some(DataSet::from_json(input_item)), 51 | Some("estimation_procedure") => { 52 | estimation_procedure = Some(Box::new(FrozenSets::from_json(input_item))) 53 | } 54 | //Some("cost_matrix") => cost_matrix = Some(input_item.into()), 55 | Some(_) => {} 56 | None => panic!("/task/input/name is not a string"), 57 | } 58 | } 59 | 60 | SupervisedClassification { 61 | id: task_json["task_id"].as_str().unwrap().to_owned(), 62 | name: task_json["task_name"].as_str().unwrap().to_owned(), 63 | source_data: source_data.unwrap(), 64 | estimation_procedure: estimation_procedure.unwrap(), 65 | //cost_matrix: cost_matrix.unwrap(), 66 | } 67 | } 68 | } 69 | 70 | impl SupervisedRegression { 71 | pub fn from_json(task_json: &serde_json::Value) -> Self { 72 | let mut source_data = None; 73 | let mut estimation_procedure = None; 74 | 75 | for input_item in task_json["input"].as_array().unwrap() { 76 | match input_item["name"].as_str() { 77 | Some("source_data") => source_data = Some(DataSet::from_json(input_item)), 78 | Some("estimation_procedure") => { 79 | estimation_procedure = Some(Box::new(FrozenSets::from_json(input_item))) 80 | } 81 | Some(_) => {} 82 | None => panic!("/task/input/name is not a string"), 83 | } 84 | } 85 | 86 | SupervisedRegression { 87 | id: task_json["task_id"].as_str().unwrap().to_owned(), 88 | name: task_json["task_name"].as_str().unwrap().to_owned(), 89 | source_data: source_data.unwrap(), 90 | estimation_procedure: estimation_procedure.unwrap(), 91 | } 92 | } 93 | } 94 | 95 | impl FrozenSets { 96 | fn from_json(item: &serde_json::Value) -> Self { 97 | let v = &item["estimation_procedure"]; 98 | let typ = v["type"].as_str(); 99 | let splits = v["data_splits_url"].as_str(); 100 | 101 | match (typ, splits) { 102 | (_, Some(url)) => FrozenSets::from_url(url).unwrap(), 103 | _ => unimplemented!(), 104 | } 105 | } 106 | 107 | fn from_url(url: &str) -> Result { 108 | let raw = get_cached(url)?; 109 | let data: Vec = arff::from_str(&raw)?; 110 | 111 | let mut folds = vec![]; 112 | for item in data { 113 | if item.repeat >= folds.len() { 114 | folds.resize(item.repeat + 1, vec![]); 115 | } 116 | let mut rep = &mut folds[item.repeat]; 117 | 118 | if item.fold >= rep.len() { 119 | rep.resize(item.fold + 1, Fold::new()); 120 | } 121 | let mut fold = &mut rep[item.fold]; 122 | 123 | match item.purpose { 124 | TrainTest::Train => fold.trainset.push(item.rowid), 125 | TrainTest::Test => fold.testset.push(item.rowid), 126 | } 127 | } 128 | 129 | Ok(FrozenSets { folds }) 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/openml_api/impls_from_openml.rs: -------------------------------------------------------------------------------- 1 | //! implementations to load tasks from the OpenML API. 2 | use serde_json; 3 | 4 | use error::Result; 5 | use tasks::{SupervisedClassification, SupervisedRegression}; 6 | 7 | use super::Id; 8 | use super::api_types::GenericResponse; 9 | use super::web_access::get_cached; 10 | 11 | impl SupervisedClassification { 12 | pub fn from_openml<'a, T: Id>(id: T) -> Result { 13 | let url = format!("https://www.openml.org/api/v1/json/task/{}", id.as_string()); 14 | let raw_task = get_cached(&url)?; 15 | let response: GenericResponse = serde_json::from_str(&raw_task)?; 16 | 17 | let task = response.look_up("/task").unwrap(); 18 | 19 | match response.look_up("/task/task_type_id").unwrap().as_str() { 20 | Some("1") => Ok(SupervisedClassification::from_json(task)), 21 | Some(id) => panic!("Wrong task type ID. Expected \"1\" but got \"{}\"", id), 22 | None => panic!("Invalid task type ID") 23 | } 24 | } 25 | } 26 | 27 | impl SupervisedRegression { 28 | pub fn from_openml<'a, T: Id>(id: T) -> Result { 29 | let url = format!("https://www.openml.org/api/v1/json/task/{}", id.as_string()); 30 | let raw_task = get_cached(&url)?; 31 | let response: GenericResponse = serde_json::from_str(&raw_task)?; 32 | 33 | let task = response.look_up("/task").unwrap(); 34 | 35 | match response.look_up("/task/task_type_id").unwrap().as_str() { 36 | Some("2") => Ok(SupervisedRegression::from_json(task)), 37 | Some(id) => panic!("Wrong task type ID. Expected \"2\" but got \"{}\"", id), 38 | None => panic!("Invalid task type ID") 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/openml_api/mod.rs: -------------------------------------------------------------------------------- 1 | //! Cached access to the OpenML REST API 2 | 3 | mod api_types; 4 | mod file_lock; 5 | mod impls_from_json; 6 | mod impls_from_openml; 7 | mod web_access; 8 | 9 | use std::borrow::Cow; 10 | 11 | pub trait Id { 12 | fn as_string(&self) -> Cow; 13 | fn as_u32(&self) -> u32; 14 | } 15 | 16 | impl Id for String { 17 | #[inline(always)] 18 | fn as_string(&self) -> Cow { 19 | Cow::from(self.as_str()) 20 | } 21 | 22 | #[inline(always)] 23 | fn as_u32(&self) -> u32 { 24 | self.parse().unwrap() 25 | } 26 | } 27 | 28 | impl<'a> Id for &'a str { 29 | #[inline(always)] 30 | fn as_string(&self) -> Cow { 31 | Cow::from(*self) 32 | } 33 | 34 | #[inline(always)] 35 | fn as_u32(&self) -> u32 { 36 | self.parse().unwrap() 37 | } 38 | } 39 | 40 | impl Id for u32 { 41 | #[inline(always)] 42 | fn as_string(&self) -> Cow { 43 | Cow::from(format!("{}", self)) 44 | } 45 | 46 | #[inline(always)] 47 | fn as_u32(&self) -> u32 { 48 | *self 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/openml_api/web_access.rs: -------------------------------------------------------------------------------- 1 | //! Access the OpenML REST API 2 | 3 | use std::fs::{File, OpenOptions}; 4 | use std::io::{self, Read, Write}; 5 | 6 | use app_dirs::{app_root, AppDataType, AppInfo}; 7 | use futures::{Future, Stream}; 8 | use hyper::Client; 9 | use hyper_tls::HttpsConnector; 10 | use tokio_core::reactor::Core; 11 | 12 | use error::Result; 13 | 14 | use super::file_lock::{ExclusiveLock, SharedLock}; 15 | 16 | const APP_INFO: AppInfo = AppInfo{name: "openml-rust", author: "openml-rust"}; 17 | 18 | /// Query a URL. If possible read the response from local cache 19 | pub fn get_cached(url: &str) -> Result { 20 | // todo: is there a potential race condition with a process locking the file for reading while 21 | // the writer has created but not yet locked the file? 22 | 23 | let mut path = app_root(AppDataType::UserCache, &APP_INFO)?; 24 | path.push(url_to_file(url)); 25 | 26 | loop { 27 | match File::open(&path) { 28 | Ok(f) => { 29 | info!("Loading cached {}", url); 30 | let mut file = SharedLock::new(f)?; 31 | let mut data = String::new(); 32 | file.read_to_string(&mut data)?; 33 | return Ok(data); 34 | } 35 | Err(_) => {} 36 | } 37 | 38 | match OpenOptions::new().create_new(true).write(true).open(&path) { 39 | Err(e) => { 40 | // todo: is this the correct io error raised if another thread has locked the file currently? 41 | if let io::ErrorKind::PermissionDenied = e.kind() { 42 | continue; 43 | } 44 | error!("Error while opening cache for writing: {:?}", e); 45 | return Err(e.into()); 46 | } 47 | Ok(f) => { 48 | info!("Downloading {}", url); 49 | let mut file = ExclusiveLock::new(f)?; 50 | let data = download(url)?; 51 | file.write_all(data.as_bytes())?; 52 | return Ok(data); 53 | } 54 | } 55 | } 56 | } 57 | 58 | /// Query a URL. 59 | fn download(url: &str) -> Result { 60 | let mut core = Core::new()?; 61 | let handle = core.handle(); 62 | let client = Client::configure() 63 | .connector(HttpsConnector::new(4, &handle)?) 64 | .build(&handle); 65 | 66 | let req = client.get(url.parse()?); 67 | 68 | let mut bytes = Vec::new(); 69 | { 70 | let work = req.and_then(|res| { 71 | res.body().for_each(|chunk| { 72 | bytes.extend_from_slice(&chunk); 73 | Ok(()) 74 | }) 75 | }); 76 | core.run(work)? 77 | } 78 | let result = String::from_utf8(bytes)?; 79 | Ok(result) 80 | } 81 | 82 | /// Convert URL to file name for chching 83 | fn url_to_file(s: &str) -> String { 84 | s.replace('/', "_").replace(':', "") 85 | } 86 | -------------------------------------------------------------------------------- /src/prelude.rs: -------------------------------------------------------------------------------- 1 | pub use measure_accumulator::MeasureAccumulator; -------------------------------------------------------------------------------- /src/procedures/frozen_sets.rs: -------------------------------------------------------------------------------- 1 | use super::{Fold, Procedure}; 2 | 3 | /// pre-defined cross-validation 4 | #[derive(Debug)] 5 | pub(crate) struct FrozenSets { 6 | pub(crate) folds: Vec>, 7 | } 8 | 9 | impl Procedure for FrozenSets { 10 | fn iter<'a>(&'a self) -> Box<'a + Iterator> { 11 | let iter = self.folds.iter().flat_map(|inner| inner.iter()); 12 | Box::new(iter) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/procedures/mod.rs: -------------------------------------------------------------------------------- 1 | //! Validation procedures 2 | 3 | mod frozen_sets; 4 | 5 | pub(crate) use self::frozen_sets::FrozenSets; 6 | 7 | /// Validation procedures support iteration over cross-validation folds 8 | pub(crate) trait Procedure { 9 | fn iter<'a>(&'a self) -> Box<'a + Iterator>; 10 | } 11 | 12 | /// A single cross-validation fold, consisting of a training set and a testing set 13 | #[derive(Debug, Clone)] 14 | pub(crate) struct Fold { 15 | pub(crate) trainset: Vec, 16 | pub(crate) testset: Vec, 17 | } 18 | 19 | impl Fold { 20 | pub fn new() -> Self { 21 | Fold { 22 | trainset: Vec::new(), 23 | testset: Vec::new(), 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/tasks/mod.rs: -------------------------------------------------------------------------------- 1 | //! Implementations of specific OpenML task types 2 | 3 | mod supervised_classification; 4 | mod supervised_regression; 5 | 6 | use serde::de::DeserializeOwned; 7 | 8 | pub use self::supervised_classification::SupervisedClassification; 9 | pub use self::supervised_regression::SupervisedRegression; 10 | 11 | use measure_accumulator::MeasureAccumulator; 12 | 13 | pub trait Task { 14 | /// get task ID 15 | fn id(&self) -> &str; 16 | 17 | /// get task name 18 | fn name(&self) -> &str; 19 | 20 | /// run task, specifying the type of an entire feature column in `X`. This allows to run 21 | /// machine learning models that take features of different types, or named features in form 22 | /// of structs. 23 | fn run_static(&self, flow: F) -> M 24 | where 25 | F: Fn(&mut Iterator, &mut Iterator) -> Box>, 26 | X: DeserializeOwned, 27 | Y: DeserializeOwned, 28 | M: MeasureAccumulator; 29 | 30 | /// run task, specifying the feature type in `X`. This allows to run machine learning models 31 | /// that expect every feature to have the same type. 32 | fn run(&self, flow: F) -> M 33 | where 34 | F: Fn(&mut Iterator, &mut Iterator) 35 | -> Box>, 36 | X: DeserializeOwned, 37 | Y: DeserializeOwned, 38 | M: MeasureAccumulator; 39 | } 40 | -------------------------------------------------------------------------------- /src/tasks/supervised_classification.rs: -------------------------------------------------------------------------------- 1 | use arff::dynamic::de::from_dataset; 2 | use serde::de::DeserializeOwned; 3 | 4 | use dataset::DataSet; 5 | use measure_accumulator::MeasureAccumulator; 6 | use procedures::Procedure; 7 | 8 | /// Classification task 9 | pub struct SupervisedClassification { 10 | pub(crate) id: String, 11 | pub(crate) name: String, 12 | pub(crate) source_data: DataSet, 13 | pub(crate) estimation_procedure: Box, 14 | //pub(crate) cost_matrix: CostMatrix, 15 | } 16 | 17 | impl SupervisedClassification { 18 | /// get task ID 19 | pub fn id(&self) -> &str { 20 | &self.id 21 | } 22 | 23 | /// get task name 24 | pub fn name(&self) -> &str { 25 | &self.name 26 | } 27 | 28 | /// run task, specifying the type of an entire feature column in `X`. This allows to run 29 | /// machine learning models that take features of different types, or named features in form 30 | /// of structs. 31 | pub fn run_static(&self, flow: F) -> M 32 | where 33 | F: Fn(&mut Iterator, &mut Iterator) -> Box>, 34 | X: DeserializeOwned, 35 | Y: DeserializeOwned, 36 | M: MeasureAccumulator, 37 | { 38 | let (dx, dy) = self.source_data 39 | .clone_split() 40 | .expect("Supervised Classification requires a target column"); 41 | 42 | let x: Vec = from_dataset(&dx).unwrap(); 43 | let y: Vec = from_dataset(&dy).unwrap(); 44 | 45 | let mut measure = M::new(); 46 | 47 | for fold in self.estimation_procedure.iter() { 48 | let mut train = fold.trainset.iter().map(|&i| (&x[i], &y[i])); 49 | 50 | let mut test = fold.testset.iter().map(|&i| &x[i]); 51 | 52 | let predictit = flow(&mut train, &mut test); 53 | 54 | for (known, pred) in fold.testset.iter().map(|&i| &y[i]).zip(predictit) { 55 | measure.update_one(known, &pred); 56 | } 57 | } 58 | 59 | measure 60 | } 61 | 62 | /// run task, specifying the feature type in `X`. This allows to run machine learning models 63 | /// that expect every feature to have the same type. 64 | pub fn run(&self, flow: F) -> M 65 | where 66 | F: Fn(&mut Iterator, &mut Iterator) 67 | -> Box>, 68 | X: DeserializeOwned, 69 | Y: DeserializeOwned, 70 | M: MeasureAccumulator, 71 | { 72 | let (dx, dy) = self.source_data 73 | .clone_split() 74 | .expect("Supervised Classification requires a target column"); 75 | 76 | let x: Vec = from_dataset(&dx).unwrap(); 77 | let y: Vec = from_dataset(&dy).unwrap(); 78 | 79 | let mut measure = M::new(); 80 | 81 | for fold in self.estimation_procedure.iter() { 82 | let mut train = fold.trainset 83 | .iter() 84 | .map(|&i| (&x[i * dx.n_cols()..(i + 1) * dx.n_cols()], &y[i])); 85 | 86 | let mut test = fold.testset 87 | .iter() 88 | .map(|&i| &x[i * dx.n_cols()..(i + 1) * dx.n_cols()]); 89 | 90 | let predictit = flow(&mut train, &mut test); 91 | 92 | for (known, pred) in fold.testset.iter().map(|&i| &y[i]).zip(predictit) { 93 | measure.update_one(known, &pred); 94 | } 95 | } 96 | 97 | measure 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/tasks/supervised_regression.rs: -------------------------------------------------------------------------------- 1 | use arff::dynamic::de::from_dataset; 2 | use serde::de::DeserializeOwned; 3 | 4 | use dataset::DataSet; 5 | use measure_accumulator::MeasureAccumulator; 6 | use procedures::Procedure; 7 | 8 | /// Regression task 9 | pub struct SupervisedRegression { 10 | pub(crate) id: String, 11 | pub(crate) name: String, 12 | pub(crate) source_data: DataSet, 13 | pub(crate) estimation_procedure: Box, 14 | } 15 | 16 | impl SupervisedRegression { 17 | /// get task ID 18 | pub fn id(&self) -> &str { 19 | &self.id 20 | } 21 | 22 | /// get task name 23 | pub fn name(&self) -> &str { 24 | &self.name 25 | } 26 | 27 | /// run task, specifying the type of an entire feature column in `X`. This allows to run 28 | /// machine learning models that take features of different types, or named features in form 29 | /// of structs. 30 | pub fn run_static(&self, flow: F) -> M 31 | where 32 | F: Fn(&mut Iterator, &mut Iterator) -> Box>, 33 | X: DeserializeOwned, 34 | Y: DeserializeOwned, 35 | M: MeasureAccumulator, 36 | { 37 | let (dx, dy) = self.source_data 38 | .clone_split() 39 | .expect("Supervised Regression requires a target column"); 40 | 41 | let x: Vec = from_dataset(&dx).unwrap(); 42 | let y: Vec = from_dataset(&dy).unwrap(); 43 | 44 | let mut measure = M::new(); 45 | 46 | for fold in self.estimation_procedure.iter() { 47 | let mut train = fold.trainset.iter().map(|&i| (&x[i], &y[i])); 48 | 49 | let mut test = fold.testset.iter().map(|&i| &x[i]); 50 | 51 | let predictit = flow(&mut train, &mut test); 52 | 53 | for (known, pred) in fold.testset.iter().map(|&i| &y[i]).zip(predictit) { 54 | measure.update_one(known, &pred); 55 | } 56 | } 57 | 58 | measure 59 | } 60 | 61 | /// run task, specifying the feature type in `X`. This allows to run machine learning models 62 | /// that expect every feature to have the same type. 63 | pub fn run(&self, flow: F) -> M 64 | where 65 | F: Fn(&mut Iterator, &mut Iterator) 66 | -> Box>, 67 | X: DeserializeOwned, 68 | Y: DeserializeOwned, 69 | M: MeasureAccumulator, 70 | { 71 | let (dx, dy) = self.source_data 72 | .clone_split() 73 | .expect("Supervised Regression requires a target column"); 74 | 75 | let x: Vec = from_dataset(&dx).unwrap(); 76 | let y: Vec = from_dataset(&dy).unwrap(); 77 | 78 | let mut measure = M::new(); 79 | 80 | for fold in self.estimation_procedure.iter() { 81 | let mut train = fold.trainset 82 | .iter() 83 | .map(|&i| (&x[i * dx.n_cols()..(i + 1) * dx.n_cols()], &y[i])); 84 | 85 | let mut test = fold.testset 86 | .iter() 87 | .map(|&i| &x[i * dx.n_cols()..(i + 1) * dx.n_cols()]); 88 | 89 | let predictit = flow(&mut train, &mut test); 90 | 91 | for (known, pred) in fold.testset.iter().map(|&i| &y[i]).zip(predictit) { 92 | measure.update_one(known, &pred); 93 | } 94 | } 95 | 96 | measure 97 | } 98 | } 99 | --------------------------------------------------------------------------------