├── .gitignore ├── LICENSE ├── README.md ├── pom.xml └── src └── main ├── java └── com │ └── datumbox │ └── examples │ ├── Classification.java │ ├── Clustering.java │ ├── DataModeling.java │ ├── Regression.java │ └── TextClassification.java └── resources ├── datasets ├── diabetes │ ├── diabetes.tsv.gz │ └── source.txt ├── heart-desease │ ├── heart.csv │ └── source.txt ├── labor-statistics │ ├── longley.csv │ └── source.txt └── sentiment-analysis │ ├── rt-polarity.neg │ ├── rt-polarity.pos │ └── source.txt ├── datumbox.concurrencyconfiguration.properties ├── datumbox.configuration.properties ├── datumbox.inmemoryconfiguration.properties ├── datumbox.mapdbconfiguration.properties └── logback.xml /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.jar 3 | *.war 4 | *.ear 5 | *.iml 6 | 7 | target/ 8 | /.settings/ 9 | /.idea/ 10 | .classpath 11 | .project 12 | nbactions.xml 13 | nb-configuration.xml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2013 Vasilis Vryniotis 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Code Examples for Datumbox Machine Learning Framework 2 | ===================================================== 3 | 4 | [![Datumbox](http://www.datumbox.com/img/logo.png)](http://www.datumbox.com/) 5 | 6 | This project provides examples on how to use the [Datumbox Machine Learning Framework](https://github.com/datumbox/datumbox-framework/) v0.8.3-SNAPSHOT (Build 20201014). 7 | 8 | Copyright & License 9 | ------------------- 10 | 11 | Copyright (c) 2013-2020 [Vasilis Vryniotis](http://blog.datumbox.com/author/bbriniotis/). 12 | 13 | The code is licensed under the [Apache License, Version 2.0](./LICENSE). 14 | 15 | How to use 16 | ---------- 17 | 18 | The code uses Maven Project Structure and contains the following code examples: 19 | 20 | - [Classification.java](./src/main/java/com/datumbox/examples/Classification.java): Contains an example on how to perform Classification. 21 | - [Clustering.java](./src/main/java/com/datumbox/examples/Clustering.java): It is an example that runs Cluster Analysis. 22 | - [Regression.java](./src/main/java/com/datumbox/examples/Regression.java): Shows how to run Regression Analysis. 23 | - [DataModeling.java](./src/main/java/com/datumbox/examples/DataModeling.java): Explains how to use the convenience Modeler class. 24 | - [TextClassification.java](./src/main/java/com/datumbox/examples/TextClassification.java): Uses the convenience TextClassifier class. 25 | 26 | All of the above files contain a main() method. To use it just clone the project on your workspace and run any of the above files. 27 | 28 | The project contains also 5 configuration files in the resources folder: 29 | 30 | - [datumbox.configuration.properties](./src/main/resources/datumbox.configuration.properties): It defines for the default storage engine (required). 31 | - [datumbox.concurrencyconfiguration.properties](./src/main/resources/datumbox.concurrencyconfiguration.properties): It controls the concurrency levels (required). 32 | - [datumbox.inmemoryconfiguration.properties](./src/main/resources/datumbox.inmemoryconfiguration.properties): It contains the configurations for the InMemory storage engine (required). 33 | - [datumbox.mapdbconfiguration.properties](./src/main/resources/datumbox.mapdbconfiguration.properties): It contains the configurations for the MapDB storage engine (optional). 34 | - [logback.xml](./src/main/resources/logback.xml): It contains the configuration file for the logger (optional). 35 | 36 | Finally in the resources folder there are several [real world datasets](./src/main/resources/datasets/) which are used for testing. 37 | 38 | Useful Links 39 | ------------ 40 | 41 | - [Datumbox Machine Learning Framework](https://github.com/datumbox/datumbox-framework/) 42 | - [Datumbox Zoo: Pre-trained models](https://github.com/datumbox/datumbox-framework-zoo/) 43 | - [Datumbox.com](http://www.datumbox.com/) 44 | - [Machine Learning Blog](http://blog.datumbox.com/) 45 | 46 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 19 | 20 | 4.0.0 21 | com.datumbox 22 | datumbox-framework-examples 23 | 0.8.3-SNAPSHOT 24 | 25 | jar 26 | 27 | Code Examples for Datumbox Machine Learning Framework 28 | Code examples on how to use the Datumbox Machine Learning Framework. 29 | https://github.com/datumbox/datumbox-framework-examples/ 30 | 31 | Datumbox 32 | http://www.datumbox.com/ 33 | 34 | 2013 35 | 36 | 37 | 38 | bbriniotis 39 | Vasilis Vryniotis 40 | bbriniotis@datumbox.com 41 | http://blog.datumbox.com/author/bbriniotis/ 42 | 43 | Project Lead Developer 44 | 45 | 46 | 47 | 48 | 49 | 50 | Eleftherios Bampaletakis 51 | lmpampaletakis@gmail.com 52 | http://gr.linkedin.com/pub/eleftherios-bampaletakis/39/875/551 53 | 54 | Java Consultant 55 | 56 | 57 | 58 | 59 | 60 | 61 | Apache License, Version 2.0 62 | http://www.apache.org/licenses/LICENSE-2.0.txt 63 | repo 64 | A business-friendly OSS license 65 | 66 | 67 | 68 | 69 | https://github.com/datumbox/datumbox-framework-examples/issues/ 70 | GitHub Issues 71 | 72 | 73 | 74 | scm:git:git@github.com:datumbox/datumbox-framework-examples.git 75 | scm:git:git@github.com:datumbox/datumbox-framework-examples.git 76 | scm:git:git@github.com:datumbox/datumbox-framework-examples.git 77 | 78 | 79 | 80 | 81 | 0.8.3-SNAPSHOT 82 | 1.3.12 83 | 84 | 85 | UTF-8 86 | 11 87 | 11 88 | 89 | 90 | 91 | 92 | com.datumbox 93 | datumbox-framework-lib 94 | ${datumbox-framework-lib-version} 95 | 96 | 97 | ch.qos.logback 98 | logback-classic 99 | ${logback-classic-version} 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /src/main/java/com/datumbox/examples/Classification.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2013-2020 Vasilis Vryniotis 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.datumbox.examples; 17 | 18 | import com.datumbox.framework.common.Configuration; 19 | import com.datumbox.framework.core.common.dataobjects.Dataframe; 20 | import com.datumbox.framework.core.common.dataobjects.Record; 21 | import com.datumbox.framework.common.dataobjects.TypeInference; 22 | import com.datumbox.framework.common.utilities.RandomGenerator; 23 | import com.datumbox.framework.core.machinelearning.MLBuilder; 24 | import com.datumbox.framework.core.machinelearning.classification.SoftMaxRegression; 25 | import com.datumbox.framework.core.machinelearning.featureselection.PCA; 26 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics; 27 | import com.datumbox.framework.core.machinelearning.modelselection.splitters.ShuffleSplitter; 28 | import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler; 29 | 30 | import java.io.*; 31 | import java.net.URISyntaxException; 32 | import java.nio.file.Paths; 33 | import java.util.LinkedHashMap; 34 | import java.util.Map; 35 | import java.util.zip.GZIPInputStream; 36 | 37 | /** 38 | * Classification example. 39 | * 40 | * @author Vasilis Vryniotis 41 | */ 42 | public class Classification { 43 | 44 | /** 45 | * Example of how to use directly the algorithms of the framework in order to 46 | * perform classification. A similar approach can be used to perform clustering, 47 | * regression, build recommender system or perform topic modeling and dimensionality 48 | * reduction. 49 | * 50 | * @param args the command line arguments 51 | */ 52 | public static void main(String[] args) { 53 | /** 54 | * There are 5 configuration files in the resources folder: 55 | * 56 | * - datumbox.configuration.properties: It defines for the default storage engine (required) 57 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required) 58 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required) 59 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional) 60 | * - logback.xml: It contains the configuration file for the logger (optional) 61 | */ 62 | 63 | //Initialization 64 | //-------------- 65 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects 66 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file 67 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default) 68 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine 69 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization 70 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level 71 | 72 | 73 | 74 | 75 | //Reading Data 76 | //------------ 77 | Dataframe data; 78 | try (Reader fileReader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(Paths.get(Classification.class.getClassLoader().getResource("datasets/diabetes/diabetes.tsv.gz").toURI()).toFile())), "UTF-8"))) { 79 | LinkedHashMap headerDataTypes = new LinkedHashMap<>(); 80 | headerDataTypes.put("pregnancies", TypeInference.DataType.NUMERICAL); 81 | headerDataTypes.put("plasma glucose", TypeInference.DataType.NUMERICAL); 82 | headerDataTypes.put("blood pressure", TypeInference.DataType.NUMERICAL); 83 | headerDataTypes.put("triceps thickness", TypeInference.DataType.NUMERICAL); 84 | headerDataTypes.put("serum insulin", TypeInference.DataType.NUMERICAL); 85 | headerDataTypes.put("bmi", TypeInference.DataType.NUMERICAL); 86 | headerDataTypes.put("dpf", TypeInference.DataType.NUMERICAL); 87 | headerDataTypes.put("age", TypeInference.DataType.NUMERICAL); 88 | headerDataTypes.put("test result", TypeInference.DataType.CATEGORICAL); 89 | 90 | data = Dataframe.Builder.parseCSVFile(fileReader, "test result", headerDataTypes, '\t', '"', "\r\n", null, null, configuration); 91 | } 92 | catch(UncheckedIOException | IOException | URISyntaxException ex) { 93 | throw new RuntimeException(ex); 94 | } 95 | 96 | //Spit into train and test datasets 97 | ShuffleSplitter.Split split = new ShuffleSplitter(0.8, 1).split(data).next(); 98 | Dataframe trainingDataframe = split.getTrain(); 99 | Dataframe testingDataframe = split.getTest(); 100 | 101 | 102 | //Transform Dataframe 103 | //----------------- 104 | 105 | //Scale continuous variables 106 | MinMaxScaler.TrainingParameters nsParams = new MinMaxScaler.TrainingParameters(); 107 | MinMaxScaler numericalScaler = MLBuilder.create(nsParams, configuration); 108 | 109 | numericalScaler.fit_transform(trainingDataframe); 110 | numericalScaler.save("Diabetes"); 111 | 112 | 113 | 114 | //Feature Selection 115 | //----------------- 116 | 117 | //Perform dimensionality reduction using PCA 118 | 119 | PCA.TrainingParameters featureSelectionParameters = new PCA.TrainingParameters(); 120 | featureSelectionParameters.setMaxDimensions(trainingDataframe.xColumnSize()-1); //remove one dimension 121 | featureSelectionParameters.setWhitened(false); 122 | featureSelectionParameters.setVariancePercentageThreshold(0.99999995); 123 | 124 | PCA featureSelection = MLBuilder.create(featureSelectionParameters, configuration); 125 | featureSelection.fit_transform(trainingDataframe); 126 | featureSelection.save("Diabetes"); 127 | 128 | 129 | 130 | //Fit the classifier 131 | //------------------ 132 | 133 | SoftMaxRegression.TrainingParameters param = new SoftMaxRegression.TrainingParameters(); 134 | param.setTotalIterations(200); 135 | param.setLearningRate(0.1); 136 | 137 | SoftMaxRegression classifier = MLBuilder.create(param, configuration); 138 | classifier.fit(trainingDataframe); 139 | classifier.save("Diabetes"); 140 | 141 | 142 | //Use the classifier 143 | //------------------ 144 | 145 | //Apply the same numerical scaling on testingDataframe 146 | numericalScaler.transform(testingDataframe); 147 | 148 | //Apply the same featureSelection transformations on testingDataframe 149 | featureSelection.transform(testingDataframe); 150 | 151 | //Use the classifier to make predictions on the testingDataframe 152 | classifier.predict(testingDataframe); 153 | 154 | //Get validation metrics on the test set 155 | ClassificationMetrics vm = new ClassificationMetrics(testingDataframe); 156 | 157 | System.out.println("Results:"); 158 | for(Map.Entry entry: testingDataframe.entries()) { 159 | Integer rId = entry.getKey(); 160 | Record r = entry.getValue(); 161 | System.out.println("Record "+rId+" - Real Y: "+r.getY()+", Predicted Y: "+r.getYPredicted()); 162 | } 163 | 164 | System.out.println("Classifier Accuracy: "+vm.getAccuracy()); 165 | 166 | 167 | 168 | //Clean up 169 | //-------- 170 | 171 | //Delete scaler, featureselector and classifier. 172 | numericalScaler.delete(); 173 | featureSelection.delete(); 174 | classifier.delete(); 175 | 176 | //Close Dataframes. 177 | trainingDataframe.close(); 178 | testingDataframe.close(); 179 | } 180 | 181 | } 182 | -------------------------------------------------------------------------------- /src/main/java/com/datumbox/examples/Clustering.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2013-2020 Vasilis Vryniotis 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.datumbox.examples; 17 | 18 | import com.datumbox.framework.common.Configuration; 19 | import com.datumbox.framework.core.common.dataobjects.Dataframe; 20 | import com.datumbox.framework.core.common.dataobjects.Record; 21 | import com.datumbox.framework.common.dataobjects.TypeInference; 22 | import com.datumbox.framework.common.utilities.RandomGenerator; 23 | import com.datumbox.framework.core.machinelearning.MLBuilder; 24 | import com.datumbox.framework.core.machinelearning.clustering.Kmeans; 25 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClusteringMetrics; 26 | import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder; 27 | import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler; 28 | 29 | import java.io.*; 30 | import java.net.URISyntaxException; 31 | import java.nio.file.Paths; 32 | import java.util.LinkedHashMap; 33 | import java.util.Map; 34 | 35 | /** 36 | * Clustering example. 37 | * 38 | * @author Vasilis Vryniotis 39 | */ 40 | public class Clustering { 41 | 42 | /** 43 | * Example of how to use directly the algorithms of the framework in order to 44 | * perform clustering. A similar approach can be used to perform classification, 45 | * regression, build recommender system or perform topic modeling and dimensionality 46 | * reduction. 47 | * 48 | * @param args the command line arguments 49 | */ 50 | public static void main(String[] args) { 51 | /** 52 | * There are 5 configuration files in the resources folder: 53 | * 54 | * - datumbox.configuration.properties: It defines for the default storage engine (required) 55 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required) 56 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required) 57 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional) 58 | * - logback.xml: It contains the configuration file for the logger (optional) 59 | */ 60 | 61 | //Initialization 62 | //-------------- 63 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects 64 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file 65 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default) 66 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine 67 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization 68 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level 69 | 70 | 71 | 72 | //Reading Data 73 | //------------ 74 | Dataframe trainingDataframe; 75 | try (Reader fileReader = new InputStreamReader(new FileInputStream(Paths.get(Clustering.class.getClassLoader().getResource("datasets/heart-desease/heart.csv").toURI()).toFile()), "UTF-8")) { 76 | LinkedHashMap headerDataTypes = new LinkedHashMap<>(); 77 | headerDataTypes.put("Age", TypeInference.DataType.NUMERICAL); 78 | headerDataTypes.put("Sex", TypeInference.DataType.CATEGORICAL); 79 | headerDataTypes.put("ChestPain", TypeInference.DataType.CATEGORICAL); 80 | headerDataTypes.put("RestBP", TypeInference.DataType.NUMERICAL); 81 | headerDataTypes.put("Cholesterol", TypeInference.DataType.NUMERICAL); 82 | headerDataTypes.put("BloodSugar", TypeInference.DataType.BOOLEAN); 83 | headerDataTypes.put("ECG", TypeInference.DataType.CATEGORICAL); 84 | headerDataTypes.put("MaxHeartRate", TypeInference.DataType.NUMERICAL); 85 | headerDataTypes.put("Angina", TypeInference.DataType.BOOLEAN); 86 | headerDataTypes.put("OldPeak", TypeInference.DataType.NUMERICAL); 87 | headerDataTypes.put("STSlope", TypeInference.DataType.ORDINAL); 88 | headerDataTypes.put("Vessels", TypeInference.DataType.NUMERICAL); 89 | headerDataTypes.put("Thal", TypeInference.DataType.CATEGORICAL); 90 | headerDataTypes.put("Class", TypeInference.DataType.CATEGORICAL); 91 | 92 | trainingDataframe = Dataframe.Builder.parseCSVFile(fileReader, "Class", headerDataTypes, ',', '"', "\r\n", null, null, configuration); 93 | } 94 | catch(UncheckedIOException | IOException | URISyntaxException ex) { 95 | throw new RuntimeException(ex); 96 | } 97 | 98 | //Store data and load them back 99 | trainingDataframe.save("HeartDeseaseDataset"); 100 | Dataframe testingDataframe = Dataframe.Builder.load("HeartDeseaseDataset", configuration); 101 | 102 | 103 | //Transform Dataframe 104 | //----------------- 105 | 106 | //Convert Categorical variables to dummy variables (boolean) and scale continuous variables 107 | MinMaxScaler.TrainingParameters nsParams = new MinMaxScaler.TrainingParameters(); 108 | MinMaxScaler numericalScaler = MLBuilder.create(nsParams, configuration); 109 | 110 | numericalScaler.fit_transform(trainingDataframe); 111 | numericalScaler.save("HeartDesease"); 112 | 113 | OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters(); 114 | OneHotEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration); 115 | 116 | categoricalEncoder.fit_transform(trainingDataframe); 117 | categoricalEncoder.save("HeartDesease"); 118 | 119 | 120 | 121 | //Fit the clusterer 122 | //----------------- 123 | 124 | Kmeans.TrainingParameters param = new Kmeans.TrainingParameters(); 125 | param.setK(2); 126 | param.setMaxIterations(200); 127 | param.setInitializationMethod(Kmeans.TrainingParameters.Initialization.FORGY); 128 | param.setDistanceMethod(Kmeans.TrainingParameters.Distance.EUCLIDIAN); 129 | param.setWeighted(false); 130 | param.setCategoricalGamaMultiplier(1.0); 131 | param.setSubsetFurthestFirstcValue(2.0); 132 | 133 | Kmeans clusterer = MLBuilder.create(param, configuration); 134 | clusterer.fit(trainingDataframe); 135 | clusterer.save("HeartDesease"); 136 | 137 | 138 | //Use the clusterer 139 | //----------------- 140 | 141 | //Apply the same scaling and encoding on testingDataframe 142 | numericalScaler.transform(testingDataframe); 143 | categoricalEncoder.transform(testingDataframe); 144 | 145 | //Make predictions on the test set 146 | clusterer.predict(testingDataframe); 147 | 148 | //Get validation metrics on the test set 149 | ClusteringMetrics vm = new ClusteringMetrics(testingDataframe); 150 | 151 | System.out.println("Results:"); 152 | for(Map.Entry entry: testingDataframe.entries()) { 153 | Integer rId = entry.getKey(); 154 | Record r = entry.getValue(); 155 | System.out.println("Record "+rId+" - Original Y: "+r.getY()+", Predicted Cluster Id: "+r.getYPredicted()); 156 | } 157 | 158 | System.out.println("Clusterer Purity: "+vm.getPurity()); 159 | 160 | 161 | 162 | //Clean up 163 | //-------- 164 | 165 | //Delete scaler, encoder, clusterer. 166 | numericalScaler.delete(); 167 | categoricalEncoder.delete(); 168 | clusterer.delete(); 169 | 170 | //Delete the train and close the test Dataframe. 171 | trainingDataframe.delete(); 172 | testingDataframe.close(); 173 | } 174 | 175 | } 176 | -------------------------------------------------------------------------------- /src/main/java/com/datumbox/examples/DataModeling.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2013-2020 Vasilis Vryniotis 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.datumbox.examples; 17 | 18 | import com.datumbox.framework.applications.datamodeling.Modeler; 19 | import com.datumbox.framework.common.Configuration; 20 | import com.datumbox.framework.core.common.dataobjects.Dataframe; 21 | import com.datumbox.framework.core.common.dataobjects.Record; 22 | import com.datumbox.framework.common.dataobjects.TypeInference; 23 | import com.datumbox.framework.common.utilities.RandomGenerator; 24 | import com.datumbox.framework.core.machinelearning.MLBuilder; 25 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.LinearRegressionMetrics; 26 | import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder; 27 | import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler; 28 | import com.datumbox.framework.core.machinelearning.regression.NLMS; 29 | 30 | import java.io.*; 31 | import java.net.URISyntaxException; 32 | import java.nio.file.Paths; 33 | import java.util.Arrays; 34 | import java.util.LinkedHashMap; 35 | import java.util.Map; 36 | 37 | /** 38 | * DataModeling example. 39 | * 40 | * @author Vasilis Vryniotis 41 | */ 42 | public class DataModeling { 43 | 44 | /** 45 | * Example of how to use the Modeler class. 46 | * 47 | * @param args the command line arguments 48 | */ 49 | public static void main(String[] args) { 50 | /** 51 | * There are 5 configuration files in the resources folder: 52 | * 53 | * - datumbox.configuration.properties: It defines for the default storage engine (required) 54 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required) 55 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required) 56 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional) 57 | * - logback.xml: It contains the configuration file for the logger (optional) 58 | */ 59 | 60 | //Initialization 61 | //-------------- 62 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects 63 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file 64 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default) 65 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine 66 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization 67 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level 68 | 69 | 70 | 71 | //Reading Data 72 | //------------ 73 | Dataframe trainingDataframe; 74 | try (Reader fileReader = new InputStreamReader(new FileInputStream(Paths.get(Clustering.class.getClassLoader().getResource("datasets/labor-statistics/longley.csv").toURI()).toFile()), "UTF-8")) { 75 | LinkedHashMap headerDataTypes = new LinkedHashMap<>(); 76 | headerDataTypes.put("Employed", TypeInference.DataType.NUMERICAL); 77 | headerDataTypes.put("GNP.deflator", TypeInference.DataType.NUMERICAL); 78 | headerDataTypes.put("GNP", TypeInference.DataType.NUMERICAL); 79 | headerDataTypes.put("Unemployed", TypeInference.DataType.NUMERICAL); 80 | headerDataTypes.put("Armed.Forces", TypeInference.DataType.NUMERICAL); 81 | headerDataTypes.put("Population", TypeInference.DataType.NUMERICAL); 82 | headerDataTypes.put("Year", TypeInference.DataType.NUMERICAL); 83 | 84 | trainingDataframe = Dataframe.Builder.parseCSVFile(fileReader, "Employed", headerDataTypes, ',', '"', "\r\n", null, null, configuration); 85 | } 86 | catch(UncheckedIOException | IOException | URISyntaxException ex) { 87 | throw new RuntimeException(ex); 88 | } 89 | Dataframe testingDataframe = trainingDataframe.copy(); 90 | 91 | 92 | 93 | //Setup Training Parameters 94 | //------------------------- 95 | Modeler.TrainingParameters trainingParameters = new Modeler.TrainingParameters(); 96 | 97 | //numerical scaling configuration 98 | MinMaxScaler.TrainingParameters nsParams = new MinMaxScaler.TrainingParameters(); 99 | trainingParameters.setNumericalScalerTrainingParameters(nsParams); 100 | 101 | //categorical encoding configuration 102 | OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters(); 103 | trainingParameters.setCategoricalEncoderTrainingParameters(ceParams); 104 | 105 | //Set feature selection configuration 106 | trainingParameters.setFeatureSelectorTrainingParametersList(Arrays.asList()); 107 | 108 | //Model Configuration 109 | trainingParameters.setModelerTrainingParameters(new NLMS.TrainingParameters()); 110 | 111 | 112 | 113 | //Fit the modeler 114 | //--------------- 115 | Modeler modeler = MLBuilder.create(trainingParameters, configuration); 116 | modeler.fit(trainingDataframe); 117 | modeler.save("LaborStatistics"); 118 | 119 | 120 | //Use the modeler 121 | //--------------- 122 | 123 | //Make predictions on the test set 124 | modeler.predict(testingDataframe); 125 | 126 | LinearRegressionMetrics vm = new LinearRegressionMetrics(testingDataframe); 127 | 128 | System.out.println("Test Results:"); 129 | for(Map.Entry entry: testingDataframe.entries()) { 130 | Integer rId = entry.getKey(); 131 | Record r = entry.getValue(); 132 | System.out.println("Record "+rId+" - Real Y: "+r.getY()+", Predicted Y: "+r.getYPredicted()); 133 | } 134 | 135 | System.out.println("Model Rsquare: "+vm.getRSquare()); 136 | 137 | 138 | 139 | //Clean up 140 | //-------- 141 | 142 | //Delete the modeler. This removes all files. 143 | modeler.delete(); 144 | 145 | //Close Dataframes. 146 | trainingDataframe.close(); 147 | testingDataframe.close(); 148 | } 149 | 150 | } 151 | -------------------------------------------------------------------------------- /src/main/java/com/datumbox/examples/Regression.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2013-2020 Vasilis Vryniotis 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.datumbox.examples; 17 | 18 | import com.datumbox.framework.common.Configuration; 19 | import com.datumbox.framework.core.common.dataobjects.Dataframe; 20 | import com.datumbox.framework.core.common.dataobjects.Record; 21 | import com.datumbox.framework.common.dataobjects.TypeInference; 22 | import com.datumbox.framework.common.utilities.RandomGenerator; 23 | import com.datumbox.framework.core.machinelearning.MLBuilder; 24 | import com.datumbox.framework.core.machinelearning.featureselection.PCA; 25 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.LinearRegressionMetrics; 26 | import com.datumbox.framework.core.machinelearning.preprocessing.StandardScaler; 27 | import com.datumbox.framework.core.machinelearning.regression.MatrixLinearRegression; 28 | 29 | import java.io.*; 30 | import java.net.URISyntaxException; 31 | import java.nio.file.Paths; 32 | import java.util.LinkedHashMap; 33 | import java.util.Map; 34 | 35 | /** 36 | * Regression example. 37 | * 38 | * @author Vasilis Vryniotis 39 | */ 40 | public class Regression { 41 | 42 | /** 43 | * Example of how to use directly the algorithms of the framework in order to 44 | * perform regression. A similar approach can be used to perform clustering, 45 | * classification, build recommender system or perform topic modeling and dimensionality 46 | * reduction. 47 | * 48 | * @param args the command line arguments 49 | */ 50 | public static void main(String[] args) { 51 | /** 52 | * There are 5 configuration files in the resources folder: 53 | * 54 | * - datumbox.configuration.properties: It defines for the default storage engine (required) 55 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required) 56 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required) 57 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional) 58 | * - logback.xml: It contains the configuration file for the logger (optional) 59 | */ 60 | 61 | //Initialization 62 | //-------------- 63 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects 64 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file 65 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default) 66 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine 67 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization 68 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level 69 | 70 | 71 | 72 | //Reading Data 73 | //------------ 74 | Dataframe trainingDataframe; 75 | try (Reader fileReader = new InputStreamReader(new FileInputStream(Paths.get(Clustering.class.getClassLoader().getResource("datasets/labor-statistics/longley.csv").toURI()).toFile()), "UTF-8")) { 76 | LinkedHashMap headerDataTypes = new LinkedHashMap<>(); 77 | headerDataTypes.put("Employed", TypeInference.DataType.NUMERICAL); 78 | headerDataTypes.put("GNP.deflator", TypeInference.DataType.NUMERICAL); 79 | headerDataTypes.put("GNP", TypeInference.DataType.NUMERICAL); 80 | headerDataTypes.put("Unemployed", TypeInference.DataType.NUMERICAL); 81 | headerDataTypes.put("Armed.Forces", TypeInference.DataType.NUMERICAL); 82 | headerDataTypes.put("Population", TypeInference.DataType.NUMERICAL); 83 | headerDataTypes.put("Year", TypeInference.DataType.NUMERICAL); 84 | 85 | trainingDataframe = Dataframe.Builder.parseCSVFile(fileReader, "Employed", headerDataTypes, ',', '"', "\r\n", null, null, configuration); 86 | } 87 | catch(UncheckedIOException | IOException | URISyntaxException ex) { 88 | throw new RuntimeException(ex); 89 | } 90 | Dataframe testingDataframe = trainingDataframe.copy(); 91 | 92 | 93 | //Transform Dataframe 94 | //----------------- 95 | 96 | //Scale continuous variables 97 | StandardScaler.TrainingParameters nsParams = new StandardScaler.TrainingParameters(); 98 | nsParams.setScaleResponse(true); 99 | StandardScaler numericalScaler = MLBuilder.create(nsParams, configuration); 100 | 101 | numericalScaler.fit_transform(trainingDataframe); 102 | numericalScaler.save("LaborStatistics"); 103 | 104 | 105 | 106 | //Feature Selection 107 | //----------------- 108 | 109 | //Perform dimensionality reduction using PCA 110 | 111 | PCA.TrainingParameters featureSelectionParameters = new PCA.TrainingParameters(); 112 | featureSelectionParameters.setMaxDimensions(trainingDataframe.xColumnSize()-1); //remove one dimension 113 | featureSelectionParameters.setWhitened(false); 114 | featureSelectionParameters.setVariancePercentageThreshold(0.99999995); 115 | 116 | PCA featureSelection = MLBuilder.create(featureSelectionParameters, configuration); 117 | featureSelection.fit_transform(trainingDataframe); 118 | featureSelection.save("LaborStatistics"); 119 | 120 | 121 | 122 | //Fit the regressor 123 | //----------------- 124 | 125 | MatrixLinearRegression.TrainingParameters param = new MatrixLinearRegression.TrainingParameters(); 126 | 127 | MatrixLinearRegression regressor = MLBuilder.create(param, configuration); 128 | regressor.fit(trainingDataframe); 129 | regressor.save("LaborStatistics"); 130 | regressor.close(); //close the regressor, we will use it again later 131 | 132 | 133 | 134 | //Use the regressor 135 | //------------------ 136 | 137 | //Apply the same numerical scaling on testingDataframe 138 | numericalScaler.transform(testingDataframe); 139 | 140 | //Apply the same featureSelection transformations on testingDataframe 141 | featureSelection.transform(testingDataframe); 142 | 143 | //Load again the regressor 144 | regressor = MLBuilder.load(MatrixLinearRegression.class, "LaborStatistics", configuration); 145 | regressor.predict(testingDataframe); 146 | 147 | //Get validation metrics on the training set 148 | LinearRegressionMetrics vm = new LinearRegressionMetrics(testingDataframe); 149 | 150 | System.out.println("Results:"); 151 | for(Map.Entry entry: testingDataframe.entries()) { 152 | Integer rId = entry.getKey(); 153 | Record r = entry.getValue(); 154 | System.out.println("Record "+rId+" - Real Y: "+r.getY()+", Predicted Y: "+r.getYPredicted()); 155 | } 156 | 157 | System.out.println("Regressor Rsquare: "+vm.getRSquare()); 158 | 159 | 160 | 161 | //Clean up 162 | //-------- 163 | 164 | //Delete scaler, featureselector and regressor. 165 | numericalScaler.delete(); 166 | featureSelection.delete(); 167 | regressor.delete(); 168 | 169 | //Close Dataframes. 170 | trainingDataframe.close(); 171 | testingDataframe.close(); 172 | } 173 | 174 | } 175 | -------------------------------------------------------------------------------- /src/main/java/com/datumbox/examples/TextClassification.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2013-2020 Vasilis Vryniotis 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.datumbox.examples; 17 | 18 | import com.datumbox.framework.applications.nlp.TextClassifier; 19 | import com.datumbox.framework.common.Configuration; 20 | import com.datumbox.framework.core.common.dataobjects.Record; 21 | import com.datumbox.framework.common.utilities.RandomGenerator; 22 | import com.datumbox.framework.core.machinelearning.MLBuilder; 23 | import com.datumbox.framework.core.machinelearning.classification.MultinomialNaiveBayes; 24 | import com.datumbox.framework.core.machinelearning.featureselection.ChisquareSelect; 25 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics; 26 | import com.datumbox.framework.core.common.text.extractors.NgramsExtractor; 27 | 28 | import java.net.URI; 29 | import java.net.URISyntaxException; 30 | import java.util.Arrays; 31 | import java.util.HashMap; 32 | import java.util.Map; 33 | 34 | 35 | /** 36 | * Text Classification example. 37 | * 38 | * @author Vasilis Vryniotis 39 | */ 40 | public class TextClassification { 41 | 42 | /** 43 | * Example of how to use the TextClassifier class. 44 | * 45 | * @param args the command line arguments 46 | * @throws java.net.URISyntaxException 47 | */ 48 | public static void main(String[] args) throws URISyntaxException { 49 | /** 50 | * There are 5 configuration files in the resources folder: 51 | * 52 | * - datumbox.configuration.properties: It defines for the default storage engine (required) 53 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required) 54 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required) 55 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional) 56 | * - logback.xml: It contains the configuration file for the logger (optional) 57 | */ 58 | 59 | //Initialization 60 | //-------------- 61 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects 62 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file 63 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default) 64 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine 65 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization 66 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level 67 | 68 | 69 | 70 | //Reading Data 71 | //------------ 72 | Map datasets = new HashMap<>(); //The examples of each category are stored on the same file, one example per row. 73 | datasets.put("positive", TextClassification.class.getClassLoader().getResource("datasets/sentiment-analysis/rt-polarity.pos").toURI()); 74 | datasets.put("negative", TextClassification.class.getClassLoader().getResource("datasets/sentiment-analysis/rt-polarity.neg").toURI()); 75 | 76 | 77 | 78 | //Setup Training Parameters 79 | //------------------------- 80 | TextClassifier.TrainingParameters trainingParameters = new TextClassifier.TrainingParameters(); 81 | 82 | //numerical scaling configuration 83 | trainingParameters.setNumericalScalerTrainingParameters(null); 84 | 85 | //Set feature selection configuration 86 | trainingParameters.setFeatureSelectorTrainingParametersList(Arrays.asList(new ChisquareSelect.TrainingParameters())); 87 | 88 | //Set text extraction configuration 89 | trainingParameters.setTextExtractorParameters(new NgramsExtractor.Parameters()); 90 | 91 | //Classifier configuration 92 | trainingParameters.setModelerTrainingParameters(new MultinomialNaiveBayes.TrainingParameters()); 93 | 94 | 95 | 96 | //Fit the classifier 97 | //------------------ 98 | TextClassifier textClassifier = MLBuilder.create(trainingParameters, configuration); 99 | textClassifier.fit(datasets); 100 | textClassifier.save("SentimentAnalysis"); 101 | 102 | 103 | 104 | //Use the classifier 105 | //------------------ 106 | 107 | //Get validation metrics on the dataset 108 | ClassificationMetrics vm = textClassifier.validate(datasets); 109 | 110 | //Classify a single sentence 111 | String sentence = "Datumbox is amazing!"; 112 | Record r = textClassifier.predict(sentence); 113 | 114 | System.out.println("Classifing sentence: \""+sentence+"\""); 115 | System.out.println("Predicted class: "+r.getYPredicted()); 116 | System.out.println("Probability: "+r.getYPredictedProbabilities().get(r.getYPredicted())); 117 | 118 | System.out.println("Classifier Accuracy: "+vm.getAccuracy()); 119 | 120 | 121 | 122 | //Clean up 123 | //-------- 124 | 125 | //Delete the classifier. This removes all files. 126 | textClassifier.delete(); 127 | } 128 | 129 | } 130 | -------------------------------------------------------------------------------- /src/main/resources/datasets/diabetes/diabetes.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datumbox/datumbox-framework-examples/224908663718828f5769ffc9559cba8bca45ca37/src/main/resources/datasets/diabetes/diabetes.tsv.gz -------------------------------------------------------------------------------- /src/main/resources/datasets/diabetes/source.txt: -------------------------------------------------------------------------------- 1 | http://www.sgi.com/tech/mlc/db/diabetes.names 2 | http://www.sgi.com/tech/mlc/db/diabetes.all 3 | -------------------------------------------------------------------------------- /src/main/resources/datasets/heart-desease/heart.csv: -------------------------------------------------------------------------------- 1 | Age,Sex,ChestPain,RestBP,Cholesterol,BloodSugar,ECG,MaxHeartRate,Angina,OldPeak,STSlope,Vessels,Thal,Class 2 | 70,male,type4,130,322,FALSE,type2,109,FALSE,2.4,2,3,normal,not-healthy 3 | 67,female,type3,115,564,FALSE,type2,160,FALSE,1.6,2,0,reversable defect,healthy 4 | 57,male,type2,124,261,FALSE,type0,141,FALSE,0.3,1,0,reversable defect,not-healthy 5 | 64,male,type4,128,263,FALSE,type0,105,TRUE,0.2,2,1,reversable defect,healthy 6 | 74,female,type2,120,269,FALSE,type2,121,TRUE,0.2,1,1,normal,healthy 7 | 65,male,type4,120,177,FALSE,type0,140,FALSE,0.4,1,0,reversable defect,healthy 8 | 56,male,type3,130,256,TRUE,type2,142,TRUE,0.6,2,1,fixed defect,not-healthy 9 | 59,male,type4,110,239,FALSE,type2,142,TRUE,1.2,2,1,reversable defect,not-healthy 10 | 60,male,type4,140,293,FALSE,type2,170,FALSE,1.2,2,2,reversable defect,not-healthy 11 | 63,female,type4,150,407,FALSE,type2,154,FALSE,4,2,3,reversable defect,not-healthy 12 | 59,male,type4,135,234,FALSE,type0,161,FALSE,0.5,2,0,reversable defect,healthy 13 | 53,male,type4,142,226,FALSE,type2,111,TRUE,0,1,0,reversable defect,healthy 14 | 44,male,type3,140,235,FALSE,type2,180,FALSE,0,1,0,normal,healthy 15 | 61,male,type1,134,234,FALSE,type0,145,FALSE,2.6,2,2,normal,not-healthy 16 | 57,female,type4,128,303,FALSE,type2,159,FALSE,0,1,1,normal,healthy 17 | 71,female,type4,112,149,FALSE,type0,125,FALSE,1.6,2,0,normal,healthy 18 | 46,male,type4,140,311,FALSE,type0,120,TRUE,1.8,2,2,reversable defect,not-healthy 19 | 53,male,type4,140,203,TRUE,type2,155,TRUE,3.1,3,0,reversable defect,not-healthy 20 | 64,male,type1,110,211,FALSE,type2,144,TRUE,1.8,2,0,normal,healthy 21 | 40,male,type1,140,199,FALSE,type0,178,TRUE,1.4,1,0,reversable defect,healthy 22 | 67,male,type4,120,229,FALSE,type2,129,TRUE,2.6,2,2,reversable defect,not-healthy 23 | 48,male,type2,130,245,FALSE,type2,180,FALSE,0.2,2,0,normal,healthy 24 | 43,male,type4,115,303,FALSE,type0,181,FALSE,1.2,2,0,normal,healthy 25 | 47,male,type4,112,204,FALSE,type0,143,FALSE,0.1,1,0,normal,healthy 26 | 54,female,type2,132,288,TRUE,type2,159,TRUE,0,1,1,normal,healthy 27 | 48,female,type3,130,275,FALSE,type0,139,FALSE,0.2,1,0,normal,healthy 28 | 46,female,type4,138,243,FALSE,type2,152,TRUE,0,2,0,normal,healthy 29 | 51,female,type3,120,295,FALSE,type2,157,FALSE,0.6,1,0,normal,healthy 30 | 58,male,type3,112,230,FALSE,type2,165,FALSE,2.5,2,1,reversable defect,not-healthy 31 | 71,female,type3,110,265,TRUE,type2,130,FALSE,0,1,1,normal,healthy 32 | 57,male,type3,128,229,FALSE,type2,150,FALSE,0.4,2,1,reversable defect,not-healthy 33 | 66,male,type4,160,228,FALSE,type2,138,FALSE,2.3,1,0,fixed defect,healthy 34 | 37,female,type3,120,215,FALSE,type0,170,FALSE,0,1,0,normal,healthy 35 | 59,male,type4,170,326,FALSE,type2,140,TRUE,3.4,3,0,reversable defect,not-healthy 36 | 50,male,type4,144,200,FALSE,type2,126,TRUE,0.9,2,0,reversable defect,not-healthy 37 | 48,male,type4,130,256,TRUE,type2,150,TRUE,0,1,2,reversable defect,not-healthy 38 | 61,male,type4,140,207,FALSE,type2,138,TRUE,1.9,1,1,reversable defect,not-healthy 39 | 59,male,type1,160,273,FALSE,type2,125,FALSE,0,1,0,normal,not-healthy 40 | 42,male,type3,130,180,FALSE,type0,150,FALSE,0,1,0,normal,healthy 41 | 48,male,type4,122,222,FALSE,type2,186,FALSE,0,1,0,normal,healthy 42 | 40,male,type4,152,223,FALSE,type0,181,FALSE,0,1,0,reversable defect,not-healthy 43 | 62,female,type4,124,209,FALSE,type0,163,FALSE,0,1,0,normal,healthy 44 | 44,male,type3,130,233,FALSE,type0,179,TRUE,0.4,1,0,normal,healthy 45 | 46,male,type2,101,197,TRUE,type0,156,FALSE,0,1,0,reversable defect,healthy 46 | 59,male,type3,126,218,TRUE,type0,134,FALSE,2.2,2,1,fixed defect,not-healthy 47 | 58,male,type3,140,211,TRUE,type2,165,FALSE,0,1,0,normal,healthy 48 | 49,male,type3,118,149,FALSE,type2,126,FALSE,0.8,1,3,normal,not-healthy 49 | 44,male,type4,110,197,FALSE,type2,177,FALSE,0,1,1,normal,not-healthy 50 | 66,male,type2,160,246,FALSE,type0,120,TRUE,0,2,3,fixed defect,not-healthy 51 | 65,female,type4,150,225,FALSE,type2,114,FALSE,1,2,3,reversable defect,not-healthy 52 | 42,male,type4,136,315,FALSE,type0,125,TRUE,1.8,2,0,fixed defect,not-healthy 53 | 52,male,type2,128,205,TRUE,type0,184,FALSE,0,1,0,normal,healthy 54 | 65,female,type3,140,417,TRUE,type2,157,FALSE,0.8,1,1,normal,healthy 55 | 63,female,type2,140,195,FALSE,type0,179,FALSE,0,1,2,normal,healthy 56 | 45,female,type2,130,234,FALSE,type2,175,FALSE,0.6,2,0,normal,healthy 57 | 41,female,type2,105,198,FALSE,type0,168,FALSE,0,1,1,normal,healthy 58 | 61,male,type4,138,166,FALSE,type2,125,TRUE,3.6,2,1,normal,not-healthy 59 | 60,female,type3,120,178,TRUE,type0,96,FALSE,0,1,0,normal,healthy 60 | 59,female,type4,174,249,FALSE,type0,143,TRUE,0,2,0,normal,not-healthy 61 | 62,male,type2,120,281,FALSE,type2,103,FALSE,1.4,2,1,reversable defect,not-healthy 62 | 57,male,type3,150,126,TRUE,type0,173,FALSE,0.2,1,1,reversable defect,healthy 63 | 51,female,type4,130,305,FALSE,type0,142,TRUE,1.2,2,0,reversable defect,not-healthy 64 | 44,male,type3,120,226,FALSE,type0,169,FALSE,0,1,0,normal,healthy 65 | 60,female,type1,150,240,FALSE,type0,171,FALSE,0.9,1,0,normal,healthy 66 | 63,male,type1,145,233,TRUE,type2,150,FALSE,2.3,3,0,fixed defect,healthy 67 | 57,male,type4,150,276,FALSE,type2,112,TRUE,0.6,2,1,fixed defect,not-healthy 68 | 51,male,type4,140,261,FALSE,type2,186,TRUE,0,1,0,normal,healthy 69 | 58,female,type2,136,319,TRUE,type2,152,FALSE,0,1,2,normal,not-healthy 70 | 44,female,type3,118,242,FALSE,type0,149,FALSE,0.3,2,1,normal,healthy 71 | 47,male,type3,108,243,FALSE,type0,152,FALSE,0,1,0,normal,not-healthy 72 | 61,male,type4,120,260,FALSE,type0,140,TRUE,3.6,2,1,reversable defect,not-healthy 73 | 57,female,type4,120,354,FALSE,type0,163,TRUE,0.6,1,0,normal,healthy 74 | 70,male,type2,156,245,FALSE,type2,143,FALSE,0,1,0,normal,healthy 75 | 76,female,type3,140,197,FALSE,type1,116,FALSE,1.1,2,0,normal,healthy 76 | 67,female,type4,106,223,FALSE,type0,142,FALSE,0.3,1,2,normal,healthy 77 | 45,male,type4,142,309,FALSE,type2,147,TRUE,0,2,3,reversable defect,not-healthy 78 | 45,male,type4,104,208,FALSE,type2,148,TRUE,3,2,0,normal,healthy 79 | 39,female,type3,94,199,FALSE,type0,179,FALSE,0,1,0,normal,healthy 80 | 42,female,type3,120,209,FALSE,type0,173,FALSE,0,2,0,normal,healthy 81 | 56,male,type2,120,236,FALSE,type0,178,FALSE,0.8,1,0,normal,healthy 82 | 58,male,type4,146,218,FALSE,type0,105,FALSE,2,2,1,reversable defect,not-healthy 83 | 35,male,type4,120,198,FALSE,type0,130,TRUE,1.6,2,0,reversable defect,not-healthy 84 | 58,male,type4,150,270,FALSE,type2,111,TRUE,0.8,1,0,reversable defect,not-healthy 85 | 41,male,type3,130,214,FALSE,type2,168,FALSE,2,2,0,normal,healthy 86 | 57,male,type4,110,201,FALSE,type0,126,TRUE,1.5,2,0,fixed defect,healthy 87 | 42,male,type1,148,244,FALSE,type2,178,FALSE,0.8,1,2,normal,healthy 88 | 62,male,type2,128,208,TRUE,type2,140,FALSE,0,1,0,normal,healthy 89 | 59,male,type1,178,270,FALSE,type2,145,FALSE,4.2,3,0,reversable defect,healthy 90 | 41,female,type2,126,306,FALSE,type0,163,FALSE,0,1,0,normal,healthy 91 | 50,male,type4,150,243,FALSE,type2,128,FALSE,2.6,2,0,reversable defect,not-healthy 92 | 59,male,type2,140,221,FALSE,type0,164,TRUE,0,1,0,normal,healthy 93 | 61,female,type4,130,330,FALSE,type2,169,FALSE,0,1,0,normal,not-healthy 94 | 54,male,type4,124,266,FALSE,type2,109,TRUE,2.2,2,1,reversable defect,not-healthy 95 | 54,male,type4,110,206,FALSE,type2,108,TRUE,0,2,1,normal,not-healthy 96 | 52,male,type4,125,212,FALSE,type0,168,FALSE,1,1,2,reversable defect,not-healthy 97 | 47,male,type4,110,275,FALSE,type2,118,TRUE,1,2,1,normal,not-healthy 98 | 66,male,type4,120,302,FALSE,type2,151,FALSE,0.4,2,0,normal,healthy 99 | 58,male,type4,100,234,FALSE,type0,156,FALSE,0.1,1,1,reversable defect,not-healthy 100 | 64,female,type3,140,313,FALSE,type0,133,FALSE,0.2,1,0,reversable defect,healthy 101 | 50,female,type2,120,244,FALSE,type0,162,FALSE,1.1,1,0,normal,healthy 102 | 44,female,type3,108,141,FALSE,type0,175,FALSE,0.6,2,0,normal,healthy 103 | 67,male,type4,120,237,FALSE,type0,71,FALSE,1,2,0,normal,not-healthy 104 | 49,female,type4,130,269,FALSE,type0,163,FALSE,0,1,0,normal,healthy 105 | 57,male,type4,165,289,TRUE,type2,124,FALSE,1,2,3,reversable defect,not-healthy 106 | 63,male,type4,130,254,FALSE,type2,147,FALSE,1.4,2,1,reversable defect,not-healthy 107 | 48,male,type4,124,274,FALSE,type2,166,FALSE,0.5,2,0,reversable defect,not-healthy 108 | 51,male,type3,100,222,FALSE,type0,143,TRUE,1.2,2,0,normal,healthy 109 | 60,female,type4,150,258,FALSE,type2,157,FALSE,2.6,2,2,reversable defect,not-healthy 110 | 59,male,type4,140,177,FALSE,type0,162,TRUE,0,1,1,reversable defect,not-healthy 111 | 45,female,type2,112,160,FALSE,type0,138,FALSE,0,2,0,normal,healthy 112 | 55,female,type4,180,327,FALSE,type1,117,TRUE,3.4,2,0,normal,not-healthy 113 | 41,male,type2,110,235,FALSE,type0,153,FALSE,0,1,0,normal,healthy 114 | 60,female,type4,158,305,FALSE,type2,161,FALSE,0,1,0,normal,not-healthy 115 | 54,female,type3,135,304,TRUE,type0,170,FALSE,0,1,0,normal,healthy 116 | 42,male,type2,120,295,FALSE,type0,162,FALSE,0,1,0,normal,healthy 117 | 49,female,type2,134,271,FALSE,type0,162,FALSE,0,2,0,normal,healthy 118 | 46,male,type4,120,249,FALSE,type2,144,FALSE,0.8,1,0,reversable defect,not-healthy 119 | 56,female,type4,200,288,TRUE,type2,133,TRUE,4,3,2,reversable defect,not-healthy 120 | 66,female,type1,150,226,FALSE,type0,114,FALSE,2.6,3,0,normal,healthy 121 | 56,male,type4,130,283,TRUE,type2,103,TRUE,1.6,3,0,reversable defect,not-healthy 122 | 49,male,type3,120,188,FALSE,type0,139,FALSE,2,2,3,reversable defect,not-healthy 123 | 54,male,type4,122,286,FALSE,type2,116,TRUE,3.2,2,2,normal,not-healthy 124 | 57,male,type4,152,274,FALSE,type0,88,TRUE,1.2,2,1,reversable defect,not-healthy 125 | 65,female,type3,160,360,FALSE,type2,151,FALSE,0.8,1,0,normal,healthy 126 | 54,male,type3,125,273,FALSE,type2,152,FALSE,0.5,3,1,normal,healthy 127 | 54,female,type3,160,201,FALSE,type0,163,FALSE,0,1,1,normal,healthy 128 | 62,male,type4,120,267,FALSE,type0,99,TRUE,1.8,2,2,reversable defect,not-healthy 129 | 52,female,type3,136,196,FALSE,type2,169,FALSE,0.1,2,0,normal,healthy 130 | 52,male,type2,134,201,FALSE,type0,158,FALSE,0.8,1,1,normal,healthy 131 | 60,male,type4,117,230,TRUE,type0,160,TRUE,1.4,1,2,reversable defect,not-healthy 132 | 63,female,type4,108,269,FALSE,type0,169,TRUE,1.8,2,2,normal,not-healthy 133 | 66,male,type4,112,212,FALSE,type2,132,TRUE,0.1,1,1,normal,not-healthy 134 | 42,male,type4,140,226,FALSE,type0,178,FALSE,0,1,0,normal,healthy 135 | 64,male,type4,120,246,FALSE,type2,96,TRUE,2.2,3,1,normal,not-healthy 136 | 54,male,type3,150,232,FALSE,type2,165,FALSE,1.6,1,0,reversable defect,healthy 137 | 46,female,type3,142,177,FALSE,type2,160,TRUE,1.4,3,0,normal,healthy 138 | 67,female,type3,152,277,FALSE,type0,172,FALSE,0,1,1,normal,healthy 139 | 56,male,type4,125,249,TRUE,type2,144,TRUE,1.2,2,1,normal,not-healthy 140 | 34,female,type2,118,210,FALSE,type0,192,FALSE,0.7,1,0,normal,healthy 141 | 57,male,type4,132,207,FALSE,type0,168,TRUE,0,1,0,reversable defect,healthy 142 | 64,male,type4,145,212,FALSE,type2,132,FALSE,2,2,2,fixed defect,not-healthy 143 | 59,male,type4,138,271,FALSE,type2,182,FALSE,0,1,0,normal,healthy 144 | 50,male,type3,140,233,FALSE,type0,163,FALSE,0.6,2,1,reversable defect,not-healthy 145 | 51,male,type1,125,213,FALSE,type2,125,TRUE,1.4,1,1,normal,healthy 146 | 54,male,type2,192,283,FALSE,type2,195,FALSE,0,1,1,reversable defect,not-healthy 147 | 53,male,type4,123,282,FALSE,type0,95,TRUE,2,2,2,reversable defect,not-healthy 148 | 52,male,type4,112,230,FALSE,type0,160,FALSE,0,1,1,normal,not-healthy 149 | 40,male,type4,110,167,FALSE,type2,114,TRUE,2,2,0,reversable defect,not-healthy 150 | 58,male,type3,132,224,FALSE,type2,173,FALSE,3.2,1,2,reversable defect,not-healthy 151 | 41,female,type3,112,268,FALSE,type2,172,TRUE,0,1,0,normal,healthy 152 | 41,male,type3,112,250,FALSE,type0,179,FALSE,0,1,0,normal,healthy 153 | 50,female,type3,120,219,FALSE,type0,158,FALSE,1.6,2,0,normal,healthy 154 | 54,female,type3,108,267,FALSE,type2,167,FALSE,0,1,0,normal,healthy 155 | 64,female,type4,130,303,FALSE,type0,122,FALSE,2,2,2,normal,healthy 156 | 51,female,type3,130,256,FALSE,type2,149,FALSE,0.5,1,0,normal,healthy 157 | 46,female,type2,105,204,FALSE,type0,172,FALSE,0,1,0,normal,healthy 158 | 55,male,type4,140,217,FALSE,type0,111,TRUE,5.6,3,0,reversable defect,not-healthy 159 | 45,male,type2,128,308,FALSE,type2,170,FALSE,0,1,0,normal,healthy 160 | 56,male,type1,120,193,FALSE,type2,162,FALSE,1.9,2,0,reversable defect,healthy 161 | 66,female,type4,178,228,TRUE,type0,165,TRUE,1,2,2,reversable defect,not-healthy 162 | 38,male,type1,120,231,FALSE,type0,182,TRUE,3.8,2,0,reversable defect,not-healthy 163 | 62,female,type4,150,244,FALSE,type0,154,TRUE,1.4,2,0,normal,not-healthy 164 | 55,male,type2,130,262,FALSE,type0,155,FALSE,0,1,0,normal,healthy 165 | 58,male,type4,128,259,FALSE,type2,130,TRUE,3,2,2,reversable defect,not-healthy 166 | 43,male,type4,110,211,FALSE,type0,161,FALSE,0,1,0,reversable defect,healthy 167 | 64,female,type4,180,325,FALSE,type0,154,TRUE,0,1,0,normal,healthy 168 | 50,female,type4,110,254,FALSE,type2,159,FALSE,0,1,0,normal,healthy 169 | 53,male,type3,130,197,TRUE,type2,152,FALSE,1.2,3,0,normal,healthy 170 | 45,female,type4,138,236,FALSE,type2,152,TRUE,0.2,2,0,normal,healthy 171 | 65,male,type1,138,282,TRUE,type2,174,FALSE,1.4,2,1,normal,not-healthy 172 | 69,male,type1,160,234,TRUE,type2,131,FALSE,0.1,2,1,normal,healthy 173 | 69,male,type3,140,254,FALSE,type2,146,FALSE,2,2,3,reversable defect,not-healthy 174 | 67,male,type4,100,299,FALSE,type2,125,TRUE,0.9,2,2,normal,not-healthy 175 | 68,female,type3,120,211,FALSE,type2,115,FALSE,1.5,2,0,normal,healthy 176 | 34,male,type1,118,182,FALSE,type2,174,FALSE,0,1,0,normal,healthy 177 | 62,female,type4,138,294,TRUE,type0,106,FALSE,1.9,2,3,normal,not-healthy 178 | 51,male,type4,140,298,FALSE,type0,122,TRUE,4.2,2,3,reversable defect,not-healthy 179 | 46,male,type3,150,231,FALSE,type0,147,FALSE,3.6,2,0,normal,not-healthy 180 | 67,male,type4,125,254,TRUE,type0,163,FALSE,0.2,2,2,reversable defect,not-healthy 181 | 50,male,type3,129,196,FALSE,type0,163,FALSE,0,1,0,normal,healthy 182 | 42,male,type3,120,240,TRUE,type0,194,FALSE,0.8,3,0,reversable defect,healthy 183 | 56,female,type4,134,409,FALSE,type2,150,TRUE,1.9,2,2,reversable defect,not-healthy 184 | 41,male,type4,110,172,FALSE,type2,158,FALSE,0,1,0,reversable defect,not-healthy 185 | 42,female,type4,102,265,FALSE,type2,122,FALSE,0.6,2,0,normal,healthy 186 | 53,male,type3,130,246,TRUE,type2,173,FALSE,0,1,3,normal,healthy 187 | 43,male,type3,130,315,FALSE,type0,162,FALSE,1.9,1,1,normal,healthy 188 | 56,male,type4,132,184,FALSE,type2,105,TRUE,2.1,2,1,fixed defect,not-healthy 189 | 52,male,type4,108,233,TRUE,type0,147,FALSE,0.1,1,3,reversable defect,healthy 190 | 62,female,type4,140,394,FALSE,type2,157,FALSE,1.2,2,0,normal,healthy 191 | 70,male,type3,160,269,FALSE,type0,112,TRUE,2.9,2,1,reversable defect,not-healthy 192 | 54,male,type4,140,239,FALSE,type0,160,FALSE,1.2,1,0,normal,healthy 193 | 70,male,type4,145,174,FALSE,type0,125,TRUE,2.6,3,0,reversable defect,not-healthy 194 | 54,male,type2,108,309,FALSE,type0,156,FALSE,0,1,0,reversable defect,healthy 195 | 35,male,type4,126,282,FALSE,type2,156,TRUE,0,1,0,reversable defect,not-healthy 196 | 48,male,type3,124,255,TRUE,type0,175,FALSE,0,1,2,normal,healthy 197 | 55,female,type2,135,250,FALSE,type2,161,FALSE,1.4,2,0,normal,healthy 198 | 58,female,type4,100,248,FALSE,type2,122,FALSE,1,2,0,normal,healthy 199 | 54,female,type3,110,214,FALSE,type0,158,FALSE,1.6,2,0,normal,healthy 200 | 69,female,type1,140,239,FALSE,type0,151,FALSE,1.8,1,2,normal,healthy 201 | 77,male,type4,125,304,FALSE,type2,162,TRUE,0,1,3,normal,not-healthy 202 | 68,male,type3,118,277,FALSE,type0,151,FALSE,1,1,1,reversable defect,healthy 203 | 58,male,type4,125,300,FALSE,type2,171,FALSE,0,1,2,reversable defect,not-healthy 204 | 60,male,type4,125,258,FALSE,type2,141,TRUE,2.8,2,1,reversable defect,not-healthy 205 | 51,male,type4,140,299,FALSE,type0,173,TRUE,1.6,1,0,reversable defect,not-healthy 206 | 55,male,type4,160,289,FALSE,type2,145,TRUE,0.8,2,1,reversable defect,not-healthy 207 | 52,male,type1,152,298,TRUE,type0,178,FALSE,1.2,2,0,reversable defect,healthy 208 | 60,female,type3,102,318,FALSE,type0,160,FALSE,0,1,1,normal,healthy 209 | 58,male,type3,105,240,FALSE,type2,154,TRUE,0.6,2,0,reversable defect,healthy 210 | 64,male,type3,125,309,FALSE,type0,131,TRUE,1.8,2,0,reversable defect,not-healthy 211 | 37,male,type3,130,250,FALSE,type0,187,FALSE,3.5,3,0,normal,healthy 212 | 59,male,type1,170,288,FALSE,type2,159,FALSE,0.2,2,0,reversable defect,not-healthy 213 | 51,male,type3,125,245,TRUE,type2,166,FALSE,2.4,2,0,normal,healthy 214 | 43,female,type3,122,213,FALSE,type0,165,FALSE,0.2,2,0,normal,healthy 215 | 58,male,type4,128,216,FALSE,type2,131,TRUE,2.2,2,3,reversable defect,not-healthy 216 | 29,male,type2,130,204,FALSE,type2,202,FALSE,0,1,0,normal,healthy 217 | 41,female,type2,130,204,FALSE,type2,172,FALSE,1.4,1,0,normal,healthy 218 | 63,female,type3,135,252,FALSE,type2,172,FALSE,0,1,0,normal,healthy 219 | 51,male,type3,94,227,FALSE,type0,154,TRUE,0,1,1,reversable defect,healthy 220 | 54,male,type3,120,258,FALSE,type2,147,FALSE,0.4,2,0,reversable defect,healthy 221 | 44,male,type2,120,220,FALSE,type0,170,FALSE,0,1,0,normal,healthy 222 | 54,male,type4,110,239,FALSE,type0,126,TRUE,2.8,2,1,reversable defect,not-healthy 223 | 65,male,type4,135,254,FALSE,type2,127,FALSE,2.8,2,1,reversable defect,not-healthy 224 | 57,male,type3,150,168,FALSE,type0,174,FALSE,1.6,1,0,normal,healthy 225 | 63,male,type4,130,330,TRUE,type2,132,TRUE,1.8,1,3,reversable defect,not-healthy 226 | 35,female,type4,138,183,FALSE,type0,182,FALSE,1.4,1,0,normal,healthy 227 | 41,male,type2,135,203,FALSE,type0,132,FALSE,0,2,0,fixed defect,healthy 228 | 62,female,type3,130,263,FALSE,type0,97,FALSE,1.2,2,1,reversable defect,not-healthy 229 | 43,female,type4,132,341,TRUE,type2,136,TRUE,3,2,0,reversable defect,not-healthy 230 | 58,female,type1,150,283,TRUE,type2,162,FALSE,1,1,0,normal,healthy 231 | 52,male,type1,118,186,FALSE,type2,190,FALSE,0,2,0,fixed defect,healthy 232 | 61,female,type4,145,307,FALSE,type2,146,TRUE,1,2,0,reversable defect,not-healthy 233 | 39,male,type4,118,219,FALSE,type0,140,FALSE,1.2,2,0,reversable defect,not-healthy 234 | 45,male,type4,115,260,FALSE,type2,185,FALSE,0,1,0,normal,healthy 235 | 52,male,type4,128,255,FALSE,type0,161,TRUE,0,1,1,reversable defect,not-healthy 236 | 62,male,type3,130,231,FALSE,type0,146,FALSE,1.8,2,3,reversable defect,healthy 237 | 62,female,type4,160,164,FALSE,type2,145,FALSE,6.2,3,3,reversable defect,not-healthy 238 | 53,female,type4,138,234,FALSE,type2,160,FALSE,0,1,0,normal,healthy 239 | 43,male,type4,120,177,FALSE,type2,120,TRUE,2.5,2,0,reversable defect,not-healthy 240 | 47,male,type3,138,257,FALSE,type2,156,FALSE,0,1,0,normal,healthy 241 | 52,male,type2,120,325,FALSE,type0,172,FALSE,0.2,1,0,normal,healthy 242 | 68,male,type3,180,274,TRUE,type2,150,TRUE,1.6,2,0,reversable defect,not-healthy 243 | 39,male,type3,140,321,FALSE,type2,182,FALSE,0,1,0,normal,healthy 244 | 53,female,type4,130,264,FALSE,type2,143,FALSE,0.4,2,0,normal,healthy 245 | 62,female,type4,140,268,FALSE,type2,160,FALSE,3.6,3,2,normal,not-healthy 246 | 51,female,type3,140,308,FALSE,type2,142,FALSE,1.5,1,1,normal,healthy 247 | 60,male,type4,130,253,FALSE,type0,144,TRUE,1.4,1,1,reversable defect,not-healthy 248 | 65,male,type4,110,248,FALSE,type2,158,FALSE,0.6,1,2,fixed defect,not-healthy 249 | 65,female,type3,155,269,FALSE,type0,148,FALSE,0.8,1,0,normal,healthy 250 | 60,male,type3,140,185,FALSE,type2,155,FALSE,3,2,0,normal,not-healthy 251 | 60,male,type4,145,282,FALSE,type2,142,TRUE,2.8,2,2,reversable defect,not-healthy 252 | 54,male,type4,120,188,FALSE,type0,113,FALSE,1.4,2,1,reversable defect,not-healthy 253 | 44,male,type2,130,219,FALSE,type2,188,FALSE,0,1,0,normal,healthy 254 | 44,male,type4,112,290,FALSE,type2,153,FALSE,0,1,1,normal,not-healthy 255 | 51,male,type3,110,175,FALSE,type0,123,FALSE,0.6,1,0,normal,healthy 256 | 59,male,type3,150,212,TRUE,type0,157,FALSE,1.6,1,0,normal,healthy 257 | 71,female,type2,160,302,FALSE,type0,162,FALSE,0.4,1,2,normal,healthy 258 | 61,male,type3,150,243,TRUE,type0,137,TRUE,1,2,0,normal,healthy 259 | 55,male,type4,132,353,FALSE,type0,132,TRUE,1.2,2,1,reversable defect,not-healthy 260 | 64,male,type3,140,335,FALSE,type0,158,FALSE,0,1,0,normal,not-healthy 261 | 43,male,type4,150,247,FALSE,type0,171,FALSE,1.5,1,0,normal,healthy 262 | 58,female,type3,120,340,FALSE,type0,172,FALSE,0,1,0,normal,healthy 263 | 60,male,type4,130,206,FALSE,type2,132,TRUE,2.4,2,2,reversable defect,not-healthy 264 | 58,male,type2,120,284,FALSE,type2,160,FALSE,1.8,2,0,normal,not-healthy 265 | 49,male,type2,130,266,FALSE,type0,171,FALSE,0.6,1,0,normal,healthy 266 | 48,male,type2,110,229,FALSE,type0,168,FALSE,1,3,0,reversable defect,not-healthy 267 | 52,male,type3,172,199,TRUE,type0,162,FALSE,0.5,1,0,reversable defect,healthy 268 | 44,male,type2,120,263,FALSE,type0,173,FALSE,0,1,0,reversable defect,healthy 269 | 56,female,type2,140,294,FALSE,type2,153,FALSE,1.3,2,0,normal,healthy 270 | 57,male,type4,140,192,FALSE,type0,148,FALSE,0.4,2,0,fixed defect,healthy 271 | 67,male,type4,160,286,FALSE,type2,108,TRUE,1.5,2,3,normal,not-healthy 272 | -------------------------------------------------------------------------------- /src/main/resources/datasets/heart-desease/source.txt: -------------------------------------------------------------------------------- 1 | http://www.sgi.com/tech/mlc/db/heart.names 2 | http://www.sgi.com/tech/mlc/db/heart.all 3 | -------------------------------------------------------------------------------- /src/main/resources/datasets/labor-statistics/longley.csv: -------------------------------------------------------------------------------- 1 | "GNP.deflator","GNP","Unemployed","Armed.Forces","Population","Year","Employed" 2 | 83,234.289,235.6,159,107.608,1947,60.323 3 | 88.5,259.426,232.5,145.6,108.632,1948,61.122 4 | 88.2,258.054,368.2,161.6,109.773,1949,60.171 5 | 89.5,284.599,335.1,165,110.929,1950,61.187 6 | 96.2,328.975,209.9,309.9,112.075,1951,63.221 7 | 98.1,346.999,193.2,359.4,113.27,1952,63.639 8 | 99,365.385,187,354.7,115.094,1953,64.989 9 | 100,363.112,357.8,335,116.219,1954,63.761 10 | 101.2,397.469,290.4,304.8,117.388,1955,66.019 11 | 104.6,419.18,282.2,285.7,118.734,1956,67.857 12 | 108.4,442.769,293.6,279.8,120.445,1957,68.169 13 | 110.8,444.546,468.1,263.7,121.95,1958,66.513 14 | 112.6,482.704,381.3,255.2,123.366,1959,68.655 15 | 114.2,502.601,393.1,251.4,125.368,1960,69.564 16 | 115.7,518.173,480.6,257.2,127.852,1961,69.331 17 | 116.9,554.894,400.7,282.7,130.081,1962,70.551 18 | -------------------------------------------------------------------------------- /src/main/resources/datasets/labor-statistics/source.txt: -------------------------------------------------------------------------------- 1 | http://www.itl.nist.gov/div898/strd/lls/data/Longley.shtml 2 | -------------------------------------------------------------------------------- /src/main/resources/datasets/sentiment-analysis/rt-polarity.neg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datumbox/datumbox-framework-examples/224908663718828f5769ffc9559cba8bca45ca37/src/main/resources/datasets/sentiment-analysis/rt-polarity.neg -------------------------------------------------------------------------------- /src/main/resources/datasets/sentiment-analysis/rt-polarity.pos: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datumbox/datumbox-framework-examples/224908663718828f5769ffc9559cba8bca45ca37/src/main/resources/datasets/sentiment-analysis/rt-polarity.pos -------------------------------------------------------------------------------- /src/main/resources/datasets/sentiment-analysis/source.txt: -------------------------------------------------------------------------------- 1 | http://www.cs.cornell.edu/people/pabo/movie-review-data/ -------------------------------------------------------------------------------- /src/main/resources/datumbox.concurrencyconfiguration.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013-2020 Vasilis Vryniotis 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # Whether the concurrent execution of tasks is allowed (options: true/false): 18 | concurrencyConfiguration.parallelized=true 19 | 20 | # The maximum number of Threads that can be executed concurrently for each task: 21 | # - Use 0 for setting it equal to the number of CPUs on the system. 22 | # - Use 1 to turn off concurrency (same as concurrencyConfiguration.parallelized=false). 23 | # - Any other positive value acts as a limit on the concurrency level, provided that the concurrencyConfiguration.parallelized=true. 24 | concurrencyConfiguration.maxNumberOfThreadsPerTask=0 25 | -------------------------------------------------------------------------------- /src/main/resources/datumbox.configuration.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013-2020 Vasilis Vryniotis 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # The full package name of the Storage Engine. This determines the default storage engine which is used for storing the models: 18 | configuration.storageConfiguration=com.datumbox.framework.storage.inmemory.InMemoryConfiguration 19 | -------------------------------------------------------------------------------- /src/main/resources/datumbox.inmemoryconfiguration.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013-2020 Vasilis Vryniotis 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # The relative or absolute path for the directory where the models are stored (if not specified the temporary directory is used): 18 | inMemoryConfiguration.directory= 19 | -------------------------------------------------------------------------------- /src/main/resources/datumbox.mapdbconfiguration.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013-2020 Vasilis Vryniotis 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # The relative or absolute path for the directory where the models are stored (if not specified the temporary directory is used): 18 | mapDBConfiguration.directory= 19 | 20 | # The number of records kept in each LRU cache. Setting it to 0 will disable caching (not recommended): 21 | mapDBConfiguration.cacheSize=10000 22 | 23 | # Whether compression will be used in storage (options: true/false): 24 | mapDBConfiguration.compressed=true 25 | 26 | # The hybridized mode enables small and important data to be stored directly In-Memory (options: true/false): 27 | mapDBConfiguration.hybridized=true 28 | 29 | # Whether the writes will be performed asynchronously (options: true/false): 30 | mapDBConfiguration.asynchronous=true 31 | -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{5} - %msg%n 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | --------------------------------------------------------------------------------