├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
└── src
└── main
├── java
└── com
│ └── datumbox
│ └── examples
│ ├── Classification.java
│ ├── Clustering.java
│ ├── DataModeling.java
│ ├── Regression.java
│ └── TextClassification.java
└── resources
├── datasets
├── diabetes
│ ├── diabetes.tsv.gz
│ └── source.txt
├── heart-desease
│ ├── heart.csv
│ └── source.txt
├── labor-statistics
│ ├── longley.csv
│ └── source.txt
└── sentiment-analysis
│ ├── rt-polarity.neg
│ ├── rt-polarity.pos
│ └── source.txt
├── datumbox.concurrencyconfiguration.properties
├── datumbox.configuration.properties
├── datumbox.inmemoryconfiguration.properties
├── datumbox.mapdbconfiguration.properties
└── logback.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.jar
3 | *.war
4 | *.ear
5 | *.iml
6 |
7 | target/
8 | /.settings/
9 | /.idea/
10 | .classpath
11 | .project
12 | nbactions.xml
13 | nb-configuration.xml
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright 2013 Vasilis Vryniotis
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Code Examples for Datumbox Machine Learning Framework
2 | =====================================================
3 |
4 | [](http://www.datumbox.com/)
5 |
6 | This project provides examples on how to use the [Datumbox Machine Learning Framework](https://github.com/datumbox/datumbox-framework/) v0.8.3-SNAPSHOT (Build 20201014).
7 |
8 | Copyright & License
9 | -------------------
10 |
11 | Copyright (c) 2013-2020 [Vasilis Vryniotis](http://blog.datumbox.com/author/bbriniotis/).
12 |
13 | The code is licensed under the [Apache License, Version 2.0](./LICENSE).
14 |
15 | How to use
16 | ----------
17 |
18 | The code uses Maven Project Structure and contains the following code examples:
19 |
20 | - [Classification.java](./src/main/java/com/datumbox/examples/Classification.java): Contains an example on how to perform Classification.
21 | - [Clustering.java](./src/main/java/com/datumbox/examples/Clustering.java): It is an example that runs Cluster Analysis.
22 | - [Regression.java](./src/main/java/com/datumbox/examples/Regression.java): Shows how to run Regression Analysis.
23 | - [DataModeling.java](./src/main/java/com/datumbox/examples/DataModeling.java): Explains how to use the convenience Modeler class.
24 | - [TextClassification.java](./src/main/java/com/datumbox/examples/TextClassification.java): Uses the convenience TextClassifier class.
25 |
26 | All of the above files contain a main() method. To use it just clone the project on your workspace and run any of the above files.
27 |
28 | The project contains also 5 configuration files in the resources folder:
29 |
30 | - [datumbox.configuration.properties](./src/main/resources/datumbox.configuration.properties): It defines for the default storage engine (required).
31 | - [datumbox.concurrencyconfiguration.properties](./src/main/resources/datumbox.concurrencyconfiguration.properties): It controls the concurrency levels (required).
32 | - [datumbox.inmemoryconfiguration.properties](./src/main/resources/datumbox.inmemoryconfiguration.properties): It contains the configurations for the InMemory storage engine (required).
33 | - [datumbox.mapdbconfiguration.properties](./src/main/resources/datumbox.mapdbconfiguration.properties): It contains the configurations for the MapDB storage engine (optional).
34 | - [logback.xml](./src/main/resources/logback.xml): It contains the configuration file for the logger (optional).
35 |
36 | Finally in the resources folder there are several [real world datasets](./src/main/resources/datasets/) which are used for testing.
37 |
38 | Useful Links
39 | ------------
40 |
41 | - [Datumbox Machine Learning Framework](https://github.com/datumbox/datumbox-framework/)
42 | - [Datumbox Zoo: Pre-trained models](https://github.com/datumbox/datumbox-framework-zoo/)
43 | - [Datumbox.com](http://www.datumbox.com/)
44 | - [Machine Learning Blog](http://blog.datumbox.com/)
45 |
46 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
19 |
20 | 4.0.0
21 | com.datumbox
22 | datumbox-framework-examples
23 | 0.8.3-SNAPSHOT
24 |
25 | jar
26 |
27 | Code Examples for Datumbox Machine Learning Framework
28 | Code examples on how to use the Datumbox Machine Learning Framework.
29 | https://github.com/datumbox/datumbox-framework-examples/
30 |
31 | Datumbox
32 | http://www.datumbox.com/
33 |
34 | 2013
35 |
36 |
37 |
38 | bbriniotis
39 | Vasilis Vryniotis
40 | bbriniotis@datumbox.com
41 | http://blog.datumbox.com/author/bbriniotis/
42 |
43 | Project Lead Developer
44 |
45 |
46 |
47 |
48 |
49 |
50 | Eleftherios Bampaletakis
51 | lmpampaletakis@gmail.com
52 | http://gr.linkedin.com/pub/eleftherios-bampaletakis/39/875/551
53 |
54 | Java Consultant
55 |
56 |
57 |
58 |
59 |
60 |
61 | Apache License, Version 2.0
62 | http://www.apache.org/licenses/LICENSE-2.0.txt
63 | repo
64 | A business-friendly OSS license
65 |
66 |
67 |
68 |
69 | https://github.com/datumbox/datumbox-framework-examples/issues/
70 | GitHub Issues
71 |
72 |
73 |
74 | scm:git:git@github.com:datumbox/datumbox-framework-examples.git
75 | scm:git:git@github.com:datumbox/datumbox-framework-examples.git
76 | scm:git:git@github.com:datumbox/datumbox-framework-examples.git
77 |
78 |
79 |
80 |
81 | 0.8.3-SNAPSHOT
82 | 1.3.12
83 |
84 |
85 | UTF-8
86 | 11
87 | 11
88 |
89 |
90 |
91 |
92 | com.datumbox
93 | datumbox-framework-lib
94 | ${datumbox-framework-lib-version}
95 |
96 |
97 | ch.qos.logback
98 | logback-classic
99 | ${logback-classic-version}
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/src/main/java/com/datumbox/examples/Classification.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2013-2020 Vasilis Vryniotis
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.datumbox.examples;
17 |
18 | import com.datumbox.framework.common.Configuration;
19 | import com.datumbox.framework.core.common.dataobjects.Dataframe;
20 | import com.datumbox.framework.core.common.dataobjects.Record;
21 | import com.datumbox.framework.common.dataobjects.TypeInference;
22 | import com.datumbox.framework.common.utilities.RandomGenerator;
23 | import com.datumbox.framework.core.machinelearning.MLBuilder;
24 | import com.datumbox.framework.core.machinelearning.classification.SoftMaxRegression;
25 | import com.datumbox.framework.core.machinelearning.featureselection.PCA;
26 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics;
27 | import com.datumbox.framework.core.machinelearning.modelselection.splitters.ShuffleSplitter;
28 | import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
29 |
30 | import java.io.*;
31 | import java.net.URISyntaxException;
32 | import java.nio.file.Paths;
33 | import java.util.LinkedHashMap;
34 | import java.util.Map;
35 | import java.util.zip.GZIPInputStream;
36 |
37 | /**
38 | * Classification example.
39 | *
40 | * @author Vasilis Vryniotis
41 | */
42 | public class Classification {
43 |
44 | /**
45 | * Example of how to use directly the algorithms of the framework in order to
46 | * perform classification. A similar approach can be used to perform clustering,
47 | * regression, build recommender system or perform topic modeling and dimensionality
48 | * reduction.
49 | *
50 | * @param args the command line arguments
51 | */
52 | public static void main(String[] args) {
53 | /**
54 | * There are 5 configuration files in the resources folder:
55 | *
56 | * - datumbox.configuration.properties: It defines for the default storage engine (required)
57 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required)
58 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required)
59 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional)
60 | * - logback.xml: It contains the configuration file for the logger (optional)
61 | */
62 |
63 | //Initialization
64 | //--------------
65 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects
66 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file
67 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default)
68 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine
69 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization
70 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level
71 |
72 |
73 |
74 |
75 | //Reading Data
76 | //------------
77 | Dataframe data;
78 | try (Reader fileReader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(Paths.get(Classification.class.getClassLoader().getResource("datasets/diabetes/diabetes.tsv.gz").toURI()).toFile())), "UTF-8"))) {
79 | LinkedHashMap headerDataTypes = new LinkedHashMap<>();
80 | headerDataTypes.put("pregnancies", TypeInference.DataType.NUMERICAL);
81 | headerDataTypes.put("plasma glucose", TypeInference.DataType.NUMERICAL);
82 | headerDataTypes.put("blood pressure", TypeInference.DataType.NUMERICAL);
83 | headerDataTypes.put("triceps thickness", TypeInference.DataType.NUMERICAL);
84 | headerDataTypes.put("serum insulin", TypeInference.DataType.NUMERICAL);
85 | headerDataTypes.put("bmi", TypeInference.DataType.NUMERICAL);
86 | headerDataTypes.put("dpf", TypeInference.DataType.NUMERICAL);
87 | headerDataTypes.put("age", TypeInference.DataType.NUMERICAL);
88 | headerDataTypes.put("test result", TypeInference.DataType.CATEGORICAL);
89 |
90 | data = Dataframe.Builder.parseCSVFile(fileReader, "test result", headerDataTypes, '\t', '"', "\r\n", null, null, configuration);
91 | }
92 | catch(UncheckedIOException | IOException | URISyntaxException ex) {
93 | throw new RuntimeException(ex);
94 | }
95 |
96 | //Spit into train and test datasets
97 | ShuffleSplitter.Split split = new ShuffleSplitter(0.8, 1).split(data).next();
98 | Dataframe trainingDataframe = split.getTrain();
99 | Dataframe testingDataframe = split.getTest();
100 |
101 |
102 | //Transform Dataframe
103 | //-----------------
104 |
105 | //Scale continuous variables
106 | MinMaxScaler.TrainingParameters nsParams = new MinMaxScaler.TrainingParameters();
107 | MinMaxScaler numericalScaler = MLBuilder.create(nsParams, configuration);
108 |
109 | numericalScaler.fit_transform(trainingDataframe);
110 | numericalScaler.save("Diabetes");
111 |
112 |
113 |
114 | //Feature Selection
115 | //-----------------
116 |
117 | //Perform dimensionality reduction using PCA
118 |
119 | PCA.TrainingParameters featureSelectionParameters = new PCA.TrainingParameters();
120 | featureSelectionParameters.setMaxDimensions(trainingDataframe.xColumnSize()-1); //remove one dimension
121 | featureSelectionParameters.setWhitened(false);
122 | featureSelectionParameters.setVariancePercentageThreshold(0.99999995);
123 |
124 | PCA featureSelection = MLBuilder.create(featureSelectionParameters, configuration);
125 | featureSelection.fit_transform(trainingDataframe);
126 | featureSelection.save("Diabetes");
127 |
128 |
129 |
130 | //Fit the classifier
131 | //------------------
132 |
133 | SoftMaxRegression.TrainingParameters param = new SoftMaxRegression.TrainingParameters();
134 | param.setTotalIterations(200);
135 | param.setLearningRate(0.1);
136 |
137 | SoftMaxRegression classifier = MLBuilder.create(param, configuration);
138 | classifier.fit(trainingDataframe);
139 | classifier.save("Diabetes");
140 |
141 |
142 | //Use the classifier
143 | //------------------
144 |
145 | //Apply the same numerical scaling on testingDataframe
146 | numericalScaler.transform(testingDataframe);
147 |
148 | //Apply the same featureSelection transformations on testingDataframe
149 | featureSelection.transform(testingDataframe);
150 |
151 | //Use the classifier to make predictions on the testingDataframe
152 | classifier.predict(testingDataframe);
153 |
154 | //Get validation metrics on the test set
155 | ClassificationMetrics vm = new ClassificationMetrics(testingDataframe);
156 |
157 | System.out.println("Results:");
158 | for(Map.Entry entry: testingDataframe.entries()) {
159 | Integer rId = entry.getKey();
160 | Record r = entry.getValue();
161 | System.out.println("Record "+rId+" - Real Y: "+r.getY()+", Predicted Y: "+r.getYPredicted());
162 | }
163 |
164 | System.out.println("Classifier Accuracy: "+vm.getAccuracy());
165 |
166 |
167 |
168 | //Clean up
169 | //--------
170 |
171 | //Delete scaler, featureselector and classifier.
172 | numericalScaler.delete();
173 | featureSelection.delete();
174 | classifier.delete();
175 |
176 | //Close Dataframes.
177 | trainingDataframe.close();
178 | testingDataframe.close();
179 | }
180 |
181 | }
182 |
--------------------------------------------------------------------------------
/src/main/java/com/datumbox/examples/Clustering.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2013-2020 Vasilis Vryniotis
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.datumbox.examples;
17 |
18 | import com.datumbox.framework.common.Configuration;
19 | import com.datumbox.framework.core.common.dataobjects.Dataframe;
20 | import com.datumbox.framework.core.common.dataobjects.Record;
21 | import com.datumbox.framework.common.dataobjects.TypeInference;
22 | import com.datumbox.framework.common.utilities.RandomGenerator;
23 | import com.datumbox.framework.core.machinelearning.MLBuilder;
24 | import com.datumbox.framework.core.machinelearning.clustering.Kmeans;
25 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClusteringMetrics;
26 | import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder;
27 | import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
28 |
29 | import java.io.*;
30 | import java.net.URISyntaxException;
31 | import java.nio.file.Paths;
32 | import java.util.LinkedHashMap;
33 | import java.util.Map;
34 |
35 | /**
36 | * Clustering example.
37 | *
38 | * @author Vasilis Vryniotis
39 | */
40 | public class Clustering {
41 |
42 | /**
43 | * Example of how to use directly the algorithms of the framework in order to
44 | * perform clustering. A similar approach can be used to perform classification,
45 | * regression, build recommender system or perform topic modeling and dimensionality
46 | * reduction.
47 | *
48 | * @param args the command line arguments
49 | */
50 | public static void main(String[] args) {
51 | /**
52 | * There are 5 configuration files in the resources folder:
53 | *
54 | * - datumbox.configuration.properties: It defines for the default storage engine (required)
55 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required)
56 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required)
57 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional)
58 | * - logback.xml: It contains the configuration file for the logger (optional)
59 | */
60 |
61 | //Initialization
62 | //--------------
63 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects
64 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file
65 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default)
66 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine
67 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization
68 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level
69 |
70 |
71 |
72 | //Reading Data
73 | //------------
74 | Dataframe trainingDataframe;
75 | try (Reader fileReader = new InputStreamReader(new FileInputStream(Paths.get(Clustering.class.getClassLoader().getResource("datasets/heart-desease/heart.csv").toURI()).toFile()), "UTF-8")) {
76 | LinkedHashMap headerDataTypes = new LinkedHashMap<>();
77 | headerDataTypes.put("Age", TypeInference.DataType.NUMERICAL);
78 | headerDataTypes.put("Sex", TypeInference.DataType.CATEGORICAL);
79 | headerDataTypes.put("ChestPain", TypeInference.DataType.CATEGORICAL);
80 | headerDataTypes.put("RestBP", TypeInference.DataType.NUMERICAL);
81 | headerDataTypes.put("Cholesterol", TypeInference.DataType.NUMERICAL);
82 | headerDataTypes.put("BloodSugar", TypeInference.DataType.BOOLEAN);
83 | headerDataTypes.put("ECG", TypeInference.DataType.CATEGORICAL);
84 | headerDataTypes.put("MaxHeartRate", TypeInference.DataType.NUMERICAL);
85 | headerDataTypes.put("Angina", TypeInference.DataType.BOOLEAN);
86 | headerDataTypes.put("OldPeak", TypeInference.DataType.NUMERICAL);
87 | headerDataTypes.put("STSlope", TypeInference.DataType.ORDINAL);
88 | headerDataTypes.put("Vessels", TypeInference.DataType.NUMERICAL);
89 | headerDataTypes.put("Thal", TypeInference.DataType.CATEGORICAL);
90 | headerDataTypes.put("Class", TypeInference.DataType.CATEGORICAL);
91 |
92 | trainingDataframe = Dataframe.Builder.parseCSVFile(fileReader, "Class", headerDataTypes, ',', '"', "\r\n", null, null, configuration);
93 | }
94 | catch(UncheckedIOException | IOException | URISyntaxException ex) {
95 | throw new RuntimeException(ex);
96 | }
97 |
98 | //Store data and load them back
99 | trainingDataframe.save("HeartDeseaseDataset");
100 | Dataframe testingDataframe = Dataframe.Builder.load("HeartDeseaseDataset", configuration);
101 |
102 |
103 | //Transform Dataframe
104 | //-----------------
105 |
106 | //Convert Categorical variables to dummy variables (boolean) and scale continuous variables
107 | MinMaxScaler.TrainingParameters nsParams = new MinMaxScaler.TrainingParameters();
108 | MinMaxScaler numericalScaler = MLBuilder.create(nsParams, configuration);
109 |
110 | numericalScaler.fit_transform(trainingDataframe);
111 | numericalScaler.save("HeartDesease");
112 |
113 | OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters();
114 | OneHotEncoder categoricalEncoder = MLBuilder.create(ceParams, configuration);
115 |
116 | categoricalEncoder.fit_transform(trainingDataframe);
117 | categoricalEncoder.save("HeartDesease");
118 |
119 |
120 |
121 | //Fit the clusterer
122 | //-----------------
123 |
124 | Kmeans.TrainingParameters param = new Kmeans.TrainingParameters();
125 | param.setK(2);
126 | param.setMaxIterations(200);
127 | param.setInitializationMethod(Kmeans.TrainingParameters.Initialization.FORGY);
128 | param.setDistanceMethod(Kmeans.TrainingParameters.Distance.EUCLIDIAN);
129 | param.setWeighted(false);
130 | param.setCategoricalGamaMultiplier(1.0);
131 | param.setSubsetFurthestFirstcValue(2.0);
132 |
133 | Kmeans clusterer = MLBuilder.create(param, configuration);
134 | clusterer.fit(trainingDataframe);
135 | clusterer.save("HeartDesease");
136 |
137 |
138 | //Use the clusterer
139 | //-----------------
140 |
141 | //Apply the same scaling and encoding on testingDataframe
142 | numericalScaler.transform(testingDataframe);
143 | categoricalEncoder.transform(testingDataframe);
144 |
145 | //Make predictions on the test set
146 | clusterer.predict(testingDataframe);
147 |
148 | //Get validation metrics on the test set
149 | ClusteringMetrics vm = new ClusteringMetrics(testingDataframe);
150 |
151 | System.out.println("Results:");
152 | for(Map.Entry entry: testingDataframe.entries()) {
153 | Integer rId = entry.getKey();
154 | Record r = entry.getValue();
155 | System.out.println("Record "+rId+" - Original Y: "+r.getY()+", Predicted Cluster Id: "+r.getYPredicted());
156 | }
157 |
158 | System.out.println("Clusterer Purity: "+vm.getPurity());
159 |
160 |
161 |
162 | //Clean up
163 | //--------
164 |
165 | //Delete scaler, encoder, clusterer.
166 | numericalScaler.delete();
167 | categoricalEncoder.delete();
168 | clusterer.delete();
169 |
170 | //Delete the train and close the test Dataframe.
171 | trainingDataframe.delete();
172 | testingDataframe.close();
173 | }
174 |
175 | }
176 |
--------------------------------------------------------------------------------
/src/main/java/com/datumbox/examples/DataModeling.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2013-2020 Vasilis Vryniotis
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.datumbox.examples;
17 |
18 | import com.datumbox.framework.applications.datamodeling.Modeler;
19 | import com.datumbox.framework.common.Configuration;
20 | import com.datumbox.framework.core.common.dataobjects.Dataframe;
21 | import com.datumbox.framework.core.common.dataobjects.Record;
22 | import com.datumbox.framework.common.dataobjects.TypeInference;
23 | import com.datumbox.framework.common.utilities.RandomGenerator;
24 | import com.datumbox.framework.core.machinelearning.MLBuilder;
25 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.LinearRegressionMetrics;
26 | import com.datumbox.framework.core.machinelearning.preprocessing.OneHotEncoder;
27 | import com.datumbox.framework.core.machinelearning.preprocessing.MinMaxScaler;
28 | import com.datumbox.framework.core.machinelearning.regression.NLMS;
29 |
30 | import java.io.*;
31 | import java.net.URISyntaxException;
32 | import java.nio.file.Paths;
33 | import java.util.Arrays;
34 | import java.util.LinkedHashMap;
35 | import java.util.Map;
36 |
37 | /**
38 | * DataModeling example.
39 | *
40 | * @author Vasilis Vryniotis
41 | */
42 | public class DataModeling {
43 |
44 | /**
45 | * Example of how to use the Modeler class.
46 | *
47 | * @param args the command line arguments
48 | */
49 | public static void main(String[] args) {
50 | /**
51 | * There are 5 configuration files in the resources folder:
52 | *
53 | * - datumbox.configuration.properties: It defines for the default storage engine (required)
54 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required)
55 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required)
56 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional)
57 | * - logback.xml: It contains the configuration file for the logger (optional)
58 | */
59 |
60 | //Initialization
61 | //--------------
62 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects
63 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file
64 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default)
65 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine
66 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization
67 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level
68 |
69 |
70 |
71 | //Reading Data
72 | //------------
73 | Dataframe trainingDataframe;
74 | try (Reader fileReader = new InputStreamReader(new FileInputStream(Paths.get(Clustering.class.getClassLoader().getResource("datasets/labor-statistics/longley.csv").toURI()).toFile()), "UTF-8")) {
75 | LinkedHashMap headerDataTypes = new LinkedHashMap<>();
76 | headerDataTypes.put("Employed", TypeInference.DataType.NUMERICAL);
77 | headerDataTypes.put("GNP.deflator", TypeInference.DataType.NUMERICAL);
78 | headerDataTypes.put("GNP", TypeInference.DataType.NUMERICAL);
79 | headerDataTypes.put("Unemployed", TypeInference.DataType.NUMERICAL);
80 | headerDataTypes.put("Armed.Forces", TypeInference.DataType.NUMERICAL);
81 | headerDataTypes.put("Population", TypeInference.DataType.NUMERICAL);
82 | headerDataTypes.put("Year", TypeInference.DataType.NUMERICAL);
83 |
84 | trainingDataframe = Dataframe.Builder.parseCSVFile(fileReader, "Employed", headerDataTypes, ',', '"', "\r\n", null, null, configuration);
85 | }
86 | catch(UncheckedIOException | IOException | URISyntaxException ex) {
87 | throw new RuntimeException(ex);
88 | }
89 | Dataframe testingDataframe = trainingDataframe.copy();
90 |
91 |
92 |
93 | //Setup Training Parameters
94 | //-------------------------
95 | Modeler.TrainingParameters trainingParameters = new Modeler.TrainingParameters();
96 |
97 | //numerical scaling configuration
98 | MinMaxScaler.TrainingParameters nsParams = new MinMaxScaler.TrainingParameters();
99 | trainingParameters.setNumericalScalerTrainingParameters(nsParams);
100 |
101 | //categorical encoding configuration
102 | OneHotEncoder.TrainingParameters ceParams = new OneHotEncoder.TrainingParameters();
103 | trainingParameters.setCategoricalEncoderTrainingParameters(ceParams);
104 |
105 | //Set feature selection configuration
106 | trainingParameters.setFeatureSelectorTrainingParametersList(Arrays.asList());
107 |
108 | //Model Configuration
109 | trainingParameters.setModelerTrainingParameters(new NLMS.TrainingParameters());
110 |
111 |
112 |
113 | //Fit the modeler
114 | //---------------
115 | Modeler modeler = MLBuilder.create(trainingParameters, configuration);
116 | modeler.fit(trainingDataframe);
117 | modeler.save("LaborStatistics");
118 |
119 |
120 | //Use the modeler
121 | //---------------
122 |
123 | //Make predictions on the test set
124 | modeler.predict(testingDataframe);
125 |
126 | LinearRegressionMetrics vm = new LinearRegressionMetrics(testingDataframe);
127 |
128 | System.out.println("Test Results:");
129 | for(Map.Entry entry: testingDataframe.entries()) {
130 | Integer rId = entry.getKey();
131 | Record r = entry.getValue();
132 | System.out.println("Record "+rId+" - Real Y: "+r.getY()+", Predicted Y: "+r.getYPredicted());
133 | }
134 |
135 | System.out.println("Model Rsquare: "+vm.getRSquare());
136 |
137 |
138 |
139 | //Clean up
140 | //--------
141 |
142 | //Delete the modeler. This removes all files.
143 | modeler.delete();
144 |
145 | //Close Dataframes.
146 | trainingDataframe.close();
147 | testingDataframe.close();
148 | }
149 |
150 | }
151 |
--------------------------------------------------------------------------------
/src/main/java/com/datumbox/examples/Regression.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2013-2020 Vasilis Vryniotis
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.datumbox.examples;
17 |
18 | import com.datumbox.framework.common.Configuration;
19 | import com.datumbox.framework.core.common.dataobjects.Dataframe;
20 | import com.datumbox.framework.core.common.dataobjects.Record;
21 | import com.datumbox.framework.common.dataobjects.TypeInference;
22 | import com.datumbox.framework.common.utilities.RandomGenerator;
23 | import com.datumbox.framework.core.machinelearning.MLBuilder;
24 | import com.datumbox.framework.core.machinelearning.featureselection.PCA;
25 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.LinearRegressionMetrics;
26 | import com.datumbox.framework.core.machinelearning.preprocessing.StandardScaler;
27 | import com.datumbox.framework.core.machinelearning.regression.MatrixLinearRegression;
28 |
29 | import java.io.*;
30 | import java.net.URISyntaxException;
31 | import java.nio.file.Paths;
32 | import java.util.LinkedHashMap;
33 | import java.util.Map;
34 |
35 | /**
36 | * Regression example.
37 | *
38 | * @author Vasilis Vryniotis
39 | */
40 | public class Regression {
41 |
42 | /**
43 | * Example of how to use directly the algorithms of the framework in order to
44 | * perform regression. A similar approach can be used to perform clustering,
45 | * classification, build recommender system or perform topic modeling and dimensionality
46 | * reduction.
47 | *
48 | * @param args the command line arguments
49 | */
50 | public static void main(String[] args) {
51 | /**
52 | * There are 5 configuration files in the resources folder:
53 | *
54 | * - datumbox.configuration.properties: It defines for the default storage engine (required)
55 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required)
56 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required)
57 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional)
58 | * - logback.xml: It contains the configuration file for the logger (optional)
59 | */
60 |
61 | //Initialization
62 | //--------------
63 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects
64 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file
65 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default)
66 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine
67 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization
68 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level
69 |
70 |
71 |
72 | //Reading Data
73 | //------------
74 | Dataframe trainingDataframe;
75 | try (Reader fileReader = new InputStreamReader(new FileInputStream(Paths.get(Clustering.class.getClassLoader().getResource("datasets/labor-statistics/longley.csv").toURI()).toFile()), "UTF-8")) {
76 | LinkedHashMap headerDataTypes = new LinkedHashMap<>();
77 | headerDataTypes.put("Employed", TypeInference.DataType.NUMERICAL);
78 | headerDataTypes.put("GNP.deflator", TypeInference.DataType.NUMERICAL);
79 | headerDataTypes.put("GNP", TypeInference.DataType.NUMERICAL);
80 | headerDataTypes.put("Unemployed", TypeInference.DataType.NUMERICAL);
81 | headerDataTypes.put("Armed.Forces", TypeInference.DataType.NUMERICAL);
82 | headerDataTypes.put("Population", TypeInference.DataType.NUMERICAL);
83 | headerDataTypes.put("Year", TypeInference.DataType.NUMERICAL);
84 |
85 | trainingDataframe = Dataframe.Builder.parseCSVFile(fileReader, "Employed", headerDataTypes, ',', '"', "\r\n", null, null, configuration);
86 | }
87 | catch(UncheckedIOException | IOException | URISyntaxException ex) {
88 | throw new RuntimeException(ex);
89 | }
90 | Dataframe testingDataframe = trainingDataframe.copy();
91 |
92 |
93 | //Transform Dataframe
94 | //-----------------
95 |
96 | //Scale continuous variables
97 | StandardScaler.TrainingParameters nsParams = new StandardScaler.TrainingParameters();
98 | nsParams.setScaleResponse(true);
99 | StandardScaler numericalScaler = MLBuilder.create(nsParams, configuration);
100 |
101 | numericalScaler.fit_transform(trainingDataframe);
102 | numericalScaler.save("LaborStatistics");
103 |
104 |
105 |
106 | //Feature Selection
107 | //-----------------
108 |
109 | //Perform dimensionality reduction using PCA
110 |
111 | PCA.TrainingParameters featureSelectionParameters = new PCA.TrainingParameters();
112 | featureSelectionParameters.setMaxDimensions(trainingDataframe.xColumnSize()-1); //remove one dimension
113 | featureSelectionParameters.setWhitened(false);
114 | featureSelectionParameters.setVariancePercentageThreshold(0.99999995);
115 |
116 | PCA featureSelection = MLBuilder.create(featureSelectionParameters, configuration);
117 | featureSelection.fit_transform(trainingDataframe);
118 | featureSelection.save("LaborStatistics");
119 |
120 |
121 |
122 | //Fit the regressor
123 | //-----------------
124 |
125 | MatrixLinearRegression.TrainingParameters param = new MatrixLinearRegression.TrainingParameters();
126 |
127 | MatrixLinearRegression regressor = MLBuilder.create(param, configuration);
128 | regressor.fit(trainingDataframe);
129 | regressor.save("LaborStatistics");
130 | regressor.close(); //close the regressor, we will use it again later
131 |
132 |
133 |
134 | //Use the regressor
135 | //------------------
136 |
137 | //Apply the same numerical scaling on testingDataframe
138 | numericalScaler.transform(testingDataframe);
139 |
140 | //Apply the same featureSelection transformations on testingDataframe
141 | featureSelection.transform(testingDataframe);
142 |
143 | //Load again the regressor
144 | regressor = MLBuilder.load(MatrixLinearRegression.class, "LaborStatistics", configuration);
145 | regressor.predict(testingDataframe);
146 |
147 | //Get validation metrics on the training set
148 | LinearRegressionMetrics vm = new LinearRegressionMetrics(testingDataframe);
149 |
150 | System.out.println("Results:");
151 | for(Map.Entry entry: testingDataframe.entries()) {
152 | Integer rId = entry.getKey();
153 | Record r = entry.getValue();
154 | System.out.println("Record "+rId+" - Real Y: "+r.getY()+", Predicted Y: "+r.getYPredicted());
155 | }
156 |
157 | System.out.println("Regressor Rsquare: "+vm.getRSquare());
158 |
159 |
160 |
161 | //Clean up
162 | //--------
163 |
164 | //Delete scaler, featureselector and regressor.
165 | numericalScaler.delete();
166 | featureSelection.delete();
167 | regressor.delete();
168 |
169 | //Close Dataframes.
170 | trainingDataframe.close();
171 | testingDataframe.close();
172 | }
173 |
174 | }
175 |
--------------------------------------------------------------------------------
/src/main/java/com/datumbox/examples/TextClassification.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2013-2020 Vasilis Vryniotis
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.datumbox.examples;
17 |
18 | import com.datumbox.framework.applications.nlp.TextClassifier;
19 | import com.datumbox.framework.common.Configuration;
20 | import com.datumbox.framework.core.common.dataobjects.Record;
21 | import com.datumbox.framework.common.utilities.RandomGenerator;
22 | import com.datumbox.framework.core.machinelearning.MLBuilder;
23 | import com.datumbox.framework.core.machinelearning.classification.MultinomialNaiveBayes;
24 | import com.datumbox.framework.core.machinelearning.featureselection.ChisquareSelect;
25 | import com.datumbox.framework.core.machinelearning.modelselection.metrics.ClassificationMetrics;
26 | import com.datumbox.framework.core.common.text.extractors.NgramsExtractor;
27 |
28 | import java.net.URI;
29 | import java.net.URISyntaxException;
30 | import java.util.Arrays;
31 | import java.util.HashMap;
32 | import java.util.Map;
33 |
34 |
35 | /**
36 | * Text Classification example.
37 | *
38 | * @author Vasilis Vryniotis
39 | */
40 | public class TextClassification {
41 |
42 | /**
43 | * Example of how to use the TextClassifier class.
44 | *
45 | * @param args the command line arguments
46 | * @throws java.net.URISyntaxException
47 | */
48 | public static void main(String[] args) throws URISyntaxException {
49 | /**
50 | * There are 5 configuration files in the resources folder:
51 | *
52 | * - datumbox.configuration.properties: It defines for the default storage engine (required)
53 | * - datumbox.concurrencyconfiguration.properties: It controls the concurrency levels (required)
54 | * - datumbox.inmemoryconfiguration.properties: It contains the configurations for the InMemory storage engine (required)
55 | * - datumbox.mapdbconfiguration.properties: It contains the configurations for the MapDB storage engine (optional)
56 | * - logback.xml: It contains the configuration file for the logger (optional)
57 | */
58 |
59 | //Initialization
60 | //--------------
61 | RandomGenerator.setGlobalSeed(42L); //optionally set a specific seed for all Random objects
62 | Configuration configuration = Configuration.getConfiguration(); //default configuration based on properties file
63 | //configuration.setStorageConfiguration(new InMemoryConfiguration()); //use In-Memory engine (default)
64 | //configuration.setStorageConfiguration(new MapDBConfiguration()); //use MapDB engine
65 | //configuration.getConcurrencyConfiguration().setParallelized(true); //turn on/off the parallelization
66 | //configuration.getConcurrencyConfiguration().setMaxNumberOfThreadsPerTask(4); //set the concurrency level
67 |
68 |
69 |
70 | //Reading Data
71 | //------------
72 | Map