├── .github
    └── workflows
    │   └── ci_pipeline.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── analysis_options.yaml
├── autotest.sh
├── benchmark
    └── main.dart
├── example
    ├── black_friday
    │   ├── black_friday.csv
    │   └── black_friday.dart
    ├── dataset.csv
    └── main.dart
├── lib
    ├── ml_preprocessing.dart
    └── src
    │   ├── encoder
    │       ├── encoder.dart
    │       ├── encoder_impl.dart
    │       ├── encoder_type.dart
    │       ├── helpers
    │       │   ├── create_encoder_to_series_mapping.dart
    │       │   └── get_series_names_by_indices.dart
    │       ├── series_encoder
    │       │   ├── label_series_encoder.dart
    │       │   ├── one_hot_series_encoder.dart
    │       │   ├── series_encoder.dart
    │       │   ├── series_encoder_factory.dart
    │       │   └── series_encoder_factory_impl.dart
    │       ├── to_integer_labels.dart
    │       ├── to_one_hot_labels.dart
    │       └── unknown_value_handling_type.dart
    │   ├── normalizer
    │       ├── normalize.dart
    │       ├── normalizer.dart
    │       └── normalizer_impl.dart
    │   ├── pipeline
    │       ├── pipeable.dart
    │       ├── pipeline.dart
    │       └── pipeline_impl.dart
    │   └── standardizer
    │       ├── standardize.dart
    │       ├── standardizer.dart
    │       └── standardizer_impl.dart
├── pubspec.yaml
└── test
    ├── encoder
        ├── encoder_impl_test.dart
        └── series_encoder
        │   ├── label_series_encoder_test.dart
        │   ├── one_hot_series_encoder_test.dart
        │   └── series_encoder_factory_impl.dart
    ├── helpers.dart
    ├── normalizer
        ├── normalize_test.dart
        └── normalizer_test.dart
    ├── pipeline
        ├── pipeline_integration_test.dart
        └── pipeline_test.dart
    └── standardizer
        ├── standardize_test.dart
        └── standardizer_test.dart


/.github/workflows/ci_pipeline.yml:
--------------------------------------------------------------------------------
 1 | name: CI pipeline
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v2
15 |       - uses: dart-lang/setup-dart@v1
16 | 
17 |       - name: Print Dart SDK version
18 |         run: dart --version
19 | 
20 |       - name: Install dependencies
21 |         run: dart pub get
22 | 
23 |       - name: Verify formatting
24 |         run: dart format --output=none --set-exit-if-changed .
25 | 
26 |       - name: Analyze project source
27 |         run: dart analyze --fatal-infos
28 | 
29 |       - name: Run tests
30 |         run: dart test
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | 
 3 | # See https://www.dartlang.org/guides/libraries/private-files
 4 | 
 5 | # Files and directories created by pub
 6 | .dart_tool/
 7 | .packages
 8 | .pub/
 9 | build/
10 | # If you're building an application, you may want to check-in your pubspec.lock
11 | pubspec.lock
12 | 
13 | # Directory created by dartdoc
14 | # If you don't generate documentation locally you can remove this line.
15 | doc/api/
16 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 7.0.2
 4 | - `README.md`:
 5 |     - Fixed link to `black_friday` dataset
 6 | 
 7 | ## 7.0.1
 8 | - Added code formatting checking step to CI pipline
 9 | - Corrected `README` examples
10 | - Added documentation to `Encoder` factory
11 | 
12 | ## 7.0.0
13 | - `ml_datframe` 1.0.0 supported
14 | - `featureNames` parameter renamed to `columnNames`
15 | - `featureIds` parameter renamed to `columnIndices`
16 | - `encodeAsIntegerLabels` renamed to `toIntegerLabels`
17 | - `encodeAsOneHotLabels` renamed to `toOneHotLabels`
18 | 
19 | ## 6.0.1
20 | - `pubspec.yaml`: `ml_dataframe` dependency updated
21 | 
22 | ## 6.0.0
23 | - Null-safety added (stable release)
24 | 
25 | ## 6.0.0-nullsafety.0
26 | - Null-safety added (beta release)
27 | 
28 | ## 5.2.2
29 | - `ml_dataframe`: version 0.4.0 supported
30 | 
31 | ## 5.2.1
32 | - `ml_dataframe`: version 0.3.0 supported
33 | - `CI`: github actions set up
34 | 
35 | ## 5.2.0
36 | - `UnknownValueHandlingType` enum added to the lib's public API
37 | 
38 | ## 5.1.2
39 | - `ml_dataframe` 0.2.0 supported
40 | 
41 | ## 5.1.1
42 | - `ml_dataframe` dependency updated
43 | 
44 | ## 5.1.0
45 | - `Standardizer` entity added
46 | - `dtype` parameter added as an argument for `Pipeline.process` method
47 | 
48 | ## 5.0.4
49 | - Default values for parameters `headerPrefix` and `headerPostfix` added where it applicable
50 | 
51 | ## 5.0.3
52 | - `README` corrected (ml_dataframe version corrected)
53 | 
54 | ## 5.0.2
55 | - `xrange` dependency removed
56 | - `ml_dataframe` 0.0.11 supported
57 | 
58 | ## 5.0.1
59 | - `xrange` package version locked
60 | 
61 | ## 5.0.0
62 | - `Encoder` interface changed: there is no more `encode` method, use `process` from `Pipeable` instead
63 | - `Normalizer` entity added
64 | - `normalize` operator added
65 | 
66 | ## 4.0.0
67 | - `DataFrame` class split up into separate smaller entities
68 | - `DataFrame` class core moved to separate repository
69 | - `Pipeline` entity created
70 | - Categorical data encoders implemented `Pipeable` interface
71 | 
72 | ## 3.4.0
73 | - `DataFrame`: `encodedColumnRanges` added
74 | 
75 | ## 3.3.0
76 | - `ml_linalg` 10.0.0 supported
77 | 
78 | ## 3.2.0
79 | - `ml_linalg` 9.0.0 supported
80 | 
81 | ## 3.1.0
82 | - `Categorical data processing`: `encoders` parameter added to `DataFrame.fromCsv` constructor
83 | 
84 | ## 3.0.0
85 | - `xrange` library supported: it's possible to provide `ZRange` object now instead of `tuple2` to specify a range of 
86 | indices 
87 | 
88 | ## 2.0.0
89 | - `DataFrame` introduced
90 | 
91 | ## 1.1.0
92 | - `Float32x4InterceptPreprocessor` added
93 | - `readme` updated
94 | 
95 | ## 1.0.0
96 | - Package published
97 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2019, Ilya Gyrdymov
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://github.com/gyrdym/ml_preprocessing/workflows/CI%20pipeline/badge.svg)](https://github.com/gyrdym/ml_preprocessing/actions?query=branch%3Amaster+)
  2 | [![Coverage Status](https://coveralls.io/repos/github/gyrdym/ml_preprocessing/badge.svg)](https://coveralls.io/github/gyrdym/ml_preprocessing)
  3 | [![pub package](https://img.shields.io/pub/v/ml_preprocessing.svg)](https://pub.dartlang.org/packages/ml_preprocessing)
  4 | [![Gitter Chat](https://badges.gitter.im/gyrdym/gyrdym.svg)](https://gitter.im/gyrdym/)
  5 | 
  6 | # ml_preprocessing
  7 | Data preprocessing algorithms
  8 | 
  9 | ## What is data preprocessing?
 10 | *Data preprocessing* is a set of techniques for data preparation before one can use the data in Machine Learning algorithms.
 11 | 
 12 | ## Why is it needed?
 13 | Let's say, you have a dataset:
 14 | 
 15 | ````
 16 |     ----------------------------------------------------------------------------------------
 17 |     | Gender | Country | Height (cm) | Weight (kg) | Diabetes (1 - Positive, 0 - Negative) |
 18 |     ----------------------------------------------------------------------------------------
 19 |     | Female | France  |     165     |     55      |                    1                  |
 20 |     ----------------------------------------------------------------------------------------
 21 |     | Female | Spain   |     155     |     50      |                    0                  |
 22 |     ----------------------------------------------------------------------------------------
 23 |     | Male   | Spain   |     175     |     75      |                    0                  |
 24 |     ----------------------------------------------------------------------------------------
 25 |     | Male   | Russia  |     173     |     77      |                   N/A                 |
 26 |     ----------------------------------------------------------------------------------------
 27 | ````
 28 | 
 29 | Everything seems good for now. Say, you're about to train a classifier to predict if a person has diabetes. 
 30 | But there is an obstacle - how can it be possible to use the data in mathematical equations with string-value columns 
 31 | (`Gender`, `Country`)? And things are getting even worse because of an empty (N/A) value in the `Diabetes` column. There 
 32 | should be a way to convert this data to a valid numerical representation. Here data preprocessing techniques come to play. 
 33 | You should decide, how to convert string data (aka *categorical data*) to numbers and how to treat empty values. Of 
 34 | course, you can come up with your unique algorithms to do all of these operations, but there are a lot of well-known 
 35 | techniques for doing all the conversions.      
 36 | 
 37 | The aim of the library is to give data scientists, who are interested in Dart programming language, these preprocessing 
 38 | techniques.
 39 | 
 40 | ## Prerequisites
 41 | 
 42 | The library depends on [DataFrame class](https://github.com/gyrdym/ml_dataframe/blob/master/lib/src/data_frame/data_frame.dart) 
 43 | from the [repo](https://github.com/gyrdym/ml_dataframe). It's necessary to use it as a dependency in your project,
 44 | because you need to pack data into [DataFrame](https://github.com/gyrdym/ml_dataframe/blob/master/lib/src/data_frame/data_frame.dart)
 45 | before doing preprocessing. An example with a part of pubspec.yaml:
 46 | 
 47 | ````
 48 | dependencies:
 49 |   ...
 50 |   ml_dataframe: ^1.0.0
 51 |   ...
 52 | ````
 53 | 
 54 | ## Usage examples
 55 | 
 56 | ### Getting started
 57 | 
 58 | Let's download some data from [Kaggle](https://www.kaggle.com) - let it be amazing [black friday](https://www.kaggle.com/datasets/sdolezel/black-friday) 
 59 | dataset. It's pretty interesting data with huge amount of observations (approx. 538000 rows) and a good number of 
 60 | categorical features.
 61 | 
 62 | First, import all necessary libraries:
 63 | 
 64 | ````dart
 65 | import 'package:ml_dataframe/ml_dataframe.dart';
 66 | import 'package:ml_preprocessing/ml_preprocessing.dart';
 67 | ````
 68 | 
 69 | Then, we should read the csv and create a data frame:
 70 | 
 71 | ````dart
 72 | final dataFrame = await fromCsv('example/black_friday/black_friday.csv', 
 73 |   columns: [2, 3, 5, 6, 7, 11]);
 74 | ````
 75 | 
 76 | ### Categorical data
 77 | 
 78 | After we get a dataframe, we may encode all the needed features. Let's analyze the dataset and decide, what features 
 79 | should be encoded. In our case these are:
 80 | 
 81 | ````dart
 82 | final featureNames = ['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status'];
 83 | ````
 84 | 
 85 | ### One-hot encoding
 86 | 
 87 | Let's fit the one-hot encoder. 
 88 | 
 89 | Why should we fit it? Categorical data encoder fitting - a process, when all the unique category values are being 
 90 | searched for in order to create an encoded labels list. After the fitting is complete, one may use the fitted encoder for 
 91 | the new data of the same source. 
 92 | 
 93 | In order to fit the encoder, it's needed to create the instance of the `Encoder` class and pass the fitting data as an 
 94 | argument to the constructor, along with the features to be encoded:
 95 | 
 96 |  
 97 | ````dart
 98 | final encoder = Encoder.oneHot(
 99 |   dataFrame,
100 |   columnNames: featureNames,
101 | );
102 | 
103 | ````
104 | 
105 | Let's encode the features:
106 | 
107 | ````dart
108 | final encoded = encoder.process(dataFrame);
109 | ````
110 | 
111 | We used the same dataframe here - it's absolutely normal since when we created the encoder, we just fit it with the 
112 | dataframe, and now is the time to apply the dataframe to the fitted encoder.
113 | 
114 | It's time to take a look at our processed data. Let's read it:
115 | 
116 | ````dart
117 | final data = encoded.toMatrix();
118 | 
119 | print(data);
120 | ```` 
121 | 
122 | In the output we will see just numerical data, that's exactly what we wanted to reach.
123 | 
124 | ### Label encoding
125 | 
126 | Another well-known encoding method. The technique is the same - first, we should fit the encoder and after that, we
127 | may use this "trained" encoder in some applications:
128 | 
129 | ````dart
130 | // fit encoder
131 | final encoder = Encoder.label(
132 |   dataFrame,
133 |   columnNames: featureNames,
134 | );
135 | 
136 | // apply fitted encoder to data
137 | final encoded = encoder.process(dataFrame);
138 | ````
139 | 
140 | ### Numerical data normalization
141 | 
142 | Sometimes we need to have our numerical features normalized, which means we need to treat every dataframe row as a 
143 | vector and divide this vector element-wise by its norm (Euclidean, Manhattan, etc.). To do so the library exposes
144 | `Normalizer` class:
145 | 
146 | ````dart
147 | final normalizer = Normalizer(); // by default Euclidean norm will be used
148 | final transformed = normalizer.process(dataFrame);
149 | ```` 
150 | 
151 | Please, notice, that if your data has raw categorical values, the normalization will fail as it requires only numerical 
152 | values. In this case, you should encode data (e.g. using one-hot encoding) before normalization.
153 | 
154 | ### Data standardization
155 | 
156 | A lot of machine learning algorithms require normally distributed data as their input. Normally distributed data 
157 | means that every column in the data has zero mean and unit variance. One may reach this requirement using the 
158 | `Standardizer` class. During the creation of the class instance, all the columns' mean values and deviation values are 
159 | being extracted from the passed data and stored as fields of the class, in order to apply them to standardize the 
160 | other (or the same that was used for the creation of the Standardizer) data:
161 | 
162 | ````dart
163 | final dataFrame = DataFrame([
164 |   [  1,   2,   3],
165 |   [ 10,  20,  30],
166 |   [100, 200, 300],
167 | ], headerExists: false);
168 | 
169 | // fit standardizer
170 | final standardizer = Standardizer(dataFrame);
171 | 
172 | // apply fitted standardizer to data
173 | final transformed = standardizer.process(dataFrame);
174 | ````      
175 | 
176 | ### Pipeline
177 | 
178 | There is a convenient way to organize a sequence of data preprocessing operations - `Pipeline`:
179 | 
180 | ````dart
181 | final pipeline = Pipeline(dataFrame, [
182 |   toOneHotLabels(columnNames: ['Gender', 'Age', 'City_Category']),
183 |   toIntegerLabels(columnNames: ['Stay_In_Current_City_Years', 'Marital_Status']),
184 |   normalize(),
185 |   standardize(),
186 | ]);
187 | ````
188 | 
189 | Once you create (or rather fit) a pipeline, you may use it further in your application:
190 | 
191 | ````dart
192 | final processed = pipeline.process(dataFrame);
193 | ````
194 | 
195 | `toOneHotLabels`, `toIntegerLabels`, `normalize` and `standardize` are pipeable operator functions. 
196 | The pipeable operator function is a factory that takes fitting data and creates a fitted pipeable entity (e.g., 
197 | `Normalizer` instance)  
198 | 


--------------------------------------------------------------------------------
/analysis_options.yaml:
--------------------------------------------------------------------------------
1 | include: package:pedantic/analysis_options.yaml
2 | 


--------------------------------------------------------------------------------
/autotest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pub run build_runner test -- -p vm
4 | 


--------------------------------------------------------------------------------
/benchmark/main.dart:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/example/black_friday/black_friday.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/ml_preprocessing.dart';
 3 | 
 4 | Future processDataSetWithCategoricalData() async {
 5 |   final dataFrame = await fromCsv(
 6 |     'example/black_friday/black_friday.csv',
 7 |     columnNames: [
 8 |       'Gender',
 9 |       'Age',
10 |       'City_Category',
11 |       'Stay_In_Current_City_Years',
12 |       'Marital_Status'
13 |     ],
14 |   );
15 | 
16 |   final encoded = Encoder.oneHot(
17 |     dataFrame,
18 |     columnNames: [
19 |       'Gender',
20 |       'Age',
21 |       'City_Category',
22 |       'Stay_In_Current_City_Years',
23 |       'Marital_Status'
24 |     ],
25 |   ).process(dataFrame);
26 | 
27 |   final observations = encoded.toMatrix();
28 |   final genderEncoded = observations.sample(columnIndices: [0, 1]);
29 |   final ageEncoded = observations.sample(columnIndices: [2, 3, 4, 5, 6, 7, 8]);
30 |   final cityCategoryEncoded = observations.sample(columnIndices: [9, 10, 11]);
31 |   final stayInCityEncoded =
32 |       observations.sample(columnIndices: [12, 13, 14, 15, 16]);
33 |   final maritalStatusEncoded = observations.sample(columnIndices: [17, 18]);
34 | 
35 |   print('Features:');
36 | 
37 |   print(observations);
38 | 
39 |   print('feature matrix dimensions: ${observations.rowsNum} x '
40 |       '${observations.columnsNum};');
41 | 
42 |   print('==============================');
43 | 
44 |   print('Gender:');
45 |   print(genderEncoded);
46 | 
47 |   print('==============================');
48 | 
49 |   print('Age');
50 |   print(ageEncoded);
51 | 
52 |   print('==============================');
53 | 
54 |   print('City category');
55 |   print(cityCategoryEncoded);
56 | 
57 |   print('==============================');
58 | 
59 |   print('Stay in current city (years)');
60 |   print(stayInCityEncoded);
61 | 
62 |   print('==============================');
63 | 
64 |   print('Marital status');
65 |   print(maritalStatusEncoded);
66 | }
67 | 
68 | Future main() async {
69 |   await processDataSetWithCategoricalData();
70 | }
71 | 


--------------------------------------------------------------------------------
/example/dataset.csv:
--------------------------------------------------------------------------------
1 | position,country,age,salary
2 | developer,Russia,21,1000
3 | ui designer,Russia,32,2000
4 | QA engineer,USA,27,2500
5 | QA engineer,Spain,25,2000
6 | developer,France,29,3000
7 | developer,China,23,1500
8 | ui designer,Japan,24,2000
9 | 


--------------------------------------------------------------------------------
/example/main.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/ml_preprocessing.dart';
 3 | 
 4 | Future main() async {
 5 |   final dataFrame = await fromCsv('example/dataset.csv', columns: [0, 1, 2, 3]);
 6 | 
 7 |   final pipeline = Pipeline(dataFrame, [
 8 |     toOneHotLabels(
 9 |       columnNames: ['position'],
10 |       headerPostfix: '_position',
11 |     ),
12 |     toIntegerLabels(
13 |       columnNames: ['country'],
14 |     ),
15 |   ]);
16 | 
17 |   print(pipeline.process(dataFrame).toMatrix());
18 | }
19 | 


--------------------------------------------------------------------------------
/lib/ml_preprocessing.dart:
--------------------------------------------------------------------------------
 1 | export 'package:ml_linalg/norm.dart';
 2 | export 'package:ml_preprocessing/src/encoder/encoder.dart';
 3 | export 'package:ml_preprocessing/src/encoder/to_integer_labels.dart';
 4 | export 'package:ml_preprocessing/src/encoder/to_one_hot_labels.dart';
 5 | export 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
 6 | export 'package:ml_preprocessing/src/normalizer/normalize.dart';
 7 | export 'package:ml_preprocessing/src/normalizer/normalizer.dart';
 8 | export 'package:ml_preprocessing/src/pipeline/pipeline.dart';
 9 | export 'package:ml_preprocessing/src/standardizer/standardize.dart';
10 | export 'package:ml_preprocessing/src/standardizer/standardizer.dart';
11 | 


--------------------------------------------------------------------------------
/lib/src/encoder/encoder.dart:
--------------------------------------------------------------------------------
  1 | import 'package:ml_dataframe/ml_dataframe.dart';
  2 | import 'package:ml_preprocessing/src/encoder/encoder_impl.dart';
  3 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart';
  4 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory_impl.dart';
  5 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
  6 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
  7 | 
  8 | /// Categorical data encoder factory.
  9 | ///
 10 | /// Algorithms that process data to create prediction models can't handle
 11 | /// categorical data, since they are based on mathematical equations and work
 12 | /// only with bare numbers. That means that the categorical data should be
 13 | /// converted to numbers.
 14 | ///
 15 | /// The factory exposes different ways to convert categorical data into numbers.
 16 | abstract class Encoder implements Pipeable {
 17 |   /// Gets columns by [columnIndices] or [columnNames] ([columnIndices] has a
 18 |   /// precedence over [columnNames]) from [fittingData], collects all unique
 19 |   /// values from the columns and builds a map `raw value` => `encoded value`.
 20 |   /// Once one calls the [process] method, the mapping will be applied.
 21 |   ///
 22 |   /// The mapping is built according to the following rules:
 23 |   ///
 24 |   /// Let's say, one has a list of values denoting a level of education:
 25 |   ///
 26 |   /// ```
 27 |   /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
 28 |   /// ```
 29 |   ///
 30 |   /// After applying the encoder, the source sequence will be looking
 31 |   /// like this:
 32 |   ///
 33 |   /// ```
 34 |   /// [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]]
 35 |   /// ```
 36 |   ///
 37 |   /// In other words, the `one-hot` encoder created the following mapping:
 38 |   ///
 39 |   /// `BSc` => [1, 0, 0]
 40 |   ///
 41 |   /// `PhD` => [0, 1, 0]
 42 |   ///
 43 |   /// `High School` => [0, 0, 1]
 44 |   ///
 45 |   /// Keep in mind that if you apply the [process] method to your data, the
 46 |   /// number of columns will be increased since one categorical value in the
 47 |   /// case of one-hot encoding requires several cells. Headers for the new
 48 |   /// columns will be autogenerated from the categorical values.
 49 |   factory Encoder.oneHot(
 50 |     DataFrame fittingData, {
 51 |     Iterable<int>? columnIndices,
 52 |     Iterable<String>? columnNames,
 53 |     UnknownValueHandlingType unknownValueHandlingType =
 54 |         defaultUnknownValueHandlingType,
 55 |   }) =>
 56 |       EncoderImpl(
 57 |         fittingData,
 58 |         EncoderType.oneHot,
 59 |         const SeriesEncoderFactoryImpl(),
 60 |         columnNames: columnNames,
 61 |         columnIndices: columnIndices,
 62 |         unknownValueHandlingType: unknownValueHandlingType,
 63 |       );
 64 | 
 65 |   /// Gets columns by [columnIndices] or [columnNames] ([columnIndices] has a
 66 |   /// precedence over [columnNames]) from [fittingData], collects all unique
 67 |   /// values from the columns and builds a map `raw value` => `encoded value`.
 68 |   /// Once one calls the [process] method, the mapping will be applied.
 69 |   ///
 70 |   /// The mapping is built according to the following rules:
 71 |   ///
 72 |   /// Let's say, one has a list of values denoting a level of education:
 73 |   ///
 74 |   /// ```
 75 |   /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
 76 |   /// ```
 77 |   ///
 78 |   /// After applying the encoder, the source list will be looking
 79 |   /// like this:
 80 |   ///
 81 |   /// ```
 82 |   /// [0, 0, 1, 2, 1]
 83 |   /// ```
 84 |   ///
 85 |   /// In other words, the `label` encoder created the following mapping:
 86 |   ///
 87 |   /// `BSc` => 0
 88 |   ///
 89 |   /// `PhD` => 1
 90 |   ///
 91 |   /// `High School` => 2
 92 |   factory Encoder.label(
 93 |     DataFrame fittingData, {
 94 |     Iterable<int>? columnIndices,
 95 |     Iterable<String>? columnNames,
 96 |     UnknownValueHandlingType unknownValueHandlingType =
 97 |         defaultUnknownValueHandlingType,
 98 |   }) =>
 99 |       EncoderImpl(
100 |         fittingData,
101 |         EncoderType.label,
102 |         const SeriesEncoderFactoryImpl(),
103 |         columnNames: columnNames,
104 |         columnIndices: columnIndices,
105 |         unknownValueHandlingType: unknownValueHandlingType,
106 |       );
107 | }
108 | 


--------------------------------------------------------------------------------
/lib/src/encoder/encoder_impl.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/src/encoder/encoder.dart';
 3 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart';
 4 | import 'package:ml_preprocessing/src/encoder/helpers/create_encoder_to_series_mapping.dart';
 5 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart';
 6 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory.dart';
 7 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
 8 | 
 9 | class EncoderImpl implements Encoder {
10 |   EncoderImpl(
11 |     DataFrame fittingData,
12 |     EncoderType encoderType,
13 |     SeriesEncoderFactory seriesEncoderFactory, {
14 |     Iterable<int>? columnIndices,
15 |     Iterable<String>? columnNames,
16 |     String encodedHeaderPrefix = '',
17 |     String encodedHeaderPostfix = '',
18 |     UnknownValueHandlingType unknownValueHandlingType =
19 |         defaultUnknownValueHandlingType,
20 |   }) : _encoderBySeries = createEncoderToSeriesMapping(
21 |             fittingData,
22 |             columnNames,
23 |             columnIndices,
24 |             (series) => seriesEncoderFactory.createByType(
25 |                   encoderType,
26 |                   series,
27 |                   headerPostfix: encodedHeaderPostfix,
28 |                   headerPrefix: encodedHeaderPrefix,
29 |                   unknownValueHandlingType: unknownValueHandlingType,
30 |                 ));
31 | 
32 |   final Map<String, SeriesEncoder> _encoderBySeries;
33 | 
34 |   @override
35 |   DataFrame process(DataFrame dataFrame) {
36 |     final encoded = dataFrame.series.expand((series) =>
37 |         _encoderBySeries.containsKey(series.name)
38 |             ? _encoderBySeries[series.name]!.encodeSeries(series)
39 |             : [series]);
40 | 
41 |     return DataFrame.fromSeries(encoded);
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/lib/src/encoder/encoder_type.dart:
--------------------------------------------------------------------------------
 1 | /// A type of categorical data encoding
 2 | ///
 3 | /// Algorithms that process data to create prediction models can't handle
 4 | /// categorical data, since they are based on mathematical equations and work
 5 | /// only with bare numbers. That means that the categorical data should be
 6 | /// converted to numbers.
 7 | ///
 8 | /// [EncoderType.label] converts categorical values into integer numbers. Let's
 9 | /// say, one has a list of values denoting a level of education:
10 | ///
11 | /// ```
12 | /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
13 | /// ```
14 | ///
15 | /// After applying [EncoderType.label], the source list will be looking
16 | /// like this:
17 | ///
18 | /// ```
19 | /// [0, 0, 1, 2, 1]
20 | /// ```
21 | ///
22 | /// In other words, the `label` encoder created the following mapping:
23 | ///
24 | /// `BSc` => 0
25 | ///
26 | /// `PhD` => 1
27 | ///
28 | /// `High School` => 2
29 | ///
30 | /// [EncoderType.oneHot] converts categorical values into binary sequences.
31 | /// Let's say, one has a list of values denoting a level of education:
32 | ///
33 | /// ```
34 | /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
35 | /// ```
36 | ///
37 | /// After applying [EncoderType.oneHot], the source sequence will be looking
38 | /// like this:
39 | ///
40 | /// ```
41 | /// [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]]
42 | /// ```
43 | ///
44 | /// In other words, the `one-hot` encoder created the following mapping:
45 | ///
46 | /// `BSc` => [1, 0, 0]
47 | ///
48 | /// `PhD` => [0, 1, 0]
49 | ///
50 | /// `High School` => [0, 0, 1]
51 | enum EncoderType {
52 |   oneHot,
53 |   label,
54 | }
55 | 


--------------------------------------------------------------------------------
/lib/src/encoder/helpers/create_encoder_to_series_mapping.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/src/encoder/helpers/get_series_names_by_indices.dart';
 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart';
 4 | 
 5 | Map<String, SeriesEncoder> createEncoderToSeriesMapping(
 6 |   DataFrame dataFrame,
 7 |   Iterable<String>? predefinedSeriesNames,
 8 |   Iterable<int>? seriesIndices,
 9 |   SeriesEncoder Function(Series series) seriesEncoderFactory,
10 | ) {
11 |   final seriesNames = predefinedSeriesNames ??
12 |       getSeriesNamesByIndices(dataFrame.header, seriesIndices!);
13 |   final entries = seriesNames.map((name) {
14 |     final series = dataFrame[name];
15 |     final encoder = seriesEncoderFactory(series);
16 | 
17 |     return MapEntry(name, encoder);
18 |   });
19 | 
20 |   return Map.fromEntries(entries);
21 | }
22 | 


--------------------------------------------------------------------------------
/lib/src/encoder/helpers/get_series_names_by_indices.dart:
--------------------------------------------------------------------------------
 1 | import 'package:quiver/iterables.dart';
 2 | 
 3 | Iterable<String> getSeriesNamesByIndices(
 4 |     Iterable<String> seriesNames, Iterable<int> indices) {
 5 |   final uniqueIndices = Set<int>.from(indices);
 6 | 
 7 |   return enumerate(seriesNames)
 8 |       .where((indexedName) => uniqueIndices.contains(indexedName.index))
 9 |       .map((indexedValue) => indexedValue.value);
10 | }
11 | 


--------------------------------------------------------------------------------
/lib/src/encoder/series_encoder/label_series_encoder.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart';
 3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
 4 | 
 5 | class LabelSeriesEncoder implements SeriesEncoder {
 6 |   LabelSeriesEncoder(
 7 |     Series fittingData, {
 8 |     UnknownValueHandlingType unknownValueHandlingType =
 9 |         defaultUnknownValueHandlingType,
10 |     String headerPrefix = '',
11 |     String headerPostfix = '',
12 |   })  : _unknownHandlingType = unknownValueHandlingType,
13 |         _columnHeaderTpl =
14 |             ((String label) => '$headerPrefix$label$headerPostfix'),
15 |         _labels = Set<dynamic>.from(fittingData.data).toList(growable: false);
16 | 
17 |   final UnknownValueHandlingType _unknownHandlingType;
18 |   final ColumnHeaderTemplateFn _columnHeaderTpl;
19 |   final List _labels;
20 | 
21 |   @override
22 |   Iterable<Series> encodeSeries(Series series) {
23 |     final shouldThrowErrorIfUnknown =
24 |         _unknownHandlingType == UnknownValueHandlingType.error;
25 | 
26 |     return [
27 |       Series(
28 |         _columnHeaderTpl(series.name),
29 |         series.data.map<dynamic>((dynamic label) {
30 |           if (!_labels.contains(label)) {
31 |             if (shouldThrowErrorIfUnknown) {
32 |               throw Exception('Unknown categorical value encountered - $label');
33 |             }
34 | 
35 |             return _labels.length;
36 |           }
37 | 
38 |           return _labels.indexOf(label);
39 |         }),
40 |         isDiscrete: true,
41 |       ),
42 |     ];
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/lib/src/encoder/series_encoder/one_hot_series_encoder.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart';
 3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
 4 | 
 5 | class OneHotSeriesEncoder implements SeriesEncoder {
 6 |   OneHotSeriesEncoder(
 7 |     Series fittingData, {
 8 |     UnknownValueHandlingType unknownValueHandlingType =
 9 |         defaultUnknownValueHandlingType,
10 |     String headerPrefix = '',
11 |     String headerPostfix = '',
12 |   })  : _unknownHandlingType = unknownValueHandlingType,
13 |         _columnHeaderTpl =
14 |             ((String label) => '$headerPrefix$label$headerPostfix'),
15 |         _labels = Set<dynamic>.from(fittingData.data);
16 | 
17 |   final UnknownValueHandlingType _unknownHandlingType;
18 |   final ColumnHeaderTemplateFn _columnHeaderTpl;
19 |   final Set _labels;
20 | 
21 |   @override
22 |   Iterable<Series> encodeSeries(Series series) => _labels.map((dynamic label) {
23 |         final shouldThrowErrorIfUnknown =
24 |             _unknownHandlingType == UnknownValueHandlingType.error;
25 | 
26 |         final data = series.data.map((dynamic value) {
27 |           if (shouldThrowErrorIfUnknown && !_labels.contains(value)) {
28 |             throw Exception('Unknown categorical value encountered - `$value` '
29 |                 'for series `${series.name}`');
30 |           }
31 | 
32 |           return value == label ? 1 : 0;
33 |         });
34 | 
35 |         return Series(_columnHeaderTpl(label.toString()), data,
36 |             isDiscrete: true);
37 |       });
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/src/encoder/series_encoder/series_encoder.dart:
--------------------------------------------------------------------------------
1 | import 'package:ml_dataframe/ml_dataframe.dart';
2 | 
3 | typedef ColumnHeaderTemplateFn = String Function(String label);
4 | 
5 | abstract class SeriesEncoder {
6 |   Iterable<Series> encodeSeries(Series series);
7 | }
8 | 


--------------------------------------------------------------------------------
/lib/src/encoder/series_encoder/series_encoder_factory.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart';
 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart';
 4 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
 5 | 
 6 | abstract class SeriesEncoderFactory {
 7 |   SeriesEncoder createByType(
 8 |     EncoderType type,
 9 |     Series fittingData, {
10 |     String headerPrefix,
11 |     String headerPostfix,
12 |     UnknownValueHandlingType unknownValueHandlingType,
13 |   });
14 | }
15 | 


--------------------------------------------------------------------------------
/lib/src/encoder/series_encoder/series_encoder_factory_impl.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart';
 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/label_series_encoder.dart';
 4 | import 'package:ml_preprocessing/src/encoder/series_encoder/one_hot_series_encoder.dart';
 5 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart';
 6 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory.dart';
 7 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
 8 | 
 9 | class SeriesEncoderFactoryImpl implements SeriesEncoderFactory {
10 |   const SeriesEncoderFactoryImpl();
11 | 
12 |   @override
13 |   SeriesEncoder createByType(
14 |     EncoderType type,
15 |     Series fittingData, {
16 |     String headerPrefix = '',
17 |     String headerPostfix = '',
18 |     UnknownValueHandlingType unknownValueHandlingType =
19 |         defaultUnknownValueHandlingType,
20 |   }) {
21 |     switch (type) {
22 |       case EncoderType.label:
23 |         return LabelSeriesEncoder(
24 |           fittingData,
25 |           headerPrefix: headerPrefix,
26 |           headerPostfix: headerPostfix,
27 |           unknownValueHandlingType: unknownValueHandlingType,
28 |         );
29 | 
30 |       case EncoderType.oneHot:
31 |         return OneHotSeriesEncoder(
32 |           fittingData,
33 |           headerPrefix: headerPrefix,
34 |           headerPostfix: headerPostfix,
35 |           unknownValueHandlingType: unknownValueHandlingType,
36 |         );
37 | 
38 |       default:
39 |         throw UnsupportedError('Unsupported encoder type - $type');
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/lib/src/encoder/to_integer_labels.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_preprocessing/src/encoder/encoder_impl.dart';
 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart';
 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory_impl.dart';
 4 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
 5 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 6 | 
 7 | /// A factory function to use label categorical data encoder in the pipeline
 8 | ///
 9 | /// A usage example:
10 | ///
11 | /// ```dart
12 | /// import 'package:ml_dataframe/ml_dataframe.dart';
13 | /// import 'package:ml_preprocessing/ml_preprocessing.dart';
14 | ///
15 | /// void main() {
16 | ///   final dataframe = DataFrame([
17 | ///     ['col_1', 'col_2', 'col_3'],
18 | ///     ['val_1',       1,   false],
19 | ///     ['val_2',     0.4,    true],
20 | ///     ['val_1',       5,   false],
21 | ///     ['val_3',       6,   false],
22 | ///   ]);
23 | ///
24 | ///   // let's fit a pipeline
25 | ///   final pipeline = Pipeline(dataframe, [
26 | ///     // 'col_1' column contains categorical data, let's encode it
27 | ///     toIntegerLabels(columnNames: ['col_1']),
28 | ///   ]);
29 | ///   final processed = pipeline.process(dataframe);
30 | ///
31 | ///   // since there are only 3 values in the series 'col_1', they will be
32 | ///   // converted as follows:
33 | ///   //
34 | ///   // 'val_1' => 0
35 | ///   // 'val_2' => 1
36 | ///   // 'val_3' => 2
37 | ///   print(processed);
38 | ///   // DataFrame (4 x 3)
39 | ///   // col_1   col_2   col_3
40 | ///   //     0       1   false
41 | ///   //     1     0.4    true
42 | ///   //     0       5   false
43 | ///   //     2       6   false
44 | /// }
45 | /// ```
46 | PipeableOperatorFn toIntegerLabels({
47 |   Iterable<int>? columnIndices,
48 |   Iterable<String>? columnNames,
49 |   String headerPrefix = '',
50 |   String headerPostfix = '',
51 |   UnknownValueHandlingType unknownValueHandlingType =
52 |       defaultUnknownValueHandlingType,
53 | }) =>
54 |     (data, {dtype}) => EncoderImpl(
55 |           data,
56 |           EncoderType.label,
57 |           const SeriesEncoderFactoryImpl(),
58 |           columnIndices: columnIndices,
59 |           columnNames: columnNames,
60 |           encodedHeaderPostfix: headerPostfix,
61 |           encodedHeaderPrefix: headerPrefix,
62 |           unknownValueHandlingType: unknownValueHandlingType,
63 |         );
64 | 


--------------------------------------------------------------------------------
/lib/src/encoder/to_one_hot_labels.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_preprocessing/src/encoder/encoder_impl.dart';
 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart';
 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory_impl.dart';
 4 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
 5 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 6 | 
 7 | /// A factory function to use `one hot` categorical data encoder in the pipeline
 8 | ///
 9 | /// A usage example:
10 | ///
11 | /// ```dart
12 | /// import 'package:ml_dataframe/ml_dataframe.dart';
13 | /// import 'package:ml_preprocessing/ml_preprocessing.dart';
14 | ///
15 | /// void main() {
16 | ///   final dataframe = DataFrame([
17 | ///     ['col_1', 'col_2', 'col_3'],
18 | ///     ['val_1',       1,   false],
19 | ///     ['val_2',     0.4,    true],
20 | ///     ['val_1',       5,   false],
21 | ///   ]);
22 | ///
23 | ///   // let's fit a pipeline
24 | ///   final pipeline = Pipeline(dataframe, [
25 | ///     // 'col_1' column contains categorical data, let's encode it
26 | ///     toOneHotLabels(columnNames: ['col_1']),
27 | ///   ]);
28 | ///   final processed = pipeline.process(dataframe);
29 | ///
30 | ///   // since there are only two values in the series 'col_1', they will be
31 | ///   // converted as follows:
32 | ///   //
33 | ///   // 'val_1' => 10
34 | ///   // 'val_2' => 01
35 | ///   print(processed);
36 | ///   // DataFrame (3 x 4)
37 | ///   // val_1   val_2   col_2   col_3
38 | ///   //     1       0       1   false
39 | ///   //     0       1     0.4    true
40 | ///   //     1       0       5   false
41 | /// }
42 | /// ```
43 | PipeableOperatorFn toOneHotLabels({
44 |   Iterable<int>? columnIndices,
45 |   Iterable<String>? columnNames,
46 |   String headerPrefix = '',
47 |   String headerPostfix = '',
48 |   UnknownValueHandlingType unknownValueHandlingType =
49 |       defaultUnknownValueHandlingType,
50 | }) =>
51 |     (data, {dtype}) => EncoderImpl(
52 |           data,
53 |           EncoderType.oneHot,
54 |           const SeriesEncoderFactoryImpl(),
55 |           columnIndices: columnIndices,
56 |           columnNames: columnNames,
57 |           encodedHeaderPostfix: headerPostfix,
58 |           encodedHeaderPrefix: headerPrefix,
59 |           unknownValueHandlingType: unknownValueHandlingType,
60 |         );
61 | 


--------------------------------------------------------------------------------
/lib/src/encoder/unknown_value_handling_type.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_preprocessing/ml_preprocessing.dart';
 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart';
 3 | 
 4 | /// A way to handle unknown categorical data
 5 | ///
 6 | /// During processing new data one can encounter previously unseen value. Let's
 7 | /// say, one has a list of values denoting a level of education:
 8 | ///
 9 | /// ```
10 | /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD']
11 | /// ```
12 | ///
13 | /// One successfully applied some [EncoderType] to the data, let's say, [EncoderType.label].
14 | ///
15 | /// But what should one do if there is an unknown categorical value, e.g.
16 | /// 'School SAT', among new data to process through the same [Pipeline]?
17 | ///
18 | /// [UnknownValueHandlingType.error] forces the pipeline to stop preprocessing
19 | /// and throw an error
20 | ///
21 | /// [UnknownValueHandlingType.ignore] makes it possible to continue the
22 | /// preprocessing as nothing happened - in this case depending on the [EncoderType]
23 | /// will be used an autogenerated encoded value
24 | enum UnknownValueHandlingType {
25 |   error,
26 |   ignore,
27 | }
28 | 
29 | const defaultUnknownValueHandlingType = UnknownValueHandlingType.ignore;
30 | 


--------------------------------------------------------------------------------
/lib/src/normalizer/normalize.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_linalg/dtype.dart';
 2 | import 'package:ml_linalg/norm.dart';
 3 | import 'package:ml_preprocessing/src/normalizer/normalizer.dart';
 4 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 5 | import 'package:ml_preprocessing/src/pipeline/pipeline.dart';
 6 | 
 7 | /// Returns a function that can be used in [Pipeline]. The function creates
 8 | /// a [Normalizer] instance. Example:
 9 | ///
10 | /// ```dart
11 | /// import 'package:ml_dataframe/ml_dataframe.dart';
12 | /// import 'package:ml_preprocessing/ml_preprocessing.dart';
13 | ///
14 | /// void main() {
15 | ///   final data = DataFrame([
16 | ///     ['feature_1', 'feature_2', 'label'],
17 | ///     [         10,        33.2,       2],
18 | ///     [         20,          -1,       4],
19 | ///     [         40,         -10,       5],
20 | ///     [         55,         100,      10],
21 | ///   ]);
22 | ///   final pipeline = Pipeline(data, [
23 | ///     normalize(),
24 | ///   ]);
25 | ///   final processed = pipeline.process(data);
26 | ///
27 | ///   print(processed);
28 | ///   // DataFrame (4 x 3)
29 | ///   //         feature_1                feature_2                 label
30 | ///   // 0.287927508354187       0.9559193253517151   0.05758550018072128
31 | ///   // 0.9794042110443115   -0.048970211297273636   0.19588084518909454
32 | ///   // 0.9630868434906006    -0.24077171087265015   0.12038585543632507
33 | ///   // 0.4800793528556824      0.8728715777397156   0.08728715777397156
34 | /// }
35 | /// ```
36 | PipeableOperatorFn normalize([Norm norm = Norm.euclidean]) =>
37 |     (_, {dtype}) => Normalizer(norm, dtype ?? DType.float32);
38 | 


--------------------------------------------------------------------------------
/lib/src/normalizer/normalizer.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_linalg/dtype.dart';
 2 | import 'package:ml_linalg/norm.dart';
 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 4 | import 'package:ml_preprocessing/src/normalizer/normalizer_impl.dart';
 5 | 
 6 | /// A class that performs normalization of data.
 7 | ///
 8 | /// Normalization is a process aimed to make all values in a vector vary within
 9 | /// the range from 0.0 to 1.0 - this makes it possible to treat all the values
10 | /// equally disregard their units.
11 | ///
12 | /// Normalization is applied row-wise.
13 | ///
14 | /// Example:
15 | ///
16 | /// ```dart
17 | /// import 'package:ml_dataframe/ml_dataframe.dart';
18 | /// import 'package:ml_preprocessing/ml_preprocessing.dart';
19 | ///
20 | /// void main() {
21 | ///   final data = DataFrame([
22 | ///     ['feature_1', 'feature_2', 'label'],
23 | ///     [         10,        33.2,       2],
24 | ///     [         20,          -1,       4],
25 | ///     [         40,         -10,       5],
26 | ///     [         55,         100,      10],
27 | ///   ]);
28 | ///   final normalizer = Normalizer();
29 | ///   final processed = normalizer.process(data);
30 | ///
31 | ///   print(processed);
32 | ///   // DataFrame (4 x 3)
33 | ///   //         feature_1                feature_2                 label
34 | ///   // 0.287927508354187       0.9559193253517151   0.05758550018072128
35 | ///   // 0.9794042110443115   -0.048970211297273636   0.19588084518909454
36 | ///   // 0.9630868434906006    -0.24077171087265015   0.12038585543632507
37 | ///   // 0.4800793528556824      0.8728715777397156   0.08728715777397156
38 | /// }
39 | /// ```
40 | abstract class Normalizer implements Pipeable {
41 |   factory Normalizer([Norm norm, DType dtype]) = NormalizerImpl;
42 | }
43 | 


--------------------------------------------------------------------------------
/lib/src/normalizer/normalizer_impl.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_linalg/dtype.dart';
 3 | import 'package:ml_linalg/norm.dart';
 4 | import 'package:ml_preprocessing/src/normalizer/normalizer.dart';
 5 | 
 6 | class NormalizerImpl implements Normalizer {
 7 |   NormalizerImpl([this._norm = Norm.euclidean, this._dtype = DType.float32]);
 8 | 
 9 |   final Norm _norm;
10 |   final DType _dtype;
11 | 
12 |   @override
13 |   DataFrame process(DataFrame input) {
14 |     final transformed =
15 |         input.toMatrix(_dtype).mapRows((row) => row.normalize(_norm));
16 | 
17 |     return DataFrame.fromMatrix(transformed, header: input.header);
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/lib/src/pipeline/pipeable.dart:
--------------------------------------------------------------------------------
1 | import 'package:ml_dataframe/ml_dataframe.dart';
2 | import 'package:ml_linalg/dtype.dart';
3 | 
4 | abstract class Pipeable {
5 |   DataFrame process(DataFrame input);
6 | }
7 | 
8 | typedef PipeableOperatorFn = Pipeable Function(DataFrame data, {DType? dtype});
9 | 


--------------------------------------------------------------------------------
/lib/src/pipeline/pipeline.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_linalg/dtype.dart';
 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 4 | import 'package:ml_preprocessing/src/pipeline/pipeline_impl.dart';
 5 | 
 6 | /// A class that is used to organize data preprocessing stages in a pipeline
 7 | /// manner.
 8 | ///
 9 | /// Building the pipeline is a `fitting` stage - it's a preliminary stage where
10 | /// operators extract metadata from the source data passed to [Pipeline] for
11 | /// future use, no preprocessing happens here.
12 | ///
13 | /// Once the `process` method is called, the actual data preprocessing comes to
14 | /// play.
15 | ///
16 | /// It's normal, when one uses the same data for fitting and processing, like
17 | /// in the example below.
18 | ///
19 | /// Example:
20 | ///
21 | /// ```dart
22 | /// import 'package:ml_dataframe/ml_dataframe.dart';
23 | /// import 'package:ml_preprocessing/ml_preprocessing.dart';
24 | //'
25 | /// Future main() async {
26 | ///   final dataFrame = await fromCsv('example/dataset.csv', columns: [0, 1, 2, 3]);
27 | //
28 | ///   final pipeline = Pipeline(dataFrame, [
29 | ///     toOneHotLabels(
30 | ///       columnNames: ['position'],
31 | ///       headerPostfix: '_position',
32 | ///     ),
33 | ///     toIntegerLabels(
34 | ///       columnNames: ['country'],
35 | ///     ),
36 | ///   ]);
37 | ///
38 | ///   final processed = pipeline.process(dataFrame);
39 | /// }
40 | /// ```
41 | abstract class Pipeline {
42 |   /// Takes [fittingData] to fit preprocessors from [operators] list
43 |   /// in order to use them further for new data of the same source as
44 |   /// [fittingData] via [process] method.
45 |   factory Pipeline(
46 |       DataFrame fittingData, Iterable<PipeableOperatorFn> operators,
47 |       {DType dType}) = PipelineImpl;
48 | 
49 |   /// Applies fitted preprocessors to [dataFrame] and returns transformed
50 |   /// data
51 |   DataFrame process(DataFrame dataFrame);
52 | }
53 | 


--------------------------------------------------------------------------------
/lib/src/pipeline/pipeline_impl.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_linalg/dtype.dart';
 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 4 | import 'package:ml_preprocessing/src/pipeline/pipeline.dart';
 5 | 
 6 | class PipelineImpl implements Pipeline {
 7 |   PipelineImpl(
 8 |     DataFrame fittingData,
 9 |     Iterable<PipeableOperatorFn> operators, {
10 |     DType dType = DType.float32,
11 |   }) : _steps = operators.map((operator) => operator(fittingData));
12 | 
13 |   final Iterable<Pipeable> _steps;
14 | 
15 |   @override
16 |   DataFrame process(DataFrame dataFrame) =>
17 |       _steps.fold(dataFrame, (processed, step) => step.process(processed));
18 | }
19 | 


--------------------------------------------------------------------------------
/lib/src/standardizer/standardize.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_linalg/dtype.dart';
 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 4 | import 'package:ml_preprocessing/src/standardizer/standardizer.dart';
 5 | 
 6 | /// Returns a function that can be used in [Pipeline]. The function creates a
 7 | /// [Standardizer] instance. Example:
 8 | ///
 9 | /// ```dart
10 | /// import 'package:ml_dataframe/ml_dataframe.dart';
11 | /// import 'package:ml_preprocessing/ml_preprocessing.dart';
12 | ///
13 | /// void main() {
14 | ///   final data = DataFrame([
15 | ///     ['feature_1', 'feature_2', 'label'],
16 | ///     [         10,        33.2,       2],
17 | ///     [         20,          -1,       4],
18 | ///     [         40,         -10,       5],
19 | ///     [         55,         100,      10],
20 | ///   ]);
21 | ///   final pipeline = Pipeline(data, [
22 | ///     standardize(),
23 | ///   ]);
24 | ///   final processed = pipeline.process(data);
25 | ///
26 | ///   print(processed);
27 | ///   // DataFrame (4 x 3)
28 | ///   //           feature_1             feature_2                  label
29 | ///   //  -1.217395305633545   0.06132180616259575    -1.1026456356048584
30 | ///   // -0.6445034146308899   -0.7300761342048645   -0.42409446835517883
31 | ///   //  0.5012804269790649   -0.9383387565612793   -0.08481889218091965
32 | ///   //  1.3606183528900146     1.607093095779419     1.6115589141845703
33 | /// }
34 | /// ```
35 | PipeableOperatorFn standardize() =>
36 |     (DataFrame fittingData, {dtype = DType.float32}) =>
37 |         Standardizer(fittingData, dtype: dtype!);
38 | 


--------------------------------------------------------------------------------
/lib/src/standardizer/standardizer.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_linalg/linalg.dart';
 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 4 | import 'package:ml_preprocessing/src/standardizer/standardizer_impl.dart';
 5 | 
 6 | /// A class that performs data standardization.
 7 | ///
 8 | /// Data standardization is a process, targeted to make the data look like
 9 | /// normally distributed data (with zero mean and unit variance).
10 | ///
11 | /// Standardization applies column-wise.
12 | ///
13 | /// Example:
14 | ///
15 | /// ```dart
16 | /// import 'package:ml_dataframe/ml_dataframe.dart';
17 | /// import 'package:ml_preprocessing/ml_preprocessing.dart';
18 | ///
19 | /// void main() {
20 | ///   final data = DataFrame([
21 | ///     ['feature_1', 'feature_2', 'label'],
22 | ///     [         10,        33.2,       2],
23 | ///     [         20,          -1,       4],
24 | ///     [         40,         -10,       5],
25 | ///     [         55,         100,      10],
26 | ///   ]);
27 | ///   final standardizer = Standardizer(data);
28 | ///   final processed = standardizer.process(data);
29 | ///
30 | ///   print(processed);
31 | ///   // DataFrame (4 x 3)
32 | ///   //           feature_1             feature_2                  label
33 | ///   //  -1.217395305633545   0.06132180616259575    -1.1026456356048584
34 | ///   // -0.6445034146308899   -0.7300761342048645   -0.42409446835517883
35 | ///   //  0.5012804269790649   -0.9383387565612793   -0.08481889218091965
36 | ///   //  1.3606183528900146     1.607093095779419     1.6115589141845703
37 | /// }
38 | /// ```
39 | abstract class Standardizer implements Pipeable {
40 |   factory Standardizer(
41 |     DataFrame fittingData, {
42 |     DType dtype,
43 |   }) = StandardizerImpl;
44 | }
45 | 


--------------------------------------------------------------------------------
/lib/src/standardizer/standardizer_impl.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_linalg/dtype.dart';
 3 | import 'package:ml_linalg/vector.dart';
 4 | import 'package:ml_preprocessing/src/standardizer/standardizer.dart';
 5 | 
 6 | class StandardizerImpl implements Standardizer {
 7 |   StandardizerImpl(
 8 |     DataFrame fittingData, {
 9 |     DType dtype = DType.float32,
10 |   })  : _dtype = dtype,
11 |         _mean = fittingData.toMatrix(dtype).mean(),
12 |         _deviation = Vector.fromList(
13 |           // TODO: Consider SIMD-aware mapping
14 |           fittingData
15 |               .toMatrix(dtype)
16 |               .deviation()
17 |               .map((el) => el == 0 ? 1 : el)
18 |               .toList(),
19 |           dtype: dtype,
20 |         ) {
21 |     if (!fittingData.toMatrix(dtype).hasData) {
22 |       throw Exception('No data provided');
23 |     }
24 |   }
25 | 
26 |   final DType _dtype;
27 |   final Vector _mean;
28 |   final Vector _deviation;
29 | 
30 |   /// Takes as an argument [input] with columns of various distribution types
31 |   /// and returns a [DataFrame], columns of which are normally distributed
32 |   @override
33 |   DataFrame process(DataFrame input) {
34 |     final inputAsMatrix = input.toMatrix(_dtype);
35 | 
36 |     if (inputAsMatrix.columnsNum != _deviation.length) {
37 |       throw Exception('Passed dataframe differs from the one used during '
38 |           'creation of the Standardizer: expected columns number - '
39 |           '${_deviation.length}, given - ${inputAsMatrix.columnsNum}.');
40 |     }
41 | 
42 |     final processedMatrix =
43 |         inputAsMatrix.mapRows((row) => (row - _mean) / _deviation);
44 |     final discreteColumnNames = input.series
45 |         .where((series) => series.isDiscrete)
46 |         .map((series) => series.name);
47 | 
48 |     return DataFrame.fromMatrix(
49 |       processedMatrix,
50 |       header: input.header,
51 |       discreteColumnNames: discreteColumnNames,
52 |     );
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/pubspec.yaml:
--------------------------------------------------------------------------------
 1 | name: ml_preprocessing
 2 | description: Popular data preprocessing algorithms for machine learning
 3 | version: 7.0.2
 4 | homepage: https://github.com/gyrdym/ml_preprocessing
 5 | 
 6 | environment:
 7 |   sdk: '>=2.12.0 <3.0.0'
 8 | 
 9 | dependencies:
10 |   ml_dataframe: ^1.0.0
11 |   ml_linalg: ^13.0.0
12 |   quiver: ^3.0.0
13 | 
14 | dev_dependencies:
15 |   benchmark_harness: ^2.0.0
16 |   mockito: ^5.0.2
17 |   pedantic: ^1.11.0
18 |   test: ^1.16.8
19 | 


--------------------------------------------------------------------------------
/test/encoder/encoder_impl_test.dart:
--------------------------------------------------------------------------------
  1 | import 'package:ml_dataframe/ml_dataframe.dart';
  2 | import 'package:ml_preprocessing/src/encoder/encoder.dart';
  3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
  4 | import 'package:test/test.dart';
  5 | 
  6 | void main() {
  7 |   group('EncoderImpl', () {
  8 |     final data = [
  9 |       ['first', 'second', 'third', 'fourth'],
 10 |       [1, 'F', 'category_val_1', 10],
 11 |       [10, 'F', 'category_val_2', 20],
 12 |       [11, 'M', 'category_val_1', 10],
 13 |       [21, 'F', 'category_val_2', 30],
 14 |       [44, 'M', 'category_val_1', 10],
 15 |       [43, 'M', 'category_val_1', 30],
 16 |       [55, 'F', 'category_val_3', 10],
 17 |     ];
 18 | 
 19 |     final unseenData = [
 20 |       ['first', 'second', 'third', 'fourth'],
 21 |       [1, 'F', 'category_val_5', 10],
 22 |       [10, 'F', 'category_val_2', 20],
 23 |       [11, 'M', 'category_val_6', 10],
 24 |     ];
 25 | 
 26 |     group('Encoder.oneHot', () {
 27 |       test('should encode multiple columns', () {
 28 |         final dataFrame = DataFrame(data);
 29 |         final encoder = Encoder.oneHot(dataFrame,
 30 |             columnNames: ['second', 'third', 'fourth']);
 31 |         final encoded = encoder.process(dataFrame);
 32 | 
 33 |         encoded.toMatrix();
 34 | 
 35 |         expect(
 36 |             encoded.toMatrix(),
 37 |             equals([
 38 |               [
 39 |                 1,
 40 |                 1,
 41 |                 0,
 42 |                 1,
 43 |                 0,
 44 |                 0,
 45 |                 1,
 46 |                 0,
 47 |                 0,
 48 |               ],
 49 |               [
 50 |                 10,
 51 |                 1,
 52 |                 0,
 53 |                 0,
 54 |                 1,
 55 |                 0,
 56 |                 0,
 57 |                 1,
 58 |                 0,
 59 |               ],
 60 |               [
 61 |                 11,
 62 |                 0,
 63 |                 1,
 64 |                 1,
 65 |                 0,
 66 |                 0,
 67 |                 1,
 68 |                 0,
 69 |                 0,
 70 |               ],
 71 |               [
 72 |                 21,
 73 |                 1,
 74 |                 0,
 75 |                 0,
 76 |                 1,
 77 |                 0,
 78 |                 0,
 79 |                 0,
 80 |                 1,
 81 |               ],
 82 |               [
 83 |                 44,
 84 |                 0,
 85 |                 1,
 86 |                 1,
 87 |                 0,
 88 |                 0,
 89 |                 1,
 90 |                 0,
 91 |                 0,
 92 |               ],
 93 |               [
 94 |                 43,
 95 |                 0,
 96 |                 1,
 97 |                 1,
 98 |                 0,
 99 |                 0,
100 |                 0,
101 |                 0,
102 |                 1,
103 |               ],
104 |               [
105 |                 55,
106 |                 1,
107 |                 0,
108 |                 0,
109 |                 0,
110 |                 1,
111 |                 1,
112 |                 0,
113 |                 0,
114 |               ],
115 |             ]));
116 |       });
117 | 
118 |       test('should use indices to access the needed series while encoding', () {
119 |         final dataFrame = DataFrame(data);
120 |         final encoder = Encoder.oneHot(dataFrame, columnIndices: [1, 2, 3]);
121 |         final encoded = encoder.process(dataFrame);
122 | 
123 |         encoded.toMatrix();
124 | 
125 |         expect(
126 |             encoded.toMatrix(),
127 |             equals([
128 |               [
129 |                 1,
130 |                 1,
131 |                 0,
132 |                 1,
133 |                 0,
134 |                 0,
135 |                 1,
136 |                 0,
137 |                 0,
138 |               ],
139 |               [
140 |                 10,
141 |                 1,
142 |                 0,
143 |                 0,
144 |                 1,
145 |                 0,
146 |                 0,
147 |                 1,
148 |                 0,
149 |               ],
150 |               [
151 |                 11,
152 |                 0,
153 |                 1,
154 |                 1,
155 |                 0,
156 |                 0,
157 |                 1,
158 |                 0,
159 |                 0,
160 |               ],
161 |               [
162 |                 21,
163 |                 1,
164 |                 0,
165 |                 0,
166 |                 1,
167 |                 0,
168 |                 0,
169 |                 0,
170 |                 1,
171 |               ],
172 |               [
173 |                 44,
174 |                 0,
175 |                 1,
176 |                 1,
177 |                 0,
178 |                 0,
179 |                 1,
180 |                 0,
181 |                 0,
182 |               ],
183 |               [
184 |                 43,
185 |                 0,
186 |                 1,
187 |                 1,
188 |                 0,
189 |                 0,
190 |                 0,
191 |                 0,
192 |                 1,
193 |               ],
194 |               [
195 |                 55,
196 |                 1,
197 |                 0,
198 |                 0,
199 |                 0,
200 |                 1,
201 |                 1,
202 |                 0,
203 |                 0,
204 |               ],
205 |             ]));
206 |       });
207 | 
208 |       test('should throw error if unknown value handling type is "error"', () {
209 |         final trainingDataFrame = DataFrame(data);
210 |         final unseenDataDataframe = DataFrame(unseenData);
211 |         final encoder = Encoder.oneHot(
212 |           trainingDataFrame,
213 |           columnNames: ['second', 'third', 'fourth'],
214 |           unknownValueHandlingType: UnknownValueHandlingType.error,
215 |         );
216 |         final actual = () => encoder.process(unseenDataDataframe).toMatrix();
217 |         final expected = throwsException;
218 | 
219 |         expect(actual, expected);
220 |       });
221 | 
222 |       test(
223 |           'should ignore unknown value if unknown value handling type is ignpre',
224 |           () {
225 |         final trainingDataFrame = DataFrame(data);
226 |         final unseenDataDataframe = DataFrame(unseenData);
227 |         final encoder = Encoder.oneHot(
228 |           trainingDataFrame,
229 |           columnNames: ['second', 'third', 'fourth'],
230 |           unknownValueHandlingType: UnknownValueHandlingType.ignore,
231 |         );
232 |         final actual = encoder.process(unseenDataDataframe).toMatrix();
233 |         final expected = [
234 |           [
235 |             1,
236 |             1,
237 |             0,
238 |             0,
239 |             0,
240 |             0,
241 |             1,
242 |             0,
243 |             0,
244 |           ],
245 |           [
246 |             10,
247 |             1,
248 |             0,
249 |             0,
250 |             1,
251 |             0,
252 |             0,
253 |             1,
254 |             0,
255 |           ],
256 |           [
257 |             11,
258 |             0,
259 |             1,
260 |             0,
261 |             0,
262 |             0,
263 |             1,
264 |             0,
265 |             0,
266 |           ],
267 |         ];
268 | 
269 |         expect(actual, expected);
270 |       });
271 |     });
272 | 
273 |     group('Encoder.label', () {
274 |       test('should encode multiple columns', () {
275 |         final dataFrame = DataFrame(data);
276 |         final encoder = Encoder.label(dataFrame,
277 |             columnNames: ['second', 'third', 'fourth']);
278 |         final encoded = encoder.process(dataFrame);
279 | 
280 |         encoded.toMatrix();
281 | 
282 |         expect(
283 |             encoded.toMatrix(),
284 |             equals([
285 |               [
286 |                 1,
287 |                 0,
288 |                 0,
289 |                 0,
290 |               ],
291 |               [
292 |                 10,
293 |                 0,
294 |                 1,
295 |                 1,
296 |               ],
297 |               [
298 |                 11,
299 |                 1,
300 |                 0,
301 |                 0,
302 |               ],
303 |               [
304 |                 21,
305 |                 0,
306 |                 1,
307 |                 2,
308 |               ],
309 |               [
310 |                 44,
311 |                 1,
312 |                 0,
313 |                 0,
314 |               ],
315 |               [
316 |                 43,
317 |                 1,
318 |                 0,
319 |                 2,
320 |               ],
321 |               [
322 |                 55,
323 |                 0,
324 |                 2,
325 |                 0,
326 |               ],
327 |             ]));
328 |       });
329 | 
330 |       test('should use indices to access the needed series while encoding', () {
331 |         final dataFrame = DataFrame(data);
332 |         final encoder = Encoder.label(dataFrame, columnIndices: [1, 2, 3]);
333 |         final encoded = encoder.process(dataFrame);
334 | 
335 |         encoded.toMatrix();
336 | 
337 |         expect(
338 |             encoded.toMatrix(),
339 |             equals([
340 |               [
341 |                 1,
342 |                 0,
343 |                 0,
344 |                 0,
345 |               ],
346 |               [
347 |                 10,
348 |                 0,
349 |                 1,
350 |                 1,
351 |               ],
352 |               [
353 |                 11,
354 |                 1,
355 |                 0,
356 |                 0,
357 |               ],
358 |               [
359 |                 21,
360 |                 0,
361 |                 1,
362 |                 2,
363 |               ],
364 |               [
365 |                 44,
366 |                 1,
367 |                 0,
368 |                 0,
369 |               ],
370 |               [
371 |                 43,
372 |                 1,
373 |                 0,
374 |                 2,
375 |               ],
376 |               [
377 |                 55,
378 |                 0,
379 |                 2,
380 |                 0,
381 |               ],
382 |             ]));
383 |       });
384 | 
385 |       test('should throw error if unknown value handling type is error', () {
386 |         final trainingDataFrame = DataFrame(data);
387 |         final unseenDataDataframe = DataFrame(unseenData);
388 |         final encoder = Encoder.label(
389 |           trainingDataFrame,
390 |           columnNames: ['second', 'third', 'fourth'],
391 |           unknownValueHandlingType: UnknownValueHandlingType.error,
392 |         );
393 |         final actual = () => encoder.process(unseenDataDataframe).toMatrix();
394 |         final expected = throwsException;
395 | 
396 |         expect(actual, expected);
397 |       });
398 | 
399 |       test(
400 |           'should ignore unknown value if unknown value handling type is ignpre',
401 |           () {
402 |         final trainingDataFrame = DataFrame(data);
403 |         final unseenDataDataframe = DataFrame(unseenData);
404 |         final encoder = Encoder.label(
405 |           trainingDataFrame,
406 |           columnNames: ['second', 'third', 'fourth'],
407 |           unknownValueHandlingType: UnknownValueHandlingType.ignore,
408 |         );
409 |         final actual = encoder.process(unseenDataDataframe).toMatrix();
410 |         final expected = [
411 |           [
412 |             1,
413 |             0,
414 |             3,
415 |             0,
416 |           ],
417 |           [
418 |             10,
419 |             0,
420 |             1,
421 |             1,
422 |           ],
423 |           [
424 |             11,
425 |             1,
426 |             3,
427 |             0,
428 |           ],
429 |         ];
430 | 
431 |         expect(actual, expected);
432 |       });
433 |     });
434 |   });
435 | }
436 | 


--------------------------------------------------------------------------------
/test/encoder/series_encoder/label_series_encoder_test.dart:
--------------------------------------------------------------------------------
  1 | import 'package:ml_dataframe/ml_dataframe.dart';
  2 | import 'package:ml_preprocessing/src/encoder/series_encoder/label_series_encoder.dart';
  3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
  4 | import 'package:test/test.dart';
  5 | 
  6 | void main() {
  7 |   group('LabelSeriesEncoder', () {
  8 |     test('should encode given series creating a collection of new series', () {
  9 |       final series =
 10 |           Series('just_header', <dynamic>['q', '2ee', '0030', '123']);
 11 |       final encoder = LabelSeriesEncoder(series);
 12 |       final encoded = encoder.encodeSeries(series).toList();
 13 | 
 14 |       expect(encoded, hasLength(1));
 15 |       expect(encoded[0].data, equals([0, 1, 2, 3]));
 16 |       expect(encoded[0].isDiscrete, isTrue);
 17 |     });
 18 | 
 19 |     test(
 20 |         'should use source series header as a header of encoded one if '
 21 |         'neither header prefix nor header postfix are specified', () {
 22 |       final series =
 23 |           Series('just_header', <dynamic>['q', '2ee', '0030', '123']);
 24 |       final encoder = LabelSeriesEncoder(series);
 25 |       final encoded = encoder.encodeSeries(series).toList();
 26 | 
 27 |       expect(encoded, hasLength(1));
 28 |       expect(encoded[0].name, 'just_header');
 29 |       expect(encoded[0].isDiscrete, isTrue);
 30 |     });
 31 | 
 32 |     test('should encode given series with repeating values', () {
 33 |       final series = Series('just_header',
 34 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
 35 |       final encoder = LabelSeriesEncoder(series);
 36 |       final encoded = encoder.encodeSeries(series).toList();
 37 | 
 38 |       expect(encoded, hasLength(1));
 39 |       expect(encoded[0].data, equals([0, 1, 0, 0, 2, 3, 2]));
 40 |       expect(encoded[0].isDiscrete, isTrue);
 41 |     });
 42 | 
 43 |     test('should consider given series name prefix', () {
 44 |       final series = Series('just_header',
 45 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
 46 |       final encoder = LabelSeriesEncoder(series, headerPrefix: 'pref_');
 47 |       final encoded = encoder.encodeSeries(series).toList();
 48 | 
 49 |       expect(encoded, hasLength(1));
 50 |       expect(encoded[0].name, 'pref_just_header');
 51 |       expect(encoded[0].isDiscrete, isTrue);
 52 |     });
 53 | 
 54 |     test('should consider given series name postfix', () {
 55 |       final series = Series('just_header',
 56 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
 57 |       final encoder = LabelSeriesEncoder(series, headerPostfix: '_postf');
 58 |       final encoded = encoder.encodeSeries(series).toList();
 59 | 
 60 |       expect(encoded, hasLength(1));
 61 |       expect(encoded[0].name, 'just_header_postf');
 62 |       expect(encoded[0].isDiscrete, isTrue);
 63 |     });
 64 | 
 65 |     test(
 66 |         'should consider both given series name postfix and series name '
 67 |         'prefix', () {
 68 |       final series = Series('just_header',
 69 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
 70 |       final encoder = LabelSeriesEncoder(series,
 71 |           headerPrefix: 'pref_', headerPostfix: '_postf');
 72 |       final encoded = encoder.encodeSeries(series).toList();
 73 | 
 74 |       expect(encoded, hasLength(1));
 75 |       expect(encoded[0].name, 'pref_just_header_postf');
 76 |       expect(encoded[0].isDiscrete, isTrue);
 77 |     });
 78 | 
 79 |     test('should use fitted data to encode new one', () {
 80 |       final fittingData = Series('just_header',
 81 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
 82 |       final encoder = LabelSeriesEncoder(fittingData,
 83 |           headerPrefix: 'pref_', headerPostfix: '_postf');
 84 | 
 85 |       final newData = Series('just_header',
 86 |           <dynamic>['q', 'q', 'q', 'q', '2ee', '2ee', '0030', 'q', '0030']);
 87 |       final encoded = encoder.encodeSeries(newData).toList();
 88 | 
 89 |       expect(encoded, hasLength(1));
 90 |       expect(encoded[0].data, equals([0, 0, 0, 0, 1, 1, 2, 0, 2]));
 91 |       expect(encoded[0].isDiscrete, isTrue);
 92 |     });
 93 | 
 94 |     test(
 95 |         'should throw error if unknown value handling startegy type is "throw '
 96 |         'error" and unknown value is encountered', () {
 97 |       final fittingData = Series('just_header',
 98 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
 99 |       final encoder = LabelSeriesEncoder(fittingData,
100 |           unknownValueHandlingType: UnknownValueHandlingType.error);
101 |       final unknownValue = 'unknown_value';
102 |       final newData = Series('awesome_series', <dynamic>[
103 |         'q',
104 |         'q',
105 |         'q',
106 |         unknownValue,
107 |         '2ee',
108 |         '2ee',
109 |         '0030',
110 |         'q',
111 |         '0030'
112 |       ]);
113 | 
114 |       final actual = () =>
115 |           encoder.encodeSeries(newData).map((series) => series.data.toList());
116 | 
117 |       expect(actual, throwsException);
118 |     });
119 | 
120 |     test(
121 |         'should encode unknown value as the last index of all labels if '
122 |         'unknown value handling startegy is "ignore" and unknown value is '
123 |         'encountered', () {
124 |       final fittingData = Series('just_header',
125 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
126 |       final encoder = LabelSeriesEncoder(fittingData,
127 |           unknownValueHandlingType: UnknownValueHandlingType.ignore);
128 |       final unknownValue = 'unknown_value';
129 |       final newData = Series('awesome_series', <dynamic>[
130 |         'q',
131 |         'q',
132 |         'q',
133 |         unknownValue,
134 |         '2ee',
135 |         '2ee',
136 |         '0030',
137 |         'q',
138 |         '0030'
139 |       ]);
140 |       final encoded = encoder.encodeSeries(newData).toList();
141 | 
142 |       expect(encoded, hasLength(1));
143 |       expect(encoded[0].data, equals([0, 0, 0, 4, 1, 1, 2, 0, 2]));
144 |       expect(encoded[0].isDiscrete, isTrue);
145 |     });
146 |   });
147 | }
148 | 


--------------------------------------------------------------------------------
/test/encoder/series_encoder/one_hot_series_encoder_test.dart:
--------------------------------------------------------------------------------
  1 | import 'package:ml_dataframe/ml_dataframe.dart';
  2 | import 'package:ml_preprocessing/src/encoder/series_encoder/one_hot_series_encoder.dart';
  3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart';
  4 | import 'package:test/test.dart';
  5 | 
  6 | void main() {
  7 |   group('OneHotSeriesEncoder', () {
  8 |     test('should encode given series creating a collection of new series', () {
  9 |       final series =
 10 |           Series('just_header', <dynamic>['q', '2ee', '0030', '123']);
 11 |       final encoder = OneHotSeriesEncoder(series);
 12 |       final encoded = encoder.encodeSeries(series).toList();
 13 | 
 14 |       expect(encoded, hasLength(4));
 15 | 
 16 |       expect(encoded[0].data, equals([1, 0, 0, 0]));
 17 |       expect(encoded[1].data, equals([0, 1, 0, 0]));
 18 |       expect(encoded[2].data, equals([0, 0, 1, 0]));
 19 |       expect(encoded[3].data, equals([0, 0, 0, 1]));
 20 | 
 21 |       expect(encoded[0].isDiscrete, isTrue);
 22 |       expect(encoded[1].isDiscrete, isTrue);
 23 |       expect(encoded[2].isDiscrete, isTrue);
 24 |       expect(encoded[3].isDiscrete, isTrue);
 25 |     });
 26 | 
 27 |     test(
 28 |         'should use categorical value as a encoded series headers if neither '
 29 |         'header prefix nor header postfix are specified', () {
 30 |       final series =
 31 |           Series('just_header', <dynamic>['q', '2ee', '0030', '123']);
 32 |       final encoder = OneHotSeriesEncoder(series);
 33 |       final encoded = encoder.encodeSeries(series).toList();
 34 | 
 35 |       expect(encoded, hasLength(4));
 36 | 
 37 |       expect(encoded[0].name, 'q');
 38 |       expect(encoded[1].name, '2ee');
 39 |       expect(encoded[2].name, '0030');
 40 |       expect(encoded[3].name, '123');
 41 | 
 42 |       expect(encoded[0].isDiscrete, isTrue);
 43 |       expect(encoded[1].isDiscrete, isTrue);
 44 |       expect(encoded[2].isDiscrete, isTrue);
 45 |       expect(encoded[3].isDiscrete, isTrue);
 46 |     });
 47 | 
 48 |     test('should encode given series with repeating values', () {
 49 |       final series = Series('just_header',
 50 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
 51 |       final encoder = OneHotSeriesEncoder(series);
 52 |       final encoded = encoder.encodeSeries(series).toList();
 53 | 
 54 |       expect(encoded, hasLength(4));
 55 | 
 56 |       expect(encoded[0].data, equals([1, 0, 1, 1, 0, 0, 0]));
 57 |       expect(encoded[1].data, equals([0, 1, 0, 0, 0, 0, 0]));
 58 |       expect(encoded[2].data, equals([0, 0, 0, 0, 1, 0, 1]));
 59 |       expect(encoded[3].data, equals([0, 0, 0, 0, 0, 1, 0]));
 60 | 
 61 |       expect(encoded[0].isDiscrete, isTrue);
 62 |       expect(encoded[1].isDiscrete, isTrue);
 63 |       expect(encoded[2].isDiscrete, isTrue);
 64 |       expect(encoded[3].isDiscrete, isTrue);
 65 |     });
 66 | 
 67 |     test('should consider given series name prefix', () {
 68 |       final series = Series('just_header',
 69 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
 70 |       final encoder = OneHotSeriesEncoder(series, headerPrefix: 'pref_');
 71 |       final encoded = encoder.encodeSeries(series).toList();
 72 | 
 73 |       expect(encoded, hasLength(4));
 74 | 
 75 |       expect(encoded[0].name, 'pref_q');
 76 |       expect(encoded[1].name, 'pref_2ee');
 77 |       expect(encoded[2].name, 'pref_0030');
 78 |       expect(encoded[3].name, 'pref_123');
 79 | 
 80 |       expect(encoded[0].isDiscrete, isTrue);
 81 |       expect(encoded[1].isDiscrete, isTrue);
 82 |       expect(encoded[2].isDiscrete, isTrue);
 83 |       expect(encoded[3].isDiscrete, isTrue);
 84 |     });
 85 | 
 86 |     test('should consider given series name postfix', () {
 87 |       final series = Series('just_header',
 88 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
 89 |       final encoder = OneHotSeriesEncoder(series, headerPostfix: '_postf');
 90 |       final encoded = encoder.encodeSeries(series).toList();
 91 | 
 92 |       expect(encoded, hasLength(4));
 93 | 
 94 |       expect(encoded[0].name, 'q_postf');
 95 |       expect(encoded[1].name, '2ee_postf');
 96 |       expect(encoded[2].name, '0030_postf');
 97 |       expect(encoded[3].name, '123_postf');
 98 | 
 99 |       expect(encoded[0].isDiscrete, isTrue);
100 |       expect(encoded[1].isDiscrete, isTrue);
101 |       expect(encoded[2].isDiscrete, isTrue);
102 |       expect(encoded[3].isDiscrete, isTrue);
103 |     });
104 | 
105 |     test(
106 |         'should consider both given series name postfix and series name '
107 |         'prefix', () {
108 |       final series = Series('just_header',
109 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
110 |       final encoder = OneHotSeriesEncoder(series,
111 |           headerPrefix: 'pref_', headerPostfix: '_postf');
112 |       final encoded = encoder.encodeSeries(series).toList();
113 | 
114 |       expect(encoded, hasLength(4));
115 | 
116 |       expect(encoded[0].name, 'pref_q_postf');
117 |       expect(encoded[1].name, 'pref_2ee_postf');
118 |       expect(encoded[2].name, 'pref_0030_postf');
119 |       expect(encoded[3].name, 'pref_123_postf');
120 | 
121 |       expect(encoded[0].isDiscrete, isTrue);
122 |       expect(encoded[1].isDiscrete, isTrue);
123 |       expect(encoded[2].isDiscrete, isTrue);
124 |       expect(encoded[3].isDiscrete, isTrue);
125 |     });
126 | 
127 |     test('should use fitted data to encode new one', () {
128 |       final fittingData = Series('just_header',
129 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
130 |       final encoder = OneHotSeriesEncoder(fittingData,
131 |           headerPrefix: 'pref_', headerPostfix: '_postf');
132 | 
133 |       final newData = Series('just_header',
134 |           <dynamic>['q', 'q', 'q', 'q', '2ee', '2ee', '0030', 'q', '0030']);
135 |       final encoded = encoder.encodeSeries(newData).toList();
136 | 
137 |       expect(encoded, hasLength(4));
138 | 
139 |       expect(encoded[0].data, equals([1, 1, 1, 1, 0, 0, 0, 1, 0]));
140 |       expect(encoded[1].data, equals([0, 0, 0, 0, 1, 1, 0, 0, 0]));
141 |       expect(encoded[2].data, equals([0, 0, 0, 0, 0, 0, 1, 0, 1]));
142 |       expect(encoded[3].data, equals([0, 0, 0, 0, 0, 0, 0, 0, 0]));
143 | 
144 |       expect(encoded[0].isDiscrete, isTrue);
145 |       expect(encoded[1].isDiscrete, isTrue);
146 |       expect(encoded[2].isDiscrete, isTrue);
147 |       expect(encoded[3].isDiscrete, isTrue);
148 |     });
149 | 
150 |     test(
151 |         'should throw error if unknown value handling startegy is to throw '
152 |         'error and unknown value is encountered', () {
153 |       final fittingData = Series('just_header',
154 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
155 |       final encoder = OneHotSeriesEncoder(fittingData,
156 |           unknownValueHandlingType: UnknownValueHandlingType.error);
157 |       final unknownValue = 'unknown_value';
158 |       final newData = Series('awesome_series', <dynamic>[
159 |         'q',
160 |         'q',
161 |         'q',
162 |         unknownValue,
163 |         '2ee',
164 |         '2ee',
165 |         '0030',
166 |         'q',
167 |         '0030'
168 |       ]);
169 | 
170 |       final actual = () =>
171 |           encoder.encodeSeries(newData).map((series) => series.data.toList());
172 | 
173 |       expect(actual, throwsException);
174 |     });
175 | 
176 |     test(
177 |         'should encode unknown value as 0 if unknown value handling startegy '
178 |         'is to ignore and unknown value is encountered', () {
179 |       final fittingData = Series('just_header',
180 |           <dynamic>['q', '2ee', 'q', 'q', '0030', '123', '0030']);
181 |       final encoder = OneHotSeriesEncoder(fittingData,
182 |           unknownValueHandlingType: UnknownValueHandlingType.ignore);
183 |       final unknownValue = 'unknown_value';
184 |       final newData = Series('awesome_series', <dynamic>[
185 |         'q',
186 |         'q',
187 |         'q',
188 |         unknownValue,
189 |         '2ee',
190 |         '2ee',
191 |         '0030',
192 |         'q',
193 |         '0030'
194 |       ]);
195 |       final encoded = encoder.encodeSeries(newData).toList();
196 | 
197 |       expect(encoded, hasLength(4));
198 | 
199 |       expect(encoded[0].data, equals([1, 1, 1, 0, 0, 0, 0, 1, 0]));
200 |       expect(encoded[1].data, equals([0, 0, 0, 0, 1, 1, 0, 0, 0]));
201 |       expect(encoded[2].data, equals([0, 0, 0, 0, 0, 0, 1, 0, 1]));
202 |       expect(encoded[3].data, equals([0, 0, 0, 0, 0, 0, 0, 0, 0]));
203 | 
204 |       expect(encoded[0].isDiscrete, isTrue);
205 |       expect(encoded[1].isDiscrete, isTrue);
206 |       expect(encoded[2].isDiscrete, isTrue);
207 |       expect(encoded[3].isDiscrete, isTrue);
208 |     });
209 |   });
210 | }
211 | 


--------------------------------------------------------------------------------
/test/encoder/series_encoder/series_encoder_factory_impl.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart';
 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/label_series_encoder.dart';
 4 | import 'package:ml_preprocessing/src/encoder/series_encoder/one_hot_series_encoder.dart';
 5 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory_impl.dart';
 6 | import 'package:test/test.dart';
 7 | 
 8 | void main() {
 9 |   group('SeriesEncoderFactoryImpl', () {
10 |     final factory = const SeriesEncoderFactoryImpl();
11 |     final series = Series(
12 |       'some_series',
13 |       <String>['value_1', 'value_2', 'value_3'],
14 |       isDiscrete: true,
15 |     );
16 | 
17 |     test('should create LabelSeriesEncoder', () {
18 |       final encoderType = EncoderType.label;
19 |       final actual = factory.createByType(encoderType, series);
20 |       final expected = isA<LabelSeriesEncoder>();
21 | 
22 |       expect(actual, expected);
23 |     });
24 | 
25 |     test('should create OneHotSeriesEncoder', () {
26 |       final encoderType = EncoderType.oneHot;
27 |       final actual = factory.createByType(encoderType, series);
28 |       final expected = isA<OneHotSeriesEncoder>();
29 | 
30 |       expect(actual, expected);
31 |     });
32 |   });
33 | }
34 | 


--------------------------------------------------------------------------------
/test/helpers.dart:
--------------------------------------------------------------------------------
 1 | import 'package:test/test.dart';
 2 | 
 3 | Matcher iterable2dAlmostEqualTo(Iterable<Iterable<double>> expected,
 4 |         [double precision = 1e-5]) =>
 5 |     pairwiseCompare<Iterable<double>, Iterable<double>>(expected,
 6 |         (Iterable<double> expected, Iterable<double> actual) {
 7 |       if (expected.length != actual.length) {
 8 |         return false;
 9 |       }
10 |       for (var i = 0; i < expected.length; i++) {
11 |         if ((expected.elementAt(i) - actual.elementAt(i)).abs() >= precision) {
12 |           return false;
13 |         }
14 |       }
15 |       return true;
16 |     }, '');
17 | 
18 | Matcher iterableAlmostEqualTo(Iterable<double> expected,
19 |         [double precision = 1e-5]) =>
20 |     pairwiseCompare<double, double>(
21 |         expected,
22 |         (expectedVal, actualVal) =>
23 |             (expectedVal - actualVal).abs() <= precision,
24 |         '');
25 | 


--------------------------------------------------------------------------------
/test/normalizer/normalize_test.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/src/normalizer/normalize.dart';
 3 | import 'package:ml_preprocessing/src/normalizer/normalizer.dart';
 4 | import 'package:test/test.dart';
 5 | 
 6 | void main() {
 7 |   group('normalize', () {
 8 |     test('should return normalizer factory', () {
 9 |       final normalizerFactory = normalize();
10 |       final normalizer = normalizerFactory(DataFrame([]));
11 | 
12 |       expect(normalizer, isA<Normalizer>());
13 |     });
14 |   });
15 | }
16 | 


--------------------------------------------------------------------------------
/test/normalizer/normalizer_test.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_linalg/linalg.dart';
 3 | import 'package:ml_preprocessing/src/normalizer/normalizer.dart';
 4 | import 'package:test/test.dart';
 5 | 
 6 | import '../helpers.dart';
 7 | 
 8 | void main() {
 9 |   group('Normalizer', () {
10 |     test(
11 |         'should divide each row-vector by its euclidean norm and preserve '
12 |         'the header of the input dataframe', () {
13 |       final header = ['first', 'second', 'third'];
14 |       final data = Matrix.fromList([
15 |         [10, 20, 30],
16 |         [40, 50, 60],
17 |         [90, 80, 70],
18 |         [190, 180, 170],
19 |       ]);
20 |       final input = DataFrame.fromMatrix(data, header: header);
21 |       final normalizer = Normalizer();
22 |       final transformed = normalizer.process(input);
23 | 
24 |       expect(transformed.header, equals(header));
25 |       expect(
26 |           transformed.toMatrix(),
27 |           iterable2dAlmostEqualTo([
28 |             [0.267, 0.534, 0.801],
29 |             [0.455, 0.569, 0.683],
30 |             [0.646, 0.574, 0.502],
31 |             [0.608, 0.576, 0.544],
32 |           ], 1e-3));
33 |     });
34 | 
35 |     test(
36 |         'should divide each row-vector by its manhattan norm and preserve '
37 |         'the header of the input dataframe', () {
38 |       final header = ['first', 'second', 'third'];
39 |       final data = Matrix.fromList([
40 |         [10, 20, 30],
41 |         [40, 50, 60],
42 |         [90, 80, 70],
43 |         [190, 180, 170],
44 |       ]);
45 |       final input = DataFrame.fromMatrix(data, header: header);
46 |       final normalizer = Normalizer(Norm.manhattan);
47 |       final transformed = normalizer.process(input);
48 | 
49 |       expect(transformed.header, equals(header));
50 |       expect(
51 |           transformed.toMatrix(),
52 |           iterable2dAlmostEqualTo([
53 |             [0.166, 0.333, 0.500],
54 |             [0.266, 0.333, 0.400],
55 |             [0.375, 0.333, 0.291],
56 |             [0.351, 0.333, 0.314],
57 |           ], 1e-3));
58 |     });
59 |   });
60 | }
61 | 


--------------------------------------------------------------------------------
/test/pipeline/pipeline_integration_test.dart:
--------------------------------------------------------------------------------
  1 | import 'package:ml_dataframe/ml_dataframe.dart';
  2 | import 'package:ml_preprocessing/src/encoder/to_integer_labels.dart';
  3 | import 'package:ml_preprocessing/src/encoder/to_one_hot_labels.dart';
  4 | import 'package:ml_preprocessing/src/pipeline/pipeline.dart';
  5 | import 'package:test/test.dart';
  6 | 
  7 | void main() {
  8 |   group('Pipeline', () {
  9 |     test(
 10 |         'should process steps, which return dataframes of different series '
 11 |         'number', () {
 12 |       final fittingData = DataFrame([
 13 |         <dynamic>['first', 'second', 'third', 'fourth'],
 14 |         <dynamic>[1, 'F', 'category_val_1', 10],
 15 |         <dynamic>[10, 'F', 'category_val_2', 20],
 16 |         <dynamic>[11, 'M', 'category_val_1', 10],
 17 |         <dynamic>[21, 'F', 'category_val_2', 30],
 18 |         <dynamic>[44, 'M', 'category_val_1', 10],
 19 |         <dynamic>[43, 'M', 'category_val_1', 30],
 20 |         <dynamic>[55, 'F', 'category_val_3', 10],
 21 |       ]);
 22 | 
 23 |       final pipeline = Pipeline(fittingData, [
 24 |         toOneHotLabels(columnIndices: [1]),
 25 |         toOneHotLabels(columnIndices: [2, 3]),
 26 |         toIntegerLabels(columnNames: ['first']),
 27 |       ]);
 28 | 
 29 |       final result = pipeline.process(fittingData);
 30 | 
 31 |       expect(
 32 |           result.toMatrix(),
 33 |           equals([
 34 |             [
 35 |               0,
 36 |               1,
 37 |               0,
 38 |               1,
 39 |               0,
 40 |               0,
 41 |               1,
 42 |               0,
 43 |               0,
 44 |             ],
 45 |             [
 46 |               1,
 47 |               1,
 48 |               0,
 49 |               0,
 50 |               1,
 51 |               0,
 52 |               0,
 53 |               1,
 54 |               0,
 55 |             ],
 56 |             [
 57 |               2,
 58 |               0,
 59 |               1,
 60 |               1,
 61 |               0,
 62 |               0,
 63 |               1,
 64 |               0,
 65 |               0,
 66 |             ],
 67 |             [
 68 |               3,
 69 |               1,
 70 |               0,
 71 |               0,
 72 |               1,
 73 |               0,
 74 |               0,
 75 |               0,
 76 |               1,
 77 |             ],
 78 |             [
 79 |               4,
 80 |               0,
 81 |               1,
 82 |               1,
 83 |               0,
 84 |               0,
 85 |               1,
 86 |               0,
 87 |               0,
 88 |             ],
 89 |             [
 90 |               5,
 91 |               0,
 92 |               1,
 93 |               1,
 94 |               0,
 95 |               0,
 96 |               0,
 97 |               0,
 98 |               1,
 99 |             ],
100 |             [
101 |               6,
102 |               1,
103 |               0,
104 |               0,
105 |               0,
106 |               1,
107 |               1,
108 |               0,
109 |               0,
110 |             ],
111 |           ]));
112 |     });
113 | 
114 |     test('should not rewrite previously encoded series', () {
115 |       final fittingData = DataFrame([
116 |         <dynamic>['first', 'second', 'third', 'fourth'],
117 |         <dynamic>[1, 'F', 'category_val_1', 10],
118 |         <dynamic>[10, 'F', 'category_val_2', 20],
119 |         <dynamic>[11, 'M', 'category_val_1', 10],
120 |         <dynamic>[21, 'F', 'category_val_2', 30],
121 |         <dynamic>[44, 'M', 'category_val_1', 10],
122 |         <dynamic>[43, 'M', 'category_val_1', 30],
123 |         <dynamic>[55, 'F', 'category_val_3', 10],
124 |       ]);
125 | 
126 |       final pipeline = Pipeline(fittingData, [
127 |         toOneHotLabels(columnIndices: [1, 2]),
128 |         toIntegerLabels(columnIndices: [0, 1, 3]),
129 |       ]);
130 | 
131 |       final result = pipeline.process(fittingData);
132 | 
133 |       expect(
134 |           result.rows,
135 |           equals([
136 |             [
137 |               0,
138 |               1,
139 |               0,
140 |               1,
141 |               0,
142 |               0,
143 |               0,
144 |             ],
145 |             [
146 |               1,
147 |               1,
148 |               0,
149 |               0,
150 |               1,
151 |               0,
152 |               1,
153 |             ],
154 |             [
155 |               2,
156 |               0,
157 |               1,
158 |               1,
159 |               0,
160 |               0,
161 |               0,
162 |             ],
163 |             [
164 |               3,
165 |               1,
166 |               0,
167 |               0,
168 |               1,
169 |               0,
170 |               2,
171 |             ],
172 |             [
173 |               4,
174 |               0,
175 |               1,
176 |               1,
177 |               0,
178 |               0,
179 |               0,
180 |             ],
181 |             [
182 |               5,
183 |               0,
184 |               1,
185 |               1,
186 |               0,
187 |               0,
188 |               2,
189 |             ],
190 |             [
191 |               6,
192 |               1,
193 |               0,
194 |               0,
195 |               0,
196 |               1,
197 |               0,
198 |             ],
199 |           ]));
200 |     });
201 |   });
202 | }
203 | 


--------------------------------------------------------------------------------
/test/pipeline/pipeline_test.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_preprocessing/ml_preprocessing.dart';
 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart';
 4 | import 'package:test/test.dart';
 5 | 
 6 | class Plus10Processor implements Pipeable {
 7 |   @override
 8 |   DataFrame process(DataFrame input) => DataFrame.fromSeries(
 9 |         input.series.map((series) => Series(
10 |               series.name,
11 |               series.data.map<dynamic>((dynamic value) => value + 10),
12 |             )),
13 |       );
14 | }
15 | 
16 | class MultipleBy2Processor implements Pipeable {
17 |   @override
18 |   DataFrame process(DataFrame input) => DataFrame.fromSeries(
19 |         input.series.map((series) => Series(
20 |               series.name,
21 |               series.data.map<dynamic>((dynamic value) => value * 2),
22 |             )),
23 |       );
24 | }
25 | 
26 | void main() {
27 |   group('Pipeline', () {
28 |     final fittingData = DataFrame([<dynamic>[]], headerExists: false);
29 | 
30 |     final targetData = DataFrame([
31 |       <dynamic>[20, 10, 30, 30],
32 |       <dynamic>[30, 90, 20, 60],
33 |       <dynamic>[40, 70, 50, 10],
34 |     ], headerExists: false);
35 | 
36 |     test('should create a pipeline with predefined steps', () {
37 |       final pipeline = Pipeline(fittingData, [
38 |         (data, {dtype}) => Plus10Processor(),
39 |         (data, {dtype}) => MultipleBy2Processor(),
40 |       ]);
41 | 
42 |       final result = pipeline.process(targetData);
43 | 
44 |       expect(
45 |           result.toMatrix(),
46 |           equals([
47 |             [60, 40, 80, 80],
48 |             [80, 200, 60, 140],
49 |             [100, 160, 120, 40],
50 |           ]));
51 |     });
52 |   });
53 | }
54 | 


--------------------------------------------------------------------------------
/test/standardizer/standardize_test.dart:
--------------------------------------------------------------------------------
 1 | import 'package:ml_dataframe/ml_dataframe.dart';
 2 | import 'package:ml_linalg/dtype.dart';
 3 | import 'package:ml_preprocessing/src/standardizer/standardize.dart';
 4 | import 'package:ml_preprocessing/src/standardizer/standardizer.dart';
 5 | import 'package:test/test.dart';
 6 | 
 7 | void main() {
 8 |   group('standardize', () {
 9 |     final dtype = DType.float32;
10 | 
11 |     test('should return a Standardizer factory function', () {
12 |       final fittingData = DataFrame(<Iterable<num>>[
13 |         [1, 2, 3],
14 |       ], headerExists: false);
15 | 
16 |       final standardizerFactory = standardize();
17 |       final standardizer = standardizerFactory(fittingData, dtype: dtype);
18 | 
19 |       expect(standardizer, isA<Standardizer>());
20 |     });
21 |   });
22 | }
23 | 


--------------------------------------------------------------------------------
/test/standardizer/standardizer_test.dart:
--------------------------------------------------------------------------------
  1 | import 'package:ml_dataframe/ml_dataframe.dart';
  2 | import 'package:ml_linalg/dtype.dart';
  3 | import 'package:ml_linalg/matrix.dart';
  4 | import 'package:ml_preprocessing/ml_preprocessing.dart';
  5 | import 'package:test/test.dart';
  6 | 
  7 | import '../helpers.dart';
  8 | 
  9 | void main() {
 10 |   const dtype = DType.float32;
 11 | 
 12 |   group('Standardizer', () {
 13 |     test(
 14 |         'should extract deviation and mean values from fitting data and apply '
 15 |         'them to the same data in order to make the latter look like normally'
 16 |         'distributed data (with zero mean and unit variance)', () {
 17 |       final fittingData = DataFrame(<Iterable<num>>[
 18 |         [10, 21, 90, 20],
 19 |         [20, 66, 11, 30],
 20 |         [30, 55, 0, 70],
 21 |         [40, 33, 22, 20],
 22 |       ], headerExists: false);
 23 | 
 24 |       final standardizer = Standardizer(fittingData, dtype: dtype);
 25 |       final processed = standardizer.process(fittingData);
 26 | 
 27 |       expect(
 28 |           processed.toMatrix(dtype),
 29 |           iterable2dAlmostEqualTo([
 30 |             [-1.34164079, -1.28449611, 1.68894093, -0.72760688],
 31 |             [-0.4472136, 1.25626543, -0.56298031, -0.24253563],
 32 |             [0.4472136, 0.63519039, -0.87653896, 1.69774938],
 33 |             [1.34164079, -0.6069597, -0.24942166, -0.72760688],
 34 |           ]));
 35 |     });
 36 | 
 37 |     test(
 38 |         'should extract deviation and mean values from fitting data and apply '
 39 |         'them to the previously unseen data in order to make the latter look '
 40 |         'like normally distributed data (with zero mean and unit '
 41 |         'variance)', () {
 42 |       final fittingData = DataFrame(<Iterable<num>>[
 43 |         [10, 21, 90, 20],
 44 |         [20, 66, 11, 30],
 45 |         [30, 55, 0, 70],
 46 |         [40, 33, 22, 20],
 47 |       ], headerExists: false);
 48 | 
 49 |       final testData = DataFrame(<Iterable<num>>[
 50 |         [80, 20, 11, -100],
 51 |         [90, -40, 27, 0],
 52 |         [10, 44, 96, 120],
 53 |         [50, -99, 73, 10],
 54 |         [88, -20, 36, 66],
 55 |       ], headerExists: false);
 56 | 
 57 |       final standardizer = Standardizer(fittingData, dtype: dtype);
 58 |       final processed = standardizer.process(testData);
 59 | 
 60 |       expect(
 61 |           processed.toMatrix(dtype),
 62 |           iterable2dAlmostEqualTo([
 63 |             [4.91934955, -1.34095748, -0.56298031, -6.54846188],
 64 |             [5.81377674, -4.72863954, -0.106895, -1.69774938],
 65 |             [-1.34164079, 0.01411534, 1.85997292, 4.12310563],
 66 |             [2.23606798, -8.05986023, 1.20435028, -1.21267813],
 67 |             [5.6348913, -3.59941219, 0.14965299, 1.50372088],
 68 |           ]));
 69 |     });
 70 | 
 71 |     test(
 72 |         'should extract deviation and mean values from fitting data and apply '
 73 |         'them to the previously unseen data twice or more', () {
 74 |       final fittingData = DataFrame(<Iterable<num>>[
 75 |         [10, 21, 90, 20],
 76 |         [20, 66, 11, 30],
 77 |         [30, 55, 0, 70],
 78 |         [40, 33, 22, 20],
 79 |       ], headerExists: false);
 80 | 
 81 |       final testData1 = DataFrame(<Iterable<num>>[
 82 |         [80, 20, 11, -100],
 83 |         [90, -40, 27, 0],
 84 |         [10, 44, 96, 120],
 85 |         [50, -99, 73, 10],
 86 |         [88, -20, 36, 66],
 87 |       ], headerExists: false);
 88 | 
 89 |       final testData2 = DataFrame(<Iterable<num>>[
 90 |         [1, 200, 33, 1000],
 91 |         [2, -440, 29, 0],
 92 |         [3, 414, 9, 0],
 93 |       ], headerExists: false);
 94 | 
 95 |       final standardizer = Standardizer(fittingData, dtype: dtype);
 96 | 
 97 |       final processed1 = standardizer.process(testData1);
 98 |       final processed2 = standardizer.process(testData2);
 99 | 
100 |       expect(
101 |           processed1.toMatrix(dtype),
102 |           iterable2dAlmostEqualTo([
103 |             [4.91934955, -1.34095748, -0.56298031, -6.54846188],
104 |             [5.81377674, -4.72863954, -0.106895, -1.69774938],
105 |             [-1.34164079, 0.01411534, 1.85997292, 4.12310563],
106 |             [2.23606798, -8.05986023, 1.20435028, -1.21267813],
107 |             [5.6348913, -3.59941219, 0.14965299, 1.50372088],
108 |           ]));
109 | 
110 |       expect(
111 |           processed2.toMatrix(dtype),
112 |           iterable2dAlmostEqualTo([
113 |             [-2.14662526, 8.82208869, 0.064137, 46.80937563],
114 |             [-2.05718254, -27.31318658, -0.04988433, -1.69774938],
115 |             [-1.96773982, 20.90482136, -0.61999097, -1.69774938],
116 |           ]));
117 |     });
118 | 
119 |     test('should process a dataframe with only one column', () {
120 |       final fittingData = DataFrame(<Iterable<num>>[
121 |         [10],
122 |         [20],
123 |         [30],
124 |         [40],
125 |       ], headerExists: false);
126 | 
127 |       final testData = DataFrame(<Iterable<num>>[
128 |         [80],
129 |         [90],
130 |         [10],
131 |         [50],
132 |         [88],
133 |       ], headerExists: false);
134 | 
135 |       final standardizer = Standardizer(fittingData, dtype: dtype);
136 |       final processed = standardizer.process(testData);
137 | 
138 |       expect(
139 |           processed.toMatrix(dtype),
140 |           iterable2dAlmostEqualTo([
141 |             [4.91934955],
142 |             [5.81377674],
143 |             [-1.34164079],
144 |             [2.23606798],
145 |             [5.6348913],
146 |           ]));
147 |     });
148 | 
149 |     test('should process a dataframe with only one row', () {
150 |       final fittingData = DataFrame(<Iterable<num>>[
151 |         [10, 21, 90, 20],
152 |       ], headerExists: false);
153 | 
154 |       final testData = DataFrame(<Iterable<num>>[
155 |         [80, 20, 11, -100],
156 |         [90, -40, 27, 0],
157 |         [10, 44, 96, 120],
158 |         [50, -99, 73, 10],
159 |         [88, -20, 36, 66],
160 |       ], headerExists: false);
161 | 
162 |       final standardizer = Standardizer(fittingData, dtype: dtype);
163 |       final processed = standardizer.process(testData);
164 | 
165 |       expect(
166 |           processed.toMatrix(dtype),
167 |           equals([
168 |             [70, -1, -79, -120],
169 |             [80, -61, -63, -20],
170 |             [0, 23, 6, 100],
171 |             [40, -120, -17, -10],
172 |             [78, -41, -54, 46],
173 |           ]));
174 |     });
175 | 
176 |     test('should make deviation of uniform columns equal to 1', () {
177 |       final uniformColumn = Matrix.fromList([
178 |         [10],
179 |         [10],
180 |         [10],
181 |         [10],
182 |       ]);
183 | 
184 |       final otherColumns = Matrix.fromList([
185 |         [21, 90, 20],
186 |         [66, 11, 30],
187 |         [55, 0, 70],
188 |         [33, 22, 20],
189 |       ]);
190 | 
191 |       final fittingData = DataFrame.fromMatrix(
192 |         Matrix.fromColumns([
193 |           ...uniformColumn.columns,
194 |           ...otherColumns.columns,
195 |         ], dtype: dtype),
196 |       );
197 | 
198 |       final testData = DataFrame(<Iterable<num>>[
199 |         [80, 20, 11, -100],
200 |         [90, -40, 27, 0],
201 |         [10, 44, 96, 120],
202 |         [50, -99, 73, 10],
203 |         [88, -20, 36, 66],
204 |       ], headerExists: false);
205 | 
206 |       final standardizer = Standardizer(fittingData, dtype: dtype);
207 |       final processed = standardizer.process(testData);
208 | 
209 |       expect(
210 |           processed.toMatrix(dtype),
211 |           iterable2dAlmostEqualTo([
212 |             [70, -1.34095748, -0.56298031, -6.54846188],
213 |             [80, -4.72863954, -0.106895, -1.69774938],
214 |             [0, 0.01411534, 1.85997292, 4.12310563],
215 |             [40, -8.05986023, 1.20435028, -1.21267813],
216 |             [78, -3.59941219, 0.14965299, 1.50372088],
217 |           ]));
218 |     });
219 | 
220 |     test(
221 |         'should throw an exception if one tries to apply standardizer to a '
222 |         'dataframe of inappropriate dimension (columns number in the test '
223 |         'dataframe should be equal to a number of columns in the fitting '
224 |         'dataframe)', () {
225 |       final fittingData = DataFrame(<Iterable<num>>[
226 |         [10, 21, 90, 20],
227 |         [20, 66, 11, 30],
228 |         [30, 55, 0, 70],
229 |         [40, 33, 22, 20],
230 |       ], headerExists: false);
231 | 
232 |       final testData = DataFrame(<Iterable<num>>[
233 |         [80, 20, 11],
234 |         [90, -40, 27],
235 |         [10, 44, 96],
236 |         [50, -99, 73],
237 |         [88, -20, 36],
238 |       ], headerExists: false);
239 | 
240 |       final standardizer = Standardizer(fittingData, dtype: dtype);
241 | 
242 |       expect(() => standardizer.process(testData), throwsException);
243 |     });
244 | 
245 |     test(
246 |         'should throw an exception if one tries to create a standardizer '
247 |         'using empty dataframe', () {
248 |       final fittingData = DataFrame(<Iterable<num>>[[]], headerExists: false);
249 | 
250 |       expect(
251 |         () => Standardizer(fittingData, dtype: dtype),
252 |         throwsException,
253 |       );
254 |     });
255 |   });
256 | }
257 | 


--------------------------------------------------------------------------------