├── .github └── workflows │ └── ci_pipeline.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── analysis_options.yaml ├── autotest.sh ├── benchmark └── main.dart ├── example ├── black_friday │ ├── black_friday.csv │ └── black_friday.dart ├── dataset.csv └── main.dart ├── lib ├── ml_preprocessing.dart └── src │ ├── encoder │ ├── encoder.dart │ ├── encoder_impl.dart │ ├── encoder_type.dart │ ├── helpers │ │ ├── create_encoder_to_series_mapping.dart │ │ └── get_series_names_by_indices.dart │ ├── series_encoder │ │ ├── label_series_encoder.dart │ │ ├── one_hot_series_encoder.dart │ │ ├── series_encoder.dart │ │ ├── series_encoder_factory.dart │ │ └── series_encoder_factory_impl.dart │ ├── to_integer_labels.dart │ ├── to_one_hot_labels.dart │ └── unknown_value_handling_type.dart │ ├── normalizer │ ├── normalize.dart │ ├── normalizer.dart │ └── normalizer_impl.dart │ ├── pipeline │ ├── pipeable.dart │ ├── pipeline.dart │ └── pipeline_impl.dart │ └── standardizer │ ├── standardize.dart │ ├── standardizer.dart │ └── standardizer_impl.dart ├── pubspec.yaml └── test ├── encoder ├── encoder_impl_test.dart └── series_encoder │ ├── label_series_encoder_test.dart │ ├── one_hot_series_encoder_test.dart │ └── series_encoder_factory_impl.dart ├── helpers.dart ├── normalizer ├── normalize_test.dart └── normalizer_test.dart ├── pipeline ├── pipeline_integration_test.dart └── pipeline_test.dart └── standardizer ├── standardize_test.dart └── standardizer_test.dart /.github/workflows/ci_pipeline.yml: -------------------------------------------------------------------------------- 1 | name: CI pipeline 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - uses: dart-lang/setup-dart@v1 16 | 17 | - name: Print Dart SDK version 18 | run: dart --version 19 | 20 | - name: Install dependencies 21 | run: dart pub get 22 | 23 | - name: Verify formatting 24 | run: dart format --output=none --set-exit-if-changed . 25 | 26 | - name: Analyze project source 27 | run: dart analyze --fatal-infos 28 | 29 | - name: Run tests 30 | run: dart test 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | 3 | # See https://www.dartlang.org/guides/libraries/private-files 4 | 5 | # Files and directories created by pub 6 | .dart_tool/ 7 | .packages 8 | .pub/ 9 | build/ 10 | # If you're building an application, you may want to check-in your pubspec.lock 11 | pubspec.lock 12 | 13 | # Directory created by dartdoc 14 | # If you don't generate documentation locally you can remove this line. 15 | doc/api/ 16 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 7.0.2 4 | - `README.md`: 5 | - Fixed link to `black_friday` dataset 6 | 7 | ## 7.0.1 8 | - Added code formatting checking step to CI pipline 9 | - Corrected `README` examples 10 | - Added documentation to `Encoder` factory 11 | 12 | ## 7.0.0 13 | - `ml_datframe` 1.0.0 supported 14 | - `featureNames` parameter renamed to `columnNames` 15 | - `featureIds` parameter renamed to `columnIndices` 16 | - `encodeAsIntegerLabels` renamed to `toIntegerLabels` 17 | - `encodeAsOneHotLabels` renamed to `toOneHotLabels` 18 | 19 | ## 6.0.1 20 | - `pubspec.yaml`: `ml_dataframe` dependency updated 21 | 22 | ## 6.0.0 23 | - Null-safety added (stable release) 24 | 25 | ## 6.0.0-nullsafety.0 26 | - Null-safety added (beta release) 27 | 28 | ## 5.2.2 29 | - `ml_dataframe`: version 0.4.0 supported 30 | 31 | ## 5.2.1 32 | - `ml_dataframe`: version 0.3.0 supported 33 | - `CI`: github actions set up 34 | 35 | ## 5.2.0 36 | - `UnknownValueHandlingType` enum added to the lib's public API 37 | 38 | ## 5.1.2 39 | - `ml_dataframe` 0.2.0 supported 40 | 41 | ## 5.1.1 42 | - `ml_dataframe` dependency updated 43 | 44 | ## 5.1.0 45 | - `Standardizer` entity added 46 | - `dtype` parameter added as an argument for `Pipeline.process` method 47 | 48 | ## 5.0.4 49 | - Default values for parameters `headerPrefix` and `headerPostfix` added where it applicable 50 | 51 | ## 5.0.3 52 | - `README` corrected (ml_dataframe version corrected) 53 | 54 | ## 5.0.2 55 | - `xrange` dependency removed 56 | - `ml_dataframe` 0.0.11 supported 57 | 58 | ## 5.0.1 59 | - `xrange` package version locked 60 | 61 | ## 5.0.0 62 | - `Encoder` interface changed: there is no more `encode` method, use `process` from `Pipeable` instead 63 | - `Normalizer` entity added 64 | - `normalize` operator added 65 | 66 | ## 4.0.0 67 | - `DataFrame` class split up into separate smaller entities 68 | - `DataFrame` class core moved to separate repository 69 | - `Pipeline` entity created 70 | - Categorical data encoders implemented `Pipeable` interface 71 | 72 | ## 3.4.0 73 | - `DataFrame`: `encodedColumnRanges` added 74 | 75 | ## 3.3.0 76 | - `ml_linalg` 10.0.0 supported 77 | 78 | ## 3.2.0 79 | - `ml_linalg` 9.0.0 supported 80 | 81 | ## 3.1.0 82 | - `Categorical data processing`: `encoders` parameter added to `DataFrame.fromCsv` constructor 83 | 84 | ## 3.0.0 85 | - `xrange` library supported: it's possible to provide `ZRange` object now instead of `tuple2` to specify a range of 86 | indices 87 | 88 | ## 2.0.0 89 | - `DataFrame` introduced 90 | 91 | ## 1.1.0 92 | - `Float32x4InterceptPreprocessor` added 93 | - `readme` updated 94 | 95 | ## 1.0.0 96 | - Package published 97 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2019, Ilya Gyrdymov 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://github.com/gyrdym/ml_preprocessing/workflows/CI%20pipeline/badge.svg)](https://github.com/gyrdym/ml_preprocessing/actions?query=branch%3Amaster+) 2 | [![Coverage Status](https://coveralls.io/repos/github/gyrdym/ml_preprocessing/badge.svg)](https://coveralls.io/github/gyrdym/ml_preprocessing) 3 | [![pub package](https://img.shields.io/pub/v/ml_preprocessing.svg)](https://pub.dartlang.org/packages/ml_preprocessing) 4 | [![Gitter Chat](https://badges.gitter.im/gyrdym/gyrdym.svg)](https://gitter.im/gyrdym/) 5 | 6 | # ml_preprocessing 7 | Data preprocessing algorithms 8 | 9 | ## What is data preprocessing? 10 | *Data preprocessing* is a set of techniques for data preparation before one can use the data in Machine Learning algorithms. 11 | 12 | ## Why is it needed? 13 | Let's say, you have a dataset: 14 | 15 | ```` 16 | ---------------------------------------------------------------------------------------- 17 | | Gender | Country | Height (cm) | Weight (kg) | Diabetes (1 - Positive, 0 - Negative) | 18 | ---------------------------------------------------------------------------------------- 19 | | Female | France | 165 | 55 | 1 | 20 | ---------------------------------------------------------------------------------------- 21 | | Female | Spain | 155 | 50 | 0 | 22 | ---------------------------------------------------------------------------------------- 23 | | Male | Spain | 175 | 75 | 0 | 24 | ---------------------------------------------------------------------------------------- 25 | | Male | Russia | 173 | 77 | N/A | 26 | ---------------------------------------------------------------------------------------- 27 | ```` 28 | 29 | Everything seems good for now. Say, you're about to train a classifier to predict if a person has diabetes. 30 | But there is an obstacle - how can it be possible to use the data in mathematical equations with string-value columns 31 | (`Gender`, `Country`)? And things are getting even worse because of an empty (N/A) value in the `Diabetes` column. There 32 | should be a way to convert this data to a valid numerical representation. Here data preprocessing techniques come to play. 33 | You should decide, how to convert string data (aka *categorical data*) to numbers and how to treat empty values. Of 34 | course, you can come up with your unique algorithms to do all of these operations, but there are a lot of well-known 35 | techniques for doing all the conversions. 36 | 37 | The aim of the library is to give data scientists, who are interested in Dart programming language, these preprocessing 38 | techniques. 39 | 40 | ## Prerequisites 41 | 42 | The library depends on [DataFrame class](https://github.com/gyrdym/ml_dataframe/blob/master/lib/src/data_frame/data_frame.dart) 43 | from the [repo](https://github.com/gyrdym/ml_dataframe). It's necessary to use it as a dependency in your project, 44 | because you need to pack data into [DataFrame](https://github.com/gyrdym/ml_dataframe/blob/master/lib/src/data_frame/data_frame.dart) 45 | before doing preprocessing. An example with a part of pubspec.yaml: 46 | 47 | ```` 48 | dependencies: 49 | ... 50 | ml_dataframe: ^1.0.0 51 | ... 52 | ```` 53 | 54 | ## Usage examples 55 | 56 | ### Getting started 57 | 58 | Let's download some data from [Kaggle](https://www.kaggle.com) - let it be amazing [black friday](https://www.kaggle.com/datasets/sdolezel/black-friday) 59 | dataset. It's pretty interesting data with huge amount of observations (approx. 538000 rows) and a good number of 60 | categorical features. 61 | 62 | First, import all necessary libraries: 63 | 64 | ````dart 65 | import 'package:ml_dataframe/ml_dataframe.dart'; 66 | import 'package:ml_preprocessing/ml_preprocessing.dart'; 67 | ```` 68 | 69 | Then, we should read the csv and create a data frame: 70 | 71 | ````dart 72 | final dataFrame = await fromCsv('example/black_friday/black_friday.csv', 73 | columns: [2, 3, 5, 6, 7, 11]); 74 | ```` 75 | 76 | ### Categorical data 77 | 78 | After we get a dataframe, we may encode all the needed features. Let's analyze the dataset and decide, what features 79 | should be encoded. In our case these are: 80 | 81 | ````dart 82 | final featureNames = ['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status']; 83 | ```` 84 | 85 | ### One-hot encoding 86 | 87 | Let's fit the one-hot encoder. 88 | 89 | Why should we fit it? Categorical data encoder fitting - a process, when all the unique category values are being 90 | searched for in order to create an encoded labels list. After the fitting is complete, one may use the fitted encoder for 91 | the new data of the same source. 92 | 93 | In order to fit the encoder, it's needed to create the instance of the `Encoder` class and pass the fitting data as an 94 | argument to the constructor, along with the features to be encoded: 95 | 96 | 97 | ````dart 98 | final encoder = Encoder.oneHot( 99 | dataFrame, 100 | columnNames: featureNames, 101 | ); 102 | 103 | ```` 104 | 105 | Let's encode the features: 106 | 107 | ````dart 108 | final encoded = encoder.process(dataFrame); 109 | ```` 110 | 111 | We used the same dataframe here - it's absolutely normal since when we created the encoder, we just fit it with the 112 | dataframe, and now is the time to apply the dataframe to the fitted encoder. 113 | 114 | It's time to take a look at our processed data. Let's read it: 115 | 116 | ````dart 117 | final data = encoded.toMatrix(); 118 | 119 | print(data); 120 | ```` 121 | 122 | In the output we will see just numerical data, that's exactly what we wanted to reach. 123 | 124 | ### Label encoding 125 | 126 | Another well-known encoding method. The technique is the same - first, we should fit the encoder and after that, we 127 | may use this "trained" encoder in some applications: 128 | 129 | ````dart 130 | // fit encoder 131 | final encoder = Encoder.label( 132 | dataFrame, 133 | columnNames: featureNames, 134 | ); 135 | 136 | // apply fitted encoder to data 137 | final encoded = encoder.process(dataFrame); 138 | ```` 139 | 140 | ### Numerical data normalization 141 | 142 | Sometimes we need to have our numerical features normalized, which means we need to treat every dataframe row as a 143 | vector and divide this vector element-wise by its norm (Euclidean, Manhattan, etc.). To do so the library exposes 144 | `Normalizer` class: 145 | 146 | ````dart 147 | final normalizer = Normalizer(); // by default Euclidean norm will be used 148 | final transformed = normalizer.process(dataFrame); 149 | ```` 150 | 151 | Please, notice, that if your data has raw categorical values, the normalization will fail as it requires only numerical 152 | values. In this case, you should encode data (e.g. using one-hot encoding) before normalization. 153 | 154 | ### Data standardization 155 | 156 | A lot of machine learning algorithms require normally distributed data as their input. Normally distributed data 157 | means that every column in the data has zero mean and unit variance. One may reach this requirement using the 158 | `Standardizer` class. During the creation of the class instance, all the columns' mean values and deviation values are 159 | being extracted from the passed data and stored as fields of the class, in order to apply them to standardize the 160 | other (or the same that was used for the creation of the Standardizer) data: 161 | 162 | ````dart 163 | final dataFrame = DataFrame([ 164 | [ 1, 2, 3], 165 | [ 10, 20, 30], 166 | [100, 200, 300], 167 | ], headerExists: false); 168 | 169 | // fit standardizer 170 | final standardizer = Standardizer(dataFrame); 171 | 172 | // apply fitted standardizer to data 173 | final transformed = standardizer.process(dataFrame); 174 | ```` 175 | 176 | ### Pipeline 177 | 178 | There is a convenient way to organize a sequence of data preprocessing operations - `Pipeline`: 179 | 180 | ````dart 181 | final pipeline = Pipeline(dataFrame, [ 182 | toOneHotLabels(columnNames: ['Gender', 'Age', 'City_Category']), 183 | toIntegerLabels(columnNames: ['Stay_In_Current_City_Years', 'Marital_Status']), 184 | normalize(), 185 | standardize(), 186 | ]); 187 | ```` 188 | 189 | Once you create (or rather fit) a pipeline, you may use it further in your application: 190 | 191 | ````dart 192 | final processed = pipeline.process(dataFrame); 193 | ```` 194 | 195 | `toOneHotLabels`, `toIntegerLabels`, `normalize` and `standardize` are pipeable operator functions. 196 | The pipeable operator function is a factory that takes fitting data and creates a fitted pipeable entity (e.g., 197 | `Normalizer` instance) 198 | -------------------------------------------------------------------------------- /analysis_options.yaml: -------------------------------------------------------------------------------- 1 | include: package:pedantic/analysis_options.yaml 2 | -------------------------------------------------------------------------------- /autotest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pub run build_runner test -- -p vm 4 | -------------------------------------------------------------------------------- /benchmark/main.dart: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /example/black_friday/black_friday.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/ml_preprocessing.dart'; 3 | 4 | Future processDataSetWithCategoricalData() async { 5 | final dataFrame = await fromCsv( 6 | 'example/black_friday/black_friday.csv', 7 | columnNames: [ 8 | 'Gender', 9 | 'Age', 10 | 'City_Category', 11 | 'Stay_In_Current_City_Years', 12 | 'Marital_Status' 13 | ], 14 | ); 15 | 16 | final encoded = Encoder.oneHot( 17 | dataFrame, 18 | columnNames: [ 19 | 'Gender', 20 | 'Age', 21 | 'City_Category', 22 | 'Stay_In_Current_City_Years', 23 | 'Marital_Status' 24 | ], 25 | ).process(dataFrame); 26 | 27 | final observations = encoded.toMatrix(); 28 | final genderEncoded = observations.sample(columnIndices: [0, 1]); 29 | final ageEncoded = observations.sample(columnIndices: [2, 3, 4, 5, 6, 7, 8]); 30 | final cityCategoryEncoded = observations.sample(columnIndices: [9, 10, 11]); 31 | final stayInCityEncoded = 32 | observations.sample(columnIndices: [12, 13, 14, 15, 16]); 33 | final maritalStatusEncoded = observations.sample(columnIndices: [17, 18]); 34 | 35 | print('Features:'); 36 | 37 | print(observations); 38 | 39 | print('feature matrix dimensions: ${observations.rowsNum} x ' 40 | '${observations.columnsNum};'); 41 | 42 | print('=============================='); 43 | 44 | print('Gender:'); 45 | print(genderEncoded); 46 | 47 | print('=============================='); 48 | 49 | print('Age'); 50 | print(ageEncoded); 51 | 52 | print('=============================='); 53 | 54 | print('City category'); 55 | print(cityCategoryEncoded); 56 | 57 | print('=============================='); 58 | 59 | print('Stay in current city (years)'); 60 | print(stayInCityEncoded); 61 | 62 | print('=============================='); 63 | 64 | print('Marital status'); 65 | print(maritalStatusEncoded); 66 | } 67 | 68 | Future main() async { 69 | await processDataSetWithCategoricalData(); 70 | } 71 | -------------------------------------------------------------------------------- /example/dataset.csv: -------------------------------------------------------------------------------- 1 | position,country,age,salary 2 | developer,Russia,21,1000 3 | ui designer,Russia,32,2000 4 | QA engineer,USA,27,2500 5 | QA engineer,Spain,25,2000 6 | developer,France,29,3000 7 | developer,China,23,1500 8 | ui designer,Japan,24,2000 9 | -------------------------------------------------------------------------------- /example/main.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/ml_preprocessing.dart'; 3 | 4 | Future main() async { 5 | final dataFrame = await fromCsv('example/dataset.csv', columns: [0, 1, 2, 3]); 6 | 7 | final pipeline = Pipeline(dataFrame, [ 8 | toOneHotLabels( 9 | columnNames: ['position'], 10 | headerPostfix: '_position', 11 | ), 12 | toIntegerLabels( 13 | columnNames: ['country'], 14 | ), 15 | ]); 16 | 17 | print(pipeline.process(dataFrame).toMatrix()); 18 | } 19 | -------------------------------------------------------------------------------- /lib/ml_preprocessing.dart: -------------------------------------------------------------------------------- 1 | export 'package:ml_linalg/norm.dart'; 2 | export 'package:ml_preprocessing/src/encoder/encoder.dart'; 3 | export 'package:ml_preprocessing/src/encoder/to_integer_labels.dart'; 4 | export 'package:ml_preprocessing/src/encoder/to_one_hot_labels.dart'; 5 | export 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 6 | export 'package:ml_preprocessing/src/normalizer/normalize.dart'; 7 | export 'package:ml_preprocessing/src/normalizer/normalizer.dart'; 8 | export 'package:ml_preprocessing/src/pipeline/pipeline.dart'; 9 | export 'package:ml_preprocessing/src/standardizer/standardize.dart'; 10 | export 'package:ml_preprocessing/src/standardizer/standardizer.dart'; 11 | -------------------------------------------------------------------------------- /lib/src/encoder/encoder.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/encoder_impl.dart'; 3 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart'; 4 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory_impl.dart'; 5 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 6 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 7 | 8 | /// Categorical data encoder factory. 9 | /// 10 | /// Algorithms that process data to create prediction models can't handle 11 | /// categorical data, since they are based on mathematical equations and work 12 | /// only with bare numbers. That means that the categorical data should be 13 | /// converted to numbers. 14 | /// 15 | /// The factory exposes different ways to convert categorical data into numbers. 16 | abstract class Encoder implements Pipeable { 17 | /// Gets columns by [columnIndices] or [columnNames] ([columnIndices] has a 18 | /// precedence over [columnNames]) from [fittingData], collects all unique 19 | /// values from the columns and builds a map `raw value` => `encoded value`. 20 | /// Once one calls the [process] method, the mapping will be applied. 21 | /// 22 | /// The mapping is built according to the following rules: 23 | /// 24 | /// Let's say, one has a list of values denoting a level of education: 25 | /// 26 | /// ``` 27 | /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD'] 28 | /// ``` 29 | /// 30 | /// After applying the encoder, the source sequence will be looking 31 | /// like this: 32 | /// 33 | /// ``` 34 | /// [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]] 35 | /// ``` 36 | /// 37 | /// In other words, the `one-hot` encoder created the following mapping: 38 | /// 39 | /// `BSc` => [1, 0, 0] 40 | /// 41 | /// `PhD` => [0, 1, 0] 42 | /// 43 | /// `High School` => [0, 0, 1] 44 | /// 45 | /// Keep in mind that if you apply the [process] method to your data, the 46 | /// number of columns will be increased since one categorical value in the 47 | /// case of one-hot encoding requires several cells. Headers for the new 48 | /// columns will be autogenerated from the categorical values. 49 | factory Encoder.oneHot( 50 | DataFrame fittingData, { 51 | Iterable? columnIndices, 52 | Iterable? columnNames, 53 | UnknownValueHandlingType unknownValueHandlingType = 54 | defaultUnknownValueHandlingType, 55 | }) => 56 | EncoderImpl( 57 | fittingData, 58 | EncoderType.oneHot, 59 | const SeriesEncoderFactoryImpl(), 60 | columnNames: columnNames, 61 | columnIndices: columnIndices, 62 | unknownValueHandlingType: unknownValueHandlingType, 63 | ); 64 | 65 | /// Gets columns by [columnIndices] or [columnNames] ([columnIndices] has a 66 | /// precedence over [columnNames]) from [fittingData], collects all unique 67 | /// values from the columns and builds a map `raw value` => `encoded value`. 68 | /// Once one calls the [process] method, the mapping will be applied. 69 | /// 70 | /// The mapping is built according to the following rules: 71 | /// 72 | /// Let's say, one has a list of values denoting a level of education: 73 | /// 74 | /// ``` 75 | /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD'] 76 | /// ``` 77 | /// 78 | /// After applying the encoder, the source list will be looking 79 | /// like this: 80 | /// 81 | /// ``` 82 | /// [0, 0, 1, 2, 1] 83 | /// ``` 84 | /// 85 | /// In other words, the `label` encoder created the following mapping: 86 | /// 87 | /// `BSc` => 0 88 | /// 89 | /// `PhD` => 1 90 | /// 91 | /// `High School` => 2 92 | factory Encoder.label( 93 | DataFrame fittingData, { 94 | Iterable? columnIndices, 95 | Iterable? columnNames, 96 | UnknownValueHandlingType unknownValueHandlingType = 97 | defaultUnknownValueHandlingType, 98 | }) => 99 | EncoderImpl( 100 | fittingData, 101 | EncoderType.label, 102 | const SeriesEncoderFactoryImpl(), 103 | columnNames: columnNames, 104 | columnIndices: columnIndices, 105 | unknownValueHandlingType: unknownValueHandlingType, 106 | ); 107 | } 108 | -------------------------------------------------------------------------------- /lib/src/encoder/encoder_impl.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/encoder.dart'; 3 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart'; 4 | import 'package:ml_preprocessing/src/encoder/helpers/create_encoder_to_series_mapping.dart'; 5 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart'; 6 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory.dart'; 7 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 8 | 9 | class EncoderImpl implements Encoder { 10 | EncoderImpl( 11 | DataFrame fittingData, 12 | EncoderType encoderType, 13 | SeriesEncoderFactory seriesEncoderFactory, { 14 | Iterable? columnIndices, 15 | Iterable? columnNames, 16 | String encodedHeaderPrefix = '', 17 | String encodedHeaderPostfix = '', 18 | UnknownValueHandlingType unknownValueHandlingType = 19 | defaultUnknownValueHandlingType, 20 | }) : _encoderBySeries = createEncoderToSeriesMapping( 21 | fittingData, 22 | columnNames, 23 | columnIndices, 24 | (series) => seriesEncoderFactory.createByType( 25 | encoderType, 26 | series, 27 | headerPostfix: encodedHeaderPostfix, 28 | headerPrefix: encodedHeaderPrefix, 29 | unknownValueHandlingType: unknownValueHandlingType, 30 | )); 31 | 32 | final Map _encoderBySeries; 33 | 34 | @override 35 | DataFrame process(DataFrame dataFrame) { 36 | final encoded = dataFrame.series.expand((series) => 37 | _encoderBySeries.containsKey(series.name) 38 | ? _encoderBySeries[series.name]!.encodeSeries(series) 39 | : [series]); 40 | 41 | return DataFrame.fromSeries(encoded); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /lib/src/encoder/encoder_type.dart: -------------------------------------------------------------------------------- 1 | /// A type of categorical data encoding 2 | /// 3 | /// Algorithms that process data to create prediction models can't handle 4 | /// categorical data, since they are based on mathematical equations and work 5 | /// only with bare numbers. That means that the categorical data should be 6 | /// converted to numbers. 7 | /// 8 | /// [EncoderType.label] converts categorical values into integer numbers. Let's 9 | /// say, one has a list of values denoting a level of education: 10 | /// 11 | /// ``` 12 | /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD'] 13 | /// ``` 14 | /// 15 | /// After applying [EncoderType.label], the source list will be looking 16 | /// like this: 17 | /// 18 | /// ``` 19 | /// [0, 0, 1, 2, 1] 20 | /// ``` 21 | /// 22 | /// In other words, the `label` encoder created the following mapping: 23 | /// 24 | /// `BSc` => 0 25 | /// 26 | /// `PhD` => 1 27 | /// 28 | /// `High School` => 2 29 | /// 30 | /// [EncoderType.oneHot] converts categorical values into binary sequences. 31 | /// Let's say, one has a list of values denoting a level of education: 32 | /// 33 | /// ``` 34 | /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD'] 35 | /// ``` 36 | /// 37 | /// After applying [EncoderType.oneHot], the source sequence will be looking 38 | /// like this: 39 | /// 40 | /// ``` 41 | /// [[1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]] 42 | /// ``` 43 | /// 44 | /// In other words, the `one-hot` encoder created the following mapping: 45 | /// 46 | /// `BSc` => [1, 0, 0] 47 | /// 48 | /// `PhD` => [0, 1, 0] 49 | /// 50 | /// `High School` => [0, 0, 1] 51 | enum EncoderType { 52 | oneHot, 53 | label, 54 | } 55 | -------------------------------------------------------------------------------- /lib/src/encoder/helpers/create_encoder_to_series_mapping.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/helpers/get_series_names_by_indices.dart'; 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart'; 4 | 5 | Map createEncoderToSeriesMapping( 6 | DataFrame dataFrame, 7 | Iterable? predefinedSeriesNames, 8 | Iterable? seriesIndices, 9 | SeriesEncoder Function(Series series) seriesEncoderFactory, 10 | ) { 11 | final seriesNames = predefinedSeriesNames ?? 12 | getSeriesNamesByIndices(dataFrame.header, seriesIndices!); 13 | final entries = seriesNames.map((name) { 14 | final series = dataFrame[name]; 15 | final encoder = seriesEncoderFactory(series); 16 | 17 | return MapEntry(name, encoder); 18 | }); 19 | 20 | return Map.fromEntries(entries); 21 | } 22 | -------------------------------------------------------------------------------- /lib/src/encoder/helpers/get_series_names_by_indices.dart: -------------------------------------------------------------------------------- 1 | import 'package:quiver/iterables.dart'; 2 | 3 | Iterable getSeriesNamesByIndices( 4 | Iterable seriesNames, Iterable indices) { 5 | final uniqueIndices = Set.from(indices); 6 | 7 | return enumerate(seriesNames) 8 | .where((indexedName) => uniqueIndices.contains(indexedName.index)) 9 | .map((indexedValue) => indexedValue.value); 10 | } 11 | -------------------------------------------------------------------------------- /lib/src/encoder/series_encoder/label_series_encoder.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart'; 3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 4 | 5 | class LabelSeriesEncoder implements SeriesEncoder { 6 | LabelSeriesEncoder( 7 | Series fittingData, { 8 | UnknownValueHandlingType unknownValueHandlingType = 9 | defaultUnknownValueHandlingType, 10 | String headerPrefix = '', 11 | String headerPostfix = '', 12 | }) : _unknownHandlingType = unknownValueHandlingType, 13 | _columnHeaderTpl = 14 | ((String label) => '$headerPrefix$label$headerPostfix'), 15 | _labels = Set.from(fittingData.data).toList(growable: false); 16 | 17 | final UnknownValueHandlingType _unknownHandlingType; 18 | final ColumnHeaderTemplateFn _columnHeaderTpl; 19 | final List _labels; 20 | 21 | @override 22 | Iterable encodeSeries(Series series) { 23 | final shouldThrowErrorIfUnknown = 24 | _unknownHandlingType == UnknownValueHandlingType.error; 25 | 26 | return [ 27 | Series( 28 | _columnHeaderTpl(series.name), 29 | series.data.map((dynamic label) { 30 | if (!_labels.contains(label)) { 31 | if (shouldThrowErrorIfUnknown) { 32 | throw Exception('Unknown categorical value encountered - $label'); 33 | } 34 | 35 | return _labels.length; 36 | } 37 | 38 | return _labels.indexOf(label); 39 | }), 40 | isDiscrete: true, 41 | ), 42 | ]; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /lib/src/encoder/series_encoder/one_hot_series_encoder.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart'; 3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 4 | 5 | class OneHotSeriesEncoder implements SeriesEncoder { 6 | OneHotSeriesEncoder( 7 | Series fittingData, { 8 | UnknownValueHandlingType unknownValueHandlingType = 9 | defaultUnknownValueHandlingType, 10 | String headerPrefix = '', 11 | String headerPostfix = '', 12 | }) : _unknownHandlingType = unknownValueHandlingType, 13 | _columnHeaderTpl = 14 | ((String label) => '$headerPrefix$label$headerPostfix'), 15 | _labels = Set.from(fittingData.data); 16 | 17 | final UnknownValueHandlingType _unknownHandlingType; 18 | final ColumnHeaderTemplateFn _columnHeaderTpl; 19 | final Set _labels; 20 | 21 | @override 22 | Iterable encodeSeries(Series series) => _labels.map((dynamic label) { 23 | final shouldThrowErrorIfUnknown = 24 | _unknownHandlingType == UnknownValueHandlingType.error; 25 | 26 | final data = series.data.map((dynamic value) { 27 | if (shouldThrowErrorIfUnknown && !_labels.contains(value)) { 28 | throw Exception('Unknown categorical value encountered - `$value` ' 29 | 'for series `${series.name}`'); 30 | } 31 | 32 | return value == label ? 1 : 0; 33 | }); 34 | 35 | return Series(_columnHeaderTpl(label.toString()), data, 36 | isDiscrete: true); 37 | }); 38 | } 39 | -------------------------------------------------------------------------------- /lib/src/encoder/series_encoder/series_encoder.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | 3 | typedef ColumnHeaderTemplateFn = String Function(String label); 4 | 5 | abstract class SeriesEncoder { 6 | Iterable encodeSeries(Series series); 7 | } 8 | -------------------------------------------------------------------------------- /lib/src/encoder/series_encoder/series_encoder_factory.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart'; 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart'; 4 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 5 | 6 | abstract class SeriesEncoderFactory { 7 | SeriesEncoder createByType( 8 | EncoderType type, 9 | Series fittingData, { 10 | String headerPrefix, 11 | String headerPostfix, 12 | UnknownValueHandlingType unknownValueHandlingType, 13 | }); 14 | } 15 | -------------------------------------------------------------------------------- /lib/src/encoder/series_encoder/series_encoder_factory_impl.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart'; 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/label_series_encoder.dart'; 4 | import 'package:ml_preprocessing/src/encoder/series_encoder/one_hot_series_encoder.dart'; 5 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder.dart'; 6 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory.dart'; 7 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 8 | 9 | class SeriesEncoderFactoryImpl implements SeriesEncoderFactory { 10 | const SeriesEncoderFactoryImpl(); 11 | 12 | @override 13 | SeriesEncoder createByType( 14 | EncoderType type, 15 | Series fittingData, { 16 | String headerPrefix = '', 17 | String headerPostfix = '', 18 | UnknownValueHandlingType unknownValueHandlingType = 19 | defaultUnknownValueHandlingType, 20 | }) { 21 | switch (type) { 22 | case EncoderType.label: 23 | return LabelSeriesEncoder( 24 | fittingData, 25 | headerPrefix: headerPrefix, 26 | headerPostfix: headerPostfix, 27 | unknownValueHandlingType: unknownValueHandlingType, 28 | ); 29 | 30 | case EncoderType.oneHot: 31 | return OneHotSeriesEncoder( 32 | fittingData, 33 | headerPrefix: headerPrefix, 34 | headerPostfix: headerPostfix, 35 | unknownValueHandlingType: unknownValueHandlingType, 36 | ); 37 | 38 | default: 39 | throw UnsupportedError('Unsupported encoder type - $type'); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /lib/src/encoder/to_integer_labels.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_preprocessing/src/encoder/encoder_impl.dart'; 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart'; 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory_impl.dart'; 4 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 5 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 6 | 7 | /// A factory function to use label categorical data encoder in the pipeline 8 | /// 9 | /// A usage example: 10 | /// 11 | /// ```dart 12 | /// import 'package:ml_dataframe/ml_dataframe.dart'; 13 | /// import 'package:ml_preprocessing/ml_preprocessing.dart'; 14 | /// 15 | /// void main() { 16 | /// final dataframe = DataFrame([ 17 | /// ['col_1', 'col_2', 'col_3'], 18 | /// ['val_1', 1, false], 19 | /// ['val_2', 0.4, true], 20 | /// ['val_1', 5, false], 21 | /// ['val_3', 6, false], 22 | /// ]); 23 | /// 24 | /// // let's fit a pipeline 25 | /// final pipeline = Pipeline(dataframe, [ 26 | /// // 'col_1' column contains categorical data, let's encode it 27 | /// toIntegerLabels(columnNames: ['col_1']), 28 | /// ]); 29 | /// final processed = pipeline.process(dataframe); 30 | /// 31 | /// // since there are only 3 values in the series 'col_1', they will be 32 | /// // converted as follows: 33 | /// // 34 | /// // 'val_1' => 0 35 | /// // 'val_2' => 1 36 | /// // 'val_3' => 2 37 | /// print(processed); 38 | /// // DataFrame (4 x 3) 39 | /// // col_1 col_2 col_3 40 | /// // 0 1 false 41 | /// // 1 0.4 true 42 | /// // 0 5 false 43 | /// // 2 6 false 44 | /// } 45 | /// ``` 46 | PipeableOperatorFn toIntegerLabels({ 47 | Iterable? columnIndices, 48 | Iterable? columnNames, 49 | String headerPrefix = '', 50 | String headerPostfix = '', 51 | UnknownValueHandlingType unknownValueHandlingType = 52 | defaultUnknownValueHandlingType, 53 | }) => 54 | (data, {dtype}) => EncoderImpl( 55 | data, 56 | EncoderType.label, 57 | const SeriesEncoderFactoryImpl(), 58 | columnIndices: columnIndices, 59 | columnNames: columnNames, 60 | encodedHeaderPostfix: headerPostfix, 61 | encodedHeaderPrefix: headerPrefix, 62 | unknownValueHandlingType: unknownValueHandlingType, 63 | ); 64 | -------------------------------------------------------------------------------- /lib/src/encoder/to_one_hot_labels.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_preprocessing/src/encoder/encoder_impl.dart'; 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart'; 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory_impl.dart'; 4 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 5 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 6 | 7 | /// A factory function to use `one hot` categorical data encoder in the pipeline 8 | /// 9 | /// A usage example: 10 | /// 11 | /// ```dart 12 | /// import 'package:ml_dataframe/ml_dataframe.dart'; 13 | /// import 'package:ml_preprocessing/ml_preprocessing.dart'; 14 | /// 15 | /// void main() { 16 | /// final dataframe = DataFrame([ 17 | /// ['col_1', 'col_2', 'col_3'], 18 | /// ['val_1', 1, false], 19 | /// ['val_2', 0.4, true], 20 | /// ['val_1', 5, false], 21 | /// ]); 22 | /// 23 | /// // let's fit a pipeline 24 | /// final pipeline = Pipeline(dataframe, [ 25 | /// // 'col_1' column contains categorical data, let's encode it 26 | /// toOneHotLabels(columnNames: ['col_1']), 27 | /// ]); 28 | /// final processed = pipeline.process(dataframe); 29 | /// 30 | /// // since there are only two values in the series 'col_1', they will be 31 | /// // converted as follows: 32 | /// // 33 | /// // 'val_1' => 10 34 | /// // 'val_2' => 01 35 | /// print(processed); 36 | /// // DataFrame (3 x 4) 37 | /// // val_1 val_2 col_2 col_3 38 | /// // 1 0 1 false 39 | /// // 0 1 0.4 true 40 | /// // 1 0 5 false 41 | /// } 42 | /// ``` 43 | PipeableOperatorFn toOneHotLabels({ 44 | Iterable? columnIndices, 45 | Iterable? columnNames, 46 | String headerPrefix = '', 47 | String headerPostfix = '', 48 | UnknownValueHandlingType unknownValueHandlingType = 49 | defaultUnknownValueHandlingType, 50 | }) => 51 | (data, {dtype}) => EncoderImpl( 52 | data, 53 | EncoderType.oneHot, 54 | const SeriesEncoderFactoryImpl(), 55 | columnIndices: columnIndices, 56 | columnNames: columnNames, 57 | encodedHeaderPostfix: headerPostfix, 58 | encodedHeaderPrefix: headerPrefix, 59 | unknownValueHandlingType: unknownValueHandlingType, 60 | ); 61 | -------------------------------------------------------------------------------- /lib/src/encoder/unknown_value_handling_type.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_preprocessing/ml_preprocessing.dart'; 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart'; 3 | 4 | /// A way to handle unknown categorical data 5 | /// 6 | /// During processing new data one can encounter previously unseen value. Let's 7 | /// say, one has a list of values denoting a level of education: 8 | /// 9 | /// ``` 10 | /// ['BSc', 'BSc', 'PhD', 'High School', 'PhD'] 11 | /// ``` 12 | /// 13 | /// One successfully applied some [EncoderType] to the data, let's say, [EncoderType.label]. 14 | /// 15 | /// But what should one do if there is an unknown categorical value, e.g. 16 | /// 'School SAT', among new data to process through the same [Pipeline]? 17 | /// 18 | /// [UnknownValueHandlingType.error] forces the pipeline to stop preprocessing 19 | /// and throw an error 20 | /// 21 | /// [UnknownValueHandlingType.ignore] makes it possible to continue the 22 | /// preprocessing as nothing happened - in this case depending on the [EncoderType] 23 | /// will be used an autogenerated encoded value 24 | enum UnknownValueHandlingType { 25 | error, 26 | ignore, 27 | } 28 | 29 | const defaultUnknownValueHandlingType = UnknownValueHandlingType.ignore; 30 | -------------------------------------------------------------------------------- /lib/src/normalizer/normalize.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_linalg/dtype.dart'; 2 | import 'package:ml_linalg/norm.dart'; 3 | import 'package:ml_preprocessing/src/normalizer/normalizer.dart'; 4 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 5 | import 'package:ml_preprocessing/src/pipeline/pipeline.dart'; 6 | 7 | /// Returns a function that can be used in [Pipeline]. The function creates 8 | /// a [Normalizer] instance. Example: 9 | /// 10 | /// ```dart 11 | /// import 'package:ml_dataframe/ml_dataframe.dart'; 12 | /// import 'package:ml_preprocessing/ml_preprocessing.dart'; 13 | /// 14 | /// void main() { 15 | /// final data = DataFrame([ 16 | /// ['feature_1', 'feature_2', 'label'], 17 | /// [ 10, 33.2, 2], 18 | /// [ 20, -1, 4], 19 | /// [ 40, -10, 5], 20 | /// [ 55, 100, 10], 21 | /// ]); 22 | /// final pipeline = Pipeline(data, [ 23 | /// normalize(), 24 | /// ]); 25 | /// final processed = pipeline.process(data); 26 | /// 27 | /// print(processed); 28 | /// // DataFrame (4 x 3) 29 | /// // feature_1 feature_2 label 30 | /// // 0.287927508354187 0.9559193253517151 0.05758550018072128 31 | /// // 0.9794042110443115 -0.048970211297273636 0.19588084518909454 32 | /// // 0.9630868434906006 -0.24077171087265015 0.12038585543632507 33 | /// // 0.4800793528556824 0.8728715777397156 0.08728715777397156 34 | /// } 35 | /// ``` 36 | PipeableOperatorFn normalize([Norm norm = Norm.euclidean]) => 37 | (_, {dtype}) => Normalizer(norm, dtype ?? DType.float32); 38 | -------------------------------------------------------------------------------- /lib/src/normalizer/normalizer.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_linalg/dtype.dart'; 2 | import 'package:ml_linalg/norm.dart'; 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 4 | import 'package:ml_preprocessing/src/normalizer/normalizer_impl.dart'; 5 | 6 | /// A class that performs normalization of data. 7 | /// 8 | /// Normalization is a process aimed to make all values in a vector vary within 9 | /// the range from 0.0 to 1.0 - this makes it possible to treat all the values 10 | /// equally disregard their units. 11 | /// 12 | /// Normalization is applied row-wise. 13 | /// 14 | /// Example: 15 | /// 16 | /// ```dart 17 | /// import 'package:ml_dataframe/ml_dataframe.dart'; 18 | /// import 'package:ml_preprocessing/ml_preprocessing.dart'; 19 | /// 20 | /// void main() { 21 | /// final data = DataFrame([ 22 | /// ['feature_1', 'feature_2', 'label'], 23 | /// [ 10, 33.2, 2], 24 | /// [ 20, -1, 4], 25 | /// [ 40, -10, 5], 26 | /// [ 55, 100, 10], 27 | /// ]); 28 | /// final normalizer = Normalizer(); 29 | /// final processed = normalizer.process(data); 30 | /// 31 | /// print(processed); 32 | /// // DataFrame (4 x 3) 33 | /// // feature_1 feature_2 label 34 | /// // 0.287927508354187 0.9559193253517151 0.05758550018072128 35 | /// // 0.9794042110443115 -0.048970211297273636 0.19588084518909454 36 | /// // 0.9630868434906006 -0.24077171087265015 0.12038585543632507 37 | /// // 0.4800793528556824 0.8728715777397156 0.08728715777397156 38 | /// } 39 | /// ``` 40 | abstract class Normalizer implements Pipeable { 41 | factory Normalizer([Norm norm, DType dtype]) = NormalizerImpl; 42 | } 43 | -------------------------------------------------------------------------------- /lib/src/normalizer/normalizer_impl.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/dtype.dart'; 3 | import 'package:ml_linalg/norm.dart'; 4 | import 'package:ml_preprocessing/src/normalizer/normalizer.dart'; 5 | 6 | class NormalizerImpl implements Normalizer { 7 | NormalizerImpl([this._norm = Norm.euclidean, this._dtype = DType.float32]); 8 | 9 | final Norm _norm; 10 | final DType _dtype; 11 | 12 | @override 13 | DataFrame process(DataFrame input) { 14 | final transformed = 15 | input.toMatrix(_dtype).mapRows((row) => row.normalize(_norm)); 16 | 17 | return DataFrame.fromMatrix(transformed, header: input.header); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /lib/src/pipeline/pipeable.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/dtype.dart'; 3 | 4 | abstract class Pipeable { 5 | DataFrame process(DataFrame input); 6 | } 7 | 8 | typedef PipeableOperatorFn = Pipeable Function(DataFrame data, {DType? dtype}); 9 | -------------------------------------------------------------------------------- /lib/src/pipeline/pipeline.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/dtype.dart'; 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 4 | import 'package:ml_preprocessing/src/pipeline/pipeline_impl.dart'; 5 | 6 | /// A class that is used to organize data preprocessing stages in a pipeline 7 | /// manner. 8 | /// 9 | /// Building the pipeline is a `fitting` stage - it's a preliminary stage where 10 | /// operators extract metadata from the source data passed to [Pipeline] for 11 | /// future use, no preprocessing happens here. 12 | /// 13 | /// Once the `process` method is called, the actual data preprocessing comes to 14 | /// play. 15 | /// 16 | /// It's normal, when one uses the same data for fitting and processing, like 17 | /// in the example below. 18 | /// 19 | /// Example: 20 | /// 21 | /// ```dart 22 | /// import 'package:ml_dataframe/ml_dataframe.dart'; 23 | /// import 'package:ml_preprocessing/ml_preprocessing.dart'; 24 | //' 25 | /// Future main() async { 26 | /// final dataFrame = await fromCsv('example/dataset.csv', columns: [0, 1, 2, 3]); 27 | // 28 | /// final pipeline = Pipeline(dataFrame, [ 29 | /// toOneHotLabels( 30 | /// columnNames: ['position'], 31 | /// headerPostfix: '_position', 32 | /// ), 33 | /// toIntegerLabels( 34 | /// columnNames: ['country'], 35 | /// ), 36 | /// ]); 37 | /// 38 | /// final processed = pipeline.process(dataFrame); 39 | /// } 40 | /// ``` 41 | abstract class Pipeline { 42 | /// Takes [fittingData] to fit preprocessors from [operators] list 43 | /// in order to use them further for new data of the same source as 44 | /// [fittingData] via [process] method. 45 | factory Pipeline( 46 | DataFrame fittingData, Iterable operators, 47 | {DType dType}) = PipelineImpl; 48 | 49 | /// Applies fitted preprocessors to [dataFrame] and returns transformed 50 | /// data 51 | DataFrame process(DataFrame dataFrame); 52 | } 53 | -------------------------------------------------------------------------------- /lib/src/pipeline/pipeline_impl.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/dtype.dart'; 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 4 | import 'package:ml_preprocessing/src/pipeline/pipeline.dart'; 5 | 6 | class PipelineImpl implements Pipeline { 7 | PipelineImpl( 8 | DataFrame fittingData, 9 | Iterable operators, { 10 | DType dType = DType.float32, 11 | }) : _steps = operators.map((operator) => operator(fittingData)); 12 | 13 | final Iterable _steps; 14 | 15 | @override 16 | DataFrame process(DataFrame dataFrame) => 17 | _steps.fold(dataFrame, (processed, step) => step.process(processed)); 18 | } 19 | -------------------------------------------------------------------------------- /lib/src/standardizer/standardize.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/dtype.dart'; 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 4 | import 'package:ml_preprocessing/src/standardizer/standardizer.dart'; 5 | 6 | /// Returns a function that can be used in [Pipeline]. The function creates a 7 | /// [Standardizer] instance. Example: 8 | /// 9 | /// ```dart 10 | /// import 'package:ml_dataframe/ml_dataframe.dart'; 11 | /// import 'package:ml_preprocessing/ml_preprocessing.dart'; 12 | /// 13 | /// void main() { 14 | /// final data = DataFrame([ 15 | /// ['feature_1', 'feature_2', 'label'], 16 | /// [ 10, 33.2, 2], 17 | /// [ 20, -1, 4], 18 | /// [ 40, -10, 5], 19 | /// [ 55, 100, 10], 20 | /// ]); 21 | /// final pipeline = Pipeline(data, [ 22 | /// standardize(), 23 | /// ]); 24 | /// final processed = pipeline.process(data); 25 | /// 26 | /// print(processed); 27 | /// // DataFrame (4 x 3) 28 | /// // feature_1 feature_2 label 29 | /// // -1.217395305633545 0.06132180616259575 -1.1026456356048584 30 | /// // -0.6445034146308899 -0.7300761342048645 -0.42409446835517883 31 | /// // 0.5012804269790649 -0.9383387565612793 -0.08481889218091965 32 | /// // 1.3606183528900146 1.607093095779419 1.6115589141845703 33 | /// } 34 | /// ``` 35 | PipeableOperatorFn standardize() => 36 | (DataFrame fittingData, {dtype = DType.float32}) => 37 | Standardizer(fittingData, dtype: dtype!); 38 | -------------------------------------------------------------------------------- /lib/src/standardizer/standardizer.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/linalg.dart'; 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 4 | import 'package:ml_preprocessing/src/standardizer/standardizer_impl.dart'; 5 | 6 | /// A class that performs data standardization. 7 | /// 8 | /// Data standardization is a process, targeted to make the data look like 9 | /// normally distributed data (with zero mean and unit variance). 10 | /// 11 | /// Standardization applies column-wise. 12 | /// 13 | /// Example: 14 | /// 15 | /// ```dart 16 | /// import 'package:ml_dataframe/ml_dataframe.dart'; 17 | /// import 'package:ml_preprocessing/ml_preprocessing.dart'; 18 | /// 19 | /// void main() { 20 | /// final data = DataFrame([ 21 | /// ['feature_1', 'feature_2', 'label'], 22 | /// [ 10, 33.2, 2], 23 | /// [ 20, -1, 4], 24 | /// [ 40, -10, 5], 25 | /// [ 55, 100, 10], 26 | /// ]); 27 | /// final standardizer = Standardizer(data); 28 | /// final processed = standardizer.process(data); 29 | /// 30 | /// print(processed); 31 | /// // DataFrame (4 x 3) 32 | /// // feature_1 feature_2 label 33 | /// // -1.217395305633545 0.06132180616259575 -1.1026456356048584 34 | /// // -0.6445034146308899 -0.7300761342048645 -0.42409446835517883 35 | /// // 0.5012804269790649 -0.9383387565612793 -0.08481889218091965 36 | /// // 1.3606183528900146 1.607093095779419 1.6115589141845703 37 | /// } 38 | /// ``` 39 | abstract class Standardizer implements Pipeable { 40 | factory Standardizer( 41 | DataFrame fittingData, { 42 | DType dtype, 43 | }) = StandardizerImpl; 44 | } 45 | -------------------------------------------------------------------------------- /lib/src/standardizer/standardizer_impl.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/dtype.dart'; 3 | import 'package:ml_linalg/vector.dart'; 4 | import 'package:ml_preprocessing/src/standardizer/standardizer.dart'; 5 | 6 | class StandardizerImpl implements Standardizer { 7 | StandardizerImpl( 8 | DataFrame fittingData, { 9 | DType dtype = DType.float32, 10 | }) : _dtype = dtype, 11 | _mean = fittingData.toMatrix(dtype).mean(), 12 | _deviation = Vector.fromList( 13 | // TODO: Consider SIMD-aware mapping 14 | fittingData 15 | .toMatrix(dtype) 16 | .deviation() 17 | .map((el) => el == 0 ? 1 : el) 18 | .toList(), 19 | dtype: dtype, 20 | ) { 21 | if (!fittingData.toMatrix(dtype).hasData) { 22 | throw Exception('No data provided'); 23 | } 24 | } 25 | 26 | final DType _dtype; 27 | final Vector _mean; 28 | final Vector _deviation; 29 | 30 | /// Takes as an argument [input] with columns of various distribution types 31 | /// and returns a [DataFrame], columns of which are normally distributed 32 | @override 33 | DataFrame process(DataFrame input) { 34 | final inputAsMatrix = input.toMatrix(_dtype); 35 | 36 | if (inputAsMatrix.columnsNum != _deviation.length) { 37 | throw Exception('Passed dataframe differs from the one used during ' 38 | 'creation of the Standardizer: expected columns number - ' 39 | '${_deviation.length}, given - ${inputAsMatrix.columnsNum}.'); 40 | } 41 | 42 | final processedMatrix = 43 | inputAsMatrix.mapRows((row) => (row - _mean) / _deviation); 44 | final discreteColumnNames = input.series 45 | .where((series) => series.isDiscrete) 46 | .map((series) => series.name); 47 | 48 | return DataFrame.fromMatrix( 49 | processedMatrix, 50 | header: input.header, 51 | discreteColumnNames: discreteColumnNames, 52 | ); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /pubspec.yaml: -------------------------------------------------------------------------------- 1 | name: ml_preprocessing 2 | description: Popular data preprocessing algorithms for machine learning 3 | version: 7.0.2 4 | homepage: https://github.com/gyrdym/ml_preprocessing 5 | 6 | environment: 7 | sdk: '>=2.12.0 <3.0.0' 8 | 9 | dependencies: 10 | ml_dataframe: ^1.0.0 11 | ml_linalg: ^13.0.0 12 | quiver: ^3.0.0 13 | 14 | dev_dependencies: 15 | benchmark_harness: ^2.0.0 16 | mockito: ^5.0.2 17 | pedantic: ^1.11.0 18 | test: ^1.16.8 19 | -------------------------------------------------------------------------------- /test/encoder/encoder_impl_test.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/encoder.dart'; 3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 4 | import 'package:test/test.dart'; 5 | 6 | void main() { 7 | group('EncoderImpl', () { 8 | final data = [ 9 | ['first', 'second', 'third', 'fourth'], 10 | [1, 'F', 'category_val_1', 10], 11 | [10, 'F', 'category_val_2', 20], 12 | [11, 'M', 'category_val_1', 10], 13 | [21, 'F', 'category_val_2', 30], 14 | [44, 'M', 'category_val_1', 10], 15 | [43, 'M', 'category_val_1', 30], 16 | [55, 'F', 'category_val_3', 10], 17 | ]; 18 | 19 | final unseenData = [ 20 | ['first', 'second', 'third', 'fourth'], 21 | [1, 'F', 'category_val_5', 10], 22 | [10, 'F', 'category_val_2', 20], 23 | [11, 'M', 'category_val_6', 10], 24 | ]; 25 | 26 | group('Encoder.oneHot', () { 27 | test('should encode multiple columns', () { 28 | final dataFrame = DataFrame(data); 29 | final encoder = Encoder.oneHot(dataFrame, 30 | columnNames: ['second', 'third', 'fourth']); 31 | final encoded = encoder.process(dataFrame); 32 | 33 | encoded.toMatrix(); 34 | 35 | expect( 36 | encoded.toMatrix(), 37 | equals([ 38 | [ 39 | 1, 40 | 1, 41 | 0, 42 | 1, 43 | 0, 44 | 0, 45 | 1, 46 | 0, 47 | 0, 48 | ], 49 | [ 50 | 10, 51 | 1, 52 | 0, 53 | 0, 54 | 1, 55 | 0, 56 | 0, 57 | 1, 58 | 0, 59 | ], 60 | [ 61 | 11, 62 | 0, 63 | 1, 64 | 1, 65 | 0, 66 | 0, 67 | 1, 68 | 0, 69 | 0, 70 | ], 71 | [ 72 | 21, 73 | 1, 74 | 0, 75 | 0, 76 | 1, 77 | 0, 78 | 0, 79 | 0, 80 | 1, 81 | ], 82 | [ 83 | 44, 84 | 0, 85 | 1, 86 | 1, 87 | 0, 88 | 0, 89 | 1, 90 | 0, 91 | 0, 92 | ], 93 | [ 94 | 43, 95 | 0, 96 | 1, 97 | 1, 98 | 0, 99 | 0, 100 | 0, 101 | 0, 102 | 1, 103 | ], 104 | [ 105 | 55, 106 | 1, 107 | 0, 108 | 0, 109 | 0, 110 | 1, 111 | 1, 112 | 0, 113 | 0, 114 | ], 115 | ])); 116 | }); 117 | 118 | test('should use indices to access the needed series while encoding', () { 119 | final dataFrame = DataFrame(data); 120 | final encoder = Encoder.oneHot(dataFrame, columnIndices: [1, 2, 3]); 121 | final encoded = encoder.process(dataFrame); 122 | 123 | encoded.toMatrix(); 124 | 125 | expect( 126 | encoded.toMatrix(), 127 | equals([ 128 | [ 129 | 1, 130 | 1, 131 | 0, 132 | 1, 133 | 0, 134 | 0, 135 | 1, 136 | 0, 137 | 0, 138 | ], 139 | [ 140 | 10, 141 | 1, 142 | 0, 143 | 0, 144 | 1, 145 | 0, 146 | 0, 147 | 1, 148 | 0, 149 | ], 150 | [ 151 | 11, 152 | 0, 153 | 1, 154 | 1, 155 | 0, 156 | 0, 157 | 1, 158 | 0, 159 | 0, 160 | ], 161 | [ 162 | 21, 163 | 1, 164 | 0, 165 | 0, 166 | 1, 167 | 0, 168 | 0, 169 | 0, 170 | 1, 171 | ], 172 | [ 173 | 44, 174 | 0, 175 | 1, 176 | 1, 177 | 0, 178 | 0, 179 | 1, 180 | 0, 181 | 0, 182 | ], 183 | [ 184 | 43, 185 | 0, 186 | 1, 187 | 1, 188 | 0, 189 | 0, 190 | 0, 191 | 0, 192 | 1, 193 | ], 194 | [ 195 | 55, 196 | 1, 197 | 0, 198 | 0, 199 | 0, 200 | 1, 201 | 1, 202 | 0, 203 | 0, 204 | ], 205 | ])); 206 | }); 207 | 208 | test('should throw error if unknown value handling type is "error"', () { 209 | final trainingDataFrame = DataFrame(data); 210 | final unseenDataDataframe = DataFrame(unseenData); 211 | final encoder = Encoder.oneHot( 212 | trainingDataFrame, 213 | columnNames: ['second', 'third', 'fourth'], 214 | unknownValueHandlingType: UnknownValueHandlingType.error, 215 | ); 216 | final actual = () => encoder.process(unseenDataDataframe).toMatrix(); 217 | final expected = throwsException; 218 | 219 | expect(actual, expected); 220 | }); 221 | 222 | test( 223 | 'should ignore unknown value if unknown value handling type is ignpre', 224 | () { 225 | final trainingDataFrame = DataFrame(data); 226 | final unseenDataDataframe = DataFrame(unseenData); 227 | final encoder = Encoder.oneHot( 228 | trainingDataFrame, 229 | columnNames: ['second', 'third', 'fourth'], 230 | unknownValueHandlingType: UnknownValueHandlingType.ignore, 231 | ); 232 | final actual = encoder.process(unseenDataDataframe).toMatrix(); 233 | final expected = [ 234 | [ 235 | 1, 236 | 1, 237 | 0, 238 | 0, 239 | 0, 240 | 0, 241 | 1, 242 | 0, 243 | 0, 244 | ], 245 | [ 246 | 10, 247 | 1, 248 | 0, 249 | 0, 250 | 1, 251 | 0, 252 | 0, 253 | 1, 254 | 0, 255 | ], 256 | [ 257 | 11, 258 | 0, 259 | 1, 260 | 0, 261 | 0, 262 | 0, 263 | 1, 264 | 0, 265 | 0, 266 | ], 267 | ]; 268 | 269 | expect(actual, expected); 270 | }); 271 | }); 272 | 273 | group('Encoder.label', () { 274 | test('should encode multiple columns', () { 275 | final dataFrame = DataFrame(data); 276 | final encoder = Encoder.label(dataFrame, 277 | columnNames: ['second', 'third', 'fourth']); 278 | final encoded = encoder.process(dataFrame); 279 | 280 | encoded.toMatrix(); 281 | 282 | expect( 283 | encoded.toMatrix(), 284 | equals([ 285 | [ 286 | 1, 287 | 0, 288 | 0, 289 | 0, 290 | ], 291 | [ 292 | 10, 293 | 0, 294 | 1, 295 | 1, 296 | ], 297 | [ 298 | 11, 299 | 1, 300 | 0, 301 | 0, 302 | ], 303 | [ 304 | 21, 305 | 0, 306 | 1, 307 | 2, 308 | ], 309 | [ 310 | 44, 311 | 1, 312 | 0, 313 | 0, 314 | ], 315 | [ 316 | 43, 317 | 1, 318 | 0, 319 | 2, 320 | ], 321 | [ 322 | 55, 323 | 0, 324 | 2, 325 | 0, 326 | ], 327 | ])); 328 | }); 329 | 330 | test('should use indices to access the needed series while encoding', () { 331 | final dataFrame = DataFrame(data); 332 | final encoder = Encoder.label(dataFrame, columnIndices: [1, 2, 3]); 333 | final encoded = encoder.process(dataFrame); 334 | 335 | encoded.toMatrix(); 336 | 337 | expect( 338 | encoded.toMatrix(), 339 | equals([ 340 | [ 341 | 1, 342 | 0, 343 | 0, 344 | 0, 345 | ], 346 | [ 347 | 10, 348 | 0, 349 | 1, 350 | 1, 351 | ], 352 | [ 353 | 11, 354 | 1, 355 | 0, 356 | 0, 357 | ], 358 | [ 359 | 21, 360 | 0, 361 | 1, 362 | 2, 363 | ], 364 | [ 365 | 44, 366 | 1, 367 | 0, 368 | 0, 369 | ], 370 | [ 371 | 43, 372 | 1, 373 | 0, 374 | 2, 375 | ], 376 | [ 377 | 55, 378 | 0, 379 | 2, 380 | 0, 381 | ], 382 | ])); 383 | }); 384 | 385 | test('should throw error if unknown value handling type is error', () { 386 | final trainingDataFrame = DataFrame(data); 387 | final unseenDataDataframe = DataFrame(unseenData); 388 | final encoder = Encoder.label( 389 | trainingDataFrame, 390 | columnNames: ['second', 'third', 'fourth'], 391 | unknownValueHandlingType: UnknownValueHandlingType.error, 392 | ); 393 | final actual = () => encoder.process(unseenDataDataframe).toMatrix(); 394 | final expected = throwsException; 395 | 396 | expect(actual, expected); 397 | }); 398 | 399 | test( 400 | 'should ignore unknown value if unknown value handling type is ignpre', 401 | () { 402 | final trainingDataFrame = DataFrame(data); 403 | final unseenDataDataframe = DataFrame(unseenData); 404 | final encoder = Encoder.label( 405 | trainingDataFrame, 406 | columnNames: ['second', 'third', 'fourth'], 407 | unknownValueHandlingType: UnknownValueHandlingType.ignore, 408 | ); 409 | final actual = encoder.process(unseenDataDataframe).toMatrix(); 410 | final expected = [ 411 | [ 412 | 1, 413 | 0, 414 | 3, 415 | 0, 416 | ], 417 | [ 418 | 10, 419 | 0, 420 | 1, 421 | 1, 422 | ], 423 | [ 424 | 11, 425 | 1, 426 | 3, 427 | 0, 428 | ], 429 | ]; 430 | 431 | expect(actual, expected); 432 | }); 433 | }); 434 | }); 435 | } 436 | -------------------------------------------------------------------------------- /test/encoder/series_encoder/label_series_encoder_test.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/series_encoder/label_series_encoder.dart'; 3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 4 | import 'package:test/test.dart'; 5 | 6 | void main() { 7 | group('LabelSeriesEncoder', () { 8 | test('should encode given series creating a collection of new series', () { 9 | final series = 10 | Series('just_header', ['q', '2ee', '0030', '123']); 11 | final encoder = LabelSeriesEncoder(series); 12 | final encoded = encoder.encodeSeries(series).toList(); 13 | 14 | expect(encoded, hasLength(1)); 15 | expect(encoded[0].data, equals([0, 1, 2, 3])); 16 | expect(encoded[0].isDiscrete, isTrue); 17 | }); 18 | 19 | test( 20 | 'should use source series header as a header of encoded one if ' 21 | 'neither header prefix nor header postfix are specified', () { 22 | final series = 23 | Series('just_header', ['q', '2ee', '0030', '123']); 24 | final encoder = LabelSeriesEncoder(series); 25 | final encoded = encoder.encodeSeries(series).toList(); 26 | 27 | expect(encoded, hasLength(1)); 28 | expect(encoded[0].name, 'just_header'); 29 | expect(encoded[0].isDiscrete, isTrue); 30 | }); 31 | 32 | test('should encode given series with repeating values', () { 33 | final series = Series('just_header', 34 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 35 | final encoder = LabelSeriesEncoder(series); 36 | final encoded = encoder.encodeSeries(series).toList(); 37 | 38 | expect(encoded, hasLength(1)); 39 | expect(encoded[0].data, equals([0, 1, 0, 0, 2, 3, 2])); 40 | expect(encoded[0].isDiscrete, isTrue); 41 | }); 42 | 43 | test('should consider given series name prefix', () { 44 | final series = Series('just_header', 45 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 46 | final encoder = LabelSeriesEncoder(series, headerPrefix: 'pref_'); 47 | final encoded = encoder.encodeSeries(series).toList(); 48 | 49 | expect(encoded, hasLength(1)); 50 | expect(encoded[0].name, 'pref_just_header'); 51 | expect(encoded[0].isDiscrete, isTrue); 52 | }); 53 | 54 | test('should consider given series name postfix', () { 55 | final series = Series('just_header', 56 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 57 | final encoder = LabelSeriesEncoder(series, headerPostfix: '_postf'); 58 | final encoded = encoder.encodeSeries(series).toList(); 59 | 60 | expect(encoded, hasLength(1)); 61 | expect(encoded[0].name, 'just_header_postf'); 62 | expect(encoded[0].isDiscrete, isTrue); 63 | }); 64 | 65 | test( 66 | 'should consider both given series name postfix and series name ' 67 | 'prefix', () { 68 | final series = Series('just_header', 69 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 70 | final encoder = LabelSeriesEncoder(series, 71 | headerPrefix: 'pref_', headerPostfix: '_postf'); 72 | final encoded = encoder.encodeSeries(series).toList(); 73 | 74 | expect(encoded, hasLength(1)); 75 | expect(encoded[0].name, 'pref_just_header_postf'); 76 | expect(encoded[0].isDiscrete, isTrue); 77 | }); 78 | 79 | test('should use fitted data to encode new one', () { 80 | final fittingData = Series('just_header', 81 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 82 | final encoder = LabelSeriesEncoder(fittingData, 83 | headerPrefix: 'pref_', headerPostfix: '_postf'); 84 | 85 | final newData = Series('just_header', 86 | ['q', 'q', 'q', 'q', '2ee', '2ee', '0030', 'q', '0030']); 87 | final encoded = encoder.encodeSeries(newData).toList(); 88 | 89 | expect(encoded, hasLength(1)); 90 | expect(encoded[0].data, equals([0, 0, 0, 0, 1, 1, 2, 0, 2])); 91 | expect(encoded[0].isDiscrete, isTrue); 92 | }); 93 | 94 | test( 95 | 'should throw error if unknown value handling startegy type is "throw ' 96 | 'error" and unknown value is encountered', () { 97 | final fittingData = Series('just_header', 98 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 99 | final encoder = LabelSeriesEncoder(fittingData, 100 | unknownValueHandlingType: UnknownValueHandlingType.error); 101 | final unknownValue = 'unknown_value'; 102 | final newData = Series('awesome_series', [ 103 | 'q', 104 | 'q', 105 | 'q', 106 | unknownValue, 107 | '2ee', 108 | '2ee', 109 | '0030', 110 | 'q', 111 | '0030' 112 | ]); 113 | 114 | final actual = () => 115 | encoder.encodeSeries(newData).map((series) => series.data.toList()); 116 | 117 | expect(actual, throwsException); 118 | }); 119 | 120 | test( 121 | 'should encode unknown value as the last index of all labels if ' 122 | 'unknown value handling startegy is "ignore" and unknown value is ' 123 | 'encountered', () { 124 | final fittingData = Series('just_header', 125 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 126 | final encoder = LabelSeriesEncoder(fittingData, 127 | unknownValueHandlingType: UnknownValueHandlingType.ignore); 128 | final unknownValue = 'unknown_value'; 129 | final newData = Series('awesome_series', [ 130 | 'q', 131 | 'q', 132 | 'q', 133 | unknownValue, 134 | '2ee', 135 | '2ee', 136 | '0030', 137 | 'q', 138 | '0030' 139 | ]); 140 | final encoded = encoder.encodeSeries(newData).toList(); 141 | 142 | expect(encoded, hasLength(1)); 143 | expect(encoded[0].data, equals([0, 0, 0, 4, 1, 1, 2, 0, 2])); 144 | expect(encoded[0].isDiscrete, isTrue); 145 | }); 146 | }); 147 | } 148 | -------------------------------------------------------------------------------- /test/encoder/series_encoder/one_hot_series_encoder_test.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/series_encoder/one_hot_series_encoder.dart'; 3 | import 'package:ml_preprocessing/src/encoder/unknown_value_handling_type.dart'; 4 | import 'package:test/test.dart'; 5 | 6 | void main() { 7 | group('OneHotSeriesEncoder', () { 8 | test('should encode given series creating a collection of new series', () { 9 | final series = 10 | Series('just_header', ['q', '2ee', '0030', '123']); 11 | final encoder = OneHotSeriesEncoder(series); 12 | final encoded = encoder.encodeSeries(series).toList(); 13 | 14 | expect(encoded, hasLength(4)); 15 | 16 | expect(encoded[0].data, equals([1, 0, 0, 0])); 17 | expect(encoded[1].data, equals([0, 1, 0, 0])); 18 | expect(encoded[2].data, equals([0, 0, 1, 0])); 19 | expect(encoded[3].data, equals([0, 0, 0, 1])); 20 | 21 | expect(encoded[0].isDiscrete, isTrue); 22 | expect(encoded[1].isDiscrete, isTrue); 23 | expect(encoded[2].isDiscrete, isTrue); 24 | expect(encoded[3].isDiscrete, isTrue); 25 | }); 26 | 27 | test( 28 | 'should use categorical value as a encoded series headers if neither ' 29 | 'header prefix nor header postfix are specified', () { 30 | final series = 31 | Series('just_header', ['q', '2ee', '0030', '123']); 32 | final encoder = OneHotSeriesEncoder(series); 33 | final encoded = encoder.encodeSeries(series).toList(); 34 | 35 | expect(encoded, hasLength(4)); 36 | 37 | expect(encoded[0].name, 'q'); 38 | expect(encoded[1].name, '2ee'); 39 | expect(encoded[2].name, '0030'); 40 | expect(encoded[3].name, '123'); 41 | 42 | expect(encoded[0].isDiscrete, isTrue); 43 | expect(encoded[1].isDiscrete, isTrue); 44 | expect(encoded[2].isDiscrete, isTrue); 45 | expect(encoded[3].isDiscrete, isTrue); 46 | }); 47 | 48 | test('should encode given series with repeating values', () { 49 | final series = Series('just_header', 50 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 51 | final encoder = OneHotSeriesEncoder(series); 52 | final encoded = encoder.encodeSeries(series).toList(); 53 | 54 | expect(encoded, hasLength(4)); 55 | 56 | expect(encoded[0].data, equals([1, 0, 1, 1, 0, 0, 0])); 57 | expect(encoded[1].data, equals([0, 1, 0, 0, 0, 0, 0])); 58 | expect(encoded[2].data, equals([0, 0, 0, 0, 1, 0, 1])); 59 | expect(encoded[3].data, equals([0, 0, 0, 0, 0, 1, 0])); 60 | 61 | expect(encoded[0].isDiscrete, isTrue); 62 | expect(encoded[1].isDiscrete, isTrue); 63 | expect(encoded[2].isDiscrete, isTrue); 64 | expect(encoded[3].isDiscrete, isTrue); 65 | }); 66 | 67 | test('should consider given series name prefix', () { 68 | final series = Series('just_header', 69 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 70 | final encoder = OneHotSeriesEncoder(series, headerPrefix: 'pref_'); 71 | final encoded = encoder.encodeSeries(series).toList(); 72 | 73 | expect(encoded, hasLength(4)); 74 | 75 | expect(encoded[0].name, 'pref_q'); 76 | expect(encoded[1].name, 'pref_2ee'); 77 | expect(encoded[2].name, 'pref_0030'); 78 | expect(encoded[3].name, 'pref_123'); 79 | 80 | expect(encoded[0].isDiscrete, isTrue); 81 | expect(encoded[1].isDiscrete, isTrue); 82 | expect(encoded[2].isDiscrete, isTrue); 83 | expect(encoded[3].isDiscrete, isTrue); 84 | }); 85 | 86 | test('should consider given series name postfix', () { 87 | final series = Series('just_header', 88 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 89 | final encoder = OneHotSeriesEncoder(series, headerPostfix: '_postf'); 90 | final encoded = encoder.encodeSeries(series).toList(); 91 | 92 | expect(encoded, hasLength(4)); 93 | 94 | expect(encoded[0].name, 'q_postf'); 95 | expect(encoded[1].name, '2ee_postf'); 96 | expect(encoded[2].name, '0030_postf'); 97 | expect(encoded[3].name, '123_postf'); 98 | 99 | expect(encoded[0].isDiscrete, isTrue); 100 | expect(encoded[1].isDiscrete, isTrue); 101 | expect(encoded[2].isDiscrete, isTrue); 102 | expect(encoded[3].isDiscrete, isTrue); 103 | }); 104 | 105 | test( 106 | 'should consider both given series name postfix and series name ' 107 | 'prefix', () { 108 | final series = Series('just_header', 109 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 110 | final encoder = OneHotSeriesEncoder(series, 111 | headerPrefix: 'pref_', headerPostfix: '_postf'); 112 | final encoded = encoder.encodeSeries(series).toList(); 113 | 114 | expect(encoded, hasLength(4)); 115 | 116 | expect(encoded[0].name, 'pref_q_postf'); 117 | expect(encoded[1].name, 'pref_2ee_postf'); 118 | expect(encoded[2].name, 'pref_0030_postf'); 119 | expect(encoded[3].name, 'pref_123_postf'); 120 | 121 | expect(encoded[0].isDiscrete, isTrue); 122 | expect(encoded[1].isDiscrete, isTrue); 123 | expect(encoded[2].isDiscrete, isTrue); 124 | expect(encoded[3].isDiscrete, isTrue); 125 | }); 126 | 127 | test('should use fitted data to encode new one', () { 128 | final fittingData = Series('just_header', 129 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 130 | final encoder = OneHotSeriesEncoder(fittingData, 131 | headerPrefix: 'pref_', headerPostfix: '_postf'); 132 | 133 | final newData = Series('just_header', 134 | ['q', 'q', 'q', 'q', '2ee', '2ee', '0030', 'q', '0030']); 135 | final encoded = encoder.encodeSeries(newData).toList(); 136 | 137 | expect(encoded, hasLength(4)); 138 | 139 | expect(encoded[0].data, equals([1, 1, 1, 1, 0, 0, 0, 1, 0])); 140 | expect(encoded[1].data, equals([0, 0, 0, 0, 1, 1, 0, 0, 0])); 141 | expect(encoded[2].data, equals([0, 0, 0, 0, 0, 0, 1, 0, 1])); 142 | expect(encoded[3].data, equals([0, 0, 0, 0, 0, 0, 0, 0, 0])); 143 | 144 | expect(encoded[0].isDiscrete, isTrue); 145 | expect(encoded[1].isDiscrete, isTrue); 146 | expect(encoded[2].isDiscrete, isTrue); 147 | expect(encoded[3].isDiscrete, isTrue); 148 | }); 149 | 150 | test( 151 | 'should throw error if unknown value handling startegy is to throw ' 152 | 'error and unknown value is encountered', () { 153 | final fittingData = Series('just_header', 154 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 155 | final encoder = OneHotSeriesEncoder(fittingData, 156 | unknownValueHandlingType: UnknownValueHandlingType.error); 157 | final unknownValue = 'unknown_value'; 158 | final newData = Series('awesome_series', [ 159 | 'q', 160 | 'q', 161 | 'q', 162 | unknownValue, 163 | '2ee', 164 | '2ee', 165 | '0030', 166 | 'q', 167 | '0030' 168 | ]); 169 | 170 | final actual = () => 171 | encoder.encodeSeries(newData).map((series) => series.data.toList()); 172 | 173 | expect(actual, throwsException); 174 | }); 175 | 176 | test( 177 | 'should encode unknown value as 0 if unknown value handling startegy ' 178 | 'is to ignore and unknown value is encountered', () { 179 | final fittingData = Series('just_header', 180 | ['q', '2ee', 'q', 'q', '0030', '123', '0030']); 181 | final encoder = OneHotSeriesEncoder(fittingData, 182 | unknownValueHandlingType: UnknownValueHandlingType.ignore); 183 | final unknownValue = 'unknown_value'; 184 | final newData = Series('awesome_series', [ 185 | 'q', 186 | 'q', 187 | 'q', 188 | unknownValue, 189 | '2ee', 190 | '2ee', 191 | '0030', 192 | 'q', 193 | '0030' 194 | ]); 195 | final encoded = encoder.encodeSeries(newData).toList(); 196 | 197 | expect(encoded, hasLength(4)); 198 | 199 | expect(encoded[0].data, equals([1, 1, 1, 0, 0, 0, 0, 1, 0])); 200 | expect(encoded[1].data, equals([0, 0, 0, 0, 1, 1, 0, 0, 0])); 201 | expect(encoded[2].data, equals([0, 0, 0, 0, 0, 0, 1, 0, 1])); 202 | expect(encoded[3].data, equals([0, 0, 0, 0, 0, 0, 0, 0, 0])); 203 | 204 | expect(encoded[0].isDiscrete, isTrue); 205 | expect(encoded[1].isDiscrete, isTrue); 206 | expect(encoded[2].isDiscrete, isTrue); 207 | expect(encoded[3].isDiscrete, isTrue); 208 | }); 209 | }); 210 | } 211 | -------------------------------------------------------------------------------- /test/encoder/series_encoder/series_encoder_factory_impl.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/encoder_type.dart'; 3 | import 'package:ml_preprocessing/src/encoder/series_encoder/label_series_encoder.dart'; 4 | import 'package:ml_preprocessing/src/encoder/series_encoder/one_hot_series_encoder.dart'; 5 | import 'package:ml_preprocessing/src/encoder/series_encoder/series_encoder_factory_impl.dart'; 6 | import 'package:test/test.dart'; 7 | 8 | void main() { 9 | group('SeriesEncoderFactoryImpl', () { 10 | final factory = const SeriesEncoderFactoryImpl(); 11 | final series = Series( 12 | 'some_series', 13 | ['value_1', 'value_2', 'value_3'], 14 | isDiscrete: true, 15 | ); 16 | 17 | test('should create LabelSeriesEncoder', () { 18 | final encoderType = EncoderType.label; 19 | final actual = factory.createByType(encoderType, series); 20 | final expected = isA(); 21 | 22 | expect(actual, expected); 23 | }); 24 | 25 | test('should create OneHotSeriesEncoder', () { 26 | final encoderType = EncoderType.oneHot; 27 | final actual = factory.createByType(encoderType, series); 28 | final expected = isA(); 29 | 30 | expect(actual, expected); 31 | }); 32 | }); 33 | } 34 | -------------------------------------------------------------------------------- /test/helpers.dart: -------------------------------------------------------------------------------- 1 | import 'package:test/test.dart'; 2 | 3 | Matcher iterable2dAlmostEqualTo(Iterable> expected, 4 | [double precision = 1e-5]) => 5 | pairwiseCompare, Iterable>(expected, 6 | (Iterable expected, Iterable actual) { 7 | if (expected.length != actual.length) { 8 | return false; 9 | } 10 | for (var i = 0; i < expected.length; i++) { 11 | if ((expected.elementAt(i) - actual.elementAt(i)).abs() >= precision) { 12 | return false; 13 | } 14 | } 15 | return true; 16 | }, ''); 17 | 18 | Matcher iterableAlmostEqualTo(Iterable expected, 19 | [double precision = 1e-5]) => 20 | pairwiseCompare( 21 | expected, 22 | (expectedVal, actualVal) => 23 | (expectedVal - actualVal).abs() <= precision, 24 | ''); 25 | -------------------------------------------------------------------------------- /test/normalizer/normalize_test.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/normalizer/normalize.dart'; 3 | import 'package:ml_preprocessing/src/normalizer/normalizer.dart'; 4 | import 'package:test/test.dart'; 5 | 6 | void main() { 7 | group('normalize', () { 8 | test('should return normalizer factory', () { 9 | final normalizerFactory = normalize(); 10 | final normalizer = normalizerFactory(DataFrame([])); 11 | 12 | expect(normalizer, isA()); 13 | }); 14 | }); 15 | } 16 | -------------------------------------------------------------------------------- /test/normalizer/normalizer_test.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/linalg.dart'; 3 | import 'package:ml_preprocessing/src/normalizer/normalizer.dart'; 4 | import 'package:test/test.dart'; 5 | 6 | import '../helpers.dart'; 7 | 8 | void main() { 9 | group('Normalizer', () { 10 | test( 11 | 'should divide each row-vector by its euclidean norm and preserve ' 12 | 'the header of the input dataframe', () { 13 | final header = ['first', 'second', 'third']; 14 | final data = Matrix.fromList([ 15 | [10, 20, 30], 16 | [40, 50, 60], 17 | [90, 80, 70], 18 | [190, 180, 170], 19 | ]); 20 | final input = DataFrame.fromMatrix(data, header: header); 21 | final normalizer = Normalizer(); 22 | final transformed = normalizer.process(input); 23 | 24 | expect(transformed.header, equals(header)); 25 | expect( 26 | transformed.toMatrix(), 27 | iterable2dAlmostEqualTo([ 28 | [0.267, 0.534, 0.801], 29 | [0.455, 0.569, 0.683], 30 | [0.646, 0.574, 0.502], 31 | [0.608, 0.576, 0.544], 32 | ], 1e-3)); 33 | }); 34 | 35 | test( 36 | 'should divide each row-vector by its manhattan norm and preserve ' 37 | 'the header of the input dataframe', () { 38 | final header = ['first', 'second', 'third']; 39 | final data = Matrix.fromList([ 40 | [10, 20, 30], 41 | [40, 50, 60], 42 | [90, 80, 70], 43 | [190, 180, 170], 44 | ]); 45 | final input = DataFrame.fromMatrix(data, header: header); 46 | final normalizer = Normalizer(Norm.manhattan); 47 | final transformed = normalizer.process(input); 48 | 49 | expect(transformed.header, equals(header)); 50 | expect( 51 | transformed.toMatrix(), 52 | iterable2dAlmostEqualTo([ 53 | [0.166, 0.333, 0.500], 54 | [0.266, 0.333, 0.400], 55 | [0.375, 0.333, 0.291], 56 | [0.351, 0.333, 0.314], 57 | ], 1e-3)); 58 | }); 59 | }); 60 | } 61 | -------------------------------------------------------------------------------- /test/pipeline/pipeline_integration_test.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/src/encoder/to_integer_labels.dart'; 3 | import 'package:ml_preprocessing/src/encoder/to_one_hot_labels.dart'; 4 | import 'package:ml_preprocessing/src/pipeline/pipeline.dart'; 5 | import 'package:test/test.dart'; 6 | 7 | void main() { 8 | group('Pipeline', () { 9 | test( 10 | 'should process steps, which return dataframes of different series ' 11 | 'number', () { 12 | final fittingData = DataFrame([ 13 | ['first', 'second', 'third', 'fourth'], 14 | [1, 'F', 'category_val_1', 10], 15 | [10, 'F', 'category_val_2', 20], 16 | [11, 'M', 'category_val_1', 10], 17 | [21, 'F', 'category_val_2', 30], 18 | [44, 'M', 'category_val_1', 10], 19 | [43, 'M', 'category_val_1', 30], 20 | [55, 'F', 'category_val_3', 10], 21 | ]); 22 | 23 | final pipeline = Pipeline(fittingData, [ 24 | toOneHotLabels(columnIndices: [1]), 25 | toOneHotLabels(columnIndices: [2, 3]), 26 | toIntegerLabels(columnNames: ['first']), 27 | ]); 28 | 29 | final result = pipeline.process(fittingData); 30 | 31 | expect( 32 | result.toMatrix(), 33 | equals([ 34 | [ 35 | 0, 36 | 1, 37 | 0, 38 | 1, 39 | 0, 40 | 0, 41 | 1, 42 | 0, 43 | 0, 44 | ], 45 | [ 46 | 1, 47 | 1, 48 | 0, 49 | 0, 50 | 1, 51 | 0, 52 | 0, 53 | 1, 54 | 0, 55 | ], 56 | [ 57 | 2, 58 | 0, 59 | 1, 60 | 1, 61 | 0, 62 | 0, 63 | 1, 64 | 0, 65 | 0, 66 | ], 67 | [ 68 | 3, 69 | 1, 70 | 0, 71 | 0, 72 | 1, 73 | 0, 74 | 0, 75 | 0, 76 | 1, 77 | ], 78 | [ 79 | 4, 80 | 0, 81 | 1, 82 | 1, 83 | 0, 84 | 0, 85 | 1, 86 | 0, 87 | 0, 88 | ], 89 | [ 90 | 5, 91 | 0, 92 | 1, 93 | 1, 94 | 0, 95 | 0, 96 | 0, 97 | 0, 98 | 1, 99 | ], 100 | [ 101 | 6, 102 | 1, 103 | 0, 104 | 0, 105 | 0, 106 | 1, 107 | 1, 108 | 0, 109 | 0, 110 | ], 111 | ])); 112 | }); 113 | 114 | test('should not rewrite previously encoded series', () { 115 | final fittingData = DataFrame([ 116 | ['first', 'second', 'third', 'fourth'], 117 | [1, 'F', 'category_val_1', 10], 118 | [10, 'F', 'category_val_2', 20], 119 | [11, 'M', 'category_val_1', 10], 120 | [21, 'F', 'category_val_2', 30], 121 | [44, 'M', 'category_val_1', 10], 122 | [43, 'M', 'category_val_1', 30], 123 | [55, 'F', 'category_val_3', 10], 124 | ]); 125 | 126 | final pipeline = Pipeline(fittingData, [ 127 | toOneHotLabels(columnIndices: [1, 2]), 128 | toIntegerLabels(columnIndices: [0, 1, 3]), 129 | ]); 130 | 131 | final result = pipeline.process(fittingData); 132 | 133 | expect( 134 | result.rows, 135 | equals([ 136 | [ 137 | 0, 138 | 1, 139 | 0, 140 | 1, 141 | 0, 142 | 0, 143 | 0, 144 | ], 145 | [ 146 | 1, 147 | 1, 148 | 0, 149 | 0, 150 | 1, 151 | 0, 152 | 1, 153 | ], 154 | [ 155 | 2, 156 | 0, 157 | 1, 158 | 1, 159 | 0, 160 | 0, 161 | 0, 162 | ], 163 | [ 164 | 3, 165 | 1, 166 | 0, 167 | 0, 168 | 1, 169 | 0, 170 | 2, 171 | ], 172 | [ 173 | 4, 174 | 0, 175 | 1, 176 | 1, 177 | 0, 178 | 0, 179 | 0, 180 | ], 181 | [ 182 | 5, 183 | 0, 184 | 1, 185 | 1, 186 | 0, 187 | 0, 188 | 2, 189 | ], 190 | [ 191 | 6, 192 | 1, 193 | 0, 194 | 0, 195 | 0, 196 | 1, 197 | 0, 198 | ], 199 | ])); 200 | }); 201 | }); 202 | } 203 | -------------------------------------------------------------------------------- /test/pipeline/pipeline_test.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_preprocessing/ml_preprocessing.dart'; 3 | import 'package:ml_preprocessing/src/pipeline/pipeable.dart'; 4 | import 'package:test/test.dart'; 5 | 6 | class Plus10Processor implements Pipeable { 7 | @override 8 | DataFrame process(DataFrame input) => DataFrame.fromSeries( 9 | input.series.map((series) => Series( 10 | series.name, 11 | series.data.map((dynamic value) => value + 10), 12 | )), 13 | ); 14 | } 15 | 16 | class MultipleBy2Processor implements Pipeable { 17 | @override 18 | DataFrame process(DataFrame input) => DataFrame.fromSeries( 19 | input.series.map((series) => Series( 20 | series.name, 21 | series.data.map((dynamic value) => value * 2), 22 | )), 23 | ); 24 | } 25 | 26 | void main() { 27 | group('Pipeline', () { 28 | final fittingData = DataFrame([[]], headerExists: false); 29 | 30 | final targetData = DataFrame([ 31 | [20, 10, 30, 30], 32 | [30, 90, 20, 60], 33 | [40, 70, 50, 10], 34 | ], headerExists: false); 35 | 36 | test('should create a pipeline with predefined steps', () { 37 | final pipeline = Pipeline(fittingData, [ 38 | (data, {dtype}) => Plus10Processor(), 39 | (data, {dtype}) => MultipleBy2Processor(), 40 | ]); 41 | 42 | final result = pipeline.process(targetData); 43 | 44 | expect( 45 | result.toMatrix(), 46 | equals([ 47 | [60, 40, 80, 80], 48 | [80, 200, 60, 140], 49 | [100, 160, 120, 40], 50 | ])); 51 | }); 52 | }); 53 | } 54 | -------------------------------------------------------------------------------- /test/standardizer/standardize_test.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/dtype.dart'; 3 | import 'package:ml_preprocessing/src/standardizer/standardize.dart'; 4 | import 'package:ml_preprocessing/src/standardizer/standardizer.dart'; 5 | import 'package:test/test.dart'; 6 | 7 | void main() { 8 | group('standardize', () { 9 | final dtype = DType.float32; 10 | 11 | test('should return a Standardizer factory function', () { 12 | final fittingData = DataFrame(>[ 13 | [1, 2, 3], 14 | ], headerExists: false); 15 | 16 | final standardizerFactory = standardize(); 17 | final standardizer = standardizerFactory(fittingData, dtype: dtype); 18 | 19 | expect(standardizer, isA()); 20 | }); 21 | }); 22 | } 23 | -------------------------------------------------------------------------------- /test/standardizer/standardizer_test.dart: -------------------------------------------------------------------------------- 1 | import 'package:ml_dataframe/ml_dataframe.dart'; 2 | import 'package:ml_linalg/dtype.dart'; 3 | import 'package:ml_linalg/matrix.dart'; 4 | import 'package:ml_preprocessing/ml_preprocessing.dart'; 5 | import 'package:test/test.dart'; 6 | 7 | import '../helpers.dart'; 8 | 9 | void main() { 10 | const dtype = DType.float32; 11 | 12 | group('Standardizer', () { 13 | test( 14 | 'should extract deviation and mean values from fitting data and apply ' 15 | 'them to the same data in order to make the latter look like normally' 16 | 'distributed data (with zero mean and unit variance)', () { 17 | final fittingData = DataFrame(>[ 18 | [10, 21, 90, 20], 19 | [20, 66, 11, 30], 20 | [30, 55, 0, 70], 21 | [40, 33, 22, 20], 22 | ], headerExists: false); 23 | 24 | final standardizer = Standardizer(fittingData, dtype: dtype); 25 | final processed = standardizer.process(fittingData); 26 | 27 | expect( 28 | processed.toMatrix(dtype), 29 | iterable2dAlmostEqualTo([ 30 | [-1.34164079, -1.28449611, 1.68894093, -0.72760688], 31 | [-0.4472136, 1.25626543, -0.56298031, -0.24253563], 32 | [0.4472136, 0.63519039, -0.87653896, 1.69774938], 33 | [1.34164079, -0.6069597, -0.24942166, -0.72760688], 34 | ])); 35 | }); 36 | 37 | test( 38 | 'should extract deviation and mean values from fitting data and apply ' 39 | 'them to the previously unseen data in order to make the latter look ' 40 | 'like normally distributed data (with zero mean and unit ' 41 | 'variance)', () { 42 | final fittingData = DataFrame(>[ 43 | [10, 21, 90, 20], 44 | [20, 66, 11, 30], 45 | [30, 55, 0, 70], 46 | [40, 33, 22, 20], 47 | ], headerExists: false); 48 | 49 | final testData = DataFrame(>[ 50 | [80, 20, 11, -100], 51 | [90, -40, 27, 0], 52 | [10, 44, 96, 120], 53 | [50, -99, 73, 10], 54 | [88, -20, 36, 66], 55 | ], headerExists: false); 56 | 57 | final standardizer = Standardizer(fittingData, dtype: dtype); 58 | final processed = standardizer.process(testData); 59 | 60 | expect( 61 | processed.toMatrix(dtype), 62 | iterable2dAlmostEqualTo([ 63 | [4.91934955, -1.34095748, -0.56298031, -6.54846188], 64 | [5.81377674, -4.72863954, -0.106895, -1.69774938], 65 | [-1.34164079, 0.01411534, 1.85997292, 4.12310563], 66 | [2.23606798, -8.05986023, 1.20435028, -1.21267813], 67 | [5.6348913, -3.59941219, 0.14965299, 1.50372088], 68 | ])); 69 | }); 70 | 71 | test( 72 | 'should extract deviation and mean values from fitting data and apply ' 73 | 'them to the previously unseen data twice or more', () { 74 | final fittingData = DataFrame(>[ 75 | [10, 21, 90, 20], 76 | [20, 66, 11, 30], 77 | [30, 55, 0, 70], 78 | [40, 33, 22, 20], 79 | ], headerExists: false); 80 | 81 | final testData1 = DataFrame(>[ 82 | [80, 20, 11, -100], 83 | [90, -40, 27, 0], 84 | [10, 44, 96, 120], 85 | [50, -99, 73, 10], 86 | [88, -20, 36, 66], 87 | ], headerExists: false); 88 | 89 | final testData2 = DataFrame(>[ 90 | [1, 200, 33, 1000], 91 | [2, -440, 29, 0], 92 | [3, 414, 9, 0], 93 | ], headerExists: false); 94 | 95 | final standardizer = Standardizer(fittingData, dtype: dtype); 96 | 97 | final processed1 = standardizer.process(testData1); 98 | final processed2 = standardizer.process(testData2); 99 | 100 | expect( 101 | processed1.toMatrix(dtype), 102 | iterable2dAlmostEqualTo([ 103 | [4.91934955, -1.34095748, -0.56298031, -6.54846188], 104 | [5.81377674, -4.72863954, -0.106895, -1.69774938], 105 | [-1.34164079, 0.01411534, 1.85997292, 4.12310563], 106 | [2.23606798, -8.05986023, 1.20435028, -1.21267813], 107 | [5.6348913, -3.59941219, 0.14965299, 1.50372088], 108 | ])); 109 | 110 | expect( 111 | processed2.toMatrix(dtype), 112 | iterable2dAlmostEqualTo([ 113 | [-2.14662526, 8.82208869, 0.064137, 46.80937563], 114 | [-2.05718254, -27.31318658, -0.04988433, -1.69774938], 115 | [-1.96773982, 20.90482136, -0.61999097, -1.69774938], 116 | ])); 117 | }); 118 | 119 | test('should process a dataframe with only one column', () { 120 | final fittingData = DataFrame(>[ 121 | [10], 122 | [20], 123 | [30], 124 | [40], 125 | ], headerExists: false); 126 | 127 | final testData = DataFrame(>[ 128 | [80], 129 | [90], 130 | [10], 131 | [50], 132 | [88], 133 | ], headerExists: false); 134 | 135 | final standardizer = Standardizer(fittingData, dtype: dtype); 136 | final processed = standardizer.process(testData); 137 | 138 | expect( 139 | processed.toMatrix(dtype), 140 | iterable2dAlmostEqualTo([ 141 | [4.91934955], 142 | [5.81377674], 143 | [-1.34164079], 144 | [2.23606798], 145 | [5.6348913], 146 | ])); 147 | }); 148 | 149 | test('should process a dataframe with only one row', () { 150 | final fittingData = DataFrame(>[ 151 | [10, 21, 90, 20], 152 | ], headerExists: false); 153 | 154 | final testData = DataFrame(>[ 155 | [80, 20, 11, -100], 156 | [90, -40, 27, 0], 157 | [10, 44, 96, 120], 158 | [50, -99, 73, 10], 159 | [88, -20, 36, 66], 160 | ], headerExists: false); 161 | 162 | final standardizer = Standardizer(fittingData, dtype: dtype); 163 | final processed = standardizer.process(testData); 164 | 165 | expect( 166 | processed.toMatrix(dtype), 167 | equals([ 168 | [70, -1, -79, -120], 169 | [80, -61, -63, -20], 170 | [0, 23, 6, 100], 171 | [40, -120, -17, -10], 172 | [78, -41, -54, 46], 173 | ])); 174 | }); 175 | 176 | test('should make deviation of uniform columns equal to 1', () { 177 | final uniformColumn = Matrix.fromList([ 178 | [10], 179 | [10], 180 | [10], 181 | [10], 182 | ]); 183 | 184 | final otherColumns = Matrix.fromList([ 185 | [21, 90, 20], 186 | [66, 11, 30], 187 | [55, 0, 70], 188 | [33, 22, 20], 189 | ]); 190 | 191 | final fittingData = DataFrame.fromMatrix( 192 | Matrix.fromColumns([ 193 | ...uniformColumn.columns, 194 | ...otherColumns.columns, 195 | ], dtype: dtype), 196 | ); 197 | 198 | final testData = DataFrame(>[ 199 | [80, 20, 11, -100], 200 | [90, -40, 27, 0], 201 | [10, 44, 96, 120], 202 | [50, -99, 73, 10], 203 | [88, -20, 36, 66], 204 | ], headerExists: false); 205 | 206 | final standardizer = Standardizer(fittingData, dtype: dtype); 207 | final processed = standardizer.process(testData); 208 | 209 | expect( 210 | processed.toMatrix(dtype), 211 | iterable2dAlmostEqualTo([ 212 | [70, -1.34095748, -0.56298031, -6.54846188], 213 | [80, -4.72863954, -0.106895, -1.69774938], 214 | [0, 0.01411534, 1.85997292, 4.12310563], 215 | [40, -8.05986023, 1.20435028, -1.21267813], 216 | [78, -3.59941219, 0.14965299, 1.50372088], 217 | ])); 218 | }); 219 | 220 | test( 221 | 'should throw an exception if one tries to apply standardizer to a ' 222 | 'dataframe of inappropriate dimension (columns number in the test ' 223 | 'dataframe should be equal to a number of columns in the fitting ' 224 | 'dataframe)', () { 225 | final fittingData = DataFrame(>[ 226 | [10, 21, 90, 20], 227 | [20, 66, 11, 30], 228 | [30, 55, 0, 70], 229 | [40, 33, 22, 20], 230 | ], headerExists: false); 231 | 232 | final testData = DataFrame(>[ 233 | [80, 20, 11], 234 | [90, -40, 27], 235 | [10, 44, 96], 236 | [50, -99, 73], 237 | [88, -20, 36], 238 | ], headerExists: false); 239 | 240 | final standardizer = Standardizer(fittingData, dtype: dtype); 241 | 242 | expect(() => standardizer.process(testData), throwsException); 243 | }); 244 | 245 | test( 246 | 'should throw an exception if one tries to create a standardizer ' 247 | 'using empty dataframe', () { 248 | final fittingData = DataFrame(>[[]], headerExists: false); 249 | 250 | expect( 251 | () => Standardizer(fittingData, dtype: dtype), 252 | throwsException, 253 | ); 254 | }); 255 | }); 256 | } 257 | --------------------------------------------------------------------------------