├── .coveragerc
├── .editorconfig
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── dependabot.yml
└── workflows
│ ├── dependabot.yml
│ ├── greetings.yml
│ ├── lint.yml
│ ├── release.yml
│ └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pysen
├── pyproject.toml
└── setup.cfg
├── CODEOWNERS
├── LICENSE
├── pyproject.toml
├── requirements-training.txt
├── sapientml_core
├── __init__.py
├── adaptation
│ ├── __init__.py
│ ├── artifacts
│ │ ├── PY310
│ │ │ └── label_order.json
│ │ ├── PY311
│ │ │ └── label_order.json
│ │ ├── PY39
│ │ │ └── label_order.json
│ │ └── label_order.json
│ └── generation
│ │ ├── __init__.py
│ │ ├── pipeline_template.py
│ │ ├── predicate.py
│ │ ├── preprocessing_label.py
│ │ └── template_based_adaptation.py
├── datastore
│ └── localfile
│ │ ├── __init__.py
│ │ ├── export_modules
│ │ ├── sample_dataset.py
│ │ └── split_timeseries_dataset.py
│ │ ├── generator.py
│ │ └── templates
│ │ ├── concat_train_validation.py.jinja
│ │ ├── drop_ignore_columns.py.jinja
│ │ ├── drop_inf_or_nan_rows.py.jinja
│ │ ├── load_localfile.py.jinja
│ │ ├── load_localfile_predict.py.jinja
│ │ ├── load_localfile_train.py.jinja
│ │ ├── set_index.py.jinja
│ │ ├── set_validation_as_test.py.jinja
│ │ ├── split.py.jinja
│ │ └── subsample.py.jinja
├── design
│ ├── __init__.py
│ ├── label_util.py
│ ├── pp_component_groups.py
│ └── search_space.py
├── enums.py
├── explain
│ ├── AutoEDA.py
│ ├── AutoVisualization.py
│ ├── code_miner.py
│ ├── code_template.py
│ ├── main.py
│ ├── pipeline_explanation.py
│ └── templates
│ │ └── jupyter_content.json
├── generator.py
├── internal_path.py
├── meta_features.py
├── models
│ ├── PY310
│ │ ├── feature_importance.json
│ │ ├── mp_model_1.pkl
│ │ ├── mp_model_2.pkl
│ │ └── pp_models.pkl
│ ├── PY311
│ │ ├── feature_importance.json
│ │ ├── mp_model_1.pkl
│ │ ├── mp_model_2.pkl
│ │ └── pp_models.pkl
│ ├── PY39
│ │ ├── feature_importance.json
│ │ ├── mp_model_1.pkl
│ │ ├── mp_model_2.pkl
│ │ └── pp_models.pkl
│ ├── feature_importance.json
│ ├── model_metafeatures_test.csv
│ ├── mp_model_1.pkl
│ ├── mp_model_2.pkl
│ └── pp_models.pkl
├── params.py
├── preprocess
│ └── default
│ │ ├── __init__.py
│ │ ├── generator.py
│ │ ├── params.py
│ │ └── templates
│ │ ├── drop_one_value_columns.py.jinja
│ │ ├── handle_inf_columns.py.jinja
│ │ ├── handle_iterable_values.py.jinja
│ │ ├── handle_japanese_text.py.jinja
│ │ ├── handle_mixed_typed_columns.py.jinja
│ │ ├── none_has_columns.py.jinja
│ │ └── rename_columns.py.jinja
├── ps_macros.py
├── seeding
│ ├── __init__.py
│ └── predictor.py
├── templates
│ ├── explainability_templates
│ │ ├── component_description.json
│ │ ├── model_explanation.py.jinja
│ │ └── preprocessing_explanation.py.jinja
│ ├── model_templates
│ │ ├── classification_post_process.jinja
│ │ ├── hyperparameter_tuning.py.jinja
│ │ ├── hyperparameters.py.jinja
│ │ ├── hyperparameters_default_value.py.jinja
│ │ ├── model.py.jinja
│ │ ├── model_predict.py.jinja
│ │ ├── model_test.py.jinja
│ │ └── model_train.py.jinja
│ ├── other_templates
│ │ ├── confusion_matrix.py.jinja
│ │ ├── drop_columns.py.jinja
│ │ ├── evaluation.py.jinja
│ │ ├── evaluation_test.py.jinja
│ │ ├── hyperparameter_tuning_evaluation.py.jinja
│ │ ├── inverse_target.py.jinja
│ │ ├── permutation_importance.py.jinja
│ │ ├── prediction_result.py.jinja
│ │ ├── preprocess_dataset.py.jinja
│ │ ├── shap.py.jinja
│ │ ├── target_separation_predict.py.jinja
│ │ ├── target_separation_test.py.jinja
│ │ ├── target_separation_train.py.jinja
│ │ └── target_separation_validation.py.jinja
│ ├── pipeline_predict.py.jinja
│ ├── pipeline_test.py.jinja
│ ├── pipeline_train.py.jinja
│ ├── pipeline_validation.py.jinja
│ └── preprocessing_templates
│ │ ├── DATE.py.jinja
│ │ ├── DATE_predict.jinja
│ │ ├── DATE_train.jinja
│ │ ├── LabelEncoder.py.jinja
│ │ ├── LabelEncoder_predict.py.jinja
│ │ ├── LabelEncoder_train.py.jinja
│ │ ├── Processing.py.jinja
│ │ ├── Processing_predict.py.jinja
│ │ ├── Processing_train.py.jinja
│ │ ├── SMOTE.py.jinja
│ │ ├── STANDARD.py.jinja
│ │ ├── STANDARD_predict.py.jinja
│ │ ├── STANDARD_train.py.jinja
│ │ ├── TfidfVectorizer.py.jinja
│ │ ├── TfidfVectorizer_predict.py.jinja
│ │ ├── TfidfVectorizer_train.py.jinja
│ │ ├── fillna-type-numeric.py.jinja
│ │ ├── fillna-type-numeric_predict.py.jinja
│ │ ├── fillna-type-numeric_train.py.jinja
│ │ ├── fillna-type-string.py.jinja
│ │ ├── fillna-type-string_predict.py.jinja
│ │ ├── fillna-type-string_train.py.jinja
│ │ ├── get_dummies.py.jinja
│ │ ├── get_dummies_predict.py.jinja
│ │ ├── get_dummies_train.py.jinja
│ │ ├── log.py.jinja
│ │ ├── log_predict.py.jinja
│ │ └── log_train.py.jinja
├── training
│ ├── augmentation
│ │ ├── mutation_results.py
│ │ ├── mutation_runner.py
│ │ └── mutator.py
│ ├── dataflowmodel
│ │ ├── ast_operation.py
│ │ ├── dependent_api_extractor.py
│ │ └── determine_label_order.py
│ ├── denoising
│ │ ├── ast_info_collector.py
│ │ ├── dataset_snapshot_extractor.py
│ │ ├── determine_used_features.py
│ │ ├── df_collector.py
│ │ └── static_analysis_of_columns.py
│ ├── meta_feature_extractor.py
│ ├── meta_feature_selector.py
│ ├── meta_model_trainer.py
│ ├── pp_model_trainer.py
│ ├── project.py
│ └── project_corpus.py
└── util
│ └── file_util.py
└── tests
├── __init__.py
├── fixtures
├── datasets
│ ├── testdata_df.csv
│ ├── testdata_df_light.csv
│ ├── testdata_test.csv
│ ├── testdata_train.csv
│ └── testdata_valid.csv
├── outputs
│ └── .gitignore
└── params
│ ├── config.pkl
│ ├── dataset.pkl
│ └── task.pkl
└── sapientml
├── __init__.py
├── conftest.py
├── test_generatedcode.py
└── test_generatedcode_additional_patterns.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | */.env/*
4 | */.venv/*
5 | */.cache/*
6 | */tmp/*
7 | */mining/collector.py
8 | */utilities/dataset_utility.py
9 |
10 | [report]
11 | exclude_lines =
12 | pragma: no cover
13 | if __name__ == .__main__.:
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 |
3 | [*.py]
4 | indent_style = space
5 | indent_size = 4
6 | insert_final_newline = true
7 | trim_trailing_whitespace = true
8 | end_of_line = lf
9 | charset = utf-8
10 |
11 | [*.json,*.csv]
12 | insert_final_newline = ignore
13 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Show your code calling `generate_code()`.
16 |
17 |
18 | script
19 |
20 | ```python
21 | # Paste your code here. The following is an example.
22 | from sapientml import SapientMLGenerator
23 | sml = SapientMLGenerator()
24 | sml.generate_code('your arguments')
25 | ```
26 |
27 |
28 | 2. Attach the datasets or dataframes input to `generate_code()` if possible.
29 | 3. Show the generated code such as `1_default.py` when it was generated.
30 |
31 |
32 | generated code
33 |
34 | ```python
35 | # Paste the generated code here.
36 | ```
37 |
38 |
39 | 4. Show the messages of SapientML and/or generated code.
40 |
41 | **Expected behavior**
42 | A clear and concise description of what you expected to happen.
43 |
44 | **Environment (please complete the following information):**
45 | - OS: [e.g. Ubuntu 20.04]
46 | - Docker Version (if applicable): [Docker version 20.10.17, build 100c701]
47 | - Python Version: [e.g. 3.9.12]
48 | - SapientML Version: [e.g. 2.3.4]
49 |
50 |
51 | **Additional context**
52 | Add any other context about the problem here.
53 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: 'enhancement'
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # To get started with Dependabot version updates, you'll need to specify which
16 | # package ecosystems to update and where the package manifests are located.
17 | # Please see the documentation for all configuration options:
18 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
19 |
20 | version: 2
21 | updates:
22 | - package-ecosystem: "pip" # See documentation for possible values
23 | directory: "/" # Location of package manifests
24 | schedule:
25 | interval: "weekly"
26 |
--------------------------------------------------------------------------------
/.github/workflows/dependabot.yml:
--------------------------------------------------------------------------------
1 | name: Dependabot auto approve and merge
2 | on: pull_request
3 |
4 | permissions:
5 | pull-requests: write
6 | contents: write
7 |
8 | jobs:
9 | dependabot:
10 | runs-on: ubuntu-latest
11 | if: github.actor == 'dependabot[bot]'
12 | steps:
13 | - name: Dependabot metadata
14 | id: metadata
15 | uses: dependabot/fetch-metadata@v1
16 | with:
17 | github-token: "${{ secrets.GITHUB_TOKEN }}"
18 | - name: Approve a PR
19 | run: gh pr review --approve "$PR_URL"
20 | env:
21 | PR_URL: ${{github.event.pull_request.html_url}}
22 | GH_TOKEN: ${{secrets.GITHUB_TOKEN}}
23 | - name: Enable auto-merge for Dependabot PRs
24 | if: steps.metadata.outputs.update-type == 'version-update:semver-patch'
25 | run: gh pr merge --auto --merge "$PR_URL"
26 | env:
27 | PR_URL: ${{github.event.pull_request.html_url}}
28 | GH_TOKEN: ${{secrets.GITHUB_TOKEN}}
29 |
--------------------------------------------------------------------------------
/.github/workflows/greetings.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | name: Greetings
16 |
17 | on: [pull_request_target, issues]
18 |
19 | jobs:
20 | greeting:
21 | runs-on: ubuntu-latest
22 | permissions:
23 | issues: write
24 | pull-requests: write
25 | steps:
26 | - uses: actions/first-interaction@v1
27 | with:
28 | repo-token: ${{ secrets.GITHUB_TOKEN }}
29 | issue-message: "# 🎉 Thanks for submitting the issue to SapientML!!\n\nWe have the [Discord](https://discord.gg/59yshERFD9) server. Please join the server!"
30 | pr-message: "# 🎉 Thanks for submitting the PR to SapientML!!\n\nHere is the [Contribution Guideline](https://github.com/sapientml/sapientml/blob/main/CONTRIBUTING.md).\nWe would like you to read the document and follow it.\nIf you have any question or anything to be discussed, please join the [Discord](https://discord.gg/59yshERFD9) server and chat with us.\nThank again!"
31 |
--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | name: Lint
16 |
17 | on:
18 | pull_request:
19 | branches:
20 | - main
21 |
22 | env:
23 | PYTHON_VERSION: "3.10"
24 | POETRY_VERSION: "1.5.1"
25 | POETRY_URL: https://install.python-poetry.org
26 |
27 | jobs:
28 | test:
29 | runs-on: ubuntu-latest
30 | steps:
31 | - name: Checkout
32 | uses: actions/checkout@v4
33 | - name: Cache Packages
34 | uses: actions/cache@v2
35 | with:
36 | path: ~/.local
37 | key: poetry-${{ matrix.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/*.yml') }}
38 |
39 | - name: Set up Python ${{ env.PYTHON_VERSION }}
40 | uses: actions/setup-python@v4
41 | with:
42 | python-version: ${{ env.PYTHON_VERSION }}
43 |
44 | - name: Install Dependencies
45 | run: pip install pysen flake8 black isort==5.12.0
46 |
47 | - name: Pysen run lint
48 | run: pysen run lint
49 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | name: Release
16 |
17 | on:
18 | push:
19 | tags:
20 | - '*.*.*'
21 |
22 | env:
23 | POETRY_VERSION: "1.7.1"
24 | POETRY_URL: https://install.python-poetry.org
25 |
26 | jobs:
27 | test:
28 | strategy:
29 | matrix:
30 | version: ["3.10", "3.11"]
31 | test: [test_misc,
32 | test_regressor_works_number, test_regressor_works_with_nosparse,
33 | test_classifier_category_binary_num_noproba, test_classifier_category_binary_num_proba,
34 | test_classifier_category_multi_nonnum_metric_noproba, test_classifier_category_multi_nonnum_metric_proba,
35 | test_classifier_category_binary_boolean_metric_noproba, test_classifier_category_binary_boolean_metric_proba,
36 | test_classifier_category_binary_num_use_proba_with_metric_default_noproba, test_classifier_category_multi_nonnum_noproba_metric_with_proba,
37 | test_classifier_notext_nonegative_explanatry, test_classifier_works_with
38 | ]
39 | runs-on: ubuntu-latest
40 | steps:
41 | - name: Checkout
42 | uses: actions/checkout@v4
43 |
44 | - name: Set up Python ${{ matrix.version }}
45 | uses: actions/setup-python@v4
46 | with:
47 | python-version: ${{ matrix.version }}
48 |
49 | - name: Install Poetry
50 | run: |
51 | curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }}
52 | echo "$HOME/.local/bin" >> $GITHUB_PATH
53 |
54 | - name: Install Dependencies
55 | run: poetry install
56 |
57 | - name: Pytest
58 | run: poetry run -- pytest -k ${{ matrix.test }}
59 |
60 | - name: Upload Coverage
61 | uses: actions/upload-artifact@v4
62 | with:
63 | name: ${{ matrix.test }}
64 | include-hidden-files: true
65 | path: .coverage
66 | retention-days: 1
67 | overwrite: true
68 |
69 |
70 | additional_test:
71 | strategy:
72 | matrix:
73 | version: ["3.10", "3.11"]
74 | test: [test_additional_misc,
75 | test_additional_regressor_works_number, test_additional_regressor_works_with_nosparse,
76 | test_additional_classifier_category_binary_nonnum_noproba, test_additional_classifier_category_binary_nonnum_proba,
77 | test_additional_classifier_category_binary_num_noproba, test_additional_classifier_category_binary_num_proba,
78 | test_additional_classifier_category_multi_nonnum_metric_noproba, test_additional_classifier_category_multi_nonnum_metric_proba,
79 | test_additional_classifier_category_multi_num_metric_noproba, test_additional_classifier_category_multi_num_metric_proba,
80 | test_additional_classifier_category_binary_boolean_metric_noproba, test_additional_classifier_category_binary_boolean_metric_proba,
81 | test_additional_classifier_category_binary_num_use_proba_with_metric_default_noproba, test_additional_classifier_category_multi_nonnum_noproba_metric_with_proba,
82 | test_additional_classifier_works_with
83 | ]
84 | runs-on: ubuntu-latest
85 | steps:
86 | - name: Checkout
87 | uses: actions/checkout@v4
88 |
89 | - name: Set up Python ${{ matrix.version }}
90 | uses: actions/setup-python@v4
91 | with:
92 | python-version: ${{ matrix.version }}
93 |
94 | - name: Install Poetry
95 | run: |
96 | curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }}
97 | echo "$HOME/.local/bin" >> $GITHUB_PATH
98 |
99 | - name: Install Dependencies
100 | run: poetry install
101 |
102 | - name: Pytest
103 | run: poetry run -- pytest -k ${{ matrix.test }}
104 |
105 | - name: Upload Coverage
106 | uses: actions/upload-artifact@v4
107 | with:
108 | name: ${{ matrix.test }}
109 | include-hidden-files: true
110 | path: .coverage
111 | retention-days: 1
112 | overwrite: true
113 |
114 | report_coverage:
115 | runs-on: ubuntu-latest
116 | needs:
117 | - test
118 | - additional_test
119 | steps:
120 | - name: Checkout
121 | uses: actions/checkout@v4
122 |
123 | - name: Set up Python 3.11
124 | uses: actions/setup-python@v4
125 | with:
126 | python-version: 3.11
127 |
128 | - name: Download Coverage Files
129 | uses: actions/download-artifact@v4
130 |
131 | - name: Install coverage
132 | run: pip install coverage
133 |
134 | - name: Combine Coverage Files
135 | run: |
136 | mv --backup=t */.coverage .
137 | coverage combine -a
138 | coverage report
139 |
140 | - name: Report Coverage to CodeCov
141 | uses: codecov/codecov-action@v3
142 | with:
143 | token: ${{ secrets.CODECOV_TOKEN }}
144 |
145 | release:
146 | name: Release
147 | runs-on: ubuntu-latest
148 | steps:
149 | - name: Checkout
150 | uses: actions/checkout@v4
151 |
152 | - name: Set up Python 3.10
153 | uses: actions/setup-python@v4
154 | with:
155 | python-version: "3.10"
156 |
157 | - name: Install Poetry
158 | run: |
159 | curl -sSL https://install.python-poetry.org | python - -y
160 |
161 | - name: Update PATH
162 | run: echo "$HOME/.local/bin" >> $GITHUB_PATH
163 |
164 | - name: Set Version
165 | run: |
166 | SEMVER=$(git describe --exact-match --tags HEAD)
167 | sed -i "s/\(version *= *\).*/\1\"$SEMVER\"/" pyproject.toml
168 |
169 | - name: Build project for distribution
170 | run: poetry build
171 |
172 | - name: Check Version
173 | id: check-version
174 | run: |
175 | [[ "$(poetry version --short)" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] || echo prerelease=true >> $GITHUB_OUTPUT
176 |
177 | - name: Create Release
178 | uses: ncipollo/release-action@v1
179 | with:
180 | artifacts: "dist/*"
181 | token: ${{ secrets.GITHUB_TOKEN }}
182 | draft: false
183 | prerelease: steps.check-version.outputs.prerelease == 'true'
184 |
185 | - name: Publish to PyPI
186 | env:
187 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }}
188 | run: poetry publish --skip-existing
189 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | name: Testing
16 |
17 | on:
18 | pull_request:
19 | branches:
20 | - main
21 |
22 | env:
23 | POETRY_VERSION: "1.5.1"
24 | POETRY_URL: https://install.python-poetry.org
25 |
26 | jobs:
27 | test:
28 | strategy:
29 | matrix:
30 | version: ["3.10", "3.11"]
31 | test: [test_misc, test_regressor_works_number, test_regressor_works_with_nosparse,
32 | test_classifier_category_binary_num_noproba, test_classifier_category_binary_num_proba,
33 | test_classifier_category_multi_nonnum_metric_noproba, test_classifier_category_multi_nonnum_metric_proba,
34 | test_classifier_category_binary_boolean_metric_noproba, test_classifier_category_binary_boolean_metric_proba,
35 | test_classifier_category_binary_num_use_proba_with_metric_default_noproba, test_classifier_category_multi_nonnum_noproba_metric_with_proba,
36 | test_classifier_notext_nonegative_explanatry, test_classifier_works_with,
37 | ]
38 | runs-on: ubuntu-latest
39 | steps:
40 | - name: Checkout
41 | uses: actions/checkout@v4
42 |
43 | - name: Set up Python ${{ matrix.version }}
44 | uses: actions/setup-python@v4
45 | with:
46 | python-version: ${{ matrix.version }}
47 |
48 | - name: Install Poetry
49 | run: |
50 | curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }}
51 | echo "$HOME/.local/bin" >> $GITHUB_PATH
52 |
53 | - name: Install Dependencies
54 | run: poetry install
55 |
56 | - name: Pytest
57 | run: poetry run -- pytest -k ${{ matrix.test }}
58 |
59 | - name: Upload Coverage
60 | uses: actions/upload-artifact@v4
61 | with:
62 | name: ${{ matrix.test }}
63 | include-hidden-files: true
64 | path: .coverage
65 | retention-days: 1
66 | overwrite: true
67 |
68 | report_coverage:
69 | runs-on: ubuntu-latest
70 | needs:
71 | - test
72 | steps:
73 | - name: Checkout
74 | uses: actions/checkout@v4
75 |
76 | - name: Set up Python 3.11
77 | uses: actions/setup-python@v4
78 | with:
79 | python-version: 3.11
80 |
81 | - name: Download Coverage Files
82 | uses: actions/download-artifact@v4
83 |
84 | - name: Install coverage
85 | run: pip install coverage
86 |
87 | - name: Combine Coverage Files
88 | run: |
89 | mv --backup=t */.coverage .
90 | coverage combine -a
91 | coverage report
92 |
93 | - name: Report Coverage to CodeCov
94 | uses: codecov/codecov-action@v3
95 | with:
96 | token: ${{ secrets.CODECOV_TOKEN }}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | poetry.lock
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: local
3 | hooks:
4 | - id: pysen
5 | name: Run pysen
6 | entry: pysen run_files lint
7 | language: system
8 | types: [file, python]
9 |
--------------------------------------------------------------------------------
/.pysen/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool]
2 | [tool.black] # automatically generated by pysen
3 | # pysen ignores and overwrites any modifications
4 | line-length = 120
5 | target-version = ["py310"]
6 |
7 | [tool.isort] # automatically generated by pysen
8 | # pysen ignores and overwrites any modifications
9 | default_section = "THIRDPARTY"
10 | ensure_newline_before_comments = true
11 | force_grid_wrap = 0
12 | force_single_line = false
13 | include_trailing_comma = true
14 | line_length = 120
15 | multi_line_output = 3
16 | use_parentheses = true
17 |
--------------------------------------------------------------------------------
/.pysen/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # automatically generated by pysen
3 | # pysen ignores and overwrites any modifications
4 | # e203: black treats : as a binary operator
5 | # e231: black doesn't put a space after ,
6 | # e501: black may exceed the line-length to follow other style rules
7 | # w503 or w504: either one needs to be disabled to select w error codes
8 | ignore = E203,E231,E501,W503
9 | max-line-length = 120
10 | select = B,B950,C,E,F,W
11 |
12 |
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @sapientml/maintainers
2 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | authors = ["The SapientML Authors"]
3 | description = "A SapientML plugin of SapientMLGenerator"
4 | license = "Apache-2.0"
5 | maintainers = [
6 | "Kosaku Kimura ",
7 | "Akira Ura ",
8 | ]
9 | name = "sapientml-core"
10 | version = "0"
11 |
12 | [tool.poetry.dependencies]
13 | catboost = ">=1.2.3"
14 | imbalanced-learn = ">=0.11,<0.13"
15 | ipykernel = "^6.25.1"
16 | japanize-matplotlib = "^1.1.3"
17 | jinja2 = "^3.1.2"
18 | libcst = "^1.0.1"
19 | lightgbm = "^4.0.0"
20 | nbconvert = "^7.7.4"
21 | nbformat = "^5.9.2"
22 | nltk = "^3.8.1"
23 | numba = ">=0.57.1,<0.61.0"
24 | optuna = ">=3.2,<5.0"
25 | python = ">=3.9,<3.13"
26 | sapientml = "*"
27 | scikit-learn = "1.5.2"
28 | scipy = "^1.11.1"
29 | seaborn = ">=0.12.2,<0.14.0"
30 | shap = ">=0.43,<0.47"
31 | tqdm = "^4.66.1"
32 | xgboost = ">=1.7.6,<3.0.0"
33 | mecab-python3 = "^1.0.6"
34 | ipadic = "^1.0.0"
35 | fasttext-wheel = "^0.9.2"
36 | requests = "^2.31.0"
37 |
38 | [tool.poetry.group.dev.dependencies]
39 | black = ">=23.7,<25.0"
40 | flake8 = ">=6.1,<8.0"
41 | isort = "^5.12.0"
42 | pre-commit = ">=3.3.3,<5.0.0"
43 | pysen = ">=0.10.5,<0.12.0"
44 | pytest = ">=7.4,<9.0"
45 | pytest-cov = ">=4.1,<7.0"
46 | pytest-xdist = "^3.3.1"
47 |
48 | [build-system]
49 | build-backend = "poetry.core.masonry.api"
50 | requires = ["poetry-core>=1.0.0"]
51 |
52 | [tool.poetry.plugins."sapientml.config"]
53 | sapientml = "sapientml_core:SapientMLConfig"
54 |
55 | [tool.poetry.plugins."sapientml.pipeline_generator"]
56 | sapientml = "sapientml_core:SapientMLGenerator"
57 |
58 | [tool.poetry.plugins."sapientml.datastore"]
59 | localfile = "sapientml_core.datastore.localfile:LocalFile"
60 |
61 | [tool.poetry.plugins."sapientml.preprocess"]
62 | default = "sapientml_core.preprocess.default:DefaultPreprocess"
63 |
64 | [tool.poetry.plugins."sapientml.export_modules"]
65 | sample-dataset = "sapientml_core.datastore.localfile.export_modules"
66 |
67 | [tool.pysen]
68 | version = "0.11.0"
69 |
70 | [tool.pysen-cli]
71 | settings_dir = ".pysen"
72 |
73 | [tool.pysen.lint]
74 | enable_black = true
75 | enable_flake8 = true
76 | enable_isort = true
77 | enable_mypy = false
78 | line_length = 120
79 | py_version = "py310"
80 |
81 | [tool.pysen.lint.source]
82 | includes = ["sapientml_core/", "tests/"]
83 |
84 | [tool.pytest.ini_options]
85 | addopts = "-s -x --cov=sapientml_core"
86 | testpaths = ["tests"]
87 |
--------------------------------------------------------------------------------
/requirements-training.txt:
--------------------------------------------------------------------------------
1 | category-encoders==2.6.4
2 | patsy==0.5.6
3 | statsmodels==0.14.4
4 | tensorflow==2.18.0
5 | wordcloud==1.9.4
--------------------------------------------------------------------------------
/sapientml_core/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .generator import SapientMLGenerator
16 | from .params import SapientMLConfig
17 |
18 | __all__ = ["SapientMLGenerator", "SapientMLConfig"]
19 |
--------------------------------------------------------------------------------
/sapientml_core/adaptation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/sapientml_core/adaptation/artifacts/PY310/label_order.json:
--------------------------------------------------------------------------------
1 | [
2 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas",
3 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas",
4 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom",
5 | "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy",
6 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas",
7 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas",
8 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas",
9 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas",
10 | "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy",
11 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas",
12 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
13 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas",
14 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas",
15 | "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom",
16 | "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom",
17 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas",
18 | "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
19 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom",
20 | "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas",
21 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom",
22 | "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas",
23 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
24 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas",
25 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas",
26 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas",
27 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas",
28 | "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas",
29 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
30 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas",
31 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas"
32 | ]
--------------------------------------------------------------------------------
/sapientml_core/adaptation/artifacts/PY311/label_order.json:
--------------------------------------------------------------------------------
1 | [
2 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas",
3 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom",
4 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas",
5 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas",
6 | "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy",
7 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas",
8 | "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy",
9 | "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom",
10 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas",
11 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
12 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas",
13 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas",
14 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom",
15 | "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas",
16 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom",
17 | "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom",
18 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas",
19 | "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
20 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas",
21 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
22 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas",
23 | "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas",
24 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas",
25 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas",
26 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas",
27 | "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas",
28 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas",
29 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas",
30 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas",
31 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas"
32 | ]
--------------------------------------------------------------------------------
/sapientml_core/adaptation/artifacts/PY39/label_order.json:
--------------------------------------------------------------------------------
1 | [
2 | "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
3 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas",
4 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas",
5 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas",
6 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas",
7 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
8 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom",
9 | "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy",
10 | "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas",
11 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas",
12 | "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom",
13 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
14 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
15 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas",
16 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom",
17 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas",
18 | "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas",
19 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas",
20 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas",
21 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas",
22 | "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas",
23 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas",
24 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas",
25 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom",
26 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas",
27 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas",
28 | "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom",
29 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas",
30 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas",
31 | "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy"
32 | ]
--------------------------------------------------------------------------------
/sapientml_core/adaptation/artifacts/label_order.json:
--------------------------------------------------------------------------------
1 | [
2 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas",
3 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas",
4 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas",
5 | "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy",
6 | "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas",
7 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
8 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
9 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas",
10 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas",
11 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
12 | "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom",
13 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas",
14 | "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas",
15 | "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom",
16 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas",
17 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas",
18 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas",
19 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas",
20 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas",
21 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas",
22 | "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas",
23 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom",
24 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom",
25 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom",
26 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas",
27 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas",
28 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas",
29 | "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy",
30 | "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
31 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas"
32 | ]
--------------------------------------------------------------------------------
/sapientml_core/adaptation/generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/sapientml_core/adaptation/generation/predicate.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from ...enums import Operator
16 |
17 |
18 | class Predicate:
19 | """A class to represent the predicate.
20 |
21 | This class represents the data structure for loading a decision tree
22 | condition/predicate and provides a function that can evaluate whether
23 | the predicate is true for a particular column.
24 |
25 | """
26 |
27 | feature_name = ""
28 | _operator = ""
29 | _comparison_value = ""
30 |
31 | def __init__(self, feature_name, operator, comparison_value):
32 | """Constructs all the necessary attributes for the predicate object.
33 |
34 | Parameters
35 | ----------
36 | feature_name : str
37 | Meta feature name
38 | operator : Operator
39 | comparison_value : np.float
40 |
41 | """
42 | self.feature_name = feature_name
43 | self._operator = operator
44 | self._comparison_value = comparison_value
45 |
46 | def evaluate_predicate(self, meta_features):
47 | """Evaluate whether the predicate is true for a particular column.
48 |
49 | Parameters
50 | ----------
51 | meta_features : dict
52 |
53 | Returns
54 | -------
55 | result : bool
56 |
57 | Raises
58 | ------
59 | Exception
60 | False
61 |
62 | """
63 | try:
64 | actual_value = meta_features[self.feature_name]
65 | if actual_value == -1 or actual_value == 0:
66 | return False
67 | if actual_value is None:
68 | return False
69 | except Exception:
70 | return False
71 |
72 | result = False
73 | if self._operator is Operator.GREATER_THAN:
74 | result = actual_value > self._comparison_value
75 | elif self._operator is Operator.GREATER_THAN_OR_EQUAL_TO:
76 | result = actual_value >= self._comparison_value
77 | elif self._operator is Operator.EQUAL_TO:
78 | result = actual_value == self._comparison_value
79 | elif self._operator is Operator.LESS_THAN:
80 | result = actual_value < self._comparison_value
81 | elif self._operator is Operator.LESS_THAN_OR_EQUAL_TO:
82 | result = actual_value <= self._comparison_value
83 |
84 | return result
85 |
--------------------------------------------------------------------------------
/sapientml_core/adaptation/generation/preprocessing_label.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from ...enums import Operator
16 | from .predicate import Predicate
17 |
18 |
19 | class PreprocessingLabel:
20 | """A class to represent the preprocessinglabel.
21 |
22 | This script identifies the relevant columns in the dataset
23 | for each feature engineering components.
24 |
25 | """
26 |
27 | def __init__(self, label_name, meta_features, predicates):
28 | """Constructs all the necessary attributes for the preprocessinglabel object.
29 |
30 | Parameters
31 | ----------
32 | label_name : str
33 | Component name.
34 | meta_features : list
35 | Meta features selected.
36 | predicates : list
37 | predicates details.
38 |
39 | """
40 | self.label_name = label_name
41 | self.meta_features = meta_features
42 | self.predicate_objects = list()
43 | self._build_predicate_objects(predicates)
44 | self.relevant_columns = list()
45 | self.components_before = list()
46 | self.components_after = list()
47 | self.alternative_components = list()
48 |
49 | def __str__(self):
50 | return self.label_name
51 |
52 | def __repr__(self):
53 | return str(self)
54 |
55 | def _build_predicate_objects(self, predicates):
56 | for pred in predicates:
57 | feature_name = pred["feature_name"]
58 | operator = self._get_operator(pred["operator"])
59 | comparison_value = pred["threshold"]
60 | p = Predicate(feature_name, operator, comparison_value)
61 | self.predicate_objects.append(p)
62 |
63 | def _get_operator(self, op_string):
64 | if op_string == ">":
65 | return Operator.GREATER_THAN
66 | elif op_string == ">=":
67 | return Operator.GREATER_THAN_OR_EQUAL_TO
68 | elif op_string == "<":
69 | return Operator.LESS_THAN
70 | elif op_string == "<=":
71 | return Operator.LESS_THAN_OR_EQUAL_TO
72 | elif op_string == "==" or op_string == "=":
73 | return Operator.EQUAL_TO
74 | else:
75 | return Operator.NOT_EQUAL_TO
76 |
77 | def get_relevant_columns(self, dataset_summary, target, ignore_columns):
78 | """get_relevant_columns.
79 |
80 | Parameters
81 | ----------
82 | dataset_summary : DatasetSummary
83 | Object of the datasetsummary class.
84 | target : list
85 | ignore_columns : list
86 |
87 | Returns
88 | -------
89 | rel_columns_list : list
90 | Return the relavant column list.
91 |
92 | """
93 | rel_columns_list = []
94 |
95 | # approach 1: conjunction: a column is relavant if and only if all of the predicates applicable to that component are true
96 | # approach 2: disjunction: a column is relavant if and only if at least one of the predicates applicable to that component are true
97 | approach = 2
98 |
99 | for column_name, column in dataset_summary.columns.items():
100 | if column_name in ignore_columns:
101 | continue
102 |
103 | # error handling for log transform: don't apply if any col value <= 0
104 | if "PREPROCESS:Scaling:log" in self.label_name:
105 | if column.has_negative_value:
106 | continue
107 |
108 | result = list() # holds boolean results of all predicates applicable to a column
109 | for p in self.predicate_objects:
110 | # special handling of "target_imbalance_score" feature, since it should only be applied on target column
111 | if p.feature_name == "feature:target_imbalance_score":
112 | if column_name not in target:
113 | result.append(False)
114 | continue
115 | result.append(p.evaluate_predicate(column.meta_features))
116 |
117 | if approach == 1: # conjunction
118 | if all(result):
119 | rel_columns_list.append(column_name)
120 | elif approach == 2: # disjunction
121 | if any(result):
122 | rel_columns_list.append(column_name)
123 |
124 | return rel_columns_list
125 |
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .generator import LocalFile, LocalFileConfig
16 |
17 | __all__ = ["LocalFile", "LocalFileConfig"]
18 |
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/export_modules/sample_dataset.py:
--------------------------------------------------------------------------------
1 | from decimal import ROUND_HALF_UP, Decimal
2 |
3 | import pandas as pd
4 | from sklearn.model_selection import train_test_split
5 |
6 |
7 | def _sampled_training(dev_training_dataset, train_size, stratify, task_type) -> pd.DataFrame:
8 | sampled_training_dataset, _ = train_test_split(
9 | dev_training_dataset,
10 | train_size=train_size,
11 | stratify=stratify if task_type == "classification" else None,
12 | )
13 | return sampled_training_dataset # type: ignore
14 |
15 |
16 | def sample_dataset(
17 | dataframe: pd.DataFrame,
18 | sample_size: int,
19 | target_columns: list[str],
20 | task_type: str,
21 | ) -> pd.DataFrame:
22 | # Sample the training set if the dataset is big
23 | # FIXME
24 | sampled_training_dataset = None
25 | num_of_rows = len(dataframe.index)
26 | if num_of_rows >= sample_size:
27 | rare_labels = []
28 | dataframe_alltargets = None
29 | if task_type == "classification":
30 | dataframe_alltargets = dataframe[target_columns].astype(str).apply("".join, axis=1)
31 | label_count = dataframe_alltargets.value_counts()
32 | rare_labels = label_count.loc[label_count == 1].index.tolist()
33 |
34 | if rare_labels and dataframe_alltargets is not None:
35 | dataframe_rare = dataframe[dataframe_alltargets.isin(rare_labels)]
36 | rare_index = dataframe_rare.index.values
37 |
38 | dataframe_wo_rare = dataframe.drop(rare_index)
39 |
40 | num_of_labels = [len(dataframe_wo_rare[target].value_counts()) for target in target_columns]
41 |
42 | rare_to_all_ratio = int(
43 | Decimal(sample_size * len(dataframe_rare) / len(dataframe)).quantize(
44 | Decimal("0"), rounding=ROUND_HALF_UP
45 | )
46 | )
47 | not_rare_to_all_ratio = int(
48 | Decimal(sample_size * len(dataframe_wo_rare) / len(dataframe)).quantize(
49 | Decimal("0"), rounding=ROUND_HALF_UP
50 | )
51 | )
52 |
53 | stratify_wo_rare = None
54 |
55 | if len(dataframe_rare) == len(dataframe):
56 | sampled_training_dataset = _sampled_training(dataframe, sample_size, None, task_type)
57 |
58 | elif rare_to_all_ratio in [0, 1]:
59 | sampled_training_dataset_rare = dataframe_rare
60 |
61 | if max(num_of_labels) >= sample_size:
62 | stratify_wo_rare = None
63 | else:
64 | stratify_wo_rare = dataframe_wo_rare[target_columns]
65 | sampled_training_dataset_wo_rare = _sampled_training(
66 | dataframe_wo_rare,
67 | sample_size - len(sampled_training_dataset_rare),
68 | stratify_wo_rare,
69 | task_type,
70 | )
71 |
72 | sampled_training_dataset = pd.concat(
73 | [sampled_training_dataset_wo_rare, sampled_training_dataset_rare] # type: ignore
74 | )
75 |
76 | elif not_rare_to_all_ratio in [0, 1]:
77 | sampled_training_dataset_wo_rare = dataframe_wo_rare
78 | sampled_training_dataset_rare = _sampled_training(
79 | dataframe_rare,
80 | sample_size - len(sampled_training_dataset_wo_rare),
81 | None,
82 | task_type,
83 | )
84 |
85 | sampled_training_dataset = pd.concat(
86 | [sampled_training_dataset_wo_rare, sampled_training_dataset_rare] # type: ignore
87 | )
88 |
89 | else:
90 | if max(num_of_labels) >= sample_size:
91 | stratify_wo_rare = None
92 | else:
93 | stratify_wo_rare = dataframe_wo_rare[target_columns]
94 |
95 | sampled_training_dataset_wo_rare = _sampled_training(
96 | dataframe_wo_rare, not_rare_to_all_ratio, stratify_wo_rare, task_type
97 | )
98 | sampled_training_dataset_rare = _sampled_training(dataframe_rare, rare_to_all_ratio, None, task_type)
99 |
100 | sampled_training_dataset = pd.concat(
101 | [sampled_training_dataset_wo_rare, sampled_training_dataset_rare] # type: ignore
102 | )
103 |
104 | else:
105 | num_of_labels = [len(dataframe[target].value_counts()) for target in target_columns]
106 | if max(num_of_labels) >= sample_size:
107 | stratify_wo_rare = None
108 | else:
109 | stratify_wo_rare = dataframe[target_columns]
110 |
111 | sampled_training_dataset = _sampled_training(dataframe, sample_size, stratify_wo_rare, task_type)
112 | return sampled_training_dataset
113 | else:
114 | return dataframe
115 |
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/export_modules/split_timeseries_dataset.py:
--------------------------------------------------------------------------------
1 | from sklearn.model_selection import TimeSeriesSplit
2 |
3 |
4 | def split_dataset(dataset, split_column_name, split_num, split_index):
5 | dataset = dataset.sort_values(split_column_name)
6 | splitter = TimeSeriesSplit(n_splits=split_num)
7 | train_idx, test_idx = list(splitter.split(dataset))[split_index]
8 | train_dataset, test_dataset = dataset.iloc[train_idx], dataset.iloc[test_idx]
9 | for col in train_dataset.columns:
10 | if train_dataset[col].isnull().all():
11 | if test_dataset[col].dtype == float or test_dataset[col].dtype == int:
12 | train_dataset.loc[:, col] = 0
13 | elif test_dataset[col].dtype == object:
14 | train_dataset.loc[:, col] = ""
15 | elif test_dataset[col].dtype == bool:
16 | train_dataset.loc[:, col] = False
17 | return train_dataset, test_dataset
18 |
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/concat_train_validation.py.jinja:
--------------------------------------------------------------------------------
1 | train_dataset = pd.concat([train_dataset, validation_dataset]).reset_index(drop=True)
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/drop_ignore_columns.py.jinja:
--------------------------------------------------------------------------------
1 | # DROP IGNORED COLUMNS
2 | ignore_columns = {{ ignore_columns }}
3 |
4 | {% if train %}
5 | train_dataset = train_dataset.drop(ignore_columns, axis=1, errors="ignore")
6 | {% endif %}
7 | {% if validation %}
8 | validation_dataset = validation_dataset.drop(ignore_columns, axis=1, errors="ignore")
9 | {% endif %}
10 | {% if test %}
11 | test_dataset = test_dataset.drop(ignore_columns, axis=1, errors="ignore")
12 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/drop_inf_or_nan_rows.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | train_dataset = train_dataset[~train_dataset[{{target_columns}}].isin([np.inf, -np.inf, np.nan]).any(axis=1)]
3 |
4 |
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/load_localfile.py.jinja:
--------------------------------------------------------------------------------
1 | # LOAD DATA
2 | import pandas as pd
3 |
4 | {% if dataset.training_data_path.endswith(".pkl") %}
5 | train_dataset = pd.read_pickle(r"{{ dataset.training_data_path }}")
6 | {% else %}
7 | train_dataset = pd.read_csv(r"{{ dataset.training_data_path }}", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
8 | {% endif %}
9 |
10 | {% if dataset.validation_data_path %}
11 | {% if dataset.validation_data_path.endswith(".pkl") %}
12 | validation_dataset = pd.read_pickle(r"{{ dataset.validation_data_path }}")
13 | {% else %}
14 | validation_dataset = pd.read_csv(r"{{ dataset.validation_data_path }}", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
15 | {% endif %}
16 | {% endif %}{# if dataset.validation_data_path #}
17 |
18 | {% if not validation and dataset.test_data_path %}
19 | {% if dataset.test_data_path.endswith(".pkl") %}
20 | test_dataset = pd.read_pickle(r"{{ dataset.test_data_path }}")
21 | {% else %}
22 | test_dataset = pd.read_csv(r"{{ dataset.test_data_path }}", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
23 | {% endif %}
24 | {% endif %}{# if not validation and dataset.test_data_path #}
25 |
26 |
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/load_localfile_predict.py.jinja:
--------------------------------------------------------------------------------
1 | # LOAD DATA
2 | import pandas as pd
3 |
4 | {% if dataset.training_data_path.endswith(".pkl") %}
5 | test_dataset = pd.read_pickle("./test.pkl")
6 | {% else %}
7 | test_dataset = pd.read_csv("./test.csv", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
8 | {% endif %}
9 |
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/load_localfile_train.py.jinja:
--------------------------------------------------------------------------------
1 | # LOAD DATA
2 | import pandas as pd
3 |
4 | {% if dataset.training_data_path.endswith(".pkl") %}
5 | train_dataset = pd.read_pickle("./training.pkl")
6 | {% else %}
7 | train_dataset = pd.read_csv("./training.csv", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
8 | {% endif %}
9 |
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/set_index.py.jinja:
--------------------------------------------------------------------------------
1 | # SET ID_COLUMNS TO DATAFRAME'S INDEX
2 | id_columns_for_prediction = {{ id_columns_for_prediction }}
3 | test_dataset = test_dataset.set_index(id_columns_for_prediction, drop=False)
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/set_validation_as_test.py.jinja:
--------------------------------------------------------------------------------
1 | test_dataset = validation_dataset
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/split.py.jinja:
--------------------------------------------------------------------------------
1 | {% if (validation and (not dataset.validation_data_path)) or ((not validation) and (not dataset.test_data_path)) %}
2 |
3 | # TRAIN-TEST SPLIT
4 | {% if task.split_method == "random" %}
5 | {% if task.split_stratification %}
6 | from sklearn.model_selection import train_test_split
7 | def split_dataset(dataset, train_size={{ task.split_train_size }}, random_state={{ task.split_seed }}):
8 | train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state, stratify=dataset["{{task.target_columns[0]}}"])
9 | return train_dataset, test_dataset
10 | {% else %}
11 | from sklearn.model_selection import train_test_split
12 | def split_dataset(dataset, train_size={{ task.split_train_size }}, random_state={{ task.split_seed }}):
13 | train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state)
14 | return train_dataset, test_dataset
15 | {% endif %}
16 | {% elif task.split_method == "group" %}
17 | from sklearn.model_selection import GroupShuffleSplit
18 | def split_dataset(dataset, split_column_name="{{ task.split_column_name }}", train_size={{ task.split_train_size }}, random_state={{ task.split_seed }}):
19 | splitter = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=random_state)
20 | train_idx, test_idx = next(splitter.split(dataset, groups=dataset[split_column_name]))
21 | train_dataset, test_dataset = dataset.iloc[train_idx], dataset.iloc[test_idx]
22 | return train_dataset, test_dataset
23 | {% else %}{# time #}
24 | from lib.split_timeseries_dataset import split_dataset
25 | {% endif %}
26 | {% if not dataset.test_data_path %}
27 | {% if task.split_method == "random" or task.split_method == "group" %}
28 | train_dataset, test_dataset = split_dataset(train_dataset)
29 | {% else %}
30 | train_dataset, test_dataset = split_dataset(train_dataset, split_column_name="{{ task.split_column_name }}", split_num={{ task.time_split_num }}, split_index={{ task.time_split_index}})
31 | {% endif %}
32 | {% endif %}
33 | {% if validation %}
34 | {% endif %}
35 | {% endif %}
36 | {% if validation and (not dataset.validation_data_path) %}
37 | {% if task.split_method == "random" or task.split_method == "group" %}
38 | train_dataset, validation_dataset = split_dataset(train_dataset)
39 | {% else %}
40 | train_dataset, validation_dataset = split_dataset(train_dataset, split_column_name="{{ task.split_column_name }}", split_num={{ task.time_split_num }}, split_index={{ task.time_split_index}})
41 | {% endif %}
42 | {% endif %}
43 |
--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/subsample.py.jinja:
--------------------------------------------------------------------------------
1 | # SUBSAMPLE
2 | # If the number of rows of train_dataset is larger than sample_size, sample rows to sample_size for speedup.
3 | from lib.sample_dataset import sample_dataset
4 | train_dataset = sample_dataset(
5 | dataframe=train_dataset,
6 | sample_size={{ sample_size }},
7 | target_columns={{ task.target_columns }},
8 | task_type='{{ task.task_type }}'
9 | )
10 |
11 |
--------------------------------------------------------------------------------
/sapientml_core/design/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/design/__init__.py
--------------------------------------------------------------------------------
/sapientml_core/design/label_util.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | name_to_label_mapping = {
17 | "random forest": {
18 | "c": "MODEL:Classifier:RandomForestClassifier:sklearn",
19 | "r": "MODEL:Regressor:RandomForestRegressor:sklearn",
20 | },
21 | "extra tree": {
22 | "c": "MODEL:Classifier:ExtraTreesClassifier:sklearn",
23 | "r": "MODEL:Regressor:ExtraTreesRegressor:sklearn",
24 | },
25 | "lightgbm": {"c": "MODEL:Classifier:LGBMClassifier:lightgbm", "r": "MODEL:Regressor:LGBMRegressor:lightgbm"},
26 | "xgboost": {"c": "MODEL:Classifier:XGBClassifier:xgboost", "r": "MODEL:Regressor:XGBRegressor:xgboost"},
27 | "catboost": {
28 | "c": "MODEL:Classifier:CatBoostClassifier:catboost",
29 | "r": "MODEL:Regressor:CatBoostRegressor:catboost",
30 | },
31 | "gradient boosting": {
32 | "c": "MODEL:Classifier:GradientBoostingClassifier:sklearn",
33 | "r": "MODEL:Regressor:GradientBoostingRegressor:sklearn",
34 | },
35 | "adaboost": {"c": "MODEL:Classifier:AdaBoostClassifier:sklearn", "r": "MODEL:Regressor:AdaBoostRegressor:sklearn"},
36 | "decision tree": {
37 | "c": "MODEL:Classifier:DecisionTreeClassifier:sklearn",
38 | "r": "MODEL:Regressor:DecisionTreeRegressor:sklearn",
39 | },
40 | "svm": {"c": "MODEL:Classifier:SVC:sklearn", "r": "MODEL:Regressor:SVR:sklearn"},
41 | "linear svm": {"c": "MODEL:Classifier:LinearSVC:sklearn", "r": "MODEL:Regressor:LinearSVR:sklearn"},
42 | "logistic/linear regression": {
43 | "c": "MODEL:Classifier:LogisticRegression:sklearn",
44 | "r": "MODEL:Regressor:LinearRegression:sklearn",
45 | },
46 | "lasso": {"r": "MODEL:Regressor:Lasso:sklearn"},
47 | "sgd": {"c": "MODEL:Classifier:SGDClassifier:sklearn", "r": "MODEL:Regressor:SGDRegressor:sklearn"},
48 | "mlp": {"c": "MODEL:Classifier:MLPClassifier:sklearn", "r": "MODEL:Regressor:MLPRegressor:sklearn"},
49 | "multinomial nb": {"c": "MODEL:Classifier:MultinomialNB:sklearn"},
50 | "gaussian nb": {"c": "MODEL:Classifier:GaussianNB:sklearn"},
51 | "bernoulli nb": {"c": "MODEL:Classifier:BernoulliNB:sklearn"},
52 | }
53 |
54 |
55 | def map_label_to_name():
56 | """
57 | Assign several internal labels to each ML component.
58 |
59 | Returns
60 | ----------
61 | label_to_name : dict[str, str]
62 | Assigned result.
63 | """
64 | label_to_name = {"MODEL:Classifier:LGBMClassifier:lgbm": "lightgbm", "MODEL:Regressor:train:xgboost": "xgboost"}
65 | for k, v in name_to_label_mapping.items():
66 | for k1, v1 in v.items():
67 | label_to_name[v1] = k
68 | return label_to_name
69 |
--------------------------------------------------------------------------------
/sapientml_core/design/pp_component_groups.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | drop_label_list = [
17 | "PREPROCESS:MissingValues:dropna:pandas",
18 | "PREPROCESS:MissingValues:notnull:pandas",
19 | "PREPROCESS:MissingValues:isnull:pandas",
20 | ]
21 | filler_label = [
22 | "PREPROCESS:MissingValues:fillna:pandas",
23 | "PREPROCESS:MissingValues:SimpleImputer:sklearn",
24 | "PREPROCESS:MissingValues:KNNImputer:sklearn",
25 | "PREPROCESS:MissingValues:replace:pandas",
26 | "PREPROCESS:MissingValues:random:custom",
27 | "PREPROCESS:MissingValues:interpolate:sklearn",
28 | ]
29 | in_place_converter = [
30 | "PREPROCESS:Category:LabelEncoder:sklearn",
31 | "PREPROCESS:Category:factorize:pandas",
32 | "PREPROCESS:Category:replace:pandas",
33 | "PREPROCESS:Category:map:custom",
34 | "PREPROCESS:Category:apply:pandas",
35 | "PREPROCESS:Category:custom:pandas",
36 | ]
37 | one_hot = [
38 | "PREPROCESS:Category:get_dummies:pandas",
39 | "PREPROCESS:Category:OneHotEncoder:sklearn",
40 | "PREPROCESS:Category:LabelBinarizer:sklearn",
41 | ]
42 |
43 | text_vect = ["PREPROCESS:Text:CountVectorizer:sklearn", "PREPROCESS:Text:TfidfVectorizer:sklearn"]
44 |
45 | scaling = [
46 | "PREPROCESS:Scaling:STANDARD:sklearn",
47 | "PREPROCESS:Scaling:MIN_MAX:custom",
48 | "PREPROCESS:Scaling:MIN_MAX:sklearn",
49 | "PREPROCESS:Scaling:STANDARD:custom",
50 | "PREPROCESS:Scaling:Robust:sklearn",
51 | "PREPROCESS:Scaling:STANDARD:Pandas",
52 | "PREPROCESS:Scaling:normalize:sklearn",
53 | "PREPROCESS:Scaling:normalize:Pandas",
54 | "PREPROCESS:Scaling:STANDARD:pandas",
55 | ]
56 |
57 | date = [
58 | "PREPROCESS:GenerateColumn:date:pandas",
59 | "PREPROCESS:GenerateColumn:DATE:pandas",
60 | "PREPROCESS:GenerateColumn:DATE:custom",
61 | ]
62 |
63 | text_processing = [
64 | "PREPROCESS:Text:lower:pandas",
65 | "PREPROCESS:Text:remove_non_alpha:custom",
66 | "PREPROCESS:Text:tokenize:nltk",
67 | "PREPROCESS:Text:Lemmtize:nltk",
68 | ]
69 |
70 | balancing = [
71 | "PREPROCESS:Balancing:SMOTE:imblearn",
72 | "PREPROCESS:Balancing:resample:custom",
73 | "PREPROCESS:Balancing:sample:custom",
74 | ]
75 |
76 | log_transform = [
77 | "PREPROCESS:Scaling:log1p:numpy",
78 | "PREPROCESS:Scaling:power:custom",
79 | "PREPROCESS:Scaling:log:numpy",
80 | "PREPROCESS:Scaling:sqrt:numpy",
81 | "PREPROCESS:Scaling:exp:numpy",
82 | "PREPROCESS:Scaling:log:custom",
83 | "PREPROCESS:Scaling:power_transform:sklearn",
84 | ]
85 |
--------------------------------------------------------------------------------
/sapientml_core/design/search_space.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .. import ps_macros
16 | from .pp_component_groups import (
17 | balancing,
18 | date,
19 | drop_label_list,
20 | filler_label,
21 | in_place_converter,
22 | log_transform,
23 | one_hot,
24 | scaling,
25 | text_processing,
26 | text_vect,
27 | )
28 |
29 | target_labels = [
30 | ps_macros.FILL,
31 | ps_macros.IN_PLACE_CONVERT,
32 | ps_macros.ONE_HOT,
33 | ps_macros.VECT,
34 | ps_macros.DATE,
35 | ps_macros.LEMMITIZE,
36 | ps_macros.BALANCING,
37 | ps_macros.SCALING,
38 | ps_macros.LOG,
39 | ]
40 |
41 | # Manually created semantic labels
42 | # Semantic labels are those that cannot be discriminated by our current list of meta-features
43 |
44 |
45 | label_mapping = {
46 | # macros.DROP: drop_label_list,
47 | ps_macros.FILL: filler_label,
48 | ps_macros.IN_PLACE_CONVERT: in_place_converter,
49 | ps_macros.ONE_HOT: one_hot,
50 | ps_macros.VECT: text_vect,
51 | ps_macros.MISSING: drop_label_list + filler_label,
52 | ps_macros.CATG: in_place_converter + one_hot,
53 | ps_macros.DATE: date,
54 | ps_macros.LEMMITIZE: text_processing,
55 | ps_macros.SCALING: scaling,
56 | ps_macros.BALANCING: balancing,
57 | ps_macros.LOG: log_transform,
58 | }
59 |
60 | project_related_metadata = ["file_name", "notebook_name", "csv_name", "accuracy", "target_column_name"]
61 |
62 | meta_feature_list = [
63 | ps_macros.CATG_PRESENCE,
64 | ps_macros.TEXT_PRESENCE,
65 | ps_macros.BINARY_CATG_PRESENCE,
66 | ps_macros.SMALL_CATG_PRESENCE,
67 | ps_macros.LARGE_CATG_PRESENCE,
68 | ps_macros.MISSING_PRESENCE,
69 | ps_macros.NORMALIZED_MEAN,
70 | ps_macros.NORMALIZED_STD_DEV,
71 | ps_macros.NORMALIZED_VARIATION_ACROSS_COLUMNS,
72 | ps_macros.DATE_PRESENCE,
73 | ps_macros.IMBALANCE,
74 | ps_macros.MAX_SKEW,
75 | ]
76 |
--------------------------------------------------------------------------------
/sapientml_core/enums.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import enum
16 |
17 |
18 | # various operators in decision path for FE/pre-processing meta-models.
19 | class Operator(enum.Enum):
20 | EQUAL_TO = enum.auto()
21 | NOT_EQUAL_TO = enum.auto()
22 | GREATER_THAN = enum.auto()
23 | GREATER_THAN_OR_EQUAL_TO = enum.auto()
24 | LESS_THAN = enum.auto()
25 | LESS_THAN_OR_EQUAL_TO = enum.auto()
26 |
--------------------------------------------------------------------------------
/sapientml_core/explain/code_template.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import datetime
16 |
17 |
18 | class Code_Template:
19 | """Code Template class."""
20 |
21 | def __init__(self):
22 | self.str_reverse = {"NOW": str(datetime.datetime.now())}
23 |
24 | def update(self, lines):
25 | """update method.
26 |
27 | Parameters
28 | ----------
29 | lines : list[str]
30 | A line in block code from jupyter content template.
31 |
32 | Returns
33 | -------
34 | out : list[str]
35 | Updated line in block code from jupyter content template.
36 |
37 | """
38 | out = []
39 | for line in lines:
40 | for key in self.str_reverse:
41 | line = line.replace(key, self.str_reverse[key])
42 | out.append(line)
43 | return out
44 |
--------------------------------------------------------------------------------
/sapientml_core/explain/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Literal, Optional
16 |
17 | import pandas as pd
18 | from sapientml.params import CancellationToken
19 | from sapientml.util.logging import setup_logger
20 |
21 | from .AutoEDA import EDA
22 | from .AutoVisualization import AutoVisualization_Class
23 | from .code_miner import Miner
24 |
25 | logger = setup_logger()
26 |
27 |
28 | def process(
29 | visualization: bool,
30 | eda: bool,
31 | dataframe: pd.DataFrame,
32 | script_path: str,
33 | target_columns: list[str],
34 | problem_type: Literal["regression", "classification"],
35 | ignore_columns: Optional[list[str]] = None,
36 | skeleton: Optional[dict] = None,
37 | explanation: Optional[dict] = None,
38 | run_info: Optional[dict] = None,
39 | internal_execution: bool = False,
40 | timeout: int = 0,
41 | cancel: Optional[CancellationToken] = None,
42 | ):
43 | """process function.
44 |
45 | Parameters
46 | ----------
47 | visualization : bool
48 | True and otherwise False
49 | eda : bool
50 | True and otherwise False
51 | dataframe : pd.DataFrame
52 | dataframe input
53 | script_path : str
54 | Path of the script.
55 | target_columns : list[str]
56 | Names of target columns.
57 | problem_type : Literal["regression", "classification"]
58 | Type of problem either regression or classification
59 | ignore_columns : list[str], optional
60 | Column names which must not be used and must be dropped.
61 | skeleton : dict, optional
62 | Probabilty score and other details of preprocess and model components.
63 | explanation : dict, optional
64 | pipelines explanation
65 | run_info : dict, optional
66 | execution results, logs and other information.
67 | internal_execution : bool
68 | True and otherwise Flase
69 | timeout : int
70 | integer value for timeout
71 | cancel : CancellationToken, optional
72 |
73 | Returns
74 | -------
75 | output_files : List[str]
76 | list of .ipynb files.
77 |
78 | """
79 | output_files = None
80 |
81 | if visualization:
82 | # Call AutoVisualization to generate visualization codes
83 | AV = AutoVisualization_Class()
84 | visualization_code = AV.AutoVisualization(
85 | df=dataframe,
86 | target_columns=target_columns,
87 | problem_type=problem_type,
88 | ignore_columns=ignore_columns,
89 | )
90 | else:
91 | visualization_code = None
92 |
93 | if eda:
94 | # handle list(tuple, dict) value in dataframe.
95 | for col in dataframe.columns:
96 | exist_list_values = [x for x in dataframe[col] if type(x) in [list, tuple, dict]]
97 | if len(exist_list_values) > 0:
98 | dataframe[col] = dataframe[col].fillna("").astype(str)
99 | eda = EDA(dataframe, target_columns, log_level=2)
100 |
101 | eda.check_consistency(convert=False)
102 |
103 | categories, desc = eda.cat_process(threshold=0.01, IQR_activation=True, z_activation=True)
104 |
105 | initial_blocks = eda.description
106 | else:
107 | initial_blocks = []
108 |
109 | code_miner = Miner(
110 | script_path,
111 | init_blocks=initial_blocks,
112 | visualization_code=visualization_code,
113 | logger=logger,
114 | skeleton=skeleton,
115 | explanation=explanation,
116 | run_info=run_info,
117 | )
118 | output_files = code_miner.save_all(execution=internal_execution, timeout=timeout, cancel=cancel)
119 | return output_files
120 |
--------------------------------------------------------------------------------
/sapientml_core/explain/templates/jupyter_content.json:
--------------------------------------------------------------------------------
1 | {"# BEGIN": [["# Use a generic Kaggle dataset path to start"], []], "#*** PIPELINE ***": [["We have to preprocess the dataset as first step.", "Then, we will generate a pipeline to train a model."], []], "# LOAD DATA": [["# Input Dataset"], []], "# PREPROCESSING-number": [["# Feature Engineering"], []], "# DETATCH TARGET": [[], []], "# TRAIN TEST SPLIT": [["## Split Train/Test", "We have to seprate train and test before start straining a model"], []], "# MODEL": [["# Train a Model"], []]}
--------------------------------------------------------------------------------
/sapientml_core/internal_path.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from pathlib import Path
17 |
18 | sapientml_core_root = Path(__file__).parents[0]
19 |
20 | adaptation_root_dir = sapientml_core_root / "adaptation"
21 | artifacts_path = adaptation_root_dir / "artifacts"
22 | model_path = sapientml_core_root / "models"
23 |
24 | benchmark_path = sapientml_core_root / "benchmarks"
25 | corpus_path = sapientml_core_root / "corpus"
26 | training_cache = sapientml_core_root / ".cache"
27 |
28 | execution_cache_dir = training_cache / "exec_info"
29 | analysis_dir = training_cache / "analysis"
30 | clean_notebooks_dir_name = "clean-notebooks"
31 | clean_dir = corpus_path / clean_notebooks_dir_name
32 | project_labels_path = corpus_path / "annotated-notebooks" / "annotated-notebooks-1140.csv"
33 |
--------------------------------------------------------------------------------
/sapientml_core/models/PY310/mp_model_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY310/mp_model_1.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/PY310/mp_model_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY310/mp_model_2.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/PY310/pp_models.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY310/pp_models.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/PY311/mp_model_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY311/mp_model_1.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/PY311/mp_model_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY311/mp_model_2.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/PY311/pp_models.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY311/pp_models.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/PY39/mp_model_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY39/mp_model_1.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/PY39/mp_model_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY39/mp_model_2.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/PY39/pp_models.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY39/pp_models.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/mp_model_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/mp_model_1.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/mp_model_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/mp_model_2.pkl
--------------------------------------------------------------------------------
/sapientml_core/models/pp_models.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/pp_models.pkl
--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .generator import DefaultPreprocess
16 | from .params import DefaultPreprocessConfig
17 |
18 | __all__ = ["DefaultPreprocess", "DefaultPreprocessConfig"]
19 |
--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/params.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from sapientml.params import Config, String
4 |
5 |
6 | class DefaultPreprocessConfig(Config):
7 | """Configuration arguments for DefaultPreprocess class.
8 |
9 | Attributes
10 | ----------
11 | use_pos_list : Optional[list[str]]
12 | List of parts-of-speech to be used during text analysis.
13 | This variable is used for japanese texts analysis.
14 | Select the part of speech below.
15 | "名詞", "動詞", "形容詞", "形容動詞", "副詞".
16 | use_word_stemming : bool default True
17 | Specify whether or not word stemming is used.
18 | This variable is used for japanese texts analysis.
19 |
20 | """
21 |
22 | use_pos_list: Optional[list[String]] = ["名詞", "動詞", "助動詞", "形容詞", "副詞"]
23 | use_word_stemming: bool = True
24 |
--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/drop_one_value_columns.py.jinja:
--------------------------------------------------------------------------------
1 | # DISCARD COLUMNS WITH ONE VALUE ONLY
2 | cols_one_value_only = {{ cols_one_value_only }}
3 | {% if training %}
4 | train_dataset = train_dataset.drop(cols_one_value_only, axis=1, errors="ignore")
5 | {% endif %}
6 | {% if test %}
7 | test_dataset = test_dataset.drop(cols_one_value_only, axis=1, errors="ignore")
8 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/handle_inf_columns.py.jinja:
--------------------------------------------------------------------------------
1 | # CONVERT INF TO NAN
2 | import numpy as np
3 | cols_inf_values = {{ cols_inf_values }}
4 | {% if training %}
5 | train_dataset[cols_inf_values] = train_dataset[cols_inf_values].replace([-np.inf, np.inf], np.nan)
6 | {% endif %}
7 | {% if test %}
8 | test_dataset[cols_inf_values] = test_dataset[cols_inf_values].replace([-np.inf, np.inf], np.nan)
9 | {% endif %}
10 |
--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/handle_iterable_values.py.jinja:
--------------------------------------------------------------------------------
1 | # HANDLE ITERABLE VALUES IN DATAFRAME
2 | cols_iterable_values = {{ cols_iterable_values }}
3 | for col in cols_iterable_values:
4 | {% if training %}
5 | train_dataset[col] = train_dataset[col].fillna("").astype(str)
6 | {% endif %}
7 | {% if test %}
8 | test_dataset[col] = test_dataset[col].fillna("").astype(str)
9 | {% endif %}
10 |
11 |
--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/handle_japanese_text.py.jinja:
--------------------------------------------------------------------------------
1 | # HANDLE JAPANESE TEXT
2 | import MeCab
3 | import ipadic
4 | tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS)
5 | use_pos_list = {{ config.use_pos_list }}
6 | use_word_stemming = {{ config.use_word_stemming }}
7 | def tokenize(text, use_pos_list, use_word_stemming, tokenizer):
8 | node = tokenizer.parseToNode(text)
9 | terms = []
10 | while node:
11 | features = node.feature.split(",")
12 | pos = features[0]
13 | if pos != "BOS/EOS":
14 | if use_word_stemming:
15 | term = features[6]
16 | if (pos == "名詞") & (features[1] == "数"):
17 | term = node.surface
18 | else:
19 | term = node.surface
20 | if use_pos_list:
21 | if pos in use_pos_list:
22 | terms.append(term)
23 | else:
24 | terms.append(term)
25 | node = node.next
26 | return " ".join(terms)
27 | cols_japanese_text = {{ cols_japanese_text}}
28 | for col in cols_japanese_text:
29 | {% if training %}
30 | train_dataset[col] = train_dataset[col].fillna("").apply(lambda x: tokenize(x, use_pos_list, use_word_stemming, tokenizer))
31 | {% endif %}
32 | {% if test %}
33 | test_dataset[col] = test_dataset[col].fillna("").apply(lambda x: tokenize(x, use_pos_list, use_word_stemming, tokenizer))
34 | {% endif %}
35 |
36 |
--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/handle_mixed_typed_columns.py.jinja:
--------------------------------------------------------------------------------
1 | # HANDLE MIXED TYPE
2 | import numpy as np
3 | cols_numeric_and_string = {{ cols_numeric_and_string}}
4 | for col in cols_numeric_and_string:
5 | {% if training %}
6 | train_dataset[col + '__str'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), train_dataset[col], np.nan)
7 | train_dataset[col + '__str'] = np.where(train_dataset[col + '__str'].notnull(), train_dataset[col + '__str'].astype(str), np.nan)
8 | train_dataset[col + '__num'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), np.nan, train_dataset[col]).astype(float)
9 | train_dataset = train_dataset.drop(col, axis=1)
10 | {% endif %}
11 | {% if test %}
12 | test_dataset[col + '__str'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), test_dataset[col], np.nan)
13 | test_dataset[col + '__str'] = np.where(test_dataset[col + '__str'].notnull(), test_dataset[col + '__str'].astype(str), np.nan)
14 | test_dataset[col + '__num'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), np.nan, test_dataset[col]).astype(float)
15 | test_dataset = test_dataset.drop(col, axis=1)
16 | {% endif %}
17 |
--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/none_has_columns.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | {% if training %}
3 | train_dataset = train_dataset.replace([None], np.nan)
4 | {% endif %}
5 | {% if test %}
6 | test_dataset = test_dataset.replace([None], np.nan)
7 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/rename_columns.py.jinja:
--------------------------------------------------------------------------------
1 | # Remove special symbols that interfere with visualization and model training
2 | import re
3 | cols_has_symbols = {{ cols_has_symbols }}
4 | inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+")
5 | {% if training %}
6 | train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
7 | {% endif %}
8 | {% if test %}
9 | test_dataset = test_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
10 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/ps_macros.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | FILL = "PREPROCESS:MissingValues:fillna:pandas"
17 | IN_PLACE_CONVERT = "PREPROCESS:Category:LabelEncoder:sklearn"
18 | ONE_HOT = "PREPROCESS:Category:get_dummies:pandas"
19 | VECT = "PREPROCESS:Text:TfidfVectorizer:sklearn"
20 | MISSING = "PREPROCESS:MissingValues:all"
21 | CATG = "PREPROCESS:Category:all"
22 | SCALING = "PREPROCESS:Scaling:STANDARD:sklearn"
23 | DATE = "PREPROCESS:GenerateColumn:DATE:pandas"
24 | LEMMITIZE = "PREPROCESS:TextProcessing:Processing:custom"
25 | BALANCING = "PREPROCESS:Balancing:SMOTE:imblearn"
26 | LOG = "PREPROCESS:Scaling:log:custom"
27 |
28 | # Revised meta-features
29 |
30 | CATG_PRESENCE = "feature:str_category_presence"
31 | TEXT_PRESENCE = "feature:str_text_presence"
32 | BINARY_CATG_PRESENCE = "feature:str_category_binary_presence"
33 | SMALL_CATG_PRESENCE = "feature:str_category_small_presence"
34 | LARGE_CATG_PRESENCE = "feature:str_category_large_presence"
35 | DATE_PRESENCE = "feature:str_date_presence"
36 | STR_OTHER = "feature:str_other"
37 |
38 | MISSING_PRESENCE = "feature:missing_values_presence"
39 | DATE_PRESENCE = "feature:str_date_presence"
40 |
41 | NORMALIZED_MEAN = "feature:max_normalized_mean"
42 | NORMALIZED_STD_DEV = "feature:max_normalized_stddev"
43 | NORMALIZED_VARIATION_ACROSS_COLUMNS = "feature:normalized_variation_across_columns"
44 | IMBALANCE = "feature:target_imbalance_score"
45 | MAX_SKEW = "feature:max_skewness"
46 |
47 |
48 | TASK_CLASSIFICATION = "classification"
49 | TASK_REGRESSION = "regression"
50 |
--------------------------------------------------------------------------------
/sapientml_core/seeding/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/sapientml_core/templates/explainability_templates/model_explanation.py.jinja:
--------------------------------------------------------------------------------
1 | # Component: {{ target_component_name }}
2 | # Efficient Cause: {{ target_component_name }} is required in this pipeline since the dataset has {{ relevant_meta_feature_list }}.
--------------------------------------------------------------------------------
/sapientml_core/templates/explainability_templates/preprocessing_explanation.py.jinja:
--------------------------------------------------------------------------------
1 | # Component: {{ target_component_name }}
2 | # Efficient Cause: {{ target_component_name }} is required in this pipeline since the dataset has {{ relevant_meta_feature_list }}. The relevant features are: {{ relevant_column_list }}.
3 | # Purpose: {{ api_description }}
4 | # Form:
5 | # Input: {{ data_shape }}
6 | # Key hyperparameters used: {{ hyperparameters_description }}
7 | # Alternatives: Although {{ alternative_component_list }} can also be used for this dataset, {{ target_component_name }} is used because it has more {{ relevant_meta_feature_1 }} than {{ relevant_meta_feature_2 }}.
8 | # Order: {{ target_component_name }} should be applied {{ before_or_after }} {{ dependent_component_list }}
--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/classification_post_process.jinja:
--------------------------------------------------------------------------------
1 | # POST PROCESSING
2 | {% if pipeline.adaptation_metric.startswith("MAP_") %}
3 | y_pred_sorted_index = pd.DataFrame(np.argsort(-y_pred))
4 | y_pred = y_pred_sorted_index.apply(lambda x: model.classes_[x]).to_numpy()
5 | {% else %}
6 | if np.shape(y_pred)[1] == 2:
7 | y_pred = y_pred[:, 1]
8 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/hyperparameter_tuning.py.jinja:
--------------------------------------------------------------------------------
1 | # HYPERPARAMETER OPTIMIZATION
2 | import optuna
3 | from {{import_library}} import {{ model_name }}
4 |
5 |
6 | # NEED CV: ex.) optuna.integration.OptunaSearchCV()
7 | class Objective(object):
8 | def __init__(self, feature_train, target_train, feature_test, target_test, __random_state):
9 | self.feature_train = feature_train
10 | self.target_train = target_train
11 | self.feature_test = feature_test
12 | self.target_test = target_test
13 | self.__random_state = __random_state
14 | def __call__(self, trial):
15 | def set_hyperparameters(trial):
16 | params = {}
17 | {{ params }}
18 | return params
19 |
20 | # SET DATA
21 | import numpy as np
22 |
23 | if isinstance(self.feature_train, pd.DataFrame):
24 | feature_train = self.feature_train
25 | elif isinstance(self.feature_train, np.ndarray):
26 | feature_train = pd.DataFrame(self.feature_train)
27 | else:
28 | feature_train = pd.DataFrame(self.feature_train.toarray())
29 |
30 | if isinstance(self.target_train, pd.DataFrame):
31 | target_train = self.target_train
32 | elif isinstance(self.target_train, np.ndarray):
33 | target_train = pd.DataFrame(self.target_train)
34 | else:
35 | target_train = pd.DataFrame(self.target_train.toarray())
36 |
37 | if isinstance(self.feature_test, pd.DataFrame):
38 | feature_test = self.feature_test
39 | elif isinstance(self.feature_test, np.ndarray):
40 | feature_test = pd.DataFrame(self.feature_test)
41 | else:
42 | feature_test = pd.DataFrame(self.feature_test.toarray())
43 |
44 | if isinstance(self.target_test, pd.DataFrame):
45 | {% if 'inverse_target' in pipeline.pipeline_json %}
46 | target_test = self.target_test.copy()
47 | {% else %}
48 | target_test = self.target_test
49 | {% endif %}
50 | elif isinstance(self.target_test, np.ndarray):
51 | target_test = pd.DataFrame(self.target_test)
52 | else:
53 | target_test = pd.DataFrame(self.target_test.toarray())
54 |
55 | # MODEL
56 | params = set_hyperparameters(trial)
57 | {% if flag_no_random_seed_model %}
58 | model = {{ model_name }}(**params)
59 | {% else %}
60 | model = {{ model_name }}(random_state=self.__random_state, **params)
61 | {% endif %}
62 | {% if is_multioutput_regression%}
63 | from sklearn.multioutput import MultiOutputRegressor
64 |
65 | model = MultiOutputRegressor(model)
66 | {% elif is_multioutput_classification %}
67 | from sklearn.multioutput import MultiOutputClassifier
68 |
69 | model = MultiOutputClassifier(model)
70 | {% endif %}
71 | {% set xgbclassifier = "XGBClassifier" %}
72 | {% if model_name == xgbclassifier %}
73 | from sklearn.preprocessing import LabelEncoder
74 |
75 | label_encoder = LabelEncoder()
76 | target_train = label_encoder.fit_transform(target_train)
77 | {% endif %}
78 |
79 | {% if pipeline.task.target_columns|length == 1 %}
80 | {% if model_name == xgbclassifier %}
81 | model.fit(feature_train, target_train.ravel())
82 | {% else %}
83 | model.fit(feature_train, target_train.values.ravel())
84 | {% endif %}
85 | {% else %}
86 | model.fit(feature_train, target_train)
87 | {% endif %}
88 | {% if flag_predict_proba == False %}
89 | y_pred = model.predict(feature_test)
90 | {% if model_name == xgbclassifier and not flag_predict_proba%}
91 | y_pred = label_encoder.inverse_transform(y_pred)
92 | {% endif %}
93 | {% elif flag_predict_proba == True %}
94 | y_pred = model.predict_proba(feature_test)
95 | {% filter indent(width=8, first=True) %}
96 | {{ binary_classification_snippet }}
97 | {% endfilter %}
98 | {% endif %}
99 |
100 | {% if 'inverse_target' in pipeline.pipeline_json %}
101 | {% filter indent(width=8, first=True) %}
102 | {{ pipeline.pipeline_json['inverse_target_hpo']['code'] }}
103 | {% endfilter %}
104 | {% endif %}
105 |
106 | {{ evaluation }}
107 |
108 | return score
109 |
110 | n_trials = {{ pipeline.config.hyperparameter_tuning_n_trials }}
111 | timeout = {{ timeout }}
112 | random_state = {{ pipeline.config.hyperparameter_tuning_random_state}}
113 | random_state_model = {{ pipeline.config.seed_for_model}}
114 |
115 | {% set maximize_metrics = [macros.Metric.AUC.value, macros.Metric.Accuracy.value, macros.Metric.F1.value, macros.Metric.R2.value, macros.Metric.Gini.value, macros.Metric.ROC_AUC.value] %}
116 | {% set minimize_metrics = [macros.Metric.RMSE.value, macros.Metric.RMSLE.value, macros.Metric.MAE.value, macros.Metric.LogLoss.value] %}
117 |
118 | {% if pipeline.adaptation_metric in maximize_metrics %}
119 | direction = 'maximize'
120 | {% elif pipeline.adaptation_metric in minimize_metrics %}
121 | direction = 'minimize'
122 | {% else %}
123 | direction = 'maximize'
124 | {% endif %}
125 |
126 | study = optuna.create_study(direction=direction,
127 | sampler=optuna.samplers.TPESampler(seed=random_state))
128 | {{ enqueue_default_hyperparameters }}
129 | study.optimize(Objective(feature_train, target_train, feature_test, target_test, random_state_model),
130 | n_trials=n_trials,
131 | timeout=timeout)
132 | best_params = study.best_params
133 |
134 | print("best params:", best_params)
135 | print("RESULT: {{ pipeline.task.adaptation_metric }}: " + str(study.best_value))
--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/hyperparameters_default_value.py.jinja:
--------------------------------------------------------------------------------
1 | {% if model_name == 'RandomForestClassifier' %}
2 | default_hyperparameters = {'class_weight': None, 'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100, 'oob_score': False}
3 | {% elif model_name == 'RandomForestRegressor' %}
4 | default_hyperparameters = {'criterion': 'squared_error', 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 100, 'oob_score': False}
5 | {% elif model_name == 'ExtraTreesClassifier' %}
6 | default_hyperparameters = {'class_weight': None, 'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
7 | {% elif model_name == 'ExtraTreesRegressor' %}
8 | default_hyperparameters = {'criterion': 'squared_error', 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 100}
9 | {% elif model_name == 'GradientBoostingClassifier' %}
10 | default_hyperparameters = {'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100, 'subsample': 1.0}
11 | {% elif model_name == 'GradientBoostingRegressor' %}
12 | default_hyperparameters = {'alpha': 0.9, 'criterion': 'friedman_mse', 'loss': 'squared_error', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100, 'subsample': 1.0}
13 | {% elif model_name == 'AdaBoostClassifier' %}
14 | default_hyperparameters = {'algorithm': 'SAMME.R', 'n_estimators': 50}
15 | {% elif model_name == 'AdaBoostRegressor' %}
16 | default_hyperparameters = {'loss': 'linear', 'n_estimators': 50}
17 | {% elif model_name == 'DecisionTreeClassifier' %}
18 | default_hyperparameters = {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 1}
19 | {% elif model_name == 'DecisionTreeRegressor' %}
20 | default_hyperparameters = {'criterion': 'squared_error', 'max_features': None, 'min_samples_leaf': 1}
21 | {% elif model_name == 'SVC' %}
22 | default_hyperparameters = {'C': 1.0, 'class_weight': None}
23 | {% elif model_name == 'SVR' %}
24 | default_hyperparameters = {'C': 1.0}
25 | {% elif model_name == 'LinearSVC' %}
26 | default_hyperparameters = {'C': 1.0, 'class_weight': None, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'penalty': 'l2'}
27 | {% elif model_name == 'LinearSVR' %}
28 | default_hyperparameters = {'C': 1.0, 'intercept_scaling': 1.0, 'loss': 'epsilon_insensitive'}
29 | {% elif model_name == 'LogisticRegression' %}
30 | default_hyperparameters = {'C': 1.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
31 | {% elif model_name == 'SGDClassifier' %}
32 | default_hyperparameters = {'alpha': 0.0001, 'class_weight': None, 'early_stopping': False, 'loss': 'hinge', 'penalty': 'l2'}
33 | {% elif model_name == 'SGDRegressor' %}
34 | default_hyperparameters = {'alpha': 0.0001, 'loss': 'squared_error', 'penalty': 'l2'}
35 | {% elif model_name == 'Lasso' %}
36 | default_hyperparameters = {'alpha': 1.0}
37 | {% elif model_name == 'MLPClassifier' %}
38 | default_hyperparameters = {'activation': 'relu', 'alpha': 0.0001, 'solver': 'adam'}
39 | {% elif model_name == 'MLPRegressor' %}
40 | default_hyperparameters = {'activation': 'relu', 'alpha': 0.0001, 'solver': 'adam'}
41 | {% elif model_name == 'LGBMClassifier' or model_name == 'LGBMRegressor' %}
42 | default_hyperparameters = {'class_weight': None, 'colsample_bytree': 1.0, 'min_child_samples': 20, 'min_child_weight': 0.001, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1.0, 'subsample_freq': 0}
43 | {% elif model_name == 'XGBClassifier' %}
44 | default_hyperparameters = {'colsample_bytree': 1, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1}
45 | {% elif model_name == 'XGBRegressor' %}
46 | default_hyperparameters = {'colsample_bytree': 1, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1}
47 | {% elif model_name == 'CatBoostClassifier' or model_name == 'CatBoostRegressor' %}
48 | default_hyperparameters = {'boosting_type': 'Plain', 'depth': 6, 'bootstrap_type': 'MVS', 'silent': True}
49 | {% else %}
50 | default_hyperparameters = {}
51 | {% endif %}
52 | study.enqueue_trial(default_hyperparameters)
--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/model.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from {{import_library}} import {{ model_name }}
3 |
4 | {% if "CatBoost" in model_name %}
5 | {% set silent="silent=True, " %}
6 | {% else %}
7 | {% set silent="" %}
8 | {% endif %}
9 | {% if model_arg == "HPO_noRandomSeed" %}
10 | model = {{ model_name }}(**best_params)
11 | {% elif model_arg == "HPO_RandomSeed" %}
12 | random_state_model = {{ pipeline.config.seed_for_model}}
13 | model = {{ model_name }}(random_state=random_state_model, **best_params)
14 | {% elif model_arg == "noHPO_noRandomSeed" %}
15 | model = {{ model_name }}({{ silent }}{{ params }})
16 | {% elif model_arg == "noHPO_RandomSeed" %}
17 | random_state_model = {{ pipeline.config.seed_for_model}}
18 | model = {{ model_name }}({{ silent }}random_state=random_state_model, {{ params }})
19 | {% endif %}
20 |
21 | {% if is_multioutput_regression%}
22 | from sklearn.multioutput import MultiOutputRegressor
23 |
24 | model = MultiOutputRegressor(model)
25 | {% elif is_multioutput_classification %}
26 | from sklearn.multioutput import MultiOutputClassifier
27 |
28 | model = MultiOutputClassifier(model)
29 | {% endif %}
30 | {% set xgbclassifier = "XGBClassifier" %}
31 | {% if is_multioutput_classification %}
32 | from sklearn.preprocessing import LabelEncoder
33 | label_encoders = {}
34 | for i, column in enumerate(target_train.columns):
35 | le = LabelEncoder()
36 | target_train[column] = le.fit_transform(target_train[column])
37 | label_encoders[column] = le
38 | {% elif model_name == xgbclassifier %}
39 | from sklearn.preprocessing import LabelEncoder
40 |
41 | label_encoder = LabelEncoder()
42 | target_train = pd.DataFrame(label_encoder.fit_transform(target_train), columns=TARGET_COLUMNS)
43 | {% endif %}
44 | {% if pipeline.task.target_columns|length == 1 %}
45 | model.fit(feature_train, target_train.values.ravel())
46 | {% else %}
47 | model.fit(feature_train, target_train)
48 | {% endif %}
49 | y_pred = model.predict(feature_test)
50 | {% if flag_predict_proba and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %}
51 | y_pred = model.classes_[np.argmax(y_pred, axis=1)].reshape(-1, 1)
52 | {% endif %}
53 | {% if is_multioutput_classification %}
54 | y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
55 | for column in TARGET_COLUMNS:
56 | y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int))
57 | y_pred = y_pred_df
58 | {% elif model_name == xgbclassifier and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %}
59 | y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1)
60 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/model_predict.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | {% set xgbclassifier = "XGBClassifier" %}
4 |
5 | with open('model.pkl', 'rb') as f:
6 | model = pickle.load(f)
7 |
8 | {% if (pipeline.adaptation_metric not in macros.metric_needing_predict_proba) or (pipeline.config.predict_option == macros.PRED_DEFAULT) %}
9 | y_pred = model.predict(feature_test)
10 | {% endif %}
11 | {% if pipeline.adaptation_metric and flag_predict_proba %}
12 | y_prob = model.predict_proba(feature_test)
13 | {% endif %}
14 | {% if model_name == xgbclassifier or is_multioutput_classification %}
15 | with open('target_LabelEncoder.pkl', 'rb') as f:
16 | label_encoder = pickle.load(f)
17 | {% endif %}
18 | {% if is_multioutput_classification %}
19 | y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
20 | for column in TARGET_COLUMNS:
21 | y_pred_df[column] = label_encoder[column].inverse_transform(y_pred_df[column].astype(int))
22 | y_pred = y_pred_df
23 | {% elif model_name == xgbclassifier and ((pipeline.adaptation_metric not in macros.metric_needing_predict_proba) or (pipeline.config.predict_option == macros.PRED_DEFAULT)) %}
24 | y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1)
25 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/model_test.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from {{import_library}} import {{ model_name }}
3 |
4 | {% if "CatBoost" in model_name %}
5 | {% set silent="silent=True, " %}
6 | {% else %}
7 | {% set silent="" %}
8 | {% endif %}
9 | {% if model_arg == "HPO_noRandomSeed" %}
10 | model = {{ model_name }}(**best_params)
11 | {% elif model_arg == "HPO_RandomSeed" %}
12 | random_state_model = {{ pipeline.config.seed_for_model}}
13 | model = {{ model_name }}(random_state=random_state_model, **best_params)
14 | {% elif model_arg == "noHPO_noRandomSeed" %}
15 | model = {{ model_name }}({{ silent }}{{ params }})
16 | {% elif model_arg == "noHPO_RandomSeed" %}
17 | random_state_model = {{ pipeline.config.seed_for_model}}
18 | model = {{ model_name }}({{ silent }}random_state=random_state_model, {{ params }})
19 | {% endif %}
20 |
21 | {% if is_multioutput_regression%}
22 | from sklearn.multioutput import MultiOutputRegressor
23 |
24 | model = MultiOutputRegressor(model)
25 | {% elif is_multioutput_classification %}
26 | from sklearn.multioutput import MultiOutputClassifier
27 |
28 | model = MultiOutputClassifier(model)
29 | {% endif %}
30 | {% set xgbclassifier = "XGBClassifier" %}
31 | {% if is_multioutput_classification %}
32 | from sklearn.preprocessing import LabelEncoder
33 | label_encoders = {}
34 | for i, column in enumerate(target_train.columns):
35 | le = LabelEncoder()
36 | target_train[column] = le.fit_transform(target_train[column])
37 | label_encoders[column] = le
38 | {% elif model_name == xgbclassifier %}
39 | from sklearn.preprocessing import LabelEncoder
40 |
41 | label_encoder = LabelEncoder()
42 | target_train = pd.DataFrame(label_encoder.fit_transform(target_train), columns=TARGET_COLUMNS)
43 | {% endif %}
44 | {% if pipeline.task.target_columns|length == 1 %}
45 | model.fit(feature_train, target_train.values.ravel())
46 | {% else %}
47 | model.fit(feature_train, target_train)
48 | {% endif %}
49 | y_pred = model.predict(feature_test)
50 |
51 | {% if is_multioutput_classification %}
52 | y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
53 | for column in TARGET_COLUMNS:
54 | y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int))
55 | y_pred = y_pred_df
56 | {% elif model_name == xgbclassifier %}
57 | y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1)
58 | {% endif %}
59 |
60 | {% if pipeline.task.task_type == 'classification' %}
61 | y_prob = model.predict_proba(feature_test)
62 |
63 | # POST PROCESSING
64 | {% if pipeline.adaptation_metric.startswith("MAP_") %}
65 | y_prob_sorted_index = pd.DataFrame(np.argsort(-y_prob))
66 | y_prob_map_k = y_prob_sorted_index.apply(lambda x: model.classes_[x]).to_numpy()
67 | {% endif %}
68 |
69 | {% if not is_multioutput_classification %}
70 | if np.shape(y_prob)[1] == 2:
71 | y_prob = y_prob[:, 1]
72 | {% endif %}
73 |
74 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/model_train.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from {{import_library}} import {{ model_name }}
3 |
4 | {% if model_arg == "HPO_noRandomSeed" %}
5 | model = {{ model_name }}(**best_params)
6 | {% elif model_arg == "HPO_RandomSeed" %}
7 | random_state_model = {{ pipeline.config.seed_for_model}}
8 | model = {{ model_name }}(random_state=random_state_model, **best_params)
9 | {% elif model_arg == "noHPO_noRandomSeed" %}
10 | model = {{ model_name }}({{ params }})
11 | {% elif model_arg == "noHPO_RandomSeed" %}
12 | random_state_model = {{ pipeline.config.seed_for_model}}
13 | model = {{ model_name }}(random_state=random_state_model, {{ params }})
14 | {% endif %}
15 |
16 | {% if is_multioutput_regression%}
17 | from sklearn.multioutput import MultiOutputRegressor
18 |
19 | model = MultiOutputRegressor(model)
20 | {% elif is_multioutput_classification %}
21 | from sklearn.multioutput import MultiOutputClassifier
22 |
23 | model = MultiOutputClassifier(model)
24 | {% endif %}
25 | {% set xgbclassifier = "XGBClassifier" %}
26 | {% if is_multioutput_classification %}
27 | from sklearn.preprocessing import LabelEncoder
28 | label_encoders = {}
29 | for i, column in enumerate(target_train.columns):
30 | le = LabelEncoder()
31 | target_train[column] = le.fit_transform(target_train[column])
32 | label_encoders[column] = le
33 | with open('target_LabelEncoder.pkl', 'wb') as f:
34 | pickle.dump(label_encoders, f)
35 | {% elif model_name == xgbclassifier %}
36 | from sklearn.preprocessing import LabelEncoder
37 |
38 | label_encoder = LabelEncoder()
39 | target_train = pd.DataFrame(label_encoder.fit_transform(target_train), columns=TARGET_COLUMNS)
40 | with open('target_LabelEncoder.pkl', 'wb') as f:
41 | pickle.dump(label_encoder, f)
42 |
43 | {% endif %}
44 | {% if pipeline.task.target_columns|length == 1 %}
45 | model.fit(feature_train, target_train.values.ravel())
46 | {% else %}
47 | model.fit(feature_train, target_train)
48 | {% endif %}
49 | with open('model.pkl', 'wb') as f:
50 | pickle.dump(model, f)
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/confusion_matrix.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import ConfusionMatrixDisplay
2 | {% if pipeline.task.target_columns|length == 1 %}
3 | ConfusionMatrixDisplay.from_predictions(target_test, y_pred)
4 | {% elif is_multioutput_classification %}
5 | for i, column in enumerate(y_pred.columns):
6 | disp = ConfusionMatrixDisplay.from_predictions(target_test[column], y_pred[column].values)
7 | disp.ax_.set_title(column)
8 | {% else %}
9 | for i, column in enumerate(target_test.columns):
10 | disp = ConfusionMatrixDisplay.from_predictions(target_test[column], y_pred[:, i])
11 | disp.ax_.set_title(column)
12 | {% endif %}
13 |
14 |
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/drop_columns.py.jinja:
--------------------------------------------------------------------------------
1 | # DISCARD IRRELEVANT COLUMNS
2 | irrelevant_columns = {{ irrelevant_columns }}
3 | {% if train %}
4 | train_dataset = train_dataset.drop(irrelevant_columns, axis=1, errors="ignore")
5 | {% endif %}
6 | {% if test %}
7 | test_dataset = test_dataset.drop(irrelevant_columns, axis=1, errors="ignore")
8 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/evaluation.py.jinja:
--------------------------------------------------------------------------------
1 | {% if pipeline.adaptation_metric == macros.Metric.AUC.value %}
2 | from sklearn.metrics import roc_auc_score
3 | {% if pipeline.task.is_multiclass == True %}
4 | auc = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr")
5 | {% else %}
6 | auc = roc_auc_score(target_test, y_pred)
7 | {% endif %}
8 | print('RESULT: AUC Score: ' + str(auc))
9 | {% elif (pipeline.adaptation_metric == macros.Metric.Accuracy.value) and (not pipeline.is_multi_class_multi_targets) %}
10 | from sklearn.metrics import accuracy_score
11 |
12 | accuracy = accuracy_score(target_test, y_pred)
13 | print('RESULT: Accuracy: ' + str(accuracy))
14 | {% elif (pipeline.adaptation_metric == macros.Metric.Accuracy.value) and (pipeline.is_multi_class_multi_targets) %}
15 | from sklearn.metrics import accuracy_score
16 |
17 | __accs = []
18 | for i, col in enumerate(target_test.columns):
19 | one_acc = accuracy_score(target_test[col], y_pred[col])
20 | __accs.append(one_acc)
21 | print(f"RESULT: Accuracy : {str(sum(__accs)/len(__accs))}")
22 | {% elif pipeline.adaptation_metric == macros.Metric.F1.value and not is_multioutput_classification%}
23 | from sklearn import metrics
24 |
25 | f1 = metrics.f1_score(target_test, y_pred, average='macro')
26 | print('RESULT: F1 Score: ' + str(f1))
27 | {% elif pipeline.adaptation_metric == macros.Metric.F1.value and is_multioutput_classification%}
28 | from sklearn import metrics
29 |
30 | __f1s = []
31 | for i, col in enumerate(target_test.columns):
32 | one_f1 = metrics.f1_score(target_test[col], y_pred[col], average='macro')
33 | __f1s.append(one_f1)
34 | print(f"RESULT: F1 Score : {str(sum(__f1s)/len(__f1s))}")
35 | {% elif pipeline.adaptation_metric == macros.Metric.R2.value %}
36 | from sklearn import metrics
37 |
38 | r2 = metrics.r2_score(target_test, y_pred)
39 | print('RESULT: R2 Score:', str(r2))
40 | {% elif pipeline.adaptation_metric == macros.Metric.RMSE.value %}
41 | from sklearn.metrics import mean_squared_error
42 |
43 | rmse = mean_squared_error(target_test, y_pred, squared=False)
44 | print('RESULT: RMSE:', str(rmse))
45 | {% elif pipeline.adaptation_metric == macros.Metric.RMSLE.value %}
46 | import numpy as np
47 | from sklearn.metrics import mean_squared_log_error
48 |
49 | target_test = np.clip(target_test, 0, None)
50 | y_pred = np.clip(y_pred, 0, None)
51 | rmsle = np.sqrt(mean_squared_log_error(target_test, y_pred))
52 | print('RESULT: RMSLE:', str(rmsle))
53 | {% elif pipeline.adaptation_metric == macros.Metric.Gini.value %}
54 | from sklearn.metrics import roc_auc_score
55 | {% if pipeline.task.is_multiclass == True %}
56 | gini = 2 * roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") - 1
57 | {% else %}
58 | gini = 2 * roc_auc_score(target_test, y_pred) - 1
59 | {% endif %}
60 | print('RESULT: Gini: ' + str(gini))
61 | {% elif pipeline.adaptation_metric == macros.Metric.MAE.value %}
62 | from sklearn.metrics import mean_absolute_error
63 |
64 | mae = mean_absolute_error(target_test, y_pred)
65 | print('RESULT: MAE:', str(mae))
66 | {% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value %}
67 | from sklearn.metrics import log_loss
68 |
69 | log_loss = log_loss(target_test, y_pred)
70 | print('RESULT: Log Loss:', str(log_loss))
71 | {% elif pipeline.adaptation_metric == macros.Metric.ROC_AUC.value %}
72 | from sklearn.metrics import roc_auc_score
73 | {% if pipeline.task.is_multiclass == True %}
74 | __roc_auc = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr")
75 | {% else %}
76 | __roc_auc = roc_auc_score(target_test, y_pred)
77 | {% endif %}
78 | print('RESULT: ROC AUC:', str(__roc_auc))
79 | {% elif pipeline.adaptation_metric == macros.Metric.MCC.value %}
80 | from sklearn.metrics import matthews_corrcoef
81 |
82 | mcc = matthews_corrcoef(target_test, y_pred)
83 | print('RESULT: MCC:', str(mcc))
84 | {% elif pipeline.adaptation_metric.startswith("MAP_") %}
85 | {% set k = pipeline.adaptation_metric.split("_")[1] %}
86 | def apk(actual, predicted, k):
87 | if len(predicted)>k:
88 | predicted = predicted[:k]
89 |
90 | score = 0.0
91 | num_hits = 0.0
92 |
93 | for i,p in enumerate(predicted):
94 | if p in actual and p not in predicted[:i]:
95 | num_hits += 1.0
96 | score += num_hits / (i+1.0)
97 |
98 | return score / min(len(actual), k)
99 |
100 | def mapk(actual, predicted, k):
101 | """ Computes the mean average precision at k.
102 |
103 | Args:
104 | actual (list[list[str] or ndarray): A list of lists of elements that are to be predicted
105 | predicted (list[list[str] or ndarray): A list of lists of predicted elements
106 | (In each list, arrange in the order you predicted.)
107 | k (int): The maximum number of predicted elements
108 |
109 | Returns:
110 | double: The mean average precision at k over the input lists
111 | """
112 | return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
113 |
114 | map_k = mapk(target_test.to_numpy(), y_pred, k={{ k }})
115 | print('RESULT: MAP@K: ' + str(map_k))
116 | {% elif pipeline.adaptation_metric == macros.Metric.QWK.value %}
117 | from sklearn.metrics import cohen_kappa_score
118 |
119 | qwk = cohen_kappa_score(target_test, y_pred, weights='quadratic')
120 | print('RESULT: QWK:', str(qwk))
121 | {% elif pipeline.adaptation_metric == macros.Metric.MAPE.value %}
122 | from sklearn.metrics import mean_absolute_percentage_error
123 |
124 | mape = mean_absolute_percentage_error(target_test, y_pred)
125 | print('RESULT: MAPE:', str(mape))
126 |
127 | {% elif pipeline.task_type == macros.TASK_REGRESSION.value %}
128 | from sklearn import metrics
129 |
130 | r2 = metrics.r2_score(target_test, y_pred)
131 | print('RESULT: R2 Score:', str(r2))
132 | {% else %}
133 | from sklearn import metrics
134 |
135 | f1 = metrics.f1_score(target_test, y_pred, average='macro')
136 | print('RESULT: F1 Score: ' + str(f1))
137 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/evaluation_test.py.jinja:
--------------------------------------------------------------------------------
1 | {% if pipeline.task.task_type == macros.TASK_CLASSIFICATION %}
2 |
3 | ## Metric: F1
4 | from sklearn.metrics import f1_score
5 | {% if is_multioutput_classification%}
6 | f1_scores = []
7 | for i, column in enumerate(target_test.columns):
8 | f1_score_value = f1_score(target_test[column], y_pred[column], average='macro')
9 | f1_scores.append(f1_score_value)
10 | average_f1_score = np.mean(f1_scores)
11 | print('RESULT: Average F1 Score:', str(average_f1_score))
12 | {% else %}
13 | f1 = f1_score(target_test, y_pred, average='macro')
14 | print('RESULT: F1 Score: ' + str(f1))
15 | {% endif%}
16 |
17 | ## Metric: Accuracy
18 | from sklearn.metrics import accuracy_score
19 | {% if not pipeline.is_multi_class_multi_targets %}
20 | accuracy = accuracy_score(target_test, y_pred)
21 | print('RESULT: Accuracy: ' + str(accuracy))
22 | {% elif pipeline.is_multi_class_multi_targets %}
23 | __accs = []
24 | for i, col in enumerate(target_test.columns):
25 | one_acc = accuracy_score(target_test[col], y_pred[col])
26 | __accs.append(one_acc)
27 | print(f"RESULT: Average Accuracy : {str(sum(__accs)/len(__accs))}")
28 | {% endif %}
29 |
30 | ## Metric: AUC and Gini
31 | from sklearn.metrics import roc_auc_score
32 | {% if is_multioutput_classification %}
33 | auc_scores = []
34 | gini_scores = []
35 | for i, column in enumerate(target_test.columns):
36 | if y_prob[i].ndim == 2 and y_prob[i].shape[1] == 2:
37 | auc_score = roc_auc_score(target_test[column], y_prob[i][:, 1])
38 | elif y_prob[i].ndim == 2:
39 | auc_score = roc_auc_score(target_test[column], y_prob[i], multi_class="ovr")
40 | gini_score = 2 * auc_score - 1
41 | auc_scores.append(auc_score)
42 | gini_scores.append(gini_score)
43 | auc = np.mean(auc_scores)
44 | gini = np.mean(gini_scores)
45 | print('RESULT: Average AUC Score:', str(auc))
46 | print('RESULT: Average Gini Score:', str(gini))
47 | {% else %}
48 | {% if pipeline.task.is_multiclass == True %}
49 | auc = roc_auc_score(target_test.values.ravel(), y_prob, multi_class="ovr")
50 | {% else %}
51 | auc = roc_auc_score(target_test, y_prob)
52 | {% endif %}
53 | gini = 2 * auc - 1
54 | print('RESULT: AUC Score: ' + str(auc))
55 | print('RESULT: Gini: ' + str(gini))
56 | {% endif %}
57 |
58 | ## Metric: Log Loss
59 | from sklearn.metrics import log_loss
60 | {% if is_multioutput_classification %}
61 | log_loss_scores = []
62 | for i, column in enumerate(target_test.columns):
63 | loss = log_loss(target_test[column], y_prob[i])
64 | log_loss_scores.append(loss)
65 | avg_log_loss = np.mean(log_loss_scores)
66 | print('RESULT: Average Log Loss:', str(avg_log_loss))
67 | {% else %}
68 | log_loss_score = log_loss(target_test, y_prob)
69 | print('RESULT: Log Loss:', str(log_loss_score))
70 | {% endif %}
71 |
72 | {% if not is_multioutput_classification %}
73 |
74 | ## Metric: MCC
75 | from sklearn.metrics import matthews_corrcoef
76 |
77 | mcc = matthews_corrcoef(target_test, y_pred)
78 | print('RESULT: MCC:', str(mcc))
79 |
80 | ## Metric: QWK
81 | from sklearn.metrics import cohen_kappa_score
82 |
83 | qwk = cohen_kappa_score(target_test, y_pred, weights='quadratic')
84 | print('RESULT: QWK:', str(qwk))
85 |
86 | {% if pipeline.adaptation_metric.startswith("MAP_") %}
87 | ## Metric: MAP@K
88 | {% set k = pipeline.adaptation_metric.split("_")[1] %}
89 | def apk(actual, predicted, k):
90 | if len(predicted)>k:
91 | predicted = predicted[:k]
92 |
93 | score = 0.0
94 | num_hits = 0.0
95 |
96 | for i,p in enumerate(predicted):
97 | if p in actual and p not in predicted[:i]:
98 | num_hits += 1.0
99 | score += num_hits / (i+1.0)
100 |
101 | return score / min(len(actual), k)
102 |
103 | def mapk(actual, predicted, k):
104 | """ Computes the mean average precision at k.
105 |
106 | Args:
107 | actual (list[list[str] or ndarray): A list of lists of elements that are to be predicted
108 | predicted (list[list[str] or ndarray): A list of lists of predicted elements
109 | (In each list, arrange in the order you predicted.)
110 | k (int): The maximum number of predicted elements
111 |
112 | Returns:
113 | double: The mean average precision at k over the input lists
114 | """
115 | return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
116 |
117 | map_k = mapk(target_test.to_numpy(), y_prob_map_k, k={{ k }})
118 | print('RESULT: MAP@K: ' + str(map_k))
119 |
120 | {% endif %}
121 | {% endif %}
122 |
123 | {% elif pipeline.task.task_type == macros.TASK_REGRESSION %}
124 |
125 | ## Metric: R2
126 | from sklearn import metrics
127 |
128 | r2 = metrics.r2_score(target_test, y_pred)
129 | print('RESULT: R2 Score:', str(r2))
130 |
131 | ## Metric: RMSE
132 | from sklearn.metrics import mean_squared_error
133 |
134 | rmse = mean_squared_error(target_test, y_pred, squared=False)
135 | print('RESULT: RMSE:', str(rmse))
136 |
137 | ## Metric: RMSLE
138 | import numpy as np
139 | from sklearn.metrics import mean_squared_log_error
140 |
141 | target_test = np.clip(target_test, 0, None)
142 | y_pred = np.clip(y_pred, 0, None)
143 | rmsle = np.sqrt(mean_squared_log_error(target_test, y_pred))
144 | print('RESULT: RMSLE:', str(rmsle))
145 |
146 | ## Metric: MAE
147 | from sklearn.metrics import mean_absolute_error
148 |
149 | mae = mean_absolute_error(target_test, y_pred)
150 | print('RESULT: MAE:', str(mae))
151 |
152 | ## Metric: MAPE
153 | from sklearn.metrics import mean_absolute_percentage_error
154 |
155 | mape = mean_absolute_percentage_error(target_test, y_pred)
156 | print('RESULT: MAPE:', str(mape))
157 |
158 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/hyperparameter_tuning_evaluation.py.jinja:
--------------------------------------------------------------------------------
1 | {% if pipeline.adaptation_metric == macros.Metric.AUC.value %}
2 | from sklearn.metrics import roc_auc_score
3 | {% if pipeline.task.is_multiclass == True %}
4 | score = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr")
5 | {% else %}
6 | score = roc_auc_score(target_test, y_pred)
7 | {% endif %}
8 | {% elif pipeline.adaptation_metric == macros.Metric.Accuracy.value %}
9 | from sklearn.metrics import accuracy_score
10 | score = accuracy_score(target_test, y_pred)
11 | {% elif pipeline.adaptation_metric == macros.Metric.F1.value %}
12 | from sklearn import metrics
13 | score = metrics.f1_score(target_test, y_pred, average='macro')
14 | {% elif pipeline.adaptation_metric == macros.Metric.R2.value %}
15 | from sklearn import metrics
16 | score = metrics.r2_score(target_test, y_pred)
17 | {% elif pipeline.adaptation_metric == macros.Metric.RMSE.value %}
18 | from sklearn.metrics import mean_squared_error
19 | score = mean_squared_error(target_test, y_pred, squared=False)
20 | {% elif pipeline.adaptation_metric == macros.Metric.RMSLE.value %}
21 | import numpy as np
22 | from sklearn.metrics import mean_squared_log_error
23 | target_test = np.clip(target_test, 0, None)
24 | y_pred = np.clip(y_pred, 0, None)
25 | score = np.sqrt(mean_squared_log_error(target_test, y_pred))
26 | {% elif pipeline.adaptation_metric == macros.Metric.Gini.value %}
27 | from sklearn.metrics import roc_auc_score
28 | {% if pipeline.task.is_multiclass == True %}
29 | score = 2 * roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") - 1
30 | {% else %}
31 | score = 2 * roc_auc_score(target_test, y_pred) - 1
32 | {% endif %}
33 | {% elif pipeline.adaptation_metric == macros.Metric.MAE.value %}
34 | from sklearn.metrics import mean_absolute_error
35 | score = mean_absolute_error(target_test, y_pred)
36 | {% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value %}
37 | from sklearn.metrics import log_loss
38 | score = log_loss(target_test, y_pred)
39 | {% elif pipeline.adaptation_metric == macros.Metric.ROC_AUC.value %}
40 | from sklearn.metrics import roc_auc_score
41 | {% if pipeline.task.is_multiclass == True %}
42 | score = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr")
43 | {% else %}
44 | score = roc_auc_score(target_test, y_pred)
45 | {% endif %}
46 | {% elif pipeline.adaptation_metric.startswith("MAP_") %}
47 | {% set k = pipeline.adaptation_metric.split("_")[1] %}
48 | def apk(actual, predicted, k):
49 | if len(predicted)>k:
50 | predicted = predicted[:k]
51 |
52 | score = 0.0
53 | num_hits = 0.0
54 |
55 | for i,p in enumerate(predicted):
56 | if p in actual and p not in predicted[:i]:
57 | num_hits += 1.0
58 | score += num_hits / (i+1.0)
59 |
60 | return score / min(len(actual), k)
61 |
62 | def mapk(actual, predicted, k):
63 | """ Computes the mean average precision at k.
64 |
65 | Args:
66 | actual (list[list[str] or ndarray): A list of lists of elements that are to be predicted
67 | predicted (list[list[str] or ndarray): A list of lists of predicted elements
68 | (In each list, arrange in the order you predicted.)
69 | k (int): The maximum number of predicted elements
70 |
71 | Returns:
72 | double: The mean average precision at k over the input lists
73 | """
74 | return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
75 |
76 | score = mapk(target_test.to_numpy(), y_pred, k={{ k }})
77 | {% elif pipeline.adaptation_metric == macros.Metric.MAPE.value %}
78 | from sklearn.metrics import mean_absolute_percentage_error
79 | score = mean_absolute_percentage_error(target_test, y_pred)
80 | {% elif pipeline.task.task_type == macros.TASK_REGRESSION %}
81 | from sklearn import metrics
82 | score = metrics.r2_score(target_test, y_pred)
83 | {% else %}
84 | from sklearn import metrics
85 | score = metrics.f1_score(target_test, y_pred, average='macro')
86 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/inverse_target.py.jinja:
--------------------------------------------------------------------------------
1 | # INVERSE TARGET
2 | import numpy as np
3 |
4 | COLS_TO_BE_INVERSED = list(set(NUMERIC_COLS_TO_SCALE) & set(TARGET_COLUMNS))
5 | {% if flag_hyperparameter_tuning %}
6 | target_test[COLS_TO_BE_INVERSED] = np.expm1(target_test[COLS_TO_BE_INVERSED])
7 | y_pred = pd.DataFrame(data=y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
8 | {% else %}
9 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
10 | target_test[COLS_TO_BE_INVERSED] = np.expm1(target_test[COLS_TO_BE_INVERSED])
11 | y_pred = pd.DataFrame(data=y_pred, columns=TARGET_COLUMNS, index=test_dataset.index)
12 | {% endif %}
13 | y_pred[COLS_TO_BE_INVERSED] = np.expm1(y_pred[COLS_TO_BE_INVERSED])
14 | y_pred = y_pred.to_numpy()
15 |
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/permutation_importance.py.jinja:
--------------------------------------------------------------------------------
1 | # PERMUTATION IMPORTANCE
2 | from sklearn.inspection import permutation_importance
3 | {% if pipeline.task.target_columns|length == 1 %}
4 | {% set TARGET_TRAIN = 'target_train[TARGET_COLUMNS[0]]' %}
5 | {% else %}
6 | {% set TARGET_TRAIN = 'target_train' %}
7 | {% endif %}
8 | {% if pipeline.sparse_matrix %}
9 | if len(feature_train.columns) <= 100:
10 | perm = permutation_importance(model, feature_train.sparse.to_dense(), {{ TARGET_TRAIN }},
11 | n_repeats=5,
12 | random_state=0)
13 | perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean})
14 | perm_df.to_csv("./permutation_importance.csv", index=False)
15 | {% else %}
16 | if len(feature_train.columns) <= 100:
17 | perm = permutation_importance(model, feature_train, {{ TARGET_TRAIN }},
18 | n_repeats=5,
19 | random_state=0)
20 | perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean})
21 | perm_df.to_csv("./permutation_importance.csv", index=False)
22 | {% endif %}
23 |
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/prediction_result.py.jinja:
--------------------------------------------------------------------------------
1 | # OUTPUT PREDICTION
2 | {% set xgbclassifier = "XGBClassifier" %}
3 | {% if pipeline.config.predict_option == macros.PRED_PROBABILITY and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
4 | prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index)
5 | {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
6 | prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index)
7 | {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
8 | prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index)
9 | {% elif pipeline.config.predict_option is none and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
10 | prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index)
11 | {% elif pipeline.config.predict_option is none and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
12 | prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index)
13 | {% elif pipeline.config.predict_option is none and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
14 | prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index)
15 | {% elif pipeline.adaptation_metric.startswith("MAP_") %}
16 | {% set k = pipeline.adaptation_metric.split("_")[1] %}
17 | {% if y_prob_map_k is none %}
18 | prediction = pd.DataFrame(y_prob, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_prob.shape[1] + 1)], index=feature_test.index)
19 | {% elif is_multioutput_classification %}
20 | prediction = y_pred
21 | {% else %}
22 | prediction = pd.DataFrame(y_prob_map_k, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_prob_map_k.shape[1] + 1)], index=feature_test.index)
23 | {% endif %}
24 | {% else %}
25 | prediction = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
26 | {% endif %}
27 | prediction.to_csv("./prediction_result.csv")
28 |
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/preprocess_dataset.py.jinja:
--------------------------------------------------------------------------------
1 | # Export preprocessed dataset
2 | import time
3 | timestamp_str = time.strftime("%Y%m%d_%H%M%S")
4 | preprocess_dataset=pd.concat([pd.concat([feature_train,
5 | target_train], axis=1),
6 | pd.concat([feature_test,
7 | target_test], axis=1)])
8 | preprocess_dataset.to_pickle(f"./preprocess_dataset_{timestamp_str}.pickle")
9 |
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/shap.py.jinja:
--------------------------------------------------------------------------------
1 | # Models are restricted because of execution time.
2 | {% set lgbmclassifier = "LGBMClassifier" %}
3 | models_for_shap = ['XGBClassifier', 'XGBRegressor', 'LGBMClassifier', 'LGBMRegressor', 'GradientBoostingClassifier', 'GradientBoostingRegressor']
4 | if model.__class__.__name__ in models_for_shap:
5 | import shap
6 | feature_shap = feature_train.sample(1000) if feature_train.shape[0] > 1000 else feature_train
7 | {% if model_name == lgbmclassifier %}
8 | explainer = shap.Explainer(model,feature_shap)
9 | {% else %}
10 | explainer = shap.Explainer(model)
11 | {% endif %}
12 | shap_values = explainer(feature_shap)
13 |
14 | # summarize the effects of all the features
15 | shap.plots.beeswarm(shap_values)
16 |
17 | #bar plots
18 | shap.plots.bar(shap_values)
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/target_separation_predict.py.jinja:
--------------------------------------------------------------------------------
1 | # DETACH TARGET
2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }}
3 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
4 | feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1)
5 | target_test = test_dataset[TARGET_COLUMNS].copy()
6 | else:
7 | feature_test = test_dataset
8 |
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/target_separation_test.py.jinja:
--------------------------------------------------------------------------------
1 | # DETACH TARGET
2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }}
3 | feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1)
4 | target_train = train_dataset[TARGET_COLUMNS].copy()
5 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
6 | feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1)
7 | target_test = test_dataset[TARGET_COLUMNS].copy()
8 | else:
9 | feature_test = test_dataset
10 |
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/target_separation_train.py.jinja:
--------------------------------------------------------------------------------
1 | # DETACH TARGET
2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }}
3 | feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1)
4 | target_train = train_dataset[TARGET_COLUMNS].copy()
--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/target_separation_validation.py.jinja:
--------------------------------------------------------------------------------
1 | # DETACH TARGET
2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }}
3 | feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1)
4 | target_train = train_dataset[TARGET_COLUMNS].copy()
5 | feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1)
6 | target_test = test_dataset[TARGET_COLUMNS].copy()
7 |
8 |
--------------------------------------------------------------------------------
/sapientml_core/templates/pipeline_predict.py.jinja:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | {% if 'discard_columns' in pipeline_json %}
4 |
5 | {{ pipeline_json['discard_columns']['code_predict'] }}
6 | {% endif %}
7 | {% if 'preprocessing_before_target_separation' in pipeline_json %}
8 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %}
9 | {% for code in component['code_predict'] %}
10 |
11 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
12 | {{ code }}
13 | {% endfor %}
14 | {% endfor %}
15 | {% endif %}
16 | {% if 'target_separation' in pipeline_json %}
17 |
18 | {{ pipeline_json['target_separation']['code_predict'] }}
19 | {% endif %}
20 | {% if 'preprocessing_after_target_separation' in pipeline_json %}
21 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %}
22 | {% for code in component['code_predict'] %}
23 |
24 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
25 | {{ code }}
26 | {% endfor %}
27 | {% endfor %}
28 | {% endif %}
29 | {% if 'preprocessing_after_train_test_split' in pipeline_json %}
30 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %}
31 | {% for code in component['code_predict'] %}
32 |
33 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
34 | {{ code }}
35 | {% endfor %}
36 | {% endfor %}
37 | {% endif %}
38 | {% if 'model' in pipeline_json %}
39 |
40 | # MODEL
41 | {{ pipeline_json['model']['code_predict'] }}
42 | {% endif %}
43 | {% if 'inverse_target' in pipeline_json %}
44 |
45 | {{ pipeline_json['inverse_target']['code'] }}
46 | {% endif %}
47 | {% if 'evaluation' in pipeline_json %}
48 |
49 | #EVALUATION
50 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
51 | {% filter indent(width=4, first=True) %}
52 | {{ pipeline_json['evaluation']['code_predict'] }}
53 | {% endfilter %}
54 | {% endif %}
55 | {% if 'output_prediction' in pipeline_json %}
56 |
57 | {{ pipeline_json['output_prediction']['code'] }}
58 | {% endif %}
59 |
--------------------------------------------------------------------------------
/sapientml_core/templates/pipeline_test.py.jinja:
--------------------------------------------------------------------------------
1 | {% if 'discard_columns' in pipeline_json %}
2 |
3 | {{ pipeline_json['discard_columns']['code'] }}
4 | {% endif %}
5 | {% if 'preprocessing_before_target_separation' in pipeline_json %}
6 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %}
7 | {% for code in component['code'] %}
8 |
9 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
10 | {{ code }}
11 | {% endfor %}
12 | {% endfor %}
13 | {% endif %}
14 | {% if 'target_separation' in pipeline_json %}
15 |
16 | {{ pipeline_json['target_separation']['code_test'] }}
17 | {% endif %}
18 | {% if 'preprocessing_after_target_separation' in pipeline_json %}
19 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %}
20 | {% for code in component['code'] %}
21 |
22 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
23 | {{ code }}
24 | {% endfor %}
25 | {% endfor %}
26 | {% endif %}
27 | {% if 'preprocessing_after_train_test_split' in pipeline_json %}
28 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %}
29 | {% for code in component['code'] %}
30 |
31 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
32 | {{ code }}
33 | {% endfor %}
34 | {% endfor %}
35 | {% endif %}
36 | {% if flag_hyperparameter_tuning %}
37 |
38 | # BEST PARAMETERS IN THE CANDIDATE SCRIPT
39 | # PLEASE SEE THE CANDIDATE SCRIPTS FOR THE HYPERPARAMTER OPTIMIZATION CODE
40 | best_params = study.best_params
41 | {% endif %}
42 |
43 | {% if 'preprocess_dataset' in pipeline_json %}
44 | {{ pipeline_json['preprocess_dataset']['code_test'] }}
45 |
46 | {% endif %}
47 | {% if 'model' in pipeline_json %}
48 |
49 | # MODEL
50 | {{ pipeline_json['model']['code_test'] }}
51 | {% endif %}
52 | {% if 'inverse_target' in pipeline_json %}
53 |
54 | {{ pipeline_json['inverse_target']['code'] }}
55 | {% endif %}
56 | {% if 'evaluation' in pipeline_json %}
57 |
58 | #EVALUATION
59 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
60 | {% filter indent(width=4, first=True) %}
61 | {{ pipeline_json['evaluation']['code_test'] }}
62 | {% endfilter %}
63 | {% endif %}
64 |
65 | {% if 'confusion_matrix' in pipeline_json and pipeline.task.task_type == 'classification'%}
66 | # Confusion Matrix
67 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
68 | {% filter indent(width=4, first=True) %}
69 | {{ pipeline_json['confusion_matrix']['code'] }}
70 | {% endfilter %}
71 | {% endif %}
72 | {% if 'output_prediction' in pipeline_json %}
73 |
74 | {{ pipeline_json['output_prediction']['code'] }}
75 | {% endif %}
76 | {% if 'permutation_importance' in pipeline_json %}
77 |
78 | {{ pipeline_json['permutation_importance']['code'] }}
79 | {% endif %}
80 |
81 | {% if 'shap' in pipeline_json and not pipeline.task.is_multiclass %}
82 |
83 | {{ pipeline_json['shap']['code'] }}
84 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/pipeline_train.py.jinja:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | {% if 'discard_columns' in pipeline_json %}
4 |
5 | {{ pipeline_json['discard_columns']['code_train'] }}
6 | {% endif %}
7 | {% if 'preprocessing_before_target_separation' in pipeline_json %}
8 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %}
9 | {% for code in component['code_train'] %}
10 |
11 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
12 | {{ code }}
13 | {% endfor %}
14 | {% endfor %}
15 | {% endif %}
16 | {% if 'target_separation' in pipeline_json %}
17 |
18 | {{ pipeline_json['target_separation']['code_train'] }}
19 | {% endif %}
20 | {% if 'preprocessing_after_target_separation' in pipeline_json %}
21 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %}
22 | {% for code in component['code_train'] %}
23 |
24 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
25 | {{ code }}
26 | {% endfor %}
27 | {% endfor %}
28 | {% endif %}
29 | {% if 'preprocessing_after_train_test_split' in pipeline_json %}
30 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %}
31 | {% for code in component['code_train'] %}
32 |
33 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
34 | {{ code }}
35 | {% endfor %}
36 | {% endfor %}
37 | {% endif %}
38 | {% if flag_hyperparameter_tuning %}
39 |
40 | # BEST PARAMETERS IN THE CANDIDATE SCRIPT
41 | # PLEASE SEE THE CANDIDATE SCRIPTS FOR THE HYPERPARAMTER OPTIMIZATION CODE
42 | best_params = study.best_params
43 | {% endif %}
44 | {% if 'model' in pipeline_json %}
45 |
46 | # MODEL
47 | {{ pipeline_json['model']['code_train'] }}
48 | {% endif %}
49 |
--------------------------------------------------------------------------------
/sapientml_core/templates/pipeline_validation.py.jinja:
--------------------------------------------------------------------------------
1 | {% if 'discard_columns' in pipeline_json %}
2 |
3 | {{ pipeline_json['discard_columns']['code'] }}
4 | {% endif %}
5 | {% if 'preprocessing_before_target_separation' in pipeline_json %}
6 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %}
7 | {% for code in component['code'] %}
8 |
9 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
10 | {{ code }}
11 | {% endfor %}
12 | {% endfor %}
13 | {% endif %}
14 | {% if 'target_separation' in pipeline_json %}
15 |
16 | {{ pipeline_json['target_separation']['code_validation'] }}
17 | {% endif %}
18 | {% if 'preprocessing_after_target_separation' in pipeline_json %}
19 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %}
20 | {% for code in component['code'] %}
21 |
22 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
23 | {{ code }}
24 | {% endfor %}
25 | {% endfor %}
26 | {% endif %}
27 | {% if 'preprocessing_after_train_test_split' in pipeline_json %}
28 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %}
29 | {% for code in component['code'] %}
30 |
31 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
32 | {{ code }}
33 | {% endfor %}
34 | {% endfor %}
35 | {% endif %}
36 | {% if flag_hyperparameter_tuning %}
37 |
38 | {{ pipeline_json['hyperparameter_optimization']['code'] }}
39 | {% else %}
40 | {% if 'model' in pipeline_json %}
41 | {% if 'preprocess_dataset' in pipeline_json %}
42 | {{ pipeline_json['preprocess_dataset']['code_test'] }}
43 |
44 | {% endif %}
45 |
46 | # MODEL
47 | {{ pipeline_json['model']['code'] }}
48 | {% endif %}
49 | {% if 'inverse_target' in pipeline_json %}
50 |
51 | {{ pipeline_json['inverse_target']['code'] }}
52 | {% endif %}
53 | {% if 'evaluation' in pipeline_json %}
54 |
55 | #EVALUATION
56 | {{ pipeline_json['evaluation']['code_validation'] }}
57 | {% endif %}
58 | {% endif %}{# if flag_hyperparameter_tuning #}
59 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/DATE.py.jinja:
--------------------------------------------------------------------------------
1 | DATE_COLUMNS = {{ columns }}
2 | for _col in DATE_COLUMNS:
3 | train_date_col = pd.to_datetime({{ train_dataset }}[_col], errors='coerce')
4 | {{ train_dataset }}[_col + "_year"] = train_date_col.dt.year.fillna(-1)
5 | {{ train_dataset }}[_col + "_month"] = train_date_col.dt.month.fillna(-1)
6 | {{ train_dataset }}[_col + "_day"] = train_date_col.dt.day.fillna(-1)
7 | {{ train_dataset }}[_col + "_day_of_week"] = train_date_col.dt.dayofweek.fillna(-1)
8 | {{ train_dataset }}.drop(_col, axis=1, inplace=True)
9 |
10 | test_date_col = pd.to_datetime({{ test_dataset }}[_col], errors='coerce')
11 | {{ test_dataset }}[_col+ "_year"] = test_date_col.dt.year.fillna(-1)
12 | {{ test_dataset }}[_col + "_month"] = test_date_col.dt.month.fillna(-1)
13 | {{ test_dataset }}[_col + "_day"] = test_date_col.dt.day.fillna(-1)
14 | {{ test_dataset }}[_col + "_day_of_week"] = test_date_col.dt.dayofweek.fillna(-1)
15 | {{ test_dataset }}.drop(_col, axis=1, inplace=True)
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/DATE_predict.jinja:
--------------------------------------------------------------------------------
1 | DATE_COLUMNS = {{ columns }}
2 | for _col in DATE_COLUMNS:
3 | test_date_col = pd.to_datetime({{ test_dataset }}[_col], errors='coerce')
4 | {{ test_dataset }}[_col+ "_year"] = test_date_col.dt.year.fillna(-1)
5 | {{ test_dataset }}[_col + "_month"] = test_date_col.dt.month.fillna(-1)
6 | {{ test_dataset }}[_col + "_day"] = test_date_col.dt.day.fillna(-1)
7 | {{ test_dataset }}[_col + "_day_of_week"] = test_date_col.dt.dayofweek.fillna(-1)
8 | {{ test_dataset }}.drop(_col, axis=1, inplace=True)
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/DATE_train.jinja:
--------------------------------------------------------------------------------
1 | DATE_COLUMNS = {{ columns }}
2 | for _col in DATE_COLUMNS:
3 | train_date_col = pd.to_datetime({{ train_dataset }}[_col], errors='coerce')
4 | {{ train_dataset }}[_col + "_year"] = train_date_col.dt.year.fillna(-1)
5 | {{ train_dataset }}[_col + "_month"] = train_date_col.dt.month.fillna(-1)
6 | {{ train_dataset }}[_col + "_day"] = train_date_col.dt.day.fillna(-1)
7 | {{ train_dataset }}[_col + "_day_of_week"] = train_date_col.dt.dayofweek.fillna(-1)
8 | {{ train_dataset }}.drop(_col, axis=1, inplace=True)
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/LabelEncoder.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import OrdinalEncoder
2 |
3 | CATEGORICAL_COLS = {{ columns }}
4 | ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
5 | {{ train_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS])
6 | {{ test_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS])
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/LabelEncoder_predict.py.jinja:
--------------------------------------------------------------------------------
1 | with open('ordinalEncoder.pkl', 'rb') as f:
2 | ordinal_encoder = pickle.load(f)
3 |
4 | CATEGORICAL_COLS = {{ columns }}
5 | {{ test_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS])
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/LabelEncoder_train.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import OrdinalEncoder
2 |
3 | CATEGORICAL_COLS = {{ columns }}
4 | ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
5 | {{ train_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS])
6 |
7 | with open('ordinalEncoder.pkl', 'wb') as f:
8 | pickle.dump(ordinal_encoder, f)
9 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/Processing.py.jinja:
--------------------------------------------------------------------------------
1 | import re
2 | import string
3 |
4 | import nltk
5 |
6 | TEXT_COLUMNS = {{ columns }}
7 | def process_text(__dataset):
8 | for _col in TEXT_COLUMNS:
9 | process_text = [t.lower() for t in __dataset[_col]]
10 |
11 | # strip all punctuation
12 | table = str.maketrans('', '', string.punctuation)
13 | process_text = [t.translate(table) for t in process_text]
14 |
15 | # convert all numbers in text to 'num'
16 | process_text = [re.sub(r'\d+', 'num', t) for t in process_text]
17 | __dataset[_col] = process_text
18 | return __dataset
19 |
20 | {{ train_dataset }} = process_text({{ train_dataset }})
21 | {{ test_dataset }} = process_text({{ test_dataset }})
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/Processing_predict.py.jinja:
--------------------------------------------------------------------------------
1 | import re
2 | import string
3 |
4 | import nltk
5 |
6 | TEXT_COLUMNS = {{ columns }}
7 | def process_text(__dataset):
8 | for _col in TEXT_COLUMNS:
9 | process_text = [t.lower() for t in __dataset[_col]]
10 |
11 | # strip all punctuation
12 | table = str.maketrans('', '', string.punctuation)
13 | process_text = [t.translate(table) for t in process_text]
14 |
15 | # convert all numbers in text to 'num'
16 | process_text = [re.sub(r'\d+', 'num', t) for t in process_text]
17 | __dataset[_col] = process_text
18 | return __dataset
19 |
20 | {{ test_dataset }} = process_text({{ test_dataset }})
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/Processing_train.py.jinja:
--------------------------------------------------------------------------------
1 | import re
2 | import string
3 |
4 | import nltk
5 |
6 | TEXT_COLUMNS = {{ columns }}
7 | def process_text(__dataset):
8 | for _col in TEXT_COLUMNS:
9 | process_text = [t.lower() for t in __dataset[_col]]
10 |
11 | # strip all punctuation
12 | table = str.maketrans('', '', string.punctuation)
13 | process_text = [t.translate(table) for t in process_text]
14 |
15 | # convert all numbers in text to 'num'
16 | process_text = [re.sub(r'\d+', 'num', t) for t in process_text]
17 | __dataset[_col] = process_text
18 | return __dataset
19 |
20 | {{ train_dataset }} = process_text({{ train_dataset }})
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/SMOTE.py.jinja:
--------------------------------------------------------------------------------
1 | from imblearn.over_sampling import SMOTE
2 |
3 | smote = SMOTE(random_state=0)
4 | {% if pipeline.sparse_matrix %}
5 | feature_columns = feature_train.columns
6 | feature_train = feature_train.sparse.to_coo()
7 | feature_train, target_train = smote.fit_resample(feature_train, target_train)
8 | feature_train = pd.DataFrame.sparse.from_spmatrix(feature_train, columns=feature_columns)
9 | {% else %}
10 | feature_train, target_train = smote.fit_resample(feature_train, target_train)
11 | {% endif %}
12 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/STANDARD.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import StandardScaler
2 |
3 | standard_scaler = StandardScaler(with_mean=False)
4 | {% if pipeline.sparse_matrix %}
5 | {% set dataframe = "pd.DataFrame.sparse.from_spmatrix" %}
6 | {% else %}
7 | {% set dataframe = "pd.DataFrame" %}
8 | {% endif %}
9 | {{ train_dataset }} = {{ dataframe }}(standard_scaler.fit_transform({{ train_dataset }}), columns={{ train_dataset }}.columns, index={{ train_dataset }}.index)
10 | {{ test_dataset }} = {{ dataframe }}(standard_scaler.transform({{ test_dataset }}), columns={{ test_dataset }}.columns, index={{ test_dataset }}.index)
11 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/STANDARD_predict.py.jinja:
--------------------------------------------------------------------------------
1 | with open('standardScaler.pkl', 'rb') as f:
2 | standard_scaler = pickle.load(f)
3 |
4 | {% if pipeline.sparse_matrix %}
5 | {% set dataframe = "pd.DataFrame.sparse.from_spmatrix" %}
6 | {% else %}
7 | {% set dataframe = "pd.DataFrame" %}
8 | {% endif %}
9 | {{ test_dataset }} = {{ dataframe }}(standard_scaler.transform({{ test_dataset }}), columns={{ test_dataset }}.columns, index={{ test_dataset }}.index)
10 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/STANDARD_train.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import StandardScaler
2 |
3 | standard_scaler = StandardScaler(with_mean=False)
4 | {% if pipeline.sparse_matrix %}
5 | {% set dataframe = "pd.DataFrame.sparse.from_spmatrix" %}
6 | {% else %}
7 | {% set dataframe = "pd.DataFrame" %}
8 | {% endif %}
9 | {{ train_dataset }} = {{ dataframe }}(standard_scaler.fit_transform({{ train_dataset }}), columns={{ train_dataset }}.columns, index={{ train_dataset }}.index)
10 |
11 | with open('standardScaler.pkl', 'wb') as f:
12 | pickle.dump(standard_scaler, f)
13 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/TfidfVectorizer.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.feature_extraction.text import TfidfVectorizer
2 |
3 | TEXT_COLUMNS = {{ columns }}
4 | temp_train_data = {{ train_dataset }}[TEXT_COLUMNS]
5 | temp_test_data = {{ test_dataset }}[TEXT_COLUMNS]
6 | # Make the entire dataframe sparse to avoid it converting into a dense matrix.
7 | {{ train_dataset }} = {{ train_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0))
8 | {{ test_dataset }} = {{ test_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0))
9 |
10 | {% if pipeline.config.use_word_list %}
11 | {% if pipeline.config.use_word_list is mapping %}
12 | # Use only specified words as features for each column
13 | use_word_list = {{ pipeline.config.use_word_list }}
14 | for col, word_list in use_word_list.items():
15 | word_list = [word.lower() for word in word_list]
16 | word_list = list(set(word_list))
17 | use_word_list[col] = word_list
18 | for _col in TEXT_COLUMNS:
19 | tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list.get(_col))
20 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
21 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
22 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
23 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
24 | vector_test = tfidfvectorizer.transform(temp_test_data[_col])
25 | vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index)
26 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1)
27 | {% else %}
28 | # Use only specified words as features
29 | use_word_list = {{ pipeline.config.use_word_list }}
30 | use_word_list = [word.lower() for word in use_word_list]
31 | use_word_list = list(set(use_word_list))
32 | for _col in TEXT_COLUMNS:
33 | tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list)
34 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
35 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
36 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
37 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
38 | vector_test = tfidfvectorizer.transform(temp_test_data[_col])
39 | vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index)
40 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1)
41 | {% endif %}
42 | {% else %}
43 | for _col in TEXT_COLUMNS:
44 | tfidfvectorizer = TfidfVectorizer(max_features=3000)
45 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
46 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
47 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
48 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
49 | vector_test = tfidfvectorizer.transform(temp_test_data[_col])
50 | vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index)
51 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1)
52 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/TfidfVectorizer_predict.py.jinja:
--------------------------------------------------------------------------------
1 | TEXT_COLUMNS = {{ columns }}
2 | temp_test_data = {{ test_dataset }}[TEXT_COLUMNS]
3 | # Make the entire dataframe sparse to avoid it converting into a dense matrix.
4 | {{ test_dataset }} = {{ test_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0))
5 | with open('tfidfVectorizer.pkl', 'rb') as f:
6 | vectorizers = pickle.load(f)
7 | for _col in TEXT_COLUMNS:
8 | tfidfvectorizer = vectorizers[_col]
9 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
10 | vector_test = tfidfvectorizer.transform(temp_test_data[_col])
11 | vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index)
12 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1)
13 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/TfidfVectorizer_train.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.feature_extraction.text import TfidfVectorizer
2 |
3 | TEXT_COLUMNS = {{ columns }}
4 | temp_train_data = {{ train_dataset }}[TEXT_COLUMNS]
5 | # Make the entire dataframe sparse to avoid it converting into a dense matrix.
6 | {{ train_dataset }} = {{ train_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0))
7 | vectorizers = {}
8 |
9 | {% if pipeline.config.use_word_list %}
10 | {% if pipeline.config.use_word_list is mapping %}
11 | # Use only specified words as features for each column
12 | use_word_list = {{ pipeline.config.use_word_list }}
13 | for col, word_list in use_word_list.items():
14 | word_list = [word.lower() for word in word_list]
15 | word_list = list(set(word_list))
16 | use_word_list[col] = word_list
17 | for _col in TEXT_COLUMNS:
18 | tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list.get(_col))
19 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
20 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
21 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
22 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
23 | vectorizers[_col] = tfidfvectorizer
24 | {% else %}
25 | # Use only specified words as features
26 | use_word_list = {{ pipeline.config.use_word_list }}
27 | use_word_list = [word.lower() for word in use_word_list]
28 | use_word_list = list(set(use_word_list))
29 | for _col in TEXT_COLUMNS:
30 | tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list)
31 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
32 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
33 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
34 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
35 | vectorizers[_col] = tfidfvectorizer
36 | {% endif %}
37 | {% else %}
38 | for _col in TEXT_COLUMNS:
39 | tfidfvectorizer = TfidfVectorizer(max_features=3000)
40 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
41 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
42 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
43 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
44 | vectorizers[_col] = tfidfvectorizer
45 | {% endif %}
46 |
47 | with open('tfidfVectorizer.pkl', 'wb') as f:
48 | pickle.dump(vectorizers, f)
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-numeric.py.jinja:
--------------------------------------------------------------------------------
1 | {% if columns %}
2 | import numpy as np
3 | from sklearn.impute import SimpleImputer
4 |
5 | NUMERIC_COLS_WITH_MISSING_VALUES = {{ columns }}
6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
7 | {{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES])
8 | {{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES])
9 | {% endif %}
10 | {% if cols_almost_missing_numeric %}
11 | NUMERIC_ALMOST_MISSING_COLS = {{ cols_almost_missing_numeric }}
12 | {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0)
13 | {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0)
14 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-numeric_predict.py.jinja:
--------------------------------------------------------------------------------
1 | {% if columns %}
2 | with open('simpleimputer-numeric.pkl', 'rb') as f:
3 | simple_imputer = pickle.load(f)
4 |
5 | NUMERIC_COLS_WITH_MISSING_VALUES = {{ columns }}
6 | {{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES])
7 | {% endif %}
8 | {% if cols_almost_missing_numeric %}
9 | NUMERIC_ALMOST_MISSING_COLS = {{ cols_almost_missing_numeric }}
10 | {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0)
11 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-numeric_train.py.jinja:
--------------------------------------------------------------------------------
1 | {% if columns %}
2 | import numpy as np
3 | from sklearn.impute import SimpleImputer
4 |
5 | NUMERIC_COLS_WITH_MISSING_VALUES = {{ columns }}
6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
7 | {{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES])
8 |
9 | with open('simpleimputer-numeric.pkl', 'wb') as f:
10 | pickle.dump(simple_imputer, f)
11 | {% endif %}
12 | {% if cols_almost_missing_numeric %}
13 | NUMERIC_ALMOST_MISSING_COLS = {{ cols_almost_missing_numeric }}
14 | {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0)
15 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja:
--------------------------------------------------------------------------------
1 | {% if columns %}
2 | import numpy as np
3 | from sklearn.impute import SimpleImputer
4 |
5 | STRING_COLS_WITH_MISSING_VALUES = {{ columns }}
6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
7 | {{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES])
8 | {{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES])
9 | {% endif %}
10 | {% if cols_almost_missing_string %}
11 | STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
12 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
13 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
14 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
15 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
16 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja:
--------------------------------------------------------------------------------
1 | {% if columns %}
2 | with open('simpleimputer-string.pkl', 'rb') as f:
3 | simple_imputer = pickle.load(f)
4 |
5 | STRING_COLS_WITH_MISSING_VALUES = {{ columns }}
6 | {{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES])
7 | {% endif %}
8 | {% if cols_almost_missing_string %}
9 | STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
10 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
11 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
12 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja:
--------------------------------------------------------------------------------
1 | {% if columns %}
2 | import numpy as np
3 | from sklearn.impute import SimpleImputer
4 |
5 | STRING_COLS_WITH_MISSING_VALUES = {{ columns }}
6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
7 | {{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES])
8 |
9 | with open('simpleimputer-string.pkl', 'wb') as f:
10 | pickle.dump(simple_imputer, f)
11 | {% endif %}
12 | {% if cols_almost_missing_string %}
13 | STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
14 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
15 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
16 | {% endif %}
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/get_dummies.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import OneHotEncoder
2 |
3 | CATEGORICAL_COLS = {{ columns }}
4 | onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
5 | train_encoded = pd.DataFrame(onehot_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ train_dataset }}.index)
6 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, train_encoded ], axis=1)
7 | {{ train_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True)
8 | test_encoded = pd.DataFrame(onehot_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ test_dataset }}.index)
9 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, test_encoded ], axis=1)
10 | {{ test_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True)
11 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/get_dummies_predict.py.jinja:
--------------------------------------------------------------------------------
1 | with open('oneHotEncoder.pkl', 'rb') as f:
2 | onehot_encoder = pickle.load(f)
3 |
4 | CATEGORICAL_COLS = {{ columns }}
5 | test_encoded = pd.DataFrame(onehot_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ test_dataset }}.index)
6 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, test_encoded ], axis=1)
7 | {{ test_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True)
8 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/get_dummies_train.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import OneHotEncoder
2 |
3 | CATEGORICAL_COLS = {{ columns }}
4 | onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
5 | train_encoded = pd.DataFrame(onehot_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ train_dataset }}.index)
6 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, train_encoded ], axis=1)
7 | {{ train_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True)
8 |
9 | with open('oneHotEncoder.pkl', 'wb') as f:
10 | pickle.dump(onehot_encoder, f)
11 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/log.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | NUMERIC_COLS_TO_SCALE = {{ columns }}
4 | {{ train_dataset }}[NUMERIC_COLS_TO_SCALE] = np.log1p({{ train_dataset }}[NUMERIC_COLS_TO_SCALE]).replace([np.inf, -np.inf], np.nan).fillna({{ train_dataset }}[NUMERIC_COLS_TO_SCALE].mean())
5 |
6 |
7 | NUMERIC_COLS_TO_SCALE_FOR_TEST = list(set(test_dataset.columns) & set(NUMERIC_COLS_TO_SCALE))
8 | {{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST] = np.log1p({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST]).replace([np.inf, -np.inf], np.nan).fillna({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST].mean())
9 |
10 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/log_predict.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | NUMERIC_COLS_TO_SCALE = {{ columns }}
4 | NUMERIC_COLS_TO_SCALE_FOR_TEST = list(set(test_dataset.columns) & set(NUMERIC_COLS_TO_SCALE))
5 | {{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST] = np.log1p({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST]).replace([np.inf, -np.inf], np.nan).fillna({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST].mean())
6 |
7 |
--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/log_train.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | NUMERIC_COLS_TO_SCALE = {{ columns }}
4 | {{ train_dataset }}[NUMERIC_COLS_TO_SCALE] = np.log1p({{ train_dataset }}[NUMERIC_COLS_TO_SCALE]).replace([np.inf, -np.inf], np.nan).fillna({{ train_dataset }}[NUMERIC_COLS_TO_SCALE].mean())
5 |
6 |
--------------------------------------------------------------------------------
/sapientml_core/training/augmentation/mutation_results.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | from collections import OrderedDict, defaultdict
17 |
18 | import pandas as pd
19 | from sapientml_core import internal_path
20 | from sapientml_core.seeding.predictor import name_to_label_mapping
21 | from sapientml_core.training.project_corpus import ProjectCorpus
22 | from tqdm import tqdm
23 |
24 |
25 | class MutationResult:
26 | """MutationResult class.
27 |
28 | This class loads the mutated results for each pipeline that were already stored in the sapientml_core cache
29 | and combines all the results in a CSV file and selects the best model.
30 |
31 | """
32 |
33 | def __init__(self, mutation_result_path, project_list):
34 | self.mutation_result_path = mutation_result_path
35 | self.project_list = project_list
36 |
37 | def load_results(self):
38 | """Collects the score for augmented pipelines from exec_info directory.
39 |
40 | Returns
41 | -------
42 | results: defaultdict
43 |
44 | """
45 | results = defaultdict(defaultdict)
46 | models = list(name_to_label_mapping.keys()) + ["original"]
47 | execution_root_dir = internal_path.training_cache / "exec_info"
48 |
49 | for i in tqdm(range(0, len(self.project_list))):
50 | project = self.project_list[i]
51 | project_exec_dir = execution_root_dir / project.notebook_name
52 | project_key = project.file_name
53 | for model in models:
54 | result_file_path = project_exec_dir / model / "stdout.txt"
55 | acc, r2 = 0, 0
56 | if not os.path.exists(result_file_path):
57 | results[project_key][model] = 0
58 | continue
59 | with open(result_file_path, "r", encoding="utf-8") as f:
60 | lines = f.readlines()
61 | for line in lines:
62 | for trail in ["Accuracy: ", "R2: "]:
63 | data = line
64 | if data.count(trail) > 0:
65 | data = data[data.index(trail) + len(trail) :].strip()
66 | if data.count("%") > 0:
67 | data = data[: data.index("%")]
68 | data = float(data) / 100
69 | if trail == "Accuracy: ":
70 | acc = data
71 | if trail == "R2: ":
72 | r2 = data
73 | if project.metric == "accuracy":
74 | results[project_key][model] = round(acc, 5)
75 | elif project.metric == "r2":
76 | results[project_key][model] = round(r2, 5)
77 |
78 | best_models = []
79 | sorted_results = sorted(results[project_key].items(), key=lambda x: x[1], reverse=True)
80 | best_value = 0
81 | for model, value in sorted_results:
82 | if value > 0 and value >= best_value:
83 | best_models.append(model)
84 | best_value = value
85 | else:
86 | break
87 |
88 | results[project_key]["best_models"] = best_models
89 |
90 | return results
91 |
92 |
93 | def main():
94 | """Fetch the augmented pipeline results and store it in mutation_results.csv."""
95 | corpus = ProjectCorpus() # Fetch all project and pipeline details
96 | mutation_result = MutationResult(internal_path.training_cache, corpus.project_list)
97 | results = mutation_result.load_results()
98 | result_list = []
99 | for key, result in results.items():
100 | result["file_name"] = key
101 | result = OrderedDict(result)
102 | result.move_to_end("file_name", last=False)
103 | result_list.append(result)
104 | result_dataframe = pd.DataFrame(result_list)
105 | result_dataframe.to_csv(internal_path.training_cache / "mutation_results.csv", index=False)
106 |
107 |
108 | if __name__ == "__main__":
109 | import argparse
110 |
111 | parser = argparse.ArgumentParser()
112 | parser.add_argument("--tag", type=str, help="Tag for output files and dirs.")
113 | args = parser.parse_args()
114 | if args.tag:
115 | internal_path.training_cache = internal_path.training_cache / args.tag
116 |
117 | main()
118 |
--------------------------------------------------------------------------------
/sapientml_core/training/dataflowmodel/ast_operation.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Optional, Union
16 |
17 | import libcst as cst
18 | import sapientml.macros as macros
19 | from libcst import RemoveFromParent
20 | from libcst.metadata import ParentNodeProvider, PositionProvider
21 |
22 |
23 | class NameTransformer(cst.CSTTransformer):
24 | METADATA_DEPENDENCIES = (
25 | ParentNodeProvider,
26 | PositionProvider,
27 | )
28 |
29 | def __init__(self, replacement):
30 | self.as_names = {}
31 | self.count = 0
32 | self.replacement = replacement
33 |
34 | def leave_Name(self, original_node, updated_node) -> cst.CSTNode:
35 | source_string = original_node.value
36 | if source_string in self.replacement.keys():
37 | return updated_node.with_changes(value=self.replacement[source_string])
38 | else:
39 | return original_node
40 |
41 | def leave_SimpleString(self, original_node: cst.Name, updated_node: cst.Name) -> cst.CSTNode:
42 | source_string = original_node.value
43 | if source_string in self.replacement.keys():
44 | return updated_node.with_changes(value='"' + self.replacement[source_string] + '"')
45 | else:
46 | return original_node
47 |
48 | def get_LineNumber(self, node):
49 | pos = self.get_metadata(PositionProvider, node).start
50 | return pos.line
51 |
52 |
53 | class ArgumentRemover(cst.CSTTransformer):
54 | METADATA_DEPENDENCIES = (
55 | ParentNodeProvider,
56 | PositionProvider,
57 | )
58 |
59 | def __init__(self, model_name):
60 | self.target = ""
61 | self.model_name = model_name
62 |
63 | def leave_Arg(self, original_node: cst.Arg, updated_node: cst.Arg) -> Union[cst.Arg, cst.RemovalSentinel]:
64 | parent = self.get_metadata(ParentNodeProvider, original_node)
65 | while not isinstance(parent, cst.Call):
66 | parent = self.get_metadata(ParentNodeProvider, parent)
67 |
68 | func = parent.func
69 | name = None
70 | if isinstance(func, cst.Name):
71 | name = func.value
72 | elif isinstance(func, cst.Attribute):
73 | name = func.attr.value
74 | if name == self.model_name:
75 | return RemoveFromParent()
76 | return updated_node
77 |
78 |
79 | class ModelTransformer(cst.CSTTransformer):
80 | METADATA_DEPENDENCIES = (
81 | ParentNodeProvider,
82 | PositionProvider,
83 | )
84 |
85 | def __init__(self, model_name):
86 | self.target = ""
87 | self.model_name = model_name
88 |
89 | def visit_Assign(self, node) -> Optional[bool]:
90 | assigned_target = node.targets[0]
91 | target = assigned_target.target
92 | check = hasattr(target, "value")
93 | if check:
94 | value = node.value
95 | if isinstance(value, cst.Call):
96 | func = value.func
97 | name = None
98 | if isinstance(func, cst.Name):
99 | name = func.value
100 | elif isinstance(func, cst.Attribute):
101 | name = func.attr.value
102 | if name == self.model_name:
103 | self.target = target.value
104 |
105 |
106 | def transform_model_code(source_code, model_label, metric=None):
107 | source_tree = cst.parse_module(source_code)
108 | model_name = model_label.split(":")[2]
109 | transformer = ModelTransformer(model_name)
110 | wrapper = cst.metadata.MetadataWrapper(source_tree)
111 | modified_tree = wrapper.visit(transformer)
112 | code = modified_tree.code.splitlines()
113 | if metric == macros.Metric.AUC or metric == macros.Metric.Gini:
114 | transformed_code = (
115 | code[0]
116 | + "\n"
117 | + transformer.target
118 | + ".fit(__feature_train, __target_train)\n__y_pred = "
119 | + transformer.target
120 | + ".predict_proba(__feature_test)"
121 | )
122 | else:
123 | transformed_code = (
124 | code[0]
125 | + "\n"
126 | + transformer.target
127 | + ".fit(__feature_train, __target_train)\n__y_pred = "
128 | + transformer.target
129 | + ".predict(__feature_test)"
130 | )
131 | return transformed_code
132 |
133 |
134 | def remove_arguments(source_code, model_name):
135 | source_tree = cst.parse_module(source_code)
136 | transformer = ArgumentRemover(model_name)
137 | wrapper = cst.metadata.MetadataWrapper(source_tree)
138 | modified_tree = wrapper.visit(transformer)
139 | return modified_tree.code
140 |
141 |
142 | def replaceString(source_tree, replacement):
143 | transformer = NameTransformer(replacement)
144 | wrapper = cst.metadata.MetadataWrapper(source_tree)
145 | modified_tree = wrapper.visit(transformer)
146 | return modified_tree
147 |
148 |
149 | def construct_tree(notebook_path):
150 | with open(notebook_path, "r", encoding="utf-8") as file:
151 | code_content = file.read()
152 | parts = code_content.split("### Evaluation Template: ")
153 | code_content = parts[0]
154 | source_tree = cst.parse_module(code_content)
155 | return source_tree
156 |
--------------------------------------------------------------------------------
/sapientml_core/training/dataflowmodel/determine_label_order.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import json
17 |
18 | from sapientml_core import internal_path
19 |
20 | LABELS_TO_IGNORE_NOW = {
21 | "PREPROCESS:DeleteColumns:drop:pandas",
22 | "PREPROCESS:Category:map:pandas",
23 | "PREPROCESS:MissingValues:dropna:pandas",
24 | "PREPROCESS:Category:replace:pandas",
25 | "PREPROCESS:FeatureSelection:select_dtypes:pandas",
26 | "PREPROCESS:GenerateColumn:addition:pandas",
27 | }
28 |
29 |
30 | def main():
31 | """Removes duplication of labelling orders from dependent_labels.json file.
32 |
33 | This scripts create the dataflow model, i.e., extracts the order of two APIs A and B if there is any.
34 | There is an order between A --> B if A and B are dependent on each other based on 'dependent_api_extractor.py' and
35 | A is always followed by B in all piplelines and there is NO case in the corpus where B is followed by A.
36 |
37 | """
38 | with open(internal_path.training_cache / "dependent_labels.json", "r", encoding="utf-8") as dependent_api_file:
39 | dependent_labels = json.load(dependent_api_file)
40 |
41 | dependent_order = set()
42 |
43 | for dependent_label_str in dependent_labels.keys():
44 | dep_str_after_bracket_removal = dependent_label_str.replace("[", "").replace("]", "").replace("'", "")
45 | parts = dep_str_after_bracket_removal.split(",")
46 | if (parts[0] in LABELS_TO_IGNORE_NOW) or (parts[1].strip() in LABELS_TO_IGNORE_NOW):
47 | continue
48 | first = parts[0].split(":")[1].strip()
49 | second = parts[1].split(":")[1].strip()
50 | inverse_order = second + "#" + first
51 | if first != second:
52 | if inverse_order in dependent_order:
53 | dependent_order.remove(inverse_order)
54 | else:
55 | dependent_order.add(parts[0].strip() + "#" + parts[1].strip())
56 |
57 | with open(internal_path.training_cache / "label_order.json", "w", encoding="utf-8") as outfile:
58 | json.dump(list(dependent_order), outfile, indent=4)
59 |
60 |
61 | if __name__ == "__main__":
62 | import argparse
63 |
64 | parser = argparse.ArgumentParser()
65 | parser.add_argument("--tag", type=str, help="Tag for output files and dirs.")
66 | args = parser.parse_args()
67 | if args.tag:
68 | internal_path.training_cache = internal_path.training_cache / args.tag
69 |
70 | main()
71 |
--------------------------------------------------------------------------------
/sapientml_core/training/denoising/df_collector.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pandas as pd
16 |
17 |
18 | def update_column_names(collector, line_no, obj, obj_name):
19 | """update_column_names function.
20 |
21 | This function is injected after each statement of the
22 | pipeline during instrumentation to collect the column names of the
23 | dataset after each statement.
24 |
25 | Parameters
26 | ----------
27 | collector : dict
28 | Collection of all the column name.
29 | line_no : int
30 | line_no
31 | obj : dataframe
32 | Dataframe of particular object.
33 | obj_name : str
34 | Name of the object.
35 |
36 | Returns
37 | -------
38 | dict
39 |
40 | """
41 | now_obj = obj
42 | if isinstance(now_obj, pd.Series):
43 | now_obj = now_obj.to_frame()
44 | if isinstance(now_obj, pd.DataFrame):
45 | collector[line_no] = (list(now_obj.columns), obj_name, str(type(now_obj)))
46 | else:
47 | collector[line_no] = (None, obj_name, str(type(now_obj)))
48 | return collector
49 |
--------------------------------------------------------------------------------
/sapientml_core/training/denoising/static_analysis_of_columns.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import json
17 | from pathlib import Path
18 |
19 | from sapientml.util.logging import setup_logger
20 | from sapientml_core import internal_path
21 | from sapientml_core.training import project_corpus
22 | from sapientml_core.training.denoising import ast_info_collector as collector
23 | from sapientml_core.util import file_util
24 |
25 | logger = setup_logger()
26 |
27 |
28 | def extract(json_metadata_file):
29 | """Extracting the pipeline.
30 |
31 | This function is collecting the pipeline details and extract
32 | the target column based on file data structure.
33 |
34 | Parameters
35 | ----------
36 | json_metadata_file : str
37 | The parameter containg each pipeline details.
38 |
39 | Returns
40 | -------
41 | str
42 | This funtion will return target_column_name.
43 |
44 | Raises
45 | ------
46 | The ``Raises`` section is a list of all exceptions
47 | that are relevant to the interface.
48 |
49 | """
50 | with open(json_metadata_file, "r", encoding="utf-8") as f:
51 | notebook_info = json.load(f)
52 |
53 | if isinstance(notebook_info, dict):
54 | target_column_name = notebook_info["target_column_name"]
55 | elif isinstance(notebook_info, list):
56 | target_column_name = notebook_info[1]["target_column_name"]
57 | else:
58 | logger.warning("Wrong format: {}".format(json_metadata_file))
59 | raise
60 |
61 | return target_column_name
62 |
63 |
64 | def main(test_mode=False):
65 | """Fetch all the pipeline details from corpus and parse it using libcst library.
66 |
67 | This script performs static analysis of the pipeline to identify
68 | if there is any explicit renaming of the column names or explicit
69 | deletion of columns in the pipeline and create static_info.json file.
70 |
71 | Parameters
72 | ----------
73 | test_mode : bool
74 | This parameter is used for test mode.
75 |
76 | Raises
77 | ------
78 | Exception:
79 | The ``Raises`` section is a list of all exceptions
80 | that are relevant to the interface.
81 |
82 | """
83 | corpus = project_corpus.ProjectCorpus()
84 | projects = corpus.project_list
85 | static_info_map = {}
86 |
87 | total_number_target_pipelines = len(projects)
88 |
89 | for i in range(0, total_number_target_pipelines):
90 | if test_mode and i > 5:
91 | break
92 | logger.info(f"RUNNING:{i + 1} out of:{total_number_target_pipelines} PIPELINE:{projects[i].pipeline_path}")
93 | project = projects[i]
94 | pipeline = project.pipeline_path
95 | file_name = project.file_name
96 |
97 | static_info = {}
98 | try:
99 | dataset = file_util.read_csv(
100 | Path(project.dataset_path),
101 | Path(project.pipeline_path),
102 | )
103 | except Exception:
104 | raise
105 |
106 | json_meta = pipeline.replace(".py", ".info.json")
107 |
108 | target = extract(json_meta)
109 | source_file = pipeline
110 | with open(source_file, "r", encoding="utf-8") as f:
111 | source = f.read()
112 |
113 | try:
114 | column_api_map = collector.get_column_api_map(source)
115 | except Exception:
116 | raise
117 |
118 | dataset_columns = list(dataset.columns)
119 | dropped_columns = []
120 | renamed_columns = []
121 | for column in column_api_map:
122 | if "drop" in column_api_map[column]:
123 | if column != target and column in dataset_columns:
124 | dropped_columns.append(column)
125 | if "rename" in column_api_map[column]:
126 | renamed_columns.append(column)
127 |
128 | static_info["drop_api"] = dropped_columns
129 | static_info["rename_api"] = renamed_columns
130 | static_info["target"] = target
131 | static_info_map[file_name] = static_info
132 | try:
133 | dataset.drop(dropped_columns, axis=1, inplace=True)
134 | except Exception:
135 | raise
136 |
137 | logger.info(f"Total number of notebooks: {len(static_info_map)}")
138 | with open(internal_path.training_cache / "static_info.json", "w", encoding="utf-8") as f:
139 | json.dump(static_info_map, f, indent=4)
140 |
141 |
142 | if __name__ == "__main__":
143 | import argparse
144 |
145 | parser = argparse.ArgumentParser()
146 | parser.add_argument("--tag", type=str, help="Tag for output files and dirs.")
147 | args = parser.parse_args()
148 | if args.tag:
149 | internal_path.training_cache = internal_path.training_cache / args.tag
150 | test_mode = False
151 | main(test_mode)
152 |
--------------------------------------------------------------------------------
/sapientml_core/training/meta_feature_selector.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from sapientml_core import ps_macros
17 | from sapientml_core.design import search_space
18 | from sklearn.tree import DecisionTreeClassifier
19 |
20 |
21 | def select_k_best_features(X, y):
22 | """Select the top k explanatory variables.
23 |
24 | Parameters
25 | ----------
26 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix
27 | The training input samples.
28 | y : ArrayLike = numpy.typing.ArrayLike
29 | The target values
30 |
31 | Returns
32 | -------
33 | list
34 | Returns a list of the top k selected column names.
35 | """
36 | from sklearn.feature_selection import SelectKBest, mutual_info_regression
37 |
38 | # Select top 2 features based on mutual info regression
39 | selector = SelectKBest(mutual_info_regression, k=3)
40 | selector.fit(X, y)
41 | return list(X.columns[selector.get_support()])
42 |
43 |
44 | def select_by_rfe(X, y):
45 | """Extract the top N(=n_features_to_select) feature values of importance by RFE(Recursive Feature Elimination).
46 |
47 | Parameters
48 | ----------
49 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix |
50 | ArrayLike = numpy.typing.ArrayLike
51 | The training input samples.
52 | y : ArrayLike = numpy.typing.ArrayLike
53 | The target values.
54 |
55 | Returns
56 | -------
57 | list
58 | Returns a list of selected column names.
59 | """
60 | from sklearn.feature_selection import RFE
61 |
62 | # #Selecting the Best important features according to Logistic Regression
63 | rfe_selector = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=2, step=1)
64 | rfe_selector.fit(X, y)
65 | return list(X.columns[rfe_selector.get_support()])
66 |
67 |
68 | def select_from_model(X, y):
69 | """Select features based on importance weights.
70 |
71 | Parameters
72 | ----------
73 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix
74 | The training input samples.
75 | y : None | ArrayLike = numpy.typing.ArrayLike
76 | The target values(integers that correspond to classes in classification, real numbers in regression).
77 |
78 | Returns
79 | -------
80 | list
81 | Returns a list of selected column names.
82 | """
83 | from sklearn.feature_selection import SelectFromModel
84 |
85 | # #Selecting the Best important features according to Logistic Regression using SelectFromModel
86 | sfm_selector = SelectFromModel(estimator=DecisionTreeClassifier())
87 | sfm_selector.fit(X, y)
88 | return list(X.columns[sfm_selector.get_support()])
89 |
90 |
91 | def select_sequentially(X, y):
92 | """Select feature quantity in order and select feature quantity by greedy method.
93 |
94 | Parameters
95 | ----------
96 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix
97 | Training vectors
98 | y : None | ArrayLike = numpy.typing.ArrayLike
99 | Target values. This parameter may be ignored for unsupervised learning.
100 |
101 | Returns
102 | -------
103 | list
104 | Returns a list of selected column names.
105 | """
106 | from sklearn.feature_selection import SequentialFeatureSelector
107 |
108 | # Selecting the Best important features according to Logistic Regression
109 | sfs_selector = SequentialFeatureSelector(
110 | estimator=DecisionTreeClassifier(), n_features_to_select=3, cv=10, direction="backward"
111 | )
112 | sfs_selector.fit(X, y)
113 | return list(X.columns[sfs_selector.get_support()])
114 |
115 |
116 | def select_based_on_correlation(data):
117 | """Create correlation maps for learning data.
118 |
119 | Parameters
120 | ----------
121 | data : dataframe
122 | Training data
123 |
124 | Returns
125 | -------
126 | correlation_map : defaultdict(list)
127 | """
128 | from collections import defaultdict
129 |
130 | corr = data.corr(numeric_only=True)
131 | correlation_map = defaultdict(list)
132 | for i in range(len(corr.columns)):
133 | left = corr.columns[i]
134 | for j in range(i):
135 | if corr.iloc[i, j] >= 0.25:
136 | right = corr.columns[j]
137 | if left[0] != right[0]:
138 | correlation_map[left].append(right)
139 |
140 | if len(correlation_map[left]) == 0:
141 | for j in range(i):
142 | if corr.iloc[i, j] >= 0.15:
143 | right = corr.columns[j]
144 | if left[0] != right[0]:
145 | correlation_map[left].append(right)
146 |
147 | if len(correlation_map[left]) == 0:
148 | correlation_map[left] = list(search_space.meta_feature_list)
149 | return correlation_map
150 |
151 |
152 | def select_features(label):
153 | """Return manually selected feature labels.
154 |
155 | Parameters
156 | ----------
157 | label : str
158 |
159 | Returns
160 | -------
161 | selection_model[label] : list
162 | """
163 | selection_model = {
164 | ps_macros.FILL: [ps_macros.MISSING_PRESENCE],
165 | # ps_macros.DROP: [ps_macros.MISSING_PRESENCE],
166 | ps_macros.IN_PLACE_CONVERT: [
167 | ps_macros.CATG_PRESENCE,
168 | # ps_macros.IS_TARGET_STR,
169 | ps_macros.BINARY_CATG_PRESENCE,
170 | ps_macros.SMALL_CATG_PRESENCE,
171 | ps_macros.LARGE_CATG_PRESENCE,
172 | ],
173 | ps_macros.ONE_HOT: [
174 | ps_macros.CATG_PRESENCE,
175 | # ps_macros.IS_TARGET_STR,
176 | ps_macros.BINARY_CATG_PRESENCE,
177 | ps_macros.SMALL_CATG_PRESENCE,
178 | ps_macros.LARGE_CATG_PRESENCE,
179 | ],
180 | ps_macros.VECT: [ps_macros.TEXT_PRESENCE],
181 | ps_macros.MISSING: [ps_macros.MISSING_PRESENCE],
182 | ps_macros.CATG: [ps_macros.CATG_PRESENCE],
183 | ps_macros.SCALING: [
184 | ps_macros.NORMALIZED_MEAN,
185 | ps_macros.NORMALIZED_STD_DEV,
186 | ps_macros.NORMALIZED_VARIATION_ACROSS_COLUMNS,
187 | ],
188 | ps_macros.DATE: [ps_macros.DATE_PRESENCE],
189 | ps_macros.LEMMITIZE: [ps_macros.TEXT_PRESENCE],
190 | ps_macros.BALANCING: [ps_macros.IMBALANCE],
191 | ps_macros.LOG: [
192 | ps_macros.MAX_SKEW,
193 | ],
194 | }
195 | return selection_model[label]
196 |
--------------------------------------------------------------------------------
/sapientml_core/training/pp_model_trainer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import pickle
17 | from collections import OrderedDict, defaultdict
18 | from typing import Literal
19 |
20 | import pandas as pd
21 | from sapientml.util.logging import setup_logger
22 | from sapientml_core import internal_path
23 | from sapientml_core.design import search_space
24 | from sapientml_core.training import meta_feature_selector
25 | from sklearn.tree import DecisionTreeClassifier
26 |
27 | logger = setup_logger()
28 |
29 |
30 | def train_p_model(X, y):
31 | """Build a decision tree classifier from the training set (X, y).
32 |
33 | Parameters
34 | ----------
35 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix |
36 | ArrayLike = numpy.typing.ArrayLike
37 | The training input samples.
38 | y : MatrixLike = np.ndarray | pd.DataFrame | spmatrix |
39 | ArrayLike = numpy.typing.ArrayLike
40 | The target values (class labels) as integers or strings
41 |
42 | Returns
43 | -------
44 | model : DecisionTreeClassifier
45 | Fitted estimator.
46 | """
47 | model = DecisionTreeClassifier(class_weight="balanced", max_depth=3)
48 | model.fit(X, y)
49 | return model
50 |
51 |
52 | def _train_preprocessors(train_data, feature_selection: Literal["select_manually", "customized"]):
53 | logger.info("Training skeleton predictor for preprocessors...")
54 | data = train_data
55 | data.drop(
56 | data.filter(regex="(TEMPLATE|IGNORE|EVAL:|RPEPROCESS:|MODEL:|Unnamed:)").columns,
57 | axis=1,
58 | inplace=True,
59 | )
60 | data["project_target"] = (
61 | data["csv_name"] + "_" + data["target_column_name"].apply(lambda line: "_".join(sorted(eval(line))))
62 | )
63 | all_labels = [v for v in data.columns if v.startswith(("PREPROCESS:"))]
64 | second_to_full_labels = defaultdict(list)
65 | for label in all_labels:
66 | second_to_full_labels["PREPROCESS:" + label.split(":")[1]].append(label)
67 |
68 | pp_models = OrderedDict()
69 |
70 | selected_features_map = meta_feature_selector.select_based_on_correlation(data)
71 |
72 | for _, detail_labels in second_to_full_labels.items():
73 | for label in detail_labels:
74 | logger.debug(label)
75 | main_df = data.copy()
76 | # Feature Selection On
77 | y = main_df[label]
78 | X = main_df[search_space.meta_feature_list]
79 |
80 | if feature_selection == "select_manually":
81 | selected_features = meta_feature_selector.select_features(label)
82 | logger.debug("Selected Features:", selected_features)
83 | X = main_df[selected_features]
84 | elif feature_selection == "customized":
85 | selected_features = selected_features_map[label]
86 | if len(selected_features) == 0:
87 | selected_features = meta_feature_selector.select_sequentially(X, y)
88 | logger.debug("Selected Features:", selected_features)
89 | X = main_df[selected_features]
90 |
91 | pp_model = train_p_model(X, y)
92 | pp_models[label] = (pp_model, selected_features)
93 |
94 | return pp_models
95 |
96 |
97 | def _prepare_model_training_data(raw_meta_feature_train):
98 | # Remove all the unnecessary meta-features
99 | final_meta_features = raw_meta_feature_train[search_space.project_related_metadata + search_space.meta_feature_list]
100 | final_meta_features.fillna(0, inplace=True)
101 | for semantic_label, columns in search_space.label_mapping.items():
102 | try:
103 | final_meta_features[semantic_label] = raw_meta_feature_train[columns].sum(axis=1)
104 | final_meta_features[semantic_label] = final_meta_features[semantic_label].apply(lambda x: 1 if x > 0 else 0)
105 | except KeyError as e:
106 | logger.warning(e)
107 |
108 | return final_meta_features
109 |
110 |
111 | def main():
112 | """This main function preprocesses the learning data and saves fitted estimator for the DecisionTreeClassifier.
113 |
114 | Description of feature_selection : "select_manually" | "customized"
115 | Specify how features are selected.
116 | """
117 | training_data_path = internal_path.training_cache / "pp_metafeatures_training.csv"
118 | # "select_manually" | "customized"
119 | feature_selection = "customized"
120 | raw_meta_feature_train = pd.read_csv(training_data_path)
121 | meta_feature_train = _prepare_model_training_data(raw_meta_feature_train)
122 | pp_models = _train_preprocessors(meta_feature_train, feature_selection)
123 | # Save model
124 | with open(internal_path.training_cache / "pp_models.pkl", "wb") as f:
125 | pickle.dump(pp_models, f)
126 |
127 |
128 | if __name__ == "__main__":
129 | import argparse
130 |
131 | parser = argparse.ArgumentParser()
132 | parser.add_argument("--tag", type=str, help="Tag for output files and dirs.")
133 | args = parser.parse_args()
134 | if args.tag:
135 | internal_path.training_cache = internal_path.training_cache / args.tag
136 |
137 | main()
138 |
--------------------------------------------------------------------------------
/sapientml_core/training/project.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from dataclasses import dataclass
16 |
17 |
18 | @dataclass
19 | class ProjectInfo:
20 | pipeline_path: str # full path
21 | dataset_path: str # full path
22 | file_name: str # only name of the pipeline
23 | notebook_name: str # only name of the pipeline without extension
24 | accuracy: float
25 | csv_name: str
26 | target_column_name: str
27 | metric: str
28 |
--------------------------------------------------------------------------------
/sapientml_core/training/project_corpus.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import doctest
17 | import json
18 | import re
19 | from pathlib import Path
20 |
21 | from sapientml.util.logging import setup_logger
22 | from sapientml_core import internal_path
23 | from tqdm import tqdm
24 |
25 | from .project import ProjectInfo
26 |
27 | logger = setup_logger()
28 |
29 |
30 | class ProjectCorpus:
31 | def __init__(self, target_project_name_list=None):
32 | self.target_project_name_list = target_project_name_list
33 | self.clean_notebook_dir_path = internal_path.clean_dir
34 | self.dataset_dir_path = internal_path.corpus_path / "dataset"
35 | self.metadata_dir_path = internal_path.corpus_path / "metadata"
36 | self.project_list = self._extract_project_info()
37 |
38 | def _extract_project_info(self):
39 | project_list = []
40 |
41 | if self.target_project_name_list:
42 | pipeline_file_names = [Path(project_path) for project_path in self.target_project_name_list]
43 | else:
44 | pipeline_file_names = Path(self.clean_notebook_dir_path).rglob("*.py")
45 |
46 | for notebook_path in tqdm(list(pipeline_file_names)):
47 | notebook_info_path = notebook_path.with_suffix(".info.json")
48 | notebook_name = notebook_path.stem
49 | logger.debug(f"Extracting Project Info for {notebook_name}")
50 | # Read the target column information
51 | try:
52 | with open(notebook_info_path, "r", encoding="utf-8") as notebook_info_file:
53 | notebook_info = json.load(notebook_info_file)
54 | except Exception:
55 | logger.warning("Could not read JSON info file: {}".format(notebook_info_path))
56 | continue
57 |
58 | if isinstance(notebook_info, list):
59 | notebook_info = notebook_info[1]
60 |
61 | if isinstance(notebook_info, dict):
62 | target_column_name = notebook_info["target_column_name"]
63 | dataset_folder_name = notebook_info["dataset_folder"]
64 | accuracy = notebook_info["accuracy"]
65 | metric = "accuracy"
66 | if accuracy == "N/A":
67 | accuracy = notebook_info["r2"]
68 | metric = "r2"
69 | try:
70 | accuracy = float(accuracy[:-1]) # discarding the percentage (%) sign from the end
71 | except Exception:
72 | accuracy = 0
73 | else:
74 | logger.warning("Wrong format: {}".format(notebook_info_path))
75 | continue
76 |
77 | if isinstance(target_column_name, str):
78 | if target_column_name == "UNKNOWN":
79 | continue
80 | elif isinstance(notebook_info, list):
81 | if target_column_name[0] == "UNKNOWN":
82 | continue
83 | # Read the dataset
84 | project_fqn = notebook_name + ".py"
85 | dataset_paths = [
86 | p
87 | for p in (Path(self.dataset_dir_path) / dataset_folder_name).glob("*")
88 | if re.search(r"/*\.(csv|tsv)", str(p))
89 | ]
90 | if len(dataset_paths) == 0:
91 | logger.warning(
92 | "Could not find CSV/TSV file under {}/{}".format(self.dataset_dir_path, dataset_folder_name)
93 | )
94 | continue
95 |
96 | dataset_path = dataset_paths[0]
97 | dataset_name = dataset_path.stem
98 | if len(dataset_paths) > 1:
99 | logger.warning(
100 | "Found multiple CSV/TSV files under {}. Using {}...".format(
101 | self.clean_notebook_dir_path, dataset_name
102 | )
103 | )
104 |
105 | project_info = ProjectInfo(
106 | str(notebook_path),
107 | str(dataset_path),
108 | project_fqn,
109 | notebook_name,
110 | accuracy,
111 | dataset_name,
112 | target_column_name,
113 | metric,
114 | )
115 | project_list.append(project_info)
116 | return project_list
117 |
118 |
119 | if __name__ == "__main__":
120 | doctest.testmod()
121 |
--------------------------------------------------------------------------------
/sapientml_core/util/file_util.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023-2024 The SapientML Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import calendar
17 | import datetime
18 | import glob
19 | import json
20 | import os
21 | import time
22 |
23 | import pandas as pd
24 |
25 |
26 | def get_time():
27 | """Returns the current time.
28 |
29 | Returns
30 | ----------
31 | readable : str
32 | Current time in ISO format
33 | """
34 | ts = calendar.timegm(time.gmtime())
35 | readable = datetime.datetime.fromtimestamp(ts).isoformat()
36 | return readable
37 |
38 |
39 | def read_file_in_a_list(file_name):
40 | """Open a file and place it in a list line by line(read().splitlines()).
41 |
42 | Parameters
43 | ----------
44 | file_name : FileDescriptorOrPath
45 | File name.
46 |
47 | Returns
48 | ----------
49 | lines : list[str]
50 | List file contents line by line.
51 | """
52 | with open(file_name, "r", encoding="utf-8") as f:
53 | lines = f.read().splitlines()
54 | return lines
55 |
56 |
57 | def read_file(file_name):
58 | """Open file and read data with read().
59 |
60 | Parameters
61 | ----------
62 | file_name : FileDescriptorOrPath
63 | File name.
64 |
65 | Returns
66 | ----------
67 | lines : str
68 | The entire text file read.
69 | """
70 | with open(file_name, "r", encoding="utf-8") as f:
71 | lines = f.read()
72 | return lines
73 |
74 |
75 | def write_content_to_file(file_name, content):
76 | """write content to file.
77 |
78 | Parameters
79 | ----------
80 | file_name : FileDescriptorOrPath
81 | File name.
82 | content : str
83 | What to write to the file.
84 | """
85 | with open(file_name, "w", encoding="utf-8") as out_file:
86 | out_file.write(content)
87 |
88 |
89 | def get_file_list(path, type):
90 | """Get a list of files of a specified type in a directory.
91 |
92 | Parameters
93 | ----------
94 | path : FileDescriptorOrPath
95 | Directory path.
96 | type : str
97 | File extension.
98 | Returns
99 | ----------
100 | files_with_given_type : list
101 | List of retrieved files.
102 | """
103 | os.chdir(path)
104 | files_with_given_type = []
105 | for file in glob.glob("*." + type):
106 | files_with_given_type.append((path + "/" + file))
107 | return files_with_given_type
108 |
109 |
110 | def load_json(file_name):
111 | """Load json format file.
112 |
113 | Parameters
114 | ----------
115 | file_name : FileDescriptorOrPath
116 | File name.
117 |
118 | Returns
119 | ----------
120 | content : Any
121 | Loaded content.
122 | """
123 | with open(file_name, "r", encoding="utf-8") as input_file:
124 | content = json.load(input_file)
125 | return content
126 |
127 |
128 | def read_csv(csv_path, notebook_path):
129 | """Read a csv file.
130 |
131 | Parameters
132 | ----------
133 | csv_path : FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]
134 | Csv file Path.
135 | notebook_path : pathlib.Path
136 | Notebook Directory Path.
137 |
138 | Returns
139 | ----------
140 | dataset : pd.DataFrame
141 | Contents of the loaded csv.
142 | """
143 |
144 | def read(path, **kwargs):
145 | if str(path).endswith(".csv"):
146 | return pd.read_csv(path, encoding_errors="ignore", on_bad_lines="warn", **kwargs)
147 | return pd.read_table(path, encoding_errors="ignore", on_bad_lines="warn", **kwargs)
148 |
149 | encoding = get_dataset_encoding(notebook_path)
150 | dataset = read(csv_path, encoding=encoding)
151 | num_of_features = dataset.shape[1] - 1
152 | if num_of_features == 0:
153 | dataset = read(csv_path, encoding=encoding, delim_whitespace=True)
154 | num_of_features = dataset.shape[1] - 1
155 | if num_of_features == 0:
156 | dataset = read(csv_path, encoding=encoding, delimiter=";")
157 | num_of_features = dataset.shape[1] - 1
158 | return dataset
159 |
160 |
161 | def get_dataset_encoding(notebook_path):
162 | """Get dataset encoding.
163 |
164 | Parameters
165 | ----------
166 | notebook_path : StrPath | None | BytesPath
167 | Directory path of notebooks.
168 |
169 | Returns
170 | ----------
171 | encoding : str | None
172 | """
173 | if os.path.isdir(notebook_path):
174 | return None
175 | if not str(notebook_path).endswith(".py"):
176 | return None
177 | encoding = get_dataset_file(notebook_path)
178 | if encoding:
179 | return encoding
180 | return None
181 |
182 |
183 | def get_dataset_file(notebook_path):
184 | """Read notebook_path and get encoding.
185 |
186 | Parameters
187 | ----------
188 | notebook_path : str
189 | File name.
190 |
191 | Returns
192 | ----------
193 | csv_file_name : str | bytes | None
194 | File name of csv(dataset).
195 | encoding : str | None
196 | Encoding of notebook_path.
197 | """
198 | f = open(notebook_path, "r", encoding="utf-8")
199 | lines = f.readlines()
200 | f.close()
201 | encoding = None
202 | for index in range(len(lines)):
203 | if ".read_csv(" in lines[index]:
204 | if "encoding=" in lines[index]:
205 | encoding = lines[index].split("encoding=")[1].split(")")[0].split(",")[0][1:-1]
206 | elif "encoding = " in lines[index]:
207 | encoding = lines[index].split("encoding = ")[1].split(")")[0].split(",")[0][1:-1]
208 | else:
209 | encoding = None
210 | return encoding
211 | return encoding
212 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/__init__.py
--------------------------------------------------------------------------------
/tests/fixtures/outputs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
--------------------------------------------------------------------------------
/tests/fixtures/params/config.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/fixtures/params/config.pkl
--------------------------------------------------------------------------------
/tests/fixtures/params/dataset.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/fixtures/params/dataset.pkl
--------------------------------------------------------------------------------
/tests/fixtures/params/task.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/fixtures/params/task.pkl
--------------------------------------------------------------------------------
/tests/sapientml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/sapientml/__init__.py
--------------------------------------------------------------------------------
/tests/sapientml/conftest.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from pathlib import Path
3 | from unittest import mock
4 |
5 | import pytest
6 |
7 |
8 | @pytest.fixture(scope="session", autouse=True)
9 | def disable_logging():
10 | logging.disable(logging.FATAL)
11 | yield
12 | logging.disable(logging.NOTSET)
13 |
14 |
15 | @pytest.fixture(scope="function", autouse=True)
16 | def reset_sapientml_logger():
17 | # FIXME: more efficient way to reset a logger
18 | logger = logging.getLogger("sapientml")
19 | logger.handlers.clear()
20 | logger.root.handlers.clear()
21 |
22 |
23 | @pytest.fixture(scope="function", autouse=True)
24 | def path_home(tmp_path):
25 | with mock.patch.object(Path, "home"):
26 | yield Path(tmp_path)
27 |
--------------------------------------------------------------------------------