├── .coveragerc ├── .editorconfig ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── dependabot.yml └── workflows │ ├── dependabot.yml │ ├── greetings.yml │ ├── lint.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pysen ├── pyproject.toml └── setup.cfg ├── CODEOWNERS ├── LICENSE ├── pyproject.toml ├── requirements-training.txt ├── sapientml_core ├── __init__.py ├── adaptation │ ├── __init__.py │ ├── artifacts │ │ ├── PY310 │ │ │ └── label_order.json │ │ ├── PY311 │ │ │ └── label_order.json │ │ ├── PY39 │ │ │ └── label_order.json │ │ └── label_order.json │ └── generation │ │ ├── __init__.py │ │ ├── pipeline_template.py │ │ ├── predicate.py │ │ ├── preprocessing_label.py │ │ └── template_based_adaptation.py ├── datastore │ └── localfile │ │ ├── __init__.py │ │ ├── export_modules │ │ ├── sample_dataset.py │ │ └── split_timeseries_dataset.py │ │ ├── generator.py │ │ └── templates │ │ ├── concat_train_validation.py.jinja │ │ ├── drop_ignore_columns.py.jinja │ │ ├── drop_inf_or_nan_rows.py.jinja │ │ ├── load_localfile.py.jinja │ │ ├── load_localfile_predict.py.jinja │ │ ├── load_localfile_train.py.jinja │ │ ├── set_index.py.jinja │ │ ├── set_validation_as_test.py.jinja │ │ ├── split.py.jinja │ │ └── subsample.py.jinja ├── design │ ├── __init__.py │ ├── label_util.py │ ├── pp_component_groups.py │ └── search_space.py ├── enums.py ├── explain │ ├── AutoEDA.py │ ├── AutoVisualization.py │ ├── code_miner.py │ ├── code_template.py │ ├── main.py │ ├── pipeline_explanation.py │ └── templates │ │ └── jupyter_content.json ├── generator.py ├── internal_path.py ├── meta_features.py ├── models │ ├── PY310 │ │ ├── feature_importance.json │ │ ├── mp_model_1.pkl │ │ ├── mp_model_2.pkl │ │ └── pp_models.pkl │ ├── PY311 │ │ ├── feature_importance.json │ │ ├── mp_model_1.pkl │ │ ├── mp_model_2.pkl │ │ └── pp_models.pkl │ ├── PY39 │ │ ├── feature_importance.json │ │ ├── mp_model_1.pkl │ │ ├── mp_model_2.pkl │ │ └── pp_models.pkl │ ├── feature_importance.json │ ├── model_metafeatures_test.csv │ ├── mp_model_1.pkl │ ├── mp_model_2.pkl │ └── pp_models.pkl ├── params.py ├── preprocess │ └── default │ │ ├── __init__.py │ │ ├── generator.py │ │ ├── params.py │ │ └── templates │ │ ├── drop_one_value_columns.py.jinja │ │ ├── handle_inf_columns.py.jinja │ │ ├── handle_iterable_values.py.jinja │ │ ├── handle_japanese_text.py.jinja │ │ ├── handle_mixed_typed_columns.py.jinja │ │ ├── none_has_columns.py.jinja │ │ └── rename_columns.py.jinja ├── ps_macros.py ├── seeding │ ├── __init__.py │ └── predictor.py ├── templates │ ├── explainability_templates │ │ ├── component_description.json │ │ ├── model_explanation.py.jinja │ │ └── preprocessing_explanation.py.jinja │ ├── model_templates │ │ ├── classification_post_process.jinja │ │ ├── hyperparameter_tuning.py.jinja │ │ ├── hyperparameters.py.jinja │ │ ├── hyperparameters_default_value.py.jinja │ │ ├── model.py.jinja │ │ ├── model_predict.py.jinja │ │ ├── model_test.py.jinja │ │ └── model_train.py.jinja │ ├── other_templates │ │ ├── confusion_matrix.py.jinja │ │ ├── drop_columns.py.jinja │ │ ├── evaluation.py.jinja │ │ ├── evaluation_test.py.jinja │ │ ├── hyperparameter_tuning_evaluation.py.jinja │ │ ├── inverse_target.py.jinja │ │ ├── permutation_importance.py.jinja │ │ ├── prediction_result.py.jinja │ │ ├── preprocess_dataset.py.jinja │ │ ├── shap.py.jinja │ │ ├── target_separation_predict.py.jinja │ │ ├── target_separation_test.py.jinja │ │ ├── target_separation_train.py.jinja │ │ └── target_separation_validation.py.jinja │ ├── pipeline_predict.py.jinja │ ├── pipeline_test.py.jinja │ ├── pipeline_train.py.jinja │ ├── pipeline_validation.py.jinja │ └── preprocessing_templates │ │ ├── DATE.py.jinja │ │ ├── DATE_predict.jinja │ │ ├── DATE_train.jinja │ │ ├── LabelEncoder.py.jinja │ │ ├── LabelEncoder_predict.py.jinja │ │ ├── LabelEncoder_train.py.jinja │ │ ├── Processing.py.jinja │ │ ├── Processing_predict.py.jinja │ │ ├── Processing_train.py.jinja │ │ ├── SMOTE.py.jinja │ │ ├── STANDARD.py.jinja │ │ ├── STANDARD_predict.py.jinja │ │ ├── STANDARD_train.py.jinja │ │ ├── TfidfVectorizer.py.jinja │ │ ├── TfidfVectorizer_predict.py.jinja │ │ ├── TfidfVectorizer_train.py.jinja │ │ ├── fillna-type-numeric.py.jinja │ │ ├── fillna-type-numeric_predict.py.jinja │ │ ├── fillna-type-numeric_train.py.jinja │ │ ├── fillna-type-string.py.jinja │ │ ├── fillna-type-string_predict.py.jinja │ │ ├── fillna-type-string_train.py.jinja │ │ ├── get_dummies.py.jinja │ │ ├── get_dummies_predict.py.jinja │ │ ├── get_dummies_train.py.jinja │ │ ├── log.py.jinja │ │ ├── log_predict.py.jinja │ │ └── log_train.py.jinja ├── training │ ├── augmentation │ │ ├── mutation_results.py │ │ ├── mutation_runner.py │ │ └── mutator.py │ ├── dataflowmodel │ │ ├── ast_operation.py │ │ ├── dependent_api_extractor.py │ │ └── determine_label_order.py │ ├── denoising │ │ ├── ast_info_collector.py │ │ ├── dataset_snapshot_extractor.py │ │ ├── determine_used_features.py │ │ ├── df_collector.py │ │ └── static_analysis_of_columns.py │ ├── meta_feature_extractor.py │ ├── meta_feature_selector.py │ ├── meta_model_trainer.py │ ├── pp_model_trainer.py │ ├── project.py │ └── project_corpus.py └── util │ └── file_util.py └── tests ├── __init__.py ├── fixtures ├── datasets │ ├── testdata_df.csv │ ├── testdata_df_light.csv │ ├── testdata_test.csv │ ├── testdata_train.csv │ └── testdata_valid.csv ├── outputs │ └── .gitignore └── params │ ├── config.pkl │ ├── dataset.pkl │ └── task.pkl └── sapientml ├── __init__.py ├── conftest.py ├── test_generatedcode.py └── test_generatedcode_additional_patterns.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */.env/* 4 | */.venv/* 5 | */.cache/* 6 | */tmp/* 7 | */mining/collector.py 8 | */utilities/dataset_utility.py 9 | 10 | [report] 11 | exclude_lines = 12 | pragma: no cover 13 | if __name__ == .__main__.: -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*.py] 4 | indent_style = space 5 | indent_size = 4 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | end_of_line = lf 9 | charset = utf-8 10 | 11 | [*.json,*.csv] 12 | insert_final_newline = ignore 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Show your code calling `generate_code()`. 16 | 17 |
18 | script 19 | 20 | ```python 21 | # Paste your code here. The following is an example. 22 | from sapientml import SapientMLGenerator 23 | sml = SapientMLGenerator() 24 | sml.generate_code('your arguments') 25 | ``` 26 |
27 | 28 | 2. Attach the datasets or dataframes input to `generate_code()` if possible. 29 | 3. Show the generated code such as `1_default.py` when it was generated. 30 | 31 |
32 | generated code 33 | 34 | ```python 35 | # Paste the generated code here. 36 | ``` 37 |
38 | 39 | 4. Show the messages of SapientML and/or generated code. 40 | 41 | **Expected behavior** 42 | A clear and concise description of what you expected to happen. 43 | 44 | **Environment (please complete the following information):** 45 | - OS: [e.g. Ubuntu 20.04] 46 | - Docker Version (if applicable): [Docker version 20.10.17, build 100c701] 47 | - Python Version: [e.g. 3.9.12] 48 | - SapientML Version: [e.g. 2.3.4] 49 | 50 | 51 | **Additional context** 52 | Add any other context about the problem here. 53 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: 'enhancement' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # To get started with Dependabot version updates, you'll need to specify which 16 | # package ecosystems to update and where the package manifests are located. 17 | # Please see the documentation for all configuration options: 18 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 19 | 20 | version: 2 21 | updates: 22 | - package-ecosystem: "pip" # See documentation for possible values 23 | directory: "/" # Location of package manifests 24 | schedule: 25 | interval: "weekly" 26 | -------------------------------------------------------------------------------- /.github/workflows/dependabot.yml: -------------------------------------------------------------------------------- 1 | name: Dependabot auto approve and merge 2 | on: pull_request 3 | 4 | permissions: 5 | pull-requests: write 6 | contents: write 7 | 8 | jobs: 9 | dependabot: 10 | runs-on: ubuntu-latest 11 | if: github.actor == 'dependabot[bot]' 12 | steps: 13 | - name: Dependabot metadata 14 | id: metadata 15 | uses: dependabot/fetch-metadata@v1 16 | with: 17 | github-token: "${{ secrets.GITHUB_TOKEN }}" 18 | - name: Approve a PR 19 | run: gh pr review --approve "$PR_URL" 20 | env: 21 | PR_URL: ${{github.event.pull_request.html_url}} 22 | GH_TOKEN: ${{secrets.GITHUB_TOKEN}} 23 | - name: Enable auto-merge for Dependabot PRs 24 | if: steps.metadata.outputs.update-type == 'version-update:semver-patch' 25 | run: gh pr merge --auto --merge "$PR_URL" 26 | env: 27 | PR_URL: ${{github.event.pull_request.html_url}} 28 | GH_TOKEN: ${{secrets.GITHUB_TOKEN}} 29 | -------------------------------------------------------------------------------- /.github/workflows/greetings.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Greetings 16 | 17 | on: [pull_request_target, issues] 18 | 19 | jobs: 20 | greeting: 21 | runs-on: ubuntu-latest 22 | permissions: 23 | issues: write 24 | pull-requests: write 25 | steps: 26 | - uses: actions/first-interaction@v1 27 | with: 28 | repo-token: ${{ secrets.GITHUB_TOKEN }} 29 | issue-message: "# 🎉 Thanks for submitting the issue to SapientML!!\n\nWe have the [Discord](https://discord.gg/59yshERFD9) server. Please join the server!" 30 | pr-message: "# 🎉 Thanks for submitting the PR to SapientML!!\n\nHere is the [Contribution Guideline](https://github.com/sapientml/sapientml/blob/main/CONTRIBUTING.md).\nWe would like you to read the document and follow it.\nIf you have any question or anything to be discussed, please join the [Discord](https://discord.gg/59yshERFD9) server and chat with us.\nThank again!" 31 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Lint 16 | 17 | on: 18 | pull_request: 19 | branches: 20 | - main 21 | 22 | env: 23 | PYTHON_VERSION: "3.10" 24 | POETRY_VERSION: "1.5.1" 25 | POETRY_URL: https://install.python-poetry.org 26 | 27 | jobs: 28 | test: 29 | runs-on: ubuntu-latest 30 | steps: 31 | - name: Checkout 32 | uses: actions/checkout@v4 33 | - name: Cache Packages 34 | uses: actions/cache@v2 35 | with: 36 | path: ~/.local 37 | key: poetry-${{ matrix.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/*.yml') }} 38 | 39 | - name: Set up Python ${{ env.PYTHON_VERSION }} 40 | uses: actions/setup-python@v4 41 | with: 42 | python-version: ${{ env.PYTHON_VERSION }} 43 | 44 | - name: Install Dependencies 45 | run: pip install pysen flake8 black isort==5.12.0 46 | 47 | - name: Pysen run lint 48 | run: pysen run lint 49 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Release 16 | 17 | on: 18 | push: 19 | tags: 20 | - '*.*.*' 21 | 22 | env: 23 | POETRY_VERSION: "1.7.1" 24 | POETRY_URL: https://install.python-poetry.org 25 | 26 | jobs: 27 | test: 28 | strategy: 29 | matrix: 30 | version: ["3.10", "3.11"] 31 | test: [test_misc, 32 | test_regressor_works_number, test_regressor_works_with_nosparse, 33 | test_classifier_category_binary_num_noproba, test_classifier_category_binary_num_proba, 34 | test_classifier_category_multi_nonnum_metric_noproba, test_classifier_category_multi_nonnum_metric_proba, 35 | test_classifier_category_binary_boolean_metric_noproba, test_classifier_category_binary_boolean_metric_proba, 36 | test_classifier_category_binary_num_use_proba_with_metric_default_noproba, test_classifier_category_multi_nonnum_noproba_metric_with_proba, 37 | test_classifier_notext_nonegative_explanatry, test_classifier_works_with 38 | ] 39 | runs-on: ubuntu-latest 40 | steps: 41 | - name: Checkout 42 | uses: actions/checkout@v4 43 | 44 | - name: Set up Python ${{ matrix.version }} 45 | uses: actions/setup-python@v4 46 | with: 47 | python-version: ${{ matrix.version }} 48 | 49 | - name: Install Poetry 50 | run: | 51 | curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }} 52 | echo "$HOME/.local/bin" >> $GITHUB_PATH 53 | 54 | - name: Install Dependencies 55 | run: poetry install 56 | 57 | - name: Pytest 58 | run: poetry run -- pytest -k ${{ matrix.test }} 59 | 60 | - name: Upload Coverage 61 | uses: actions/upload-artifact@v4 62 | with: 63 | name: ${{ matrix.test }} 64 | include-hidden-files: true 65 | path: .coverage 66 | retention-days: 1 67 | overwrite: true 68 | 69 | 70 | additional_test: 71 | strategy: 72 | matrix: 73 | version: ["3.10", "3.11"] 74 | test: [test_additional_misc, 75 | test_additional_regressor_works_number, test_additional_regressor_works_with_nosparse, 76 | test_additional_classifier_category_binary_nonnum_noproba, test_additional_classifier_category_binary_nonnum_proba, 77 | test_additional_classifier_category_binary_num_noproba, test_additional_classifier_category_binary_num_proba, 78 | test_additional_classifier_category_multi_nonnum_metric_noproba, test_additional_classifier_category_multi_nonnum_metric_proba, 79 | test_additional_classifier_category_multi_num_metric_noproba, test_additional_classifier_category_multi_num_metric_proba, 80 | test_additional_classifier_category_binary_boolean_metric_noproba, test_additional_classifier_category_binary_boolean_metric_proba, 81 | test_additional_classifier_category_binary_num_use_proba_with_metric_default_noproba, test_additional_classifier_category_multi_nonnum_noproba_metric_with_proba, 82 | test_additional_classifier_works_with 83 | ] 84 | runs-on: ubuntu-latest 85 | steps: 86 | - name: Checkout 87 | uses: actions/checkout@v4 88 | 89 | - name: Set up Python ${{ matrix.version }} 90 | uses: actions/setup-python@v4 91 | with: 92 | python-version: ${{ matrix.version }} 93 | 94 | - name: Install Poetry 95 | run: | 96 | curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }} 97 | echo "$HOME/.local/bin" >> $GITHUB_PATH 98 | 99 | - name: Install Dependencies 100 | run: poetry install 101 | 102 | - name: Pytest 103 | run: poetry run -- pytest -k ${{ matrix.test }} 104 | 105 | - name: Upload Coverage 106 | uses: actions/upload-artifact@v4 107 | with: 108 | name: ${{ matrix.test }} 109 | include-hidden-files: true 110 | path: .coverage 111 | retention-days: 1 112 | overwrite: true 113 | 114 | report_coverage: 115 | runs-on: ubuntu-latest 116 | needs: 117 | - test 118 | - additional_test 119 | steps: 120 | - name: Checkout 121 | uses: actions/checkout@v4 122 | 123 | - name: Set up Python 3.11 124 | uses: actions/setup-python@v4 125 | with: 126 | python-version: 3.11 127 | 128 | - name: Download Coverage Files 129 | uses: actions/download-artifact@v4 130 | 131 | - name: Install coverage 132 | run: pip install coverage 133 | 134 | - name: Combine Coverage Files 135 | run: | 136 | mv --backup=t */.coverage . 137 | coverage combine -a 138 | coverage report 139 | 140 | - name: Report Coverage to CodeCov 141 | uses: codecov/codecov-action@v3 142 | with: 143 | token: ${{ secrets.CODECOV_TOKEN }} 144 | 145 | release: 146 | name: Release 147 | runs-on: ubuntu-latest 148 | steps: 149 | - name: Checkout 150 | uses: actions/checkout@v4 151 | 152 | - name: Set up Python 3.10 153 | uses: actions/setup-python@v4 154 | with: 155 | python-version: "3.10" 156 | 157 | - name: Install Poetry 158 | run: | 159 | curl -sSL https://install.python-poetry.org | python - -y 160 | 161 | - name: Update PATH 162 | run: echo "$HOME/.local/bin" >> $GITHUB_PATH 163 | 164 | - name: Set Version 165 | run: | 166 | SEMVER=$(git describe --exact-match --tags HEAD) 167 | sed -i "s/\(version *= *\).*/\1\"$SEMVER\"/" pyproject.toml 168 | 169 | - name: Build project for distribution 170 | run: poetry build 171 | 172 | - name: Check Version 173 | id: check-version 174 | run: | 175 | [[ "$(poetry version --short)" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] || echo prerelease=true >> $GITHUB_OUTPUT 176 | 177 | - name: Create Release 178 | uses: ncipollo/release-action@v1 179 | with: 180 | artifacts: "dist/*" 181 | token: ${{ secrets.GITHUB_TOKEN }} 182 | draft: false 183 | prerelease: steps.check-version.outputs.prerelease == 'true' 184 | 185 | - name: Publish to PyPI 186 | env: 187 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }} 188 | run: poetry publish --skip-existing 189 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Testing 16 | 17 | on: 18 | pull_request: 19 | branches: 20 | - main 21 | 22 | env: 23 | POETRY_VERSION: "1.5.1" 24 | POETRY_URL: https://install.python-poetry.org 25 | 26 | jobs: 27 | test: 28 | strategy: 29 | matrix: 30 | version: ["3.10", "3.11"] 31 | test: [test_misc, test_regressor_works_number, test_regressor_works_with_nosparse, 32 | test_classifier_category_binary_num_noproba, test_classifier_category_binary_num_proba, 33 | test_classifier_category_multi_nonnum_metric_noproba, test_classifier_category_multi_nonnum_metric_proba, 34 | test_classifier_category_binary_boolean_metric_noproba, test_classifier_category_binary_boolean_metric_proba, 35 | test_classifier_category_binary_num_use_proba_with_metric_default_noproba, test_classifier_category_multi_nonnum_noproba_metric_with_proba, 36 | test_classifier_notext_nonegative_explanatry, test_classifier_works_with, 37 | ] 38 | runs-on: ubuntu-latest 39 | steps: 40 | - name: Checkout 41 | uses: actions/checkout@v4 42 | 43 | - name: Set up Python ${{ matrix.version }} 44 | uses: actions/setup-python@v4 45 | with: 46 | python-version: ${{ matrix.version }} 47 | 48 | - name: Install Poetry 49 | run: | 50 | curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }} 51 | echo "$HOME/.local/bin" >> $GITHUB_PATH 52 | 53 | - name: Install Dependencies 54 | run: poetry install 55 | 56 | - name: Pytest 57 | run: poetry run -- pytest -k ${{ matrix.test }} 58 | 59 | - name: Upload Coverage 60 | uses: actions/upload-artifact@v4 61 | with: 62 | name: ${{ matrix.test }} 63 | include-hidden-files: true 64 | path: .coverage 65 | retention-days: 1 66 | overwrite: true 67 | 68 | report_coverage: 69 | runs-on: ubuntu-latest 70 | needs: 71 | - test 72 | steps: 73 | - name: Checkout 74 | uses: actions/checkout@v4 75 | 76 | - name: Set up Python 3.11 77 | uses: actions/setup-python@v4 78 | with: 79 | python-version: 3.11 80 | 81 | - name: Download Coverage Files 82 | uses: actions/download-artifact@v4 83 | 84 | - name: Install coverage 85 | run: pip install coverage 86 | 87 | - name: Combine Coverage Files 88 | run: | 89 | mv --backup=t */.coverage . 90 | coverage combine -a 91 | coverage report 92 | 93 | - name: Report Coverage to CodeCov 94 | uses: codecov/codecov-action@v3 95 | with: 96 | token: ${{ secrets.CODECOV_TOKEN }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | poetry.lock -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: pysen 5 | name: Run pysen 6 | entry: pysen run_files lint 7 | language: system 8 | types: [file, python] 9 | -------------------------------------------------------------------------------- /.pysen/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool] 2 | [tool.black] # automatically generated by pysen 3 | # pysen ignores and overwrites any modifications 4 | line-length = 120 5 | target-version = ["py310"] 6 | 7 | [tool.isort] # automatically generated by pysen 8 | # pysen ignores and overwrites any modifications 9 | default_section = "THIRDPARTY" 10 | ensure_newline_before_comments = true 11 | force_grid_wrap = 0 12 | force_single_line = false 13 | include_trailing_comma = true 14 | line_length = 120 15 | multi_line_output = 3 16 | use_parentheses = true 17 | -------------------------------------------------------------------------------- /.pysen/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # automatically generated by pysen 3 | # pysen ignores and overwrites any modifications 4 | # e203: black treats : as a binary operator 5 | # e231: black doesn't put a space after , 6 | # e501: black may exceed the line-length to follow other style rules 7 | # w503 or w504: either one needs to be disabled to select w error codes 8 | ignore = E203,E231,E501,W503 9 | max-line-length = 120 10 | select = B,B950,C,E,F,W 11 | 12 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @sapientml/maintainers 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | authors = ["The SapientML Authors"] 3 | description = "A SapientML plugin of SapientMLGenerator" 4 | license = "Apache-2.0" 5 | maintainers = [ 6 | "Kosaku Kimura ", 7 | "Akira Ura ", 8 | ] 9 | name = "sapientml-core" 10 | version = "0" 11 | 12 | [tool.poetry.dependencies] 13 | catboost = ">=1.2.3" 14 | imbalanced-learn = ">=0.11,<0.13" 15 | ipykernel = "^6.25.1" 16 | japanize-matplotlib = "^1.1.3" 17 | jinja2 = "^3.1.2" 18 | libcst = "^1.0.1" 19 | lightgbm = "^4.0.0" 20 | nbconvert = "^7.7.4" 21 | nbformat = "^5.9.2" 22 | nltk = "^3.8.1" 23 | numba = ">=0.57.1,<0.61.0" 24 | optuna = ">=3.2,<5.0" 25 | python = ">=3.9,<3.13" 26 | sapientml = "*" 27 | scikit-learn = "1.5.2" 28 | scipy = "^1.11.1" 29 | seaborn = ">=0.12.2,<0.14.0" 30 | shap = ">=0.43,<0.47" 31 | tqdm = "^4.66.1" 32 | xgboost = ">=1.7.6,<3.0.0" 33 | mecab-python3 = "^1.0.6" 34 | ipadic = "^1.0.0" 35 | fasttext-wheel = "^0.9.2" 36 | requests = "^2.31.0" 37 | 38 | [tool.poetry.group.dev.dependencies] 39 | black = ">=23.7,<25.0" 40 | flake8 = ">=6.1,<8.0" 41 | isort = "^5.12.0" 42 | pre-commit = ">=3.3.3,<5.0.0" 43 | pysen = ">=0.10.5,<0.12.0" 44 | pytest = ">=7.4,<9.0" 45 | pytest-cov = ">=4.1,<7.0" 46 | pytest-xdist = "^3.3.1" 47 | 48 | [build-system] 49 | build-backend = "poetry.core.masonry.api" 50 | requires = ["poetry-core>=1.0.0"] 51 | 52 | [tool.poetry.plugins."sapientml.config"] 53 | sapientml = "sapientml_core:SapientMLConfig" 54 | 55 | [tool.poetry.plugins."sapientml.pipeline_generator"] 56 | sapientml = "sapientml_core:SapientMLGenerator" 57 | 58 | [tool.poetry.plugins."sapientml.datastore"] 59 | localfile = "sapientml_core.datastore.localfile:LocalFile" 60 | 61 | [tool.poetry.plugins."sapientml.preprocess"] 62 | default = "sapientml_core.preprocess.default:DefaultPreprocess" 63 | 64 | [tool.poetry.plugins."sapientml.export_modules"] 65 | sample-dataset = "sapientml_core.datastore.localfile.export_modules" 66 | 67 | [tool.pysen] 68 | version = "0.11.0" 69 | 70 | [tool.pysen-cli] 71 | settings_dir = ".pysen" 72 | 73 | [tool.pysen.lint] 74 | enable_black = true 75 | enable_flake8 = true 76 | enable_isort = true 77 | enable_mypy = false 78 | line_length = 120 79 | py_version = "py310" 80 | 81 | [tool.pysen.lint.source] 82 | includes = ["sapientml_core/", "tests/"] 83 | 84 | [tool.pytest.ini_options] 85 | addopts = "-s -x --cov=sapientml_core" 86 | testpaths = ["tests"] 87 | -------------------------------------------------------------------------------- /requirements-training.txt: -------------------------------------------------------------------------------- 1 | category-encoders==2.6.4 2 | patsy==0.5.6 3 | statsmodels==0.14.4 4 | tensorflow==2.18.0 5 | wordcloud==1.9.4 -------------------------------------------------------------------------------- /sapientml_core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .generator import SapientMLGenerator 16 | from .params import SapientMLConfig 17 | 18 | __all__ = ["SapientMLGenerator", "SapientMLConfig"] 19 | -------------------------------------------------------------------------------- /sapientml_core/adaptation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /sapientml_core/adaptation/artifacts/PY310/label_order.json: -------------------------------------------------------------------------------- 1 | [ 2 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas", 3 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas", 4 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom", 5 | "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy", 6 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas", 7 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas", 8 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas", 9 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas", 10 | "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy", 11 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas", 12 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy", 13 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas", 14 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas", 15 | "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom", 16 | "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom", 17 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas", 18 | "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy", 19 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom", 20 | "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas", 21 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom", 22 | "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas", 23 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas", 24 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas", 25 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas", 26 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas", 27 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas", 28 | "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas", 29 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas", 30 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas", 31 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas" 32 | ] -------------------------------------------------------------------------------- /sapientml_core/adaptation/artifacts/PY311/label_order.json: -------------------------------------------------------------------------------- 1 | [ 2 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas", 3 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom", 4 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas", 5 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas", 6 | "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy", 7 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas", 8 | "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy", 9 | "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom", 10 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas", 11 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy", 12 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas", 13 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas", 14 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom", 15 | "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas", 16 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom", 17 | "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom", 18 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas", 19 | "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy", 20 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas", 21 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas", 22 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas", 23 | "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas", 24 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas", 25 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas", 26 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas", 27 | "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas", 28 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas", 29 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas", 30 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas", 31 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas" 32 | ] -------------------------------------------------------------------------------- /sapientml_core/adaptation/artifacts/PY39/label_order.json: -------------------------------------------------------------------------------- 1 | [ 2 | "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy", 3 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas", 4 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas", 5 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas", 6 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas", 7 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas", 8 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom", 9 | "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy", 10 | "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas", 11 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas", 12 | "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom", 13 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas", 14 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy", 15 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas", 16 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom", 17 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas", 18 | "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas", 19 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas", 20 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas", 21 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas", 22 | "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas", 23 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas", 24 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas", 25 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom", 26 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas", 27 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas", 28 | "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom", 29 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas", 30 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas", 31 | "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy" 32 | ] -------------------------------------------------------------------------------- /sapientml_core/adaptation/artifacts/label_order.json: -------------------------------------------------------------------------------- 1 | [ 2 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas", 3 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas", 4 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas", 5 | "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy", 6 | "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas", 7 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy", 8 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas", 9 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas", 10 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas", 11 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas", 12 | "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom", 13 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas", 14 | "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas", 15 | "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom", 16 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas", 17 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas", 18 | "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas", 19 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas", 20 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas", 21 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas", 22 | "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas", 23 | "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom", 24 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom", 25 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom", 26 | "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas", 27 | "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas", 28 | "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas", 29 | "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy", 30 | "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy", 31 | "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas" 32 | ] -------------------------------------------------------------------------------- /sapientml_core/adaptation/generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /sapientml_core/adaptation/generation/predicate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from ...enums import Operator 16 | 17 | 18 | class Predicate: 19 | """A class to represent the predicate. 20 | 21 | This class represents the data structure for loading a decision tree 22 | condition/predicate and provides a function that can evaluate whether 23 | the predicate is true for a particular column. 24 | 25 | """ 26 | 27 | feature_name = "" 28 | _operator = "" 29 | _comparison_value = "" 30 | 31 | def __init__(self, feature_name, operator, comparison_value): 32 | """Constructs all the necessary attributes for the predicate object. 33 | 34 | Parameters 35 | ---------- 36 | feature_name : str 37 | Meta feature name 38 | operator : Operator 39 | comparison_value : np.float 40 | 41 | """ 42 | self.feature_name = feature_name 43 | self._operator = operator 44 | self._comparison_value = comparison_value 45 | 46 | def evaluate_predicate(self, meta_features): 47 | """Evaluate whether the predicate is true for a particular column. 48 | 49 | Parameters 50 | ---------- 51 | meta_features : dict 52 | 53 | Returns 54 | ------- 55 | result : bool 56 | 57 | Raises 58 | ------ 59 | Exception 60 | False 61 | 62 | """ 63 | try: 64 | actual_value = meta_features[self.feature_name] 65 | if actual_value == -1 or actual_value == 0: 66 | return False 67 | if actual_value is None: 68 | return False 69 | except Exception: 70 | return False 71 | 72 | result = False 73 | if self._operator is Operator.GREATER_THAN: 74 | result = actual_value > self._comparison_value 75 | elif self._operator is Operator.GREATER_THAN_OR_EQUAL_TO: 76 | result = actual_value >= self._comparison_value 77 | elif self._operator is Operator.EQUAL_TO: 78 | result = actual_value == self._comparison_value 79 | elif self._operator is Operator.LESS_THAN: 80 | result = actual_value < self._comparison_value 81 | elif self._operator is Operator.LESS_THAN_OR_EQUAL_TO: 82 | result = actual_value <= self._comparison_value 83 | 84 | return result 85 | -------------------------------------------------------------------------------- /sapientml_core/adaptation/generation/preprocessing_label.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from ...enums import Operator 16 | from .predicate import Predicate 17 | 18 | 19 | class PreprocessingLabel: 20 | """A class to represent the preprocessinglabel. 21 | 22 | This script identifies the relevant columns in the dataset 23 | for each feature engineering components. 24 | 25 | """ 26 | 27 | def __init__(self, label_name, meta_features, predicates): 28 | """Constructs all the necessary attributes for the preprocessinglabel object. 29 | 30 | Parameters 31 | ---------- 32 | label_name : str 33 | Component name. 34 | meta_features : list 35 | Meta features selected. 36 | predicates : list 37 | predicates details. 38 | 39 | """ 40 | self.label_name = label_name 41 | self.meta_features = meta_features 42 | self.predicate_objects = list() 43 | self._build_predicate_objects(predicates) 44 | self.relevant_columns = list() 45 | self.components_before = list() 46 | self.components_after = list() 47 | self.alternative_components = list() 48 | 49 | def __str__(self): 50 | return self.label_name 51 | 52 | def __repr__(self): 53 | return str(self) 54 | 55 | def _build_predicate_objects(self, predicates): 56 | for pred in predicates: 57 | feature_name = pred["feature_name"] 58 | operator = self._get_operator(pred["operator"]) 59 | comparison_value = pred["threshold"] 60 | p = Predicate(feature_name, operator, comparison_value) 61 | self.predicate_objects.append(p) 62 | 63 | def _get_operator(self, op_string): 64 | if op_string == ">": 65 | return Operator.GREATER_THAN 66 | elif op_string == ">=": 67 | return Operator.GREATER_THAN_OR_EQUAL_TO 68 | elif op_string == "<": 69 | return Operator.LESS_THAN 70 | elif op_string == "<=": 71 | return Operator.LESS_THAN_OR_EQUAL_TO 72 | elif op_string == "==" or op_string == "=": 73 | return Operator.EQUAL_TO 74 | else: 75 | return Operator.NOT_EQUAL_TO 76 | 77 | def get_relevant_columns(self, dataset_summary, target, ignore_columns): 78 | """get_relevant_columns. 79 | 80 | Parameters 81 | ---------- 82 | dataset_summary : DatasetSummary 83 | Object of the datasetsummary class. 84 | target : list 85 | ignore_columns : list 86 | 87 | Returns 88 | ------- 89 | rel_columns_list : list 90 | Return the relavant column list. 91 | 92 | """ 93 | rel_columns_list = [] 94 | 95 | # approach 1: conjunction: a column is relavant if and only if all of the predicates applicable to that component are true 96 | # approach 2: disjunction: a column is relavant if and only if at least one of the predicates applicable to that component are true 97 | approach = 2 98 | 99 | for column_name, column in dataset_summary.columns.items(): 100 | if column_name in ignore_columns: 101 | continue 102 | 103 | # error handling for log transform: don't apply if any col value <= 0 104 | if "PREPROCESS:Scaling:log" in self.label_name: 105 | if column.has_negative_value: 106 | continue 107 | 108 | result = list() # holds boolean results of all predicates applicable to a column 109 | for p in self.predicate_objects: 110 | # special handling of "target_imbalance_score" feature, since it should only be applied on target column 111 | if p.feature_name == "feature:target_imbalance_score": 112 | if column_name not in target: 113 | result.append(False) 114 | continue 115 | result.append(p.evaluate_predicate(column.meta_features)) 116 | 117 | if approach == 1: # conjunction 118 | if all(result): 119 | rel_columns_list.append(column_name) 120 | elif approach == 2: # disjunction 121 | if any(result): 122 | rel_columns_list.append(column_name) 123 | 124 | return rel_columns_list 125 | -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .generator import LocalFile, LocalFileConfig 16 | 17 | __all__ = ["LocalFile", "LocalFileConfig"] 18 | -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/export_modules/sample_dataset.py: -------------------------------------------------------------------------------- 1 | from decimal import ROUND_HALF_UP, Decimal 2 | 3 | import pandas as pd 4 | from sklearn.model_selection import train_test_split 5 | 6 | 7 | def _sampled_training(dev_training_dataset, train_size, stratify, task_type) -> pd.DataFrame: 8 | sampled_training_dataset, _ = train_test_split( 9 | dev_training_dataset, 10 | train_size=train_size, 11 | stratify=stratify if task_type == "classification" else None, 12 | ) 13 | return sampled_training_dataset # type: ignore 14 | 15 | 16 | def sample_dataset( 17 | dataframe: pd.DataFrame, 18 | sample_size: int, 19 | target_columns: list[str], 20 | task_type: str, 21 | ) -> pd.DataFrame: 22 | # Sample the training set if the dataset is big 23 | # FIXME 24 | sampled_training_dataset = None 25 | num_of_rows = len(dataframe.index) 26 | if num_of_rows >= sample_size: 27 | rare_labels = [] 28 | dataframe_alltargets = None 29 | if task_type == "classification": 30 | dataframe_alltargets = dataframe[target_columns].astype(str).apply("".join, axis=1) 31 | label_count = dataframe_alltargets.value_counts() 32 | rare_labels = label_count.loc[label_count == 1].index.tolist() 33 | 34 | if rare_labels and dataframe_alltargets is not None: 35 | dataframe_rare = dataframe[dataframe_alltargets.isin(rare_labels)] 36 | rare_index = dataframe_rare.index.values 37 | 38 | dataframe_wo_rare = dataframe.drop(rare_index) 39 | 40 | num_of_labels = [len(dataframe_wo_rare[target].value_counts()) for target in target_columns] 41 | 42 | rare_to_all_ratio = int( 43 | Decimal(sample_size * len(dataframe_rare) / len(dataframe)).quantize( 44 | Decimal("0"), rounding=ROUND_HALF_UP 45 | ) 46 | ) 47 | not_rare_to_all_ratio = int( 48 | Decimal(sample_size * len(dataframe_wo_rare) / len(dataframe)).quantize( 49 | Decimal("0"), rounding=ROUND_HALF_UP 50 | ) 51 | ) 52 | 53 | stratify_wo_rare = None 54 | 55 | if len(dataframe_rare) == len(dataframe): 56 | sampled_training_dataset = _sampled_training(dataframe, sample_size, None, task_type) 57 | 58 | elif rare_to_all_ratio in [0, 1]: 59 | sampled_training_dataset_rare = dataframe_rare 60 | 61 | if max(num_of_labels) >= sample_size: 62 | stratify_wo_rare = None 63 | else: 64 | stratify_wo_rare = dataframe_wo_rare[target_columns] 65 | sampled_training_dataset_wo_rare = _sampled_training( 66 | dataframe_wo_rare, 67 | sample_size - len(sampled_training_dataset_rare), 68 | stratify_wo_rare, 69 | task_type, 70 | ) 71 | 72 | sampled_training_dataset = pd.concat( 73 | [sampled_training_dataset_wo_rare, sampled_training_dataset_rare] # type: ignore 74 | ) 75 | 76 | elif not_rare_to_all_ratio in [0, 1]: 77 | sampled_training_dataset_wo_rare = dataframe_wo_rare 78 | sampled_training_dataset_rare = _sampled_training( 79 | dataframe_rare, 80 | sample_size - len(sampled_training_dataset_wo_rare), 81 | None, 82 | task_type, 83 | ) 84 | 85 | sampled_training_dataset = pd.concat( 86 | [sampled_training_dataset_wo_rare, sampled_training_dataset_rare] # type: ignore 87 | ) 88 | 89 | else: 90 | if max(num_of_labels) >= sample_size: 91 | stratify_wo_rare = None 92 | else: 93 | stratify_wo_rare = dataframe_wo_rare[target_columns] 94 | 95 | sampled_training_dataset_wo_rare = _sampled_training( 96 | dataframe_wo_rare, not_rare_to_all_ratio, stratify_wo_rare, task_type 97 | ) 98 | sampled_training_dataset_rare = _sampled_training(dataframe_rare, rare_to_all_ratio, None, task_type) 99 | 100 | sampled_training_dataset = pd.concat( 101 | [sampled_training_dataset_wo_rare, sampled_training_dataset_rare] # type: ignore 102 | ) 103 | 104 | else: 105 | num_of_labels = [len(dataframe[target].value_counts()) for target in target_columns] 106 | if max(num_of_labels) >= sample_size: 107 | stratify_wo_rare = None 108 | else: 109 | stratify_wo_rare = dataframe[target_columns] 110 | 111 | sampled_training_dataset = _sampled_training(dataframe, sample_size, stratify_wo_rare, task_type) 112 | return sampled_training_dataset 113 | else: 114 | return dataframe 115 | -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/export_modules/split_timeseries_dataset.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import TimeSeriesSplit 2 | 3 | 4 | def split_dataset(dataset, split_column_name, split_num, split_index): 5 | dataset = dataset.sort_values(split_column_name) 6 | splitter = TimeSeriesSplit(n_splits=split_num) 7 | train_idx, test_idx = list(splitter.split(dataset))[split_index] 8 | train_dataset, test_dataset = dataset.iloc[train_idx], dataset.iloc[test_idx] 9 | for col in train_dataset.columns: 10 | if train_dataset[col].isnull().all(): 11 | if test_dataset[col].dtype == float or test_dataset[col].dtype == int: 12 | train_dataset.loc[:, col] = 0 13 | elif test_dataset[col].dtype == object: 14 | train_dataset.loc[:, col] = "" 15 | elif test_dataset[col].dtype == bool: 16 | train_dataset.loc[:, col] = False 17 | return train_dataset, test_dataset 18 | -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/concat_train_validation.py.jinja: -------------------------------------------------------------------------------- 1 | train_dataset = pd.concat([train_dataset, validation_dataset]).reset_index(drop=True) -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/drop_ignore_columns.py.jinja: -------------------------------------------------------------------------------- 1 | # DROP IGNORED COLUMNS 2 | ignore_columns = {{ ignore_columns }} 3 | 4 | {% if train %} 5 | train_dataset = train_dataset.drop(ignore_columns, axis=1, errors="ignore") 6 | {% endif %} 7 | {% if validation %} 8 | validation_dataset = validation_dataset.drop(ignore_columns, axis=1, errors="ignore") 9 | {% endif %} 10 | {% if test %} 11 | test_dataset = test_dataset.drop(ignore_columns, axis=1, errors="ignore") 12 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/drop_inf_or_nan_rows.py.jinja: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | train_dataset = train_dataset[~train_dataset[{{target_columns}}].isin([np.inf, -np.inf, np.nan]).any(axis=1)] 3 | 4 | -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/load_localfile.py.jinja: -------------------------------------------------------------------------------- 1 | # LOAD DATA 2 | import pandas as pd 3 | 4 | {% if dataset.training_data_path.endswith(".pkl") %} 5 | train_dataset = pd.read_pickle(r"{{ dataset.training_data_path }}") 6 | {% else %} 7 | train_dataset = pd.read_csv(r"{{ dataset.training_data_path }}", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}") 8 | {% endif %} 9 | 10 | {% if dataset.validation_data_path %} 11 | {% if dataset.validation_data_path.endswith(".pkl") %} 12 | validation_dataset = pd.read_pickle(r"{{ dataset.validation_data_path }}") 13 | {% else %} 14 | validation_dataset = pd.read_csv(r"{{ dataset.validation_data_path }}", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}") 15 | {% endif %} 16 | {% endif %}{# if dataset.validation_data_path #} 17 | 18 | {% if not validation and dataset.test_data_path %} 19 | {% if dataset.test_data_path.endswith(".pkl") %} 20 | test_dataset = pd.read_pickle(r"{{ dataset.test_data_path }}") 21 | {% else %} 22 | test_dataset = pd.read_csv(r"{{ dataset.test_data_path }}", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}") 23 | {% endif %} 24 | {% endif %}{# if not validation and dataset.test_data_path #} 25 | 26 | -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/load_localfile_predict.py.jinja: -------------------------------------------------------------------------------- 1 | # LOAD DATA 2 | import pandas as pd 3 | 4 | {% if dataset.training_data_path.endswith(".pkl") %} 5 | test_dataset = pd.read_pickle("./test.pkl") 6 | {% else %} 7 | test_dataset = pd.read_csv("./test.csv", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}") 8 | {% endif %} 9 | -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/load_localfile_train.py.jinja: -------------------------------------------------------------------------------- 1 | # LOAD DATA 2 | import pandas as pd 3 | 4 | {% if dataset.training_data_path.endswith(".pkl") %} 5 | train_dataset = pd.read_pickle("./training.pkl") 6 | {% else %} 7 | train_dataset = pd.read_csv("./training.csv", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}") 8 | {% endif %} 9 | -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/set_index.py.jinja: -------------------------------------------------------------------------------- 1 | # SET ID_COLUMNS TO DATAFRAME'S INDEX 2 | id_columns_for_prediction = {{ id_columns_for_prediction }} 3 | test_dataset = test_dataset.set_index(id_columns_for_prediction, drop=False) -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/set_validation_as_test.py.jinja: -------------------------------------------------------------------------------- 1 | test_dataset = validation_dataset -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/split.py.jinja: -------------------------------------------------------------------------------- 1 | {% if (validation and (not dataset.validation_data_path)) or ((not validation) and (not dataset.test_data_path)) %} 2 | 3 | # TRAIN-TEST SPLIT 4 | {% if task.split_method == "random" %} 5 | {% if task.split_stratification %} 6 | from sklearn.model_selection import train_test_split 7 | def split_dataset(dataset, train_size={{ task.split_train_size }}, random_state={{ task.split_seed }}): 8 | train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state, stratify=dataset["{{task.target_columns[0]}}"]) 9 | return train_dataset, test_dataset 10 | {% else %} 11 | from sklearn.model_selection import train_test_split 12 | def split_dataset(dataset, train_size={{ task.split_train_size }}, random_state={{ task.split_seed }}): 13 | train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state) 14 | return train_dataset, test_dataset 15 | {% endif %} 16 | {% elif task.split_method == "group" %} 17 | from sklearn.model_selection import GroupShuffleSplit 18 | def split_dataset(dataset, split_column_name="{{ task.split_column_name }}", train_size={{ task.split_train_size }}, random_state={{ task.split_seed }}): 19 | splitter = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=random_state) 20 | train_idx, test_idx = next(splitter.split(dataset, groups=dataset[split_column_name])) 21 | train_dataset, test_dataset = dataset.iloc[train_idx], dataset.iloc[test_idx] 22 | return train_dataset, test_dataset 23 | {% else %}{# time #} 24 | from lib.split_timeseries_dataset import split_dataset 25 | {% endif %} 26 | {% if not dataset.test_data_path %} 27 | {% if task.split_method == "random" or task.split_method == "group" %} 28 | train_dataset, test_dataset = split_dataset(train_dataset) 29 | {% else %} 30 | train_dataset, test_dataset = split_dataset(train_dataset, split_column_name="{{ task.split_column_name }}", split_num={{ task.time_split_num }}, split_index={{ task.time_split_index}}) 31 | {% endif %} 32 | {% endif %} 33 | {% if validation %} 34 | {% endif %} 35 | {% endif %} 36 | {% if validation and (not dataset.validation_data_path) %} 37 | {% if task.split_method == "random" or task.split_method == "group" %} 38 | train_dataset, validation_dataset = split_dataset(train_dataset) 39 | {% else %} 40 | train_dataset, validation_dataset = split_dataset(train_dataset, split_column_name="{{ task.split_column_name }}", split_num={{ task.time_split_num }}, split_index={{ task.time_split_index}}) 41 | {% endif %} 42 | {% endif %} 43 | -------------------------------------------------------------------------------- /sapientml_core/datastore/localfile/templates/subsample.py.jinja: -------------------------------------------------------------------------------- 1 | # SUBSAMPLE 2 | # If the number of rows of train_dataset is larger than sample_size, sample rows to sample_size for speedup. 3 | from lib.sample_dataset import sample_dataset 4 | train_dataset = sample_dataset( 5 | dataframe=train_dataset, 6 | sample_size={{ sample_size }}, 7 | target_columns={{ task.target_columns }}, 8 | task_type='{{ task.task_type }}' 9 | ) 10 | 11 | -------------------------------------------------------------------------------- /sapientml_core/design/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/design/__init__.py -------------------------------------------------------------------------------- /sapientml_core/design/label_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | name_to_label_mapping = { 17 | "random forest": { 18 | "c": "MODEL:Classifier:RandomForestClassifier:sklearn", 19 | "r": "MODEL:Regressor:RandomForestRegressor:sklearn", 20 | }, 21 | "extra tree": { 22 | "c": "MODEL:Classifier:ExtraTreesClassifier:sklearn", 23 | "r": "MODEL:Regressor:ExtraTreesRegressor:sklearn", 24 | }, 25 | "lightgbm": {"c": "MODEL:Classifier:LGBMClassifier:lightgbm", "r": "MODEL:Regressor:LGBMRegressor:lightgbm"}, 26 | "xgboost": {"c": "MODEL:Classifier:XGBClassifier:xgboost", "r": "MODEL:Regressor:XGBRegressor:xgboost"}, 27 | "catboost": { 28 | "c": "MODEL:Classifier:CatBoostClassifier:catboost", 29 | "r": "MODEL:Regressor:CatBoostRegressor:catboost", 30 | }, 31 | "gradient boosting": { 32 | "c": "MODEL:Classifier:GradientBoostingClassifier:sklearn", 33 | "r": "MODEL:Regressor:GradientBoostingRegressor:sklearn", 34 | }, 35 | "adaboost": {"c": "MODEL:Classifier:AdaBoostClassifier:sklearn", "r": "MODEL:Regressor:AdaBoostRegressor:sklearn"}, 36 | "decision tree": { 37 | "c": "MODEL:Classifier:DecisionTreeClassifier:sklearn", 38 | "r": "MODEL:Regressor:DecisionTreeRegressor:sklearn", 39 | }, 40 | "svm": {"c": "MODEL:Classifier:SVC:sklearn", "r": "MODEL:Regressor:SVR:sklearn"}, 41 | "linear svm": {"c": "MODEL:Classifier:LinearSVC:sklearn", "r": "MODEL:Regressor:LinearSVR:sklearn"}, 42 | "logistic/linear regression": { 43 | "c": "MODEL:Classifier:LogisticRegression:sklearn", 44 | "r": "MODEL:Regressor:LinearRegression:sklearn", 45 | }, 46 | "lasso": {"r": "MODEL:Regressor:Lasso:sklearn"}, 47 | "sgd": {"c": "MODEL:Classifier:SGDClassifier:sklearn", "r": "MODEL:Regressor:SGDRegressor:sklearn"}, 48 | "mlp": {"c": "MODEL:Classifier:MLPClassifier:sklearn", "r": "MODEL:Regressor:MLPRegressor:sklearn"}, 49 | "multinomial nb": {"c": "MODEL:Classifier:MultinomialNB:sklearn"}, 50 | "gaussian nb": {"c": "MODEL:Classifier:GaussianNB:sklearn"}, 51 | "bernoulli nb": {"c": "MODEL:Classifier:BernoulliNB:sklearn"}, 52 | } 53 | 54 | 55 | def map_label_to_name(): 56 | """ 57 | Assign several internal labels to each ML component. 58 | 59 | Returns 60 | ---------- 61 | label_to_name : dict[str, str] 62 | Assigned result. 63 | """ 64 | label_to_name = {"MODEL:Classifier:LGBMClassifier:lgbm": "lightgbm", "MODEL:Regressor:train:xgboost": "xgboost"} 65 | for k, v in name_to_label_mapping.items(): 66 | for k1, v1 in v.items(): 67 | label_to_name[v1] = k 68 | return label_to_name 69 | -------------------------------------------------------------------------------- /sapientml_core/design/pp_component_groups.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | drop_label_list = [ 17 | "PREPROCESS:MissingValues:dropna:pandas", 18 | "PREPROCESS:MissingValues:notnull:pandas", 19 | "PREPROCESS:MissingValues:isnull:pandas", 20 | ] 21 | filler_label = [ 22 | "PREPROCESS:MissingValues:fillna:pandas", 23 | "PREPROCESS:MissingValues:SimpleImputer:sklearn", 24 | "PREPROCESS:MissingValues:KNNImputer:sklearn", 25 | "PREPROCESS:MissingValues:replace:pandas", 26 | "PREPROCESS:MissingValues:random:custom", 27 | "PREPROCESS:MissingValues:interpolate:sklearn", 28 | ] 29 | in_place_converter = [ 30 | "PREPROCESS:Category:LabelEncoder:sklearn", 31 | "PREPROCESS:Category:factorize:pandas", 32 | "PREPROCESS:Category:replace:pandas", 33 | "PREPROCESS:Category:map:custom", 34 | "PREPROCESS:Category:apply:pandas", 35 | "PREPROCESS:Category:custom:pandas", 36 | ] 37 | one_hot = [ 38 | "PREPROCESS:Category:get_dummies:pandas", 39 | "PREPROCESS:Category:OneHotEncoder:sklearn", 40 | "PREPROCESS:Category:LabelBinarizer:sklearn", 41 | ] 42 | 43 | text_vect = ["PREPROCESS:Text:CountVectorizer:sklearn", "PREPROCESS:Text:TfidfVectorizer:sklearn"] 44 | 45 | scaling = [ 46 | "PREPROCESS:Scaling:STANDARD:sklearn", 47 | "PREPROCESS:Scaling:MIN_MAX:custom", 48 | "PREPROCESS:Scaling:MIN_MAX:sklearn", 49 | "PREPROCESS:Scaling:STANDARD:custom", 50 | "PREPROCESS:Scaling:Robust:sklearn", 51 | "PREPROCESS:Scaling:STANDARD:Pandas", 52 | "PREPROCESS:Scaling:normalize:sklearn", 53 | "PREPROCESS:Scaling:normalize:Pandas", 54 | "PREPROCESS:Scaling:STANDARD:pandas", 55 | ] 56 | 57 | date = [ 58 | "PREPROCESS:GenerateColumn:date:pandas", 59 | "PREPROCESS:GenerateColumn:DATE:pandas", 60 | "PREPROCESS:GenerateColumn:DATE:custom", 61 | ] 62 | 63 | text_processing = [ 64 | "PREPROCESS:Text:lower:pandas", 65 | "PREPROCESS:Text:remove_non_alpha:custom", 66 | "PREPROCESS:Text:tokenize:nltk", 67 | "PREPROCESS:Text:Lemmtize:nltk", 68 | ] 69 | 70 | balancing = [ 71 | "PREPROCESS:Balancing:SMOTE:imblearn", 72 | "PREPROCESS:Balancing:resample:custom", 73 | "PREPROCESS:Balancing:sample:custom", 74 | ] 75 | 76 | log_transform = [ 77 | "PREPROCESS:Scaling:log1p:numpy", 78 | "PREPROCESS:Scaling:power:custom", 79 | "PREPROCESS:Scaling:log:numpy", 80 | "PREPROCESS:Scaling:sqrt:numpy", 81 | "PREPROCESS:Scaling:exp:numpy", 82 | "PREPROCESS:Scaling:log:custom", 83 | "PREPROCESS:Scaling:power_transform:sklearn", 84 | ] 85 | -------------------------------------------------------------------------------- /sapientml_core/design/search_space.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .. import ps_macros 16 | from .pp_component_groups import ( 17 | balancing, 18 | date, 19 | drop_label_list, 20 | filler_label, 21 | in_place_converter, 22 | log_transform, 23 | one_hot, 24 | scaling, 25 | text_processing, 26 | text_vect, 27 | ) 28 | 29 | target_labels = [ 30 | ps_macros.FILL, 31 | ps_macros.IN_PLACE_CONVERT, 32 | ps_macros.ONE_HOT, 33 | ps_macros.VECT, 34 | ps_macros.DATE, 35 | ps_macros.LEMMITIZE, 36 | ps_macros.BALANCING, 37 | ps_macros.SCALING, 38 | ps_macros.LOG, 39 | ] 40 | 41 | # Manually created semantic labels 42 | # Semantic labels are those that cannot be discriminated by our current list of meta-features 43 | 44 | 45 | label_mapping = { 46 | # macros.DROP: drop_label_list, 47 | ps_macros.FILL: filler_label, 48 | ps_macros.IN_PLACE_CONVERT: in_place_converter, 49 | ps_macros.ONE_HOT: one_hot, 50 | ps_macros.VECT: text_vect, 51 | ps_macros.MISSING: drop_label_list + filler_label, 52 | ps_macros.CATG: in_place_converter + one_hot, 53 | ps_macros.DATE: date, 54 | ps_macros.LEMMITIZE: text_processing, 55 | ps_macros.SCALING: scaling, 56 | ps_macros.BALANCING: balancing, 57 | ps_macros.LOG: log_transform, 58 | } 59 | 60 | project_related_metadata = ["file_name", "notebook_name", "csv_name", "accuracy", "target_column_name"] 61 | 62 | meta_feature_list = [ 63 | ps_macros.CATG_PRESENCE, 64 | ps_macros.TEXT_PRESENCE, 65 | ps_macros.BINARY_CATG_PRESENCE, 66 | ps_macros.SMALL_CATG_PRESENCE, 67 | ps_macros.LARGE_CATG_PRESENCE, 68 | ps_macros.MISSING_PRESENCE, 69 | ps_macros.NORMALIZED_MEAN, 70 | ps_macros.NORMALIZED_STD_DEV, 71 | ps_macros.NORMALIZED_VARIATION_ACROSS_COLUMNS, 72 | ps_macros.DATE_PRESENCE, 73 | ps_macros.IMBALANCE, 74 | ps_macros.MAX_SKEW, 75 | ] 76 | -------------------------------------------------------------------------------- /sapientml_core/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import enum 16 | 17 | 18 | # various operators in decision path for FE/pre-processing meta-models. 19 | class Operator(enum.Enum): 20 | EQUAL_TO = enum.auto() 21 | NOT_EQUAL_TO = enum.auto() 22 | GREATER_THAN = enum.auto() 23 | GREATER_THAN_OR_EQUAL_TO = enum.auto() 24 | LESS_THAN = enum.auto() 25 | LESS_THAN_OR_EQUAL_TO = enum.auto() 26 | -------------------------------------------------------------------------------- /sapientml_core/explain/code_template.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import datetime 16 | 17 | 18 | class Code_Template: 19 | """Code Template class.""" 20 | 21 | def __init__(self): 22 | self.str_reverse = {"NOW": str(datetime.datetime.now())} 23 | 24 | def update(self, lines): 25 | """update method. 26 | 27 | Parameters 28 | ---------- 29 | lines : list[str] 30 | A line in block code from jupyter content template. 31 | 32 | Returns 33 | ------- 34 | out : list[str] 35 | Updated line in block code from jupyter content template. 36 | 37 | """ 38 | out = [] 39 | for line in lines: 40 | for key in self.str_reverse: 41 | line = line.replace(key, self.str_reverse[key]) 42 | out.append(line) 43 | return out 44 | -------------------------------------------------------------------------------- /sapientml_core/explain/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Literal, Optional 16 | 17 | import pandas as pd 18 | from sapientml.params import CancellationToken 19 | from sapientml.util.logging import setup_logger 20 | 21 | from .AutoEDA import EDA 22 | from .AutoVisualization import AutoVisualization_Class 23 | from .code_miner import Miner 24 | 25 | logger = setup_logger() 26 | 27 | 28 | def process( 29 | visualization: bool, 30 | eda: bool, 31 | dataframe: pd.DataFrame, 32 | script_path: str, 33 | target_columns: list[str], 34 | problem_type: Literal["regression", "classification"], 35 | ignore_columns: Optional[list[str]] = None, 36 | skeleton: Optional[dict] = None, 37 | explanation: Optional[dict] = None, 38 | run_info: Optional[dict] = None, 39 | internal_execution: bool = False, 40 | timeout: int = 0, 41 | cancel: Optional[CancellationToken] = None, 42 | ): 43 | """process function. 44 | 45 | Parameters 46 | ---------- 47 | visualization : bool 48 | True and otherwise False 49 | eda : bool 50 | True and otherwise False 51 | dataframe : pd.DataFrame 52 | dataframe input 53 | script_path : str 54 | Path of the script. 55 | target_columns : list[str] 56 | Names of target columns. 57 | problem_type : Literal["regression", "classification"] 58 | Type of problem either regression or classification 59 | ignore_columns : list[str], optional 60 | Column names which must not be used and must be dropped. 61 | skeleton : dict, optional 62 | Probabilty score and other details of preprocess and model components. 63 | explanation : dict, optional 64 | pipelines explanation 65 | run_info : dict, optional 66 | execution results, logs and other information. 67 | internal_execution : bool 68 | True and otherwise Flase 69 | timeout : int 70 | integer value for timeout 71 | cancel : CancellationToken, optional 72 | 73 | Returns 74 | ------- 75 | output_files : List[str] 76 | list of .ipynb files. 77 | 78 | """ 79 | output_files = None 80 | 81 | if visualization: 82 | # Call AutoVisualization to generate visualization codes 83 | AV = AutoVisualization_Class() 84 | visualization_code = AV.AutoVisualization( 85 | df=dataframe, 86 | target_columns=target_columns, 87 | problem_type=problem_type, 88 | ignore_columns=ignore_columns, 89 | ) 90 | else: 91 | visualization_code = None 92 | 93 | if eda: 94 | # handle list(tuple, dict) value in dataframe. 95 | for col in dataframe.columns: 96 | exist_list_values = [x for x in dataframe[col] if type(x) in [list, tuple, dict]] 97 | if len(exist_list_values) > 0: 98 | dataframe[col] = dataframe[col].fillna("").astype(str) 99 | eda = EDA(dataframe, target_columns, log_level=2) 100 | 101 | eda.check_consistency(convert=False) 102 | 103 | categories, desc = eda.cat_process(threshold=0.01, IQR_activation=True, z_activation=True) 104 | 105 | initial_blocks = eda.description 106 | else: 107 | initial_blocks = [] 108 | 109 | code_miner = Miner( 110 | script_path, 111 | init_blocks=initial_blocks, 112 | visualization_code=visualization_code, 113 | logger=logger, 114 | skeleton=skeleton, 115 | explanation=explanation, 116 | run_info=run_info, 117 | ) 118 | output_files = code_miner.save_all(execution=internal_execution, timeout=timeout, cancel=cancel) 119 | return output_files 120 | -------------------------------------------------------------------------------- /sapientml_core/explain/templates/jupyter_content.json: -------------------------------------------------------------------------------- 1 | {"# BEGIN": [["# Use a generic Kaggle dataset path to start"], []], "#*** PIPELINE ***": [["We have to preprocess the dataset as first step.", "Then, we will generate a pipeline to train a model."], []], "# LOAD DATA": [["# Input Dataset"], []], "# PREPROCESSING-number": [["# Feature Engineering"], []], "# DETATCH TARGET": [[], []], "# TRAIN TEST SPLIT": [["## Split Train/Test", "We have to seprate train and test before start straining a model"], []], "# MODEL": [["# Train a Model"], []]} -------------------------------------------------------------------------------- /sapientml_core/internal_path.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from pathlib import Path 17 | 18 | sapientml_core_root = Path(__file__).parents[0] 19 | 20 | adaptation_root_dir = sapientml_core_root / "adaptation" 21 | artifacts_path = adaptation_root_dir / "artifacts" 22 | model_path = sapientml_core_root / "models" 23 | 24 | benchmark_path = sapientml_core_root / "benchmarks" 25 | corpus_path = sapientml_core_root / "corpus" 26 | training_cache = sapientml_core_root / ".cache" 27 | 28 | execution_cache_dir = training_cache / "exec_info" 29 | analysis_dir = training_cache / "analysis" 30 | clean_notebooks_dir_name = "clean-notebooks" 31 | clean_dir = corpus_path / clean_notebooks_dir_name 32 | project_labels_path = corpus_path / "annotated-notebooks" / "annotated-notebooks-1140.csv" 33 | -------------------------------------------------------------------------------- /sapientml_core/models/PY310/mp_model_1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY310/mp_model_1.pkl -------------------------------------------------------------------------------- /sapientml_core/models/PY310/mp_model_2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY310/mp_model_2.pkl -------------------------------------------------------------------------------- /sapientml_core/models/PY310/pp_models.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY310/pp_models.pkl -------------------------------------------------------------------------------- /sapientml_core/models/PY311/mp_model_1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY311/mp_model_1.pkl -------------------------------------------------------------------------------- /sapientml_core/models/PY311/mp_model_2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY311/mp_model_2.pkl -------------------------------------------------------------------------------- /sapientml_core/models/PY311/pp_models.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY311/pp_models.pkl -------------------------------------------------------------------------------- /sapientml_core/models/PY39/mp_model_1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY39/mp_model_1.pkl -------------------------------------------------------------------------------- /sapientml_core/models/PY39/mp_model_2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY39/mp_model_2.pkl -------------------------------------------------------------------------------- /sapientml_core/models/PY39/pp_models.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY39/pp_models.pkl -------------------------------------------------------------------------------- /sapientml_core/models/mp_model_1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/mp_model_1.pkl -------------------------------------------------------------------------------- /sapientml_core/models/mp_model_2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/mp_model_2.pkl -------------------------------------------------------------------------------- /sapientml_core/models/pp_models.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/pp_models.pkl -------------------------------------------------------------------------------- /sapientml_core/preprocess/default/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .generator import DefaultPreprocess 16 | from .params import DefaultPreprocessConfig 17 | 18 | __all__ = ["DefaultPreprocess", "DefaultPreprocessConfig"] 19 | -------------------------------------------------------------------------------- /sapientml_core/preprocess/default/params.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from sapientml.params import Config, String 4 | 5 | 6 | class DefaultPreprocessConfig(Config): 7 | """Configuration arguments for DefaultPreprocess class. 8 | 9 | Attributes 10 | ---------- 11 | use_pos_list : Optional[list[str]] 12 | List of parts-of-speech to be used during text analysis. 13 | This variable is used for japanese texts analysis. 14 | Select the part of speech below. 15 | "名詞", "動詞", "形容詞", "形容動詞", "副詞". 16 | use_word_stemming : bool default True 17 | Specify whether or not word stemming is used. 18 | This variable is used for japanese texts analysis. 19 | 20 | """ 21 | 22 | use_pos_list: Optional[list[String]] = ["名詞", "動詞", "助動詞", "形容詞", "副詞"] 23 | use_word_stemming: bool = True 24 | -------------------------------------------------------------------------------- /sapientml_core/preprocess/default/templates/drop_one_value_columns.py.jinja: -------------------------------------------------------------------------------- 1 | # DISCARD COLUMNS WITH ONE VALUE ONLY 2 | cols_one_value_only = {{ cols_one_value_only }} 3 | {% if training %} 4 | train_dataset = train_dataset.drop(cols_one_value_only, axis=1, errors="ignore") 5 | {% endif %} 6 | {% if test %} 7 | test_dataset = test_dataset.drop(cols_one_value_only, axis=1, errors="ignore") 8 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/preprocess/default/templates/handle_inf_columns.py.jinja: -------------------------------------------------------------------------------- 1 | # CONVERT INF TO NAN 2 | import numpy as np 3 | cols_inf_values = {{ cols_inf_values }} 4 | {% if training %} 5 | train_dataset[cols_inf_values] = train_dataset[cols_inf_values].replace([-np.inf, np.inf], np.nan) 6 | {% endif %} 7 | {% if test %} 8 | test_dataset[cols_inf_values] = test_dataset[cols_inf_values].replace([-np.inf, np.inf], np.nan) 9 | {% endif %} 10 | -------------------------------------------------------------------------------- /sapientml_core/preprocess/default/templates/handle_iterable_values.py.jinja: -------------------------------------------------------------------------------- 1 | # HANDLE ITERABLE VALUES IN DATAFRAME 2 | cols_iterable_values = {{ cols_iterable_values }} 3 | for col in cols_iterable_values: 4 | {% if training %} 5 | train_dataset[col] = train_dataset[col].fillna("").astype(str) 6 | {% endif %} 7 | {% if test %} 8 | test_dataset[col] = test_dataset[col].fillna("").astype(str) 9 | {% endif %} 10 | 11 | -------------------------------------------------------------------------------- /sapientml_core/preprocess/default/templates/handle_japanese_text.py.jinja: -------------------------------------------------------------------------------- 1 | # HANDLE JAPANESE TEXT 2 | import MeCab 3 | import ipadic 4 | tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS) 5 | use_pos_list = {{ config.use_pos_list }} 6 | use_word_stemming = {{ config.use_word_stemming }} 7 | def tokenize(text, use_pos_list, use_word_stemming, tokenizer): 8 | node = tokenizer.parseToNode(text) 9 | terms = [] 10 | while node: 11 | features = node.feature.split(",") 12 | pos = features[0] 13 | if pos != "BOS/EOS": 14 | if use_word_stemming: 15 | term = features[6] 16 | if (pos == "名詞") & (features[1] == "数"): 17 | term = node.surface 18 | else: 19 | term = node.surface 20 | if use_pos_list: 21 | if pos in use_pos_list: 22 | terms.append(term) 23 | else: 24 | terms.append(term) 25 | node = node.next 26 | return " ".join(terms) 27 | cols_japanese_text = {{ cols_japanese_text}} 28 | for col in cols_japanese_text: 29 | {% if training %} 30 | train_dataset[col] = train_dataset[col].fillna("").apply(lambda x: tokenize(x, use_pos_list, use_word_stemming, tokenizer)) 31 | {% endif %} 32 | {% if test %} 33 | test_dataset[col] = test_dataset[col].fillna("").apply(lambda x: tokenize(x, use_pos_list, use_word_stemming, tokenizer)) 34 | {% endif %} 35 | 36 | -------------------------------------------------------------------------------- /sapientml_core/preprocess/default/templates/handle_mixed_typed_columns.py.jinja: -------------------------------------------------------------------------------- 1 | # HANDLE MIXED TYPE 2 | import numpy as np 3 | cols_numeric_and_string = {{ cols_numeric_and_string}} 4 | for col in cols_numeric_and_string: 5 | {% if training %} 6 | train_dataset[col + '__str'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), train_dataset[col], np.nan) 7 | train_dataset[col + '__str'] = np.where(train_dataset[col + '__str'].notnull(), train_dataset[col + '__str'].astype(str), np.nan) 8 | train_dataset[col + '__num'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), np.nan, train_dataset[col]).astype(float) 9 | train_dataset = train_dataset.drop(col, axis=1) 10 | {% endif %} 11 | {% if test %} 12 | test_dataset[col + '__str'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), test_dataset[col], np.nan) 13 | test_dataset[col + '__str'] = np.where(test_dataset[col + '__str'].notnull(), test_dataset[col + '__str'].astype(str), np.nan) 14 | test_dataset[col + '__num'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), np.nan, test_dataset[col]).astype(float) 15 | test_dataset = test_dataset.drop(col, axis=1) 16 | {% endif %} 17 | -------------------------------------------------------------------------------- /sapientml_core/preprocess/default/templates/none_has_columns.py.jinja: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | {% if training %} 3 | train_dataset = train_dataset.replace([None], np.nan) 4 | {% endif %} 5 | {% if test %} 6 | test_dataset = test_dataset.replace([None], np.nan) 7 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/preprocess/default/templates/rename_columns.py.jinja: -------------------------------------------------------------------------------- 1 | # Remove special symbols that interfere with visualization and model training 2 | import re 3 | cols_has_symbols = {{ cols_has_symbols }} 4 | inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+") 5 | {% if training %} 6 | train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) 7 | {% endif %} 8 | {% if test %} 9 | test_dataset = test_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) 10 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/ps_macros.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | FILL = "PREPROCESS:MissingValues:fillna:pandas" 17 | IN_PLACE_CONVERT = "PREPROCESS:Category:LabelEncoder:sklearn" 18 | ONE_HOT = "PREPROCESS:Category:get_dummies:pandas" 19 | VECT = "PREPROCESS:Text:TfidfVectorizer:sklearn" 20 | MISSING = "PREPROCESS:MissingValues:all" 21 | CATG = "PREPROCESS:Category:all" 22 | SCALING = "PREPROCESS:Scaling:STANDARD:sklearn" 23 | DATE = "PREPROCESS:GenerateColumn:DATE:pandas" 24 | LEMMITIZE = "PREPROCESS:TextProcessing:Processing:custom" 25 | BALANCING = "PREPROCESS:Balancing:SMOTE:imblearn" 26 | LOG = "PREPROCESS:Scaling:log:custom" 27 | 28 | # Revised meta-features 29 | 30 | CATG_PRESENCE = "feature:str_category_presence" 31 | TEXT_PRESENCE = "feature:str_text_presence" 32 | BINARY_CATG_PRESENCE = "feature:str_category_binary_presence" 33 | SMALL_CATG_PRESENCE = "feature:str_category_small_presence" 34 | LARGE_CATG_PRESENCE = "feature:str_category_large_presence" 35 | DATE_PRESENCE = "feature:str_date_presence" 36 | STR_OTHER = "feature:str_other" 37 | 38 | MISSING_PRESENCE = "feature:missing_values_presence" 39 | DATE_PRESENCE = "feature:str_date_presence" 40 | 41 | NORMALIZED_MEAN = "feature:max_normalized_mean" 42 | NORMALIZED_STD_DEV = "feature:max_normalized_stddev" 43 | NORMALIZED_VARIATION_ACROSS_COLUMNS = "feature:normalized_variation_across_columns" 44 | IMBALANCE = "feature:target_imbalance_score" 45 | MAX_SKEW = "feature:max_skewness" 46 | 47 | 48 | TASK_CLASSIFICATION = "classification" 49 | TASK_REGRESSION = "regression" 50 | -------------------------------------------------------------------------------- /sapientml_core/seeding/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /sapientml_core/templates/explainability_templates/model_explanation.py.jinja: -------------------------------------------------------------------------------- 1 | # Component: {{ target_component_name }} 2 | # Efficient Cause: {{ target_component_name }} is required in this pipeline since the dataset has {{ relevant_meta_feature_list }}. -------------------------------------------------------------------------------- /sapientml_core/templates/explainability_templates/preprocessing_explanation.py.jinja: -------------------------------------------------------------------------------- 1 | # Component: {{ target_component_name }} 2 | # Efficient Cause: {{ target_component_name }} is required in this pipeline since the dataset has {{ relevant_meta_feature_list }}. The relevant features are: {{ relevant_column_list }}. 3 | # Purpose: {{ api_description }} 4 | # Form: 5 | # Input: {{ data_shape }} 6 | # Key hyperparameters used: {{ hyperparameters_description }} 7 | # Alternatives: Although {{ alternative_component_list }} can also be used for this dataset, {{ target_component_name }} is used because it has more {{ relevant_meta_feature_1 }} than {{ relevant_meta_feature_2 }}. 8 | # Order: {{ target_component_name }} should be applied {{ before_or_after }} {{ dependent_component_list }} -------------------------------------------------------------------------------- /sapientml_core/templates/model_templates/classification_post_process.jinja: -------------------------------------------------------------------------------- 1 | # POST PROCESSING 2 | {% if pipeline.adaptation_metric.startswith("MAP_") %} 3 | y_pred_sorted_index = pd.DataFrame(np.argsort(-y_pred)) 4 | y_pred = y_pred_sorted_index.apply(lambda x: model.classes_[x]).to_numpy() 5 | {% else %} 6 | if np.shape(y_pred)[1] == 2: 7 | y_pred = y_pred[:, 1] 8 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/model_templates/hyperparameter_tuning.py.jinja: -------------------------------------------------------------------------------- 1 | # HYPERPARAMETER OPTIMIZATION 2 | import optuna 3 | from {{import_library}} import {{ model_name }} 4 | 5 | 6 | # NEED CV: ex.) optuna.integration.OptunaSearchCV() 7 | class Objective(object): 8 | def __init__(self, feature_train, target_train, feature_test, target_test, __random_state): 9 | self.feature_train = feature_train 10 | self.target_train = target_train 11 | self.feature_test = feature_test 12 | self.target_test = target_test 13 | self.__random_state = __random_state 14 | def __call__(self, trial): 15 | def set_hyperparameters(trial): 16 | params = {} 17 | {{ params }} 18 | return params 19 | 20 | # SET DATA 21 | import numpy as np 22 | 23 | if isinstance(self.feature_train, pd.DataFrame): 24 | feature_train = self.feature_train 25 | elif isinstance(self.feature_train, np.ndarray): 26 | feature_train = pd.DataFrame(self.feature_train) 27 | else: 28 | feature_train = pd.DataFrame(self.feature_train.toarray()) 29 | 30 | if isinstance(self.target_train, pd.DataFrame): 31 | target_train = self.target_train 32 | elif isinstance(self.target_train, np.ndarray): 33 | target_train = pd.DataFrame(self.target_train) 34 | else: 35 | target_train = pd.DataFrame(self.target_train.toarray()) 36 | 37 | if isinstance(self.feature_test, pd.DataFrame): 38 | feature_test = self.feature_test 39 | elif isinstance(self.feature_test, np.ndarray): 40 | feature_test = pd.DataFrame(self.feature_test) 41 | else: 42 | feature_test = pd.DataFrame(self.feature_test.toarray()) 43 | 44 | if isinstance(self.target_test, pd.DataFrame): 45 | {% if 'inverse_target' in pipeline.pipeline_json %} 46 | target_test = self.target_test.copy() 47 | {% else %} 48 | target_test = self.target_test 49 | {% endif %} 50 | elif isinstance(self.target_test, np.ndarray): 51 | target_test = pd.DataFrame(self.target_test) 52 | else: 53 | target_test = pd.DataFrame(self.target_test.toarray()) 54 | 55 | # MODEL 56 | params = set_hyperparameters(trial) 57 | {% if flag_no_random_seed_model %} 58 | model = {{ model_name }}(**params) 59 | {% else %} 60 | model = {{ model_name }}(random_state=self.__random_state, **params) 61 | {% endif %} 62 | {% if is_multioutput_regression%} 63 | from sklearn.multioutput import MultiOutputRegressor 64 | 65 | model = MultiOutputRegressor(model) 66 | {% elif is_multioutput_classification %} 67 | from sklearn.multioutput import MultiOutputClassifier 68 | 69 | model = MultiOutputClassifier(model) 70 | {% endif %} 71 | {% set xgbclassifier = "XGBClassifier" %} 72 | {% if model_name == xgbclassifier %} 73 | from sklearn.preprocessing import LabelEncoder 74 | 75 | label_encoder = LabelEncoder() 76 | target_train = label_encoder.fit_transform(target_train) 77 | {% endif %} 78 | 79 | {% if pipeline.task.target_columns|length == 1 %} 80 | {% if model_name == xgbclassifier %} 81 | model.fit(feature_train, target_train.ravel()) 82 | {% else %} 83 | model.fit(feature_train, target_train.values.ravel()) 84 | {% endif %} 85 | {% else %} 86 | model.fit(feature_train, target_train) 87 | {% endif %} 88 | {% if flag_predict_proba == False %} 89 | y_pred = model.predict(feature_test) 90 | {% if model_name == xgbclassifier and not flag_predict_proba%} 91 | y_pred = label_encoder.inverse_transform(y_pred) 92 | {% endif %} 93 | {% elif flag_predict_proba == True %} 94 | y_pred = model.predict_proba(feature_test) 95 | {% filter indent(width=8, first=True) %} 96 | {{ binary_classification_snippet }} 97 | {% endfilter %} 98 | {% endif %} 99 | 100 | {% if 'inverse_target' in pipeline.pipeline_json %} 101 | {% filter indent(width=8, first=True) %} 102 | {{ pipeline.pipeline_json['inverse_target_hpo']['code'] }} 103 | {% endfilter %} 104 | {% endif %} 105 | 106 | {{ evaluation }} 107 | 108 | return score 109 | 110 | n_trials = {{ pipeline.config.hyperparameter_tuning_n_trials }} 111 | timeout = {{ timeout }} 112 | random_state = {{ pipeline.config.hyperparameter_tuning_random_state}} 113 | random_state_model = {{ pipeline.config.seed_for_model}} 114 | 115 | {% set maximize_metrics = [macros.Metric.AUC.value, macros.Metric.Accuracy.value, macros.Metric.F1.value, macros.Metric.R2.value, macros.Metric.Gini.value, macros.Metric.ROC_AUC.value] %} 116 | {% set minimize_metrics = [macros.Metric.RMSE.value, macros.Metric.RMSLE.value, macros.Metric.MAE.value, macros.Metric.LogLoss.value] %} 117 | 118 | {% if pipeline.adaptation_metric in maximize_metrics %} 119 | direction = 'maximize' 120 | {% elif pipeline.adaptation_metric in minimize_metrics %} 121 | direction = 'minimize' 122 | {% else %} 123 | direction = 'maximize' 124 | {% endif %} 125 | 126 | study = optuna.create_study(direction=direction, 127 | sampler=optuna.samplers.TPESampler(seed=random_state)) 128 | {{ enqueue_default_hyperparameters }} 129 | study.optimize(Objective(feature_train, target_train, feature_test, target_test, random_state_model), 130 | n_trials=n_trials, 131 | timeout=timeout) 132 | best_params = study.best_params 133 | 134 | print("best params:", best_params) 135 | print("RESULT: {{ pipeline.task.adaptation_metric }}: " + str(study.best_value)) -------------------------------------------------------------------------------- /sapientml_core/templates/model_templates/hyperparameters_default_value.py.jinja: -------------------------------------------------------------------------------- 1 | {% if model_name == 'RandomForestClassifier' %} 2 | default_hyperparameters = {'class_weight': None, 'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100, 'oob_score': False} 3 | {% elif model_name == 'RandomForestRegressor' %} 4 | default_hyperparameters = {'criterion': 'squared_error', 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 100, 'oob_score': False} 5 | {% elif model_name == 'ExtraTreesClassifier' %} 6 | default_hyperparameters = {'class_weight': None, 'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100} 7 | {% elif model_name == 'ExtraTreesRegressor' %} 8 | default_hyperparameters = {'criterion': 'squared_error', 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 100} 9 | {% elif model_name == 'GradientBoostingClassifier' %} 10 | default_hyperparameters = {'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100, 'subsample': 1.0} 11 | {% elif model_name == 'GradientBoostingRegressor' %} 12 | default_hyperparameters = {'alpha': 0.9, 'criterion': 'friedman_mse', 'loss': 'squared_error', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100, 'subsample': 1.0} 13 | {% elif model_name == 'AdaBoostClassifier' %} 14 | default_hyperparameters = {'algorithm': 'SAMME.R', 'n_estimators': 50} 15 | {% elif model_name == 'AdaBoostRegressor' %} 16 | default_hyperparameters = {'loss': 'linear', 'n_estimators': 50} 17 | {% elif model_name == 'DecisionTreeClassifier' %} 18 | default_hyperparameters = {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 1} 19 | {% elif model_name == 'DecisionTreeRegressor' %} 20 | default_hyperparameters = {'criterion': 'squared_error', 'max_features': None, 'min_samples_leaf': 1} 21 | {% elif model_name == 'SVC' %} 22 | default_hyperparameters = {'C': 1.0, 'class_weight': None} 23 | {% elif model_name == 'SVR' %} 24 | default_hyperparameters = {'C': 1.0} 25 | {% elif model_name == 'LinearSVC' %} 26 | default_hyperparameters = {'C': 1.0, 'class_weight': None, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'penalty': 'l2'} 27 | {% elif model_name == 'LinearSVR' %} 28 | default_hyperparameters = {'C': 1.0, 'intercept_scaling': 1.0, 'loss': 'epsilon_insensitive'} 29 | {% elif model_name == 'LogisticRegression' %} 30 | default_hyperparameters = {'C': 1.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'} 31 | {% elif model_name == 'SGDClassifier' %} 32 | default_hyperparameters = {'alpha': 0.0001, 'class_weight': None, 'early_stopping': False, 'loss': 'hinge', 'penalty': 'l2'} 33 | {% elif model_name == 'SGDRegressor' %} 34 | default_hyperparameters = {'alpha': 0.0001, 'loss': 'squared_error', 'penalty': 'l2'} 35 | {% elif model_name == 'Lasso' %} 36 | default_hyperparameters = {'alpha': 1.0} 37 | {% elif model_name == 'MLPClassifier' %} 38 | default_hyperparameters = {'activation': 'relu', 'alpha': 0.0001, 'solver': 'adam'} 39 | {% elif model_name == 'MLPRegressor' %} 40 | default_hyperparameters = {'activation': 'relu', 'alpha': 0.0001, 'solver': 'adam'} 41 | {% elif model_name == 'LGBMClassifier' or model_name == 'LGBMRegressor' %} 42 | default_hyperparameters = {'class_weight': None, 'colsample_bytree': 1.0, 'min_child_samples': 20, 'min_child_weight': 0.001, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1.0, 'subsample_freq': 0} 43 | {% elif model_name == 'XGBClassifier' %} 44 | default_hyperparameters = {'colsample_bytree': 1, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1} 45 | {% elif model_name == 'XGBRegressor' %} 46 | default_hyperparameters = {'colsample_bytree': 1, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1} 47 | {% elif model_name == 'CatBoostClassifier' or model_name == 'CatBoostRegressor' %} 48 | default_hyperparameters = {'boosting_type': 'Plain', 'depth': 6, 'bootstrap_type': 'MVS', 'silent': True} 49 | {% else %} 50 | default_hyperparameters = {} 51 | {% endif %} 52 | study.enqueue_trial(default_hyperparameters) -------------------------------------------------------------------------------- /sapientml_core/templates/model_templates/model.py.jinja: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from {{import_library}} import {{ model_name }} 3 | 4 | {% if "CatBoost" in model_name %} 5 | {% set silent="silent=True, " %} 6 | {% else %} 7 | {% set silent="" %} 8 | {% endif %} 9 | {% if model_arg == "HPO_noRandomSeed" %} 10 | model = {{ model_name }}(**best_params) 11 | {% elif model_arg == "HPO_RandomSeed" %} 12 | random_state_model = {{ pipeline.config.seed_for_model}} 13 | model = {{ model_name }}(random_state=random_state_model, **best_params) 14 | {% elif model_arg == "noHPO_noRandomSeed" %} 15 | model = {{ model_name }}({{ silent }}{{ params }}) 16 | {% elif model_arg == "noHPO_RandomSeed" %} 17 | random_state_model = {{ pipeline.config.seed_for_model}} 18 | model = {{ model_name }}({{ silent }}random_state=random_state_model, {{ params }}) 19 | {% endif %} 20 | 21 | {% if is_multioutput_regression%} 22 | from sklearn.multioutput import MultiOutputRegressor 23 | 24 | model = MultiOutputRegressor(model) 25 | {% elif is_multioutput_classification %} 26 | from sklearn.multioutput import MultiOutputClassifier 27 | 28 | model = MultiOutputClassifier(model) 29 | {% endif %} 30 | {% set xgbclassifier = "XGBClassifier" %} 31 | {% if is_multioutput_classification %} 32 | from sklearn.preprocessing import LabelEncoder 33 | label_encoders = {} 34 | for i, column in enumerate(target_train.columns): 35 | le = LabelEncoder() 36 | target_train[column] = le.fit_transform(target_train[column]) 37 | label_encoders[column] = le 38 | {% elif model_name == xgbclassifier %} 39 | from sklearn.preprocessing import LabelEncoder 40 | 41 | label_encoder = LabelEncoder() 42 | target_train = pd.DataFrame(label_encoder.fit_transform(target_train), columns=TARGET_COLUMNS) 43 | {% endif %} 44 | {% if pipeline.task.target_columns|length == 1 %} 45 | model.fit(feature_train, target_train.values.ravel()) 46 | {% else %} 47 | model.fit(feature_train, target_train) 48 | {% endif %} 49 | y_pred = model.predict(feature_test) 50 | {% if flag_predict_proba and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %} 51 | y_pred = model.classes_[np.argmax(y_pred, axis=1)].reshape(-1, 1) 52 | {% endif %} 53 | {% if is_multioutput_classification %} 54 | y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS) 55 | for column in TARGET_COLUMNS: 56 | y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int)) 57 | y_pred = y_pred_df 58 | {% elif model_name == xgbclassifier and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %} 59 | y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1) 60 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/model_templates/model_predict.py.jinja: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | {% set xgbclassifier = "XGBClassifier" %} 4 | 5 | with open('model.pkl', 'rb') as f: 6 | model = pickle.load(f) 7 | 8 | {% if (pipeline.adaptation_metric not in macros.metric_needing_predict_proba) or (pipeline.config.predict_option == macros.PRED_DEFAULT) %} 9 | y_pred = model.predict(feature_test) 10 | {% endif %} 11 | {% if pipeline.adaptation_metric and flag_predict_proba %} 12 | y_prob = model.predict_proba(feature_test) 13 | {% endif %} 14 | {% if model_name == xgbclassifier or is_multioutput_classification %} 15 | with open('target_LabelEncoder.pkl', 'rb') as f: 16 | label_encoder = pickle.load(f) 17 | {% endif %} 18 | {% if is_multioutput_classification %} 19 | y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS) 20 | for column in TARGET_COLUMNS: 21 | y_pred_df[column] = label_encoder[column].inverse_transform(y_pred_df[column].astype(int)) 22 | y_pred = y_pred_df 23 | {% elif model_name == xgbclassifier and ((pipeline.adaptation_metric not in macros.metric_needing_predict_proba) or (pipeline.config.predict_option == macros.PRED_DEFAULT)) %} 24 | y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1) 25 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/model_templates/model_test.py.jinja: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from {{import_library}} import {{ model_name }} 3 | 4 | {% if "CatBoost" in model_name %} 5 | {% set silent="silent=True, " %} 6 | {% else %} 7 | {% set silent="" %} 8 | {% endif %} 9 | {% if model_arg == "HPO_noRandomSeed" %} 10 | model = {{ model_name }}(**best_params) 11 | {% elif model_arg == "HPO_RandomSeed" %} 12 | random_state_model = {{ pipeline.config.seed_for_model}} 13 | model = {{ model_name }}(random_state=random_state_model, **best_params) 14 | {% elif model_arg == "noHPO_noRandomSeed" %} 15 | model = {{ model_name }}({{ silent }}{{ params }}) 16 | {% elif model_arg == "noHPO_RandomSeed" %} 17 | random_state_model = {{ pipeline.config.seed_for_model}} 18 | model = {{ model_name }}({{ silent }}random_state=random_state_model, {{ params }}) 19 | {% endif %} 20 | 21 | {% if is_multioutput_regression%} 22 | from sklearn.multioutput import MultiOutputRegressor 23 | 24 | model = MultiOutputRegressor(model) 25 | {% elif is_multioutput_classification %} 26 | from sklearn.multioutput import MultiOutputClassifier 27 | 28 | model = MultiOutputClassifier(model) 29 | {% endif %} 30 | {% set xgbclassifier = "XGBClassifier" %} 31 | {% if is_multioutput_classification %} 32 | from sklearn.preprocessing import LabelEncoder 33 | label_encoders = {} 34 | for i, column in enumerate(target_train.columns): 35 | le = LabelEncoder() 36 | target_train[column] = le.fit_transform(target_train[column]) 37 | label_encoders[column] = le 38 | {% elif model_name == xgbclassifier %} 39 | from sklearn.preprocessing import LabelEncoder 40 | 41 | label_encoder = LabelEncoder() 42 | target_train = pd.DataFrame(label_encoder.fit_transform(target_train), columns=TARGET_COLUMNS) 43 | {% endif %} 44 | {% if pipeline.task.target_columns|length == 1 %} 45 | model.fit(feature_train, target_train.values.ravel()) 46 | {% else %} 47 | model.fit(feature_train, target_train) 48 | {% endif %} 49 | y_pred = model.predict(feature_test) 50 | 51 | {% if is_multioutput_classification %} 52 | y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS) 53 | for column in TARGET_COLUMNS: 54 | y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int)) 55 | y_pred = y_pred_df 56 | {% elif model_name == xgbclassifier %} 57 | y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1) 58 | {% endif %} 59 | 60 | {% if pipeline.task.task_type == 'classification' %} 61 | y_prob = model.predict_proba(feature_test) 62 | 63 | # POST PROCESSING 64 | {% if pipeline.adaptation_metric.startswith("MAP_") %} 65 | y_prob_sorted_index = pd.DataFrame(np.argsort(-y_prob)) 66 | y_prob_map_k = y_prob_sorted_index.apply(lambda x: model.classes_[x]).to_numpy() 67 | {% endif %} 68 | 69 | {% if not is_multioutput_classification %} 70 | if np.shape(y_prob)[1] == 2: 71 | y_prob = y_prob[:, 1] 72 | {% endif %} 73 | 74 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/model_templates/model_train.py.jinja: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from {{import_library}} import {{ model_name }} 3 | 4 | {% if model_arg == "HPO_noRandomSeed" %} 5 | model = {{ model_name }}(**best_params) 6 | {% elif model_arg == "HPO_RandomSeed" %} 7 | random_state_model = {{ pipeline.config.seed_for_model}} 8 | model = {{ model_name }}(random_state=random_state_model, **best_params) 9 | {% elif model_arg == "noHPO_noRandomSeed" %} 10 | model = {{ model_name }}({{ params }}) 11 | {% elif model_arg == "noHPO_RandomSeed" %} 12 | random_state_model = {{ pipeline.config.seed_for_model}} 13 | model = {{ model_name }}(random_state=random_state_model, {{ params }}) 14 | {% endif %} 15 | 16 | {% if is_multioutput_regression%} 17 | from sklearn.multioutput import MultiOutputRegressor 18 | 19 | model = MultiOutputRegressor(model) 20 | {% elif is_multioutput_classification %} 21 | from sklearn.multioutput import MultiOutputClassifier 22 | 23 | model = MultiOutputClassifier(model) 24 | {% endif %} 25 | {% set xgbclassifier = "XGBClassifier" %} 26 | {% if is_multioutput_classification %} 27 | from sklearn.preprocessing import LabelEncoder 28 | label_encoders = {} 29 | for i, column in enumerate(target_train.columns): 30 | le = LabelEncoder() 31 | target_train[column] = le.fit_transform(target_train[column]) 32 | label_encoders[column] = le 33 | with open('target_LabelEncoder.pkl', 'wb') as f: 34 | pickle.dump(label_encoders, f) 35 | {% elif model_name == xgbclassifier %} 36 | from sklearn.preprocessing import LabelEncoder 37 | 38 | label_encoder = LabelEncoder() 39 | target_train = pd.DataFrame(label_encoder.fit_transform(target_train), columns=TARGET_COLUMNS) 40 | with open('target_LabelEncoder.pkl', 'wb') as f: 41 | pickle.dump(label_encoder, f) 42 | 43 | {% endif %} 44 | {% if pipeline.task.target_columns|length == 1 %} 45 | model.fit(feature_train, target_train.values.ravel()) 46 | {% else %} 47 | model.fit(feature_train, target_train) 48 | {% endif %} 49 | with open('model.pkl', 'wb') as f: 50 | pickle.dump(model, f) -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/confusion_matrix.py.jinja: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import ConfusionMatrixDisplay 2 | {% if pipeline.task.target_columns|length == 1 %} 3 | ConfusionMatrixDisplay.from_predictions(target_test, y_pred) 4 | {% elif is_multioutput_classification %} 5 | for i, column in enumerate(y_pred.columns): 6 | disp = ConfusionMatrixDisplay.from_predictions(target_test[column], y_pred[column].values) 7 | disp.ax_.set_title(column) 8 | {% else %} 9 | for i, column in enumerate(target_test.columns): 10 | disp = ConfusionMatrixDisplay.from_predictions(target_test[column], y_pred[:, i]) 11 | disp.ax_.set_title(column) 12 | {% endif %} 13 | 14 | -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/drop_columns.py.jinja: -------------------------------------------------------------------------------- 1 | # DISCARD IRRELEVANT COLUMNS 2 | irrelevant_columns = {{ irrelevant_columns }} 3 | {% if train %} 4 | train_dataset = train_dataset.drop(irrelevant_columns, axis=1, errors="ignore") 5 | {% endif %} 6 | {% if test %} 7 | test_dataset = test_dataset.drop(irrelevant_columns, axis=1, errors="ignore") 8 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/evaluation.py.jinja: -------------------------------------------------------------------------------- 1 | {% if pipeline.adaptation_metric == macros.Metric.AUC.value %} 2 | from sklearn.metrics import roc_auc_score 3 | {% if pipeline.task.is_multiclass == True %} 4 | auc = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") 5 | {% else %} 6 | auc = roc_auc_score(target_test, y_pred) 7 | {% endif %} 8 | print('RESULT: AUC Score: ' + str(auc)) 9 | {% elif (pipeline.adaptation_metric == macros.Metric.Accuracy.value) and (not pipeline.is_multi_class_multi_targets) %} 10 | from sklearn.metrics import accuracy_score 11 | 12 | accuracy = accuracy_score(target_test, y_pred) 13 | print('RESULT: Accuracy: ' + str(accuracy)) 14 | {% elif (pipeline.adaptation_metric == macros.Metric.Accuracy.value) and (pipeline.is_multi_class_multi_targets) %} 15 | from sklearn.metrics import accuracy_score 16 | 17 | __accs = [] 18 | for i, col in enumerate(target_test.columns): 19 | one_acc = accuracy_score(target_test[col], y_pred[col]) 20 | __accs.append(one_acc) 21 | print(f"RESULT: Accuracy : {str(sum(__accs)/len(__accs))}") 22 | {% elif pipeline.adaptation_metric == macros.Metric.F1.value and not is_multioutput_classification%} 23 | from sklearn import metrics 24 | 25 | f1 = metrics.f1_score(target_test, y_pred, average='macro') 26 | print('RESULT: F1 Score: ' + str(f1)) 27 | {% elif pipeline.adaptation_metric == macros.Metric.F1.value and is_multioutput_classification%} 28 | from sklearn import metrics 29 | 30 | __f1s = [] 31 | for i, col in enumerate(target_test.columns): 32 | one_f1 = metrics.f1_score(target_test[col], y_pred[col], average='macro') 33 | __f1s.append(one_f1) 34 | print(f"RESULT: F1 Score : {str(sum(__f1s)/len(__f1s))}") 35 | {% elif pipeline.adaptation_metric == macros.Metric.R2.value %} 36 | from sklearn import metrics 37 | 38 | r2 = metrics.r2_score(target_test, y_pred) 39 | print('RESULT: R2 Score:', str(r2)) 40 | {% elif pipeline.adaptation_metric == macros.Metric.RMSE.value %} 41 | from sklearn.metrics import mean_squared_error 42 | 43 | rmse = mean_squared_error(target_test, y_pred, squared=False) 44 | print('RESULT: RMSE:', str(rmse)) 45 | {% elif pipeline.adaptation_metric == macros.Metric.RMSLE.value %} 46 | import numpy as np 47 | from sklearn.metrics import mean_squared_log_error 48 | 49 | target_test = np.clip(target_test, 0, None) 50 | y_pred = np.clip(y_pred, 0, None) 51 | rmsle = np.sqrt(mean_squared_log_error(target_test, y_pred)) 52 | print('RESULT: RMSLE:', str(rmsle)) 53 | {% elif pipeline.adaptation_metric == macros.Metric.Gini.value %} 54 | from sklearn.metrics import roc_auc_score 55 | {% if pipeline.task.is_multiclass == True %} 56 | gini = 2 * roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") - 1 57 | {% else %} 58 | gini = 2 * roc_auc_score(target_test, y_pred) - 1 59 | {% endif %} 60 | print('RESULT: Gini: ' + str(gini)) 61 | {% elif pipeline.adaptation_metric == macros.Metric.MAE.value %} 62 | from sklearn.metrics import mean_absolute_error 63 | 64 | mae = mean_absolute_error(target_test, y_pred) 65 | print('RESULT: MAE:', str(mae)) 66 | {% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value %} 67 | from sklearn.metrics import log_loss 68 | 69 | log_loss = log_loss(target_test, y_pred) 70 | print('RESULT: Log Loss:', str(log_loss)) 71 | {% elif pipeline.adaptation_metric == macros.Metric.ROC_AUC.value %} 72 | from sklearn.metrics import roc_auc_score 73 | {% if pipeline.task.is_multiclass == True %} 74 | __roc_auc = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") 75 | {% else %} 76 | __roc_auc = roc_auc_score(target_test, y_pred) 77 | {% endif %} 78 | print('RESULT: ROC AUC:', str(__roc_auc)) 79 | {% elif pipeline.adaptation_metric == macros.Metric.MCC.value %} 80 | from sklearn.metrics import matthews_corrcoef 81 | 82 | mcc = matthews_corrcoef(target_test, y_pred) 83 | print('RESULT: MCC:', str(mcc)) 84 | {% elif pipeline.adaptation_metric.startswith("MAP_") %} 85 | {% set k = pipeline.adaptation_metric.split("_")[1] %} 86 | def apk(actual, predicted, k): 87 | if len(predicted)>k: 88 | predicted = predicted[:k] 89 | 90 | score = 0.0 91 | num_hits = 0.0 92 | 93 | for i,p in enumerate(predicted): 94 | if p in actual and p not in predicted[:i]: 95 | num_hits += 1.0 96 | score += num_hits / (i+1.0) 97 | 98 | return score / min(len(actual), k) 99 | 100 | def mapk(actual, predicted, k): 101 | """ Computes the mean average precision at k. 102 | 103 | Args: 104 | actual (list[list[str] or ndarray): A list of lists of elements that are to be predicted 105 | predicted (list[list[str] or ndarray): A list of lists of predicted elements 106 | (In each list, arrange in the order you predicted.) 107 | k (int): The maximum number of predicted elements 108 | 109 | Returns: 110 | double: The mean average precision at k over the input lists 111 | """ 112 | return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]) 113 | 114 | map_k = mapk(target_test.to_numpy(), y_pred, k={{ k }}) 115 | print('RESULT: MAP@K: ' + str(map_k)) 116 | {% elif pipeline.adaptation_metric == macros.Metric.QWK.value %} 117 | from sklearn.metrics import cohen_kappa_score 118 | 119 | qwk = cohen_kappa_score(target_test, y_pred, weights='quadratic') 120 | print('RESULT: QWK:', str(qwk)) 121 | {% elif pipeline.adaptation_metric == macros.Metric.MAPE.value %} 122 | from sklearn.metrics import mean_absolute_percentage_error 123 | 124 | mape = mean_absolute_percentage_error(target_test, y_pred) 125 | print('RESULT: MAPE:', str(mape)) 126 | 127 | {% elif pipeline.task_type == macros.TASK_REGRESSION.value %} 128 | from sklearn import metrics 129 | 130 | r2 = metrics.r2_score(target_test, y_pred) 131 | print('RESULT: R2 Score:', str(r2)) 132 | {% else %} 133 | from sklearn import metrics 134 | 135 | f1 = metrics.f1_score(target_test, y_pred, average='macro') 136 | print('RESULT: F1 Score: ' + str(f1)) 137 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/evaluation_test.py.jinja: -------------------------------------------------------------------------------- 1 | {% if pipeline.task.task_type == macros.TASK_CLASSIFICATION %} 2 | 3 | ## Metric: F1 4 | from sklearn.metrics import f1_score 5 | {% if is_multioutput_classification%} 6 | f1_scores = [] 7 | for i, column in enumerate(target_test.columns): 8 | f1_score_value = f1_score(target_test[column], y_pred[column], average='macro') 9 | f1_scores.append(f1_score_value) 10 | average_f1_score = np.mean(f1_scores) 11 | print('RESULT: Average F1 Score:', str(average_f1_score)) 12 | {% else %} 13 | f1 = f1_score(target_test, y_pred, average='macro') 14 | print('RESULT: F1 Score: ' + str(f1)) 15 | {% endif%} 16 | 17 | ## Metric: Accuracy 18 | from sklearn.metrics import accuracy_score 19 | {% if not pipeline.is_multi_class_multi_targets %} 20 | accuracy = accuracy_score(target_test, y_pred) 21 | print('RESULT: Accuracy: ' + str(accuracy)) 22 | {% elif pipeline.is_multi_class_multi_targets %} 23 | __accs = [] 24 | for i, col in enumerate(target_test.columns): 25 | one_acc = accuracy_score(target_test[col], y_pred[col]) 26 | __accs.append(one_acc) 27 | print(f"RESULT: Average Accuracy : {str(sum(__accs)/len(__accs))}") 28 | {% endif %} 29 | 30 | ## Metric: AUC and Gini 31 | from sklearn.metrics import roc_auc_score 32 | {% if is_multioutput_classification %} 33 | auc_scores = [] 34 | gini_scores = [] 35 | for i, column in enumerate(target_test.columns): 36 | if y_prob[i].ndim == 2 and y_prob[i].shape[1] == 2: 37 | auc_score = roc_auc_score(target_test[column], y_prob[i][:, 1]) 38 | elif y_prob[i].ndim == 2: 39 | auc_score = roc_auc_score(target_test[column], y_prob[i], multi_class="ovr") 40 | gini_score = 2 * auc_score - 1 41 | auc_scores.append(auc_score) 42 | gini_scores.append(gini_score) 43 | auc = np.mean(auc_scores) 44 | gini = np.mean(gini_scores) 45 | print('RESULT: Average AUC Score:', str(auc)) 46 | print('RESULT: Average Gini Score:', str(gini)) 47 | {% else %} 48 | {% if pipeline.task.is_multiclass == True %} 49 | auc = roc_auc_score(target_test.values.ravel(), y_prob, multi_class="ovr") 50 | {% else %} 51 | auc = roc_auc_score(target_test, y_prob) 52 | {% endif %} 53 | gini = 2 * auc - 1 54 | print('RESULT: AUC Score: ' + str(auc)) 55 | print('RESULT: Gini: ' + str(gini)) 56 | {% endif %} 57 | 58 | ## Metric: Log Loss 59 | from sklearn.metrics import log_loss 60 | {% if is_multioutput_classification %} 61 | log_loss_scores = [] 62 | for i, column in enumerate(target_test.columns): 63 | loss = log_loss(target_test[column], y_prob[i]) 64 | log_loss_scores.append(loss) 65 | avg_log_loss = np.mean(log_loss_scores) 66 | print('RESULT: Average Log Loss:', str(avg_log_loss)) 67 | {% else %} 68 | log_loss_score = log_loss(target_test, y_prob) 69 | print('RESULT: Log Loss:', str(log_loss_score)) 70 | {% endif %} 71 | 72 | {% if not is_multioutput_classification %} 73 | 74 | ## Metric: MCC 75 | from sklearn.metrics import matthews_corrcoef 76 | 77 | mcc = matthews_corrcoef(target_test, y_pred) 78 | print('RESULT: MCC:', str(mcc)) 79 | 80 | ## Metric: QWK 81 | from sklearn.metrics import cohen_kappa_score 82 | 83 | qwk = cohen_kappa_score(target_test, y_pred, weights='quadratic') 84 | print('RESULT: QWK:', str(qwk)) 85 | 86 | {% if pipeline.adaptation_metric.startswith("MAP_") %} 87 | ## Metric: MAP@K 88 | {% set k = pipeline.adaptation_metric.split("_")[1] %} 89 | def apk(actual, predicted, k): 90 | if len(predicted)>k: 91 | predicted = predicted[:k] 92 | 93 | score = 0.0 94 | num_hits = 0.0 95 | 96 | for i,p in enumerate(predicted): 97 | if p in actual and p not in predicted[:i]: 98 | num_hits += 1.0 99 | score += num_hits / (i+1.0) 100 | 101 | return score / min(len(actual), k) 102 | 103 | def mapk(actual, predicted, k): 104 | """ Computes the mean average precision at k. 105 | 106 | Args: 107 | actual (list[list[str] or ndarray): A list of lists of elements that are to be predicted 108 | predicted (list[list[str] or ndarray): A list of lists of predicted elements 109 | (In each list, arrange in the order you predicted.) 110 | k (int): The maximum number of predicted elements 111 | 112 | Returns: 113 | double: The mean average precision at k over the input lists 114 | """ 115 | return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]) 116 | 117 | map_k = mapk(target_test.to_numpy(), y_prob_map_k, k={{ k }}) 118 | print('RESULT: MAP@K: ' + str(map_k)) 119 | 120 | {% endif %} 121 | {% endif %} 122 | 123 | {% elif pipeline.task.task_type == macros.TASK_REGRESSION %} 124 | 125 | ## Metric: R2 126 | from sklearn import metrics 127 | 128 | r2 = metrics.r2_score(target_test, y_pred) 129 | print('RESULT: R2 Score:', str(r2)) 130 | 131 | ## Metric: RMSE 132 | from sklearn.metrics import mean_squared_error 133 | 134 | rmse = mean_squared_error(target_test, y_pred, squared=False) 135 | print('RESULT: RMSE:', str(rmse)) 136 | 137 | ## Metric: RMSLE 138 | import numpy as np 139 | from sklearn.metrics import mean_squared_log_error 140 | 141 | target_test = np.clip(target_test, 0, None) 142 | y_pred = np.clip(y_pred, 0, None) 143 | rmsle = np.sqrt(mean_squared_log_error(target_test, y_pred)) 144 | print('RESULT: RMSLE:', str(rmsle)) 145 | 146 | ## Metric: MAE 147 | from sklearn.metrics import mean_absolute_error 148 | 149 | mae = mean_absolute_error(target_test, y_pred) 150 | print('RESULT: MAE:', str(mae)) 151 | 152 | ## Metric: MAPE 153 | from sklearn.metrics import mean_absolute_percentage_error 154 | 155 | mape = mean_absolute_percentage_error(target_test, y_pred) 156 | print('RESULT: MAPE:', str(mape)) 157 | 158 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/hyperparameter_tuning_evaluation.py.jinja: -------------------------------------------------------------------------------- 1 | {% if pipeline.adaptation_metric == macros.Metric.AUC.value %} 2 | from sklearn.metrics import roc_auc_score 3 | {% if pipeline.task.is_multiclass == True %} 4 | score = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") 5 | {% else %} 6 | score = roc_auc_score(target_test, y_pred) 7 | {% endif %} 8 | {% elif pipeline.adaptation_metric == macros.Metric.Accuracy.value %} 9 | from sklearn.metrics import accuracy_score 10 | score = accuracy_score(target_test, y_pred) 11 | {% elif pipeline.adaptation_metric == macros.Metric.F1.value %} 12 | from sklearn import metrics 13 | score = metrics.f1_score(target_test, y_pred, average='macro') 14 | {% elif pipeline.adaptation_metric == macros.Metric.R2.value %} 15 | from sklearn import metrics 16 | score = metrics.r2_score(target_test, y_pred) 17 | {% elif pipeline.adaptation_metric == macros.Metric.RMSE.value %} 18 | from sklearn.metrics import mean_squared_error 19 | score = mean_squared_error(target_test, y_pred, squared=False) 20 | {% elif pipeline.adaptation_metric == macros.Metric.RMSLE.value %} 21 | import numpy as np 22 | from sklearn.metrics import mean_squared_log_error 23 | target_test = np.clip(target_test, 0, None) 24 | y_pred = np.clip(y_pred, 0, None) 25 | score = np.sqrt(mean_squared_log_error(target_test, y_pred)) 26 | {% elif pipeline.adaptation_metric == macros.Metric.Gini.value %} 27 | from sklearn.metrics import roc_auc_score 28 | {% if pipeline.task.is_multiclass == True %} 29 | score = 2 * roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") - 1 30 | {% else %} 31 | score = 2 * roc_auc_score(target_test, y_pred) - 1 32 | {% endif %} 33 | {% elif pipeline.adaptation_metric == macros.Metric.MAE.value %} 34 | from sklearn.metrics import mean_absolute_error 35 | score = mean_absolute_error(target_test, y_pred) 36 | {% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value %} 37 | from sklearn.metrics import log_loss 38 | score = log_loss(target_test, y_pred) 39 | {% elif pipeline.adaptation_metric == macros.Metric.ROC_AUC.value %} 40 | from sklearn.metrics import roc_auc_score 41 | {% if pipeline.task.is_multiclass == True %} 42 | score = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") 43 | {% else %} 44 | score = roc_auc_score(target_test, y_pred) 45 | {% endif %} 46 | {% elif pipeline.adaptation_metric.startswith("MAP_") %} 47 | {% set k = pipeline.adaptation_metric.split("_")[1] %} 48 | def apk(actual, predicted, k): 49 | if len(predicted)>k: 50 | predicted = predicted[:k] 51 | 52 | score = 0.0 53 | num_hits = 0.0 54 | 55 | for i,p in enumerate(predicted): 56 | if p in actual and p not in predicted[:i]: 57 | num_hits += 1.0 58 | score += num_hits / (i+1.0) 59 | 60 | return score / min(len(actual), k) 61 | 62 | def mapk(actual, predicted, k): 63 | """ Computes the mean average precision at k. 64 | 65 | Args: 66 | actual (list[list[str] or ndarray): A list of lists of elements that are to be predicted 67 | predicted (list[list[str] or ndarray): A list of lists of predicted elements 68 | (In each list, arrange in the order you predicted.) 69 | k (int): The maximum number of predicted elements 70 | 71 | Returns: 72 | double: The mean average precision at k over the input lists 73 | """ 74 | return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]) 75 | 76 | score = mapk(target_test.to_numpy(), y_pred, k={{ k }}) 77 | {% elif pipeline.adaptation_metric == macros.Metric.MAPE.value %} 78 | from sklearn.metrics import mean_absolute_percentage_error 79 | score = mean_absolute_percentage_error(target_test, y_pred) 80 | {% elif pipeline.task.task_type == macros.TASK_REGRESSION %} 81 | from sklearn import metrics 82 | score = metrics.r2_score(target_test, y_pred) 83 | {% else %} 84 | from sklearn import metrics 85 | score = metrics.f1_score(target_test, y_pred, average='macro') 86 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/inverse_target.py.jinja: -------------------------------------------------------------------------------- 1 | # INVERSE TARGET 2 | import numpy as np 3 | 4 | COLS_TO_BE_INVERSED = list(set(NUMERIC_COLS_TO_SCALE) & set(TARGET_COLUMNS)) 5 | {% if flag_hyperparameter_tuning %} 6 | target_test[COLS_TO_BE_INVERSED] = np.expm1(target_test[COLS_TO_BE_INVERSED]) 7 | y_pred = pd.DataFrame(data=y_pred, columns=TARGET_COLUMNS, index=feature_test.index) 8 | {% else %} 9 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): 10 | target_test[COLS_TO_BE_INVERSED] = np.expm1(target_test[COLS_TO_BE_INVERSED]) 11 | y_pred = pd.DataFrame(data=y_pred, columns=TARGET_COLUMNS, index=test_dataset.index) 12 | {% endif %} 13 | y_pred[COLS_TO_BE_INVERSED] = np.expm1(y_pred[COLS_TO_BE_INVERSED]) 14 | y_pred = y_pred.to_numpy() 15 | -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/permutation_importance.py.jinja: -------------------------------------------------------------------------------- 1 | # PERMUTATION IMPORTANCE 2 | from sklearn.inspection import permutation_importance 3 | {% if pipeline.task.target_columns|length == 1 %} 4 | {% set TARGET_TRAIN = 'target_train[TARGET_COLUMNS[0]]' %} 5 | {% else %} 6 | {% set TARGET_TRAIN = 'target_train' %} 7 | {% endif %} 8 | {% if pipeline.sparse_matrix %} 9 | if len(feature_train.columns) <= 100: 10 | perm = permutation_importance(model, feature_train.sparse.to_dense(), {{ TARGET_TRAIN }}, 11 | n_repeats=5, 12 | random_state=0) 13 | perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean}) 14 | perm_df.to_csv("./permutation_importance.csv", index=False) 15 | {% else %} 16 | if len(feature_train.columns) <= 100: 17 | perm = permutation_importance(model, feature_train, {{ TARGET_TRAIN }}, 18 | n_repeats=5, 19 | random_state=0) 20 | perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean}) 21 | perm_df.to_csv("./permutation_importance.csv", index=False) 22 | {% endif %} 23 | -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/prediction_result.py.jinja: -------------------------------------------------------------------------------- 1 | # OUTPUT PREDICTION 2 | {% set xgbclassifier = "XGBClassifier" %} 3 | {% if pipeline.config.predict_option == macros.PRED_PROBABILITY and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%} 4 | prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index) 5 | {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%} 6 | prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index) 7 | {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%} 8 | prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index) 9 | {% elif pipeline.config.predict_option is none and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%} 10 | prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index) 11 | {% elif pipeline.config.predict_option is none and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%} 12 | prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index) 13 | {% elif pipeline.config.predict_option is none and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%} 14 | prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index) 15 | {% elif pipeline.adaptation_metric.startswith("MAP_") %} 16 | {% set k = pipeline.adaptation_metric.split("_")[1] %} 17 | {% if y_prob_map_k is none %} 18 | prediction = pd.DataFrame(y_prob, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_prob.shape[1] + 1)], index=feature_test.index) 19 | {% elif is_multioutput_classification %} 20 | prediction = y_pred 21 | {% else %} 22 | prediction = pd.DataFrame(y_prob_map_k, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_prob_map_k.shape[1] + 1)], index=feature_test.index) 23 | {% endif %} 24 | {% else %} 25 | prediction = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index) 26 | {% endif %} 27 | prediction.to_csv("./prediction_result.csv") 28 | -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/preprocess_dataset.py.jinja: -------------------------------------------------------------------------------- 1 | # Export preprocessed dataset 2 | import time 3 | timestamp_str = time.strftime("%Y%m%d_%H%M%S") 4 | preprocess_dataset=pd.concat([pd.concat([feature_train, 5 | target_train], axis=1), 6 | pd.concat([feature_test, 7 | target_test], axis=1)]) 8 | preprocess_dataset.to_pickle(f"./preprocess_dataset_{timestamp_str}.pickle") 9 | -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/shap.py.jinja: -------------------------------------------------------------------------------- 1 | # Models are restricted because of execution time. 2 | {% set lgbmclassifier = "LGBMClassifier" %} 3 | models_for_shap = ['XGBClassifier', 'XGBRegressor', 'LGBMClassifier', 'LGBMRegressor', 'GradientBoostingClassifier', 'GradientBoostingRegressor'] 4 | if model.__class__.__name__ in models_for_shap: 5 | import shap 6 | feature_shap = feature_train.sample(1000) if feature_train.shape[0] > 1000 else feature_train 7 | {% if model_name == lgbmclassifier %} 8 | explainer = shap.Explainer(model,feature_shap) 9 | {% else %} 10 | explainer = shap.Explainer(model) 11 | {% endif %} 12 | shap_values = explainer(feature_shap) 13 | 14 | # summarize the effects of all the features 15 | shap.plots.beeswarm(shap_values) 16 | 17 | #bar plots 18 | shap.plots.bar(shap_values) -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/target_separation_predict.py.jinja: -------------------------------------------------------------------------------- 1 | # DETACH TARGET 2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }} 3 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): 4 | feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1) 5 | target_test = test_dataset[TARGET_COLUMNS].copy() 6 | else: 7 | feature_test = test_dataset 8 | -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/target_separation_test.py.jinja: -------------------------------------------------------------------------------- 1 | # DETACH TARGET 2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }} 3 | feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) 4 | target_train = train_dataset[TARGET_COLUMNS].copy() 5 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): 6 | feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1) 7 | target_test = test_dataset[TARGET_COLUMNS].copy() 8 | else: 9 | feature_test = test_dataset 10 | -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/target_separation_train.py.jinja: -------------------------------------------------------------------------------- 1 | # DETACH TARGET 2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }} 3 | feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) 4 | target_train = train_dataset[TARGET_COLUMNS].copy() -------------------------------------------------------------------------------- /sapientml_core/templates/other_templates/target_separation_validation.py.jinja: -------------------------------------------------------------------------------- 1 | # DETACH TARGET 2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }} 3 | feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1) 4 | target_train = train_dataset[TARGET_COLUMNS].copy() 5 | feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1) 6 | target_test = test_dataset[TARGET_COLUMNS].copy() 7 | 8 | -------------------------------------------------------------------------------- /sapientml_core/templates/pipeline_predict.py.jinja: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | {% if 'discard_columns' in pipeline_json %} 4 | 5 | {{ pipeline_json['discard_columns']['code_predict'] }} 6 | {% endif %} 7 | {% if 'preprocessing_before_target_separation' in pipeline_json %} 8 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %} 9 | {% for code in component['code_predict'] %} 10 | 11 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 12 | {{ code }} 13 | {% endfor %} 14 | {% endfor %} 15 | {% endif %} 16 | {% if 'target_separation' in pipeline_json %} 17 | 18 | {{ pipeline_json['target_separation']['code_predict'] }} 19 | {% endif %} 20 | {% if 'preprocessing_after_target_separation' in pipeline_json %} 21 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %} 22 | {% for code in component['code_predict'] %} 23 | 24 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 25 | {{ code }} 26 | {% endfor %} 27 | {% endfor %} 28 | {% endif %} 29 | {% if 'preprocessing_after_train_test_split' in pipeline_json %} 30 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %} 31 | {% for code in component['code_predict'] %} 32 | 33 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 34 | {{ code }} 35 | {% endfor %} 36 | {% endfor %} 37 | {% endif %} 38 | {% if 'model' in pipeline_json %} 39 | 40 | # MODEL 41 | {{ pipeline_json['model']['code_predict'] }} 42 | {% endif %} 43 | {% if 'inverse_target' in pipeline_json %} 44 | 45 | {{ pipeline_json['inverse_target']['code'] }} 46 | {% endif %} 47 | {% if 'evaluation' in pipeline_json %} 48 | 49 | #EVALUATION 50 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): 51 | {% filter indent(width=4, first=True) %} 52 | {{ pipeline_json['evaluation']['code_predict'] }} 53 | {% endfilter %} 54 | {% endif %} 55 | {% if 'output_prediction' in pipeline_json %} 56 | 57 | {{ pipeline_json['output_prediction']['code'] }} 58 | {% endif %} 59 | -------------------------------------------------------------------------------- /sapientml_core/templates/pipeline_test.py.jinja: -------------------------------------------------------------------------------- 1 | {% if 'discard_columns' in pipeline_json %} 2 | 3 | {{ pipeline_json['discard_columns']['code'] }} 4 | {% endif %} 5 | {% if 'preprocessing_before_target_separation' in pipeline_json %} 6 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %} 7 | {% for code in component['code'] %} 8 | 9 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 10 | {{ code }} 11 | {% endfor %} 12 | {% endfor %} 13 | {% endif %} 14 | {% if 'target_separation' in pipeline_json %} 15 | 16 | {{ pipeline_json['target_separation']['code_test'] }} 17 | {% endif %} 18 | {% if 'preprocessing_after_target_separation' in pipeline_json %} 19 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %} 20 | {% for code in component['code'] %} 21 | 22 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 23 | {{ code }} 24 | {% endfor %} 25 | {% endfor %} 26 | {% endif %} 27 | {% if 'preprocessing_after_train_test_split' in pipeline_json %} 28 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %} 29 | {% for code in component['code'] %} 30 | 31 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 32 | {{ code }} 33 | {% endfor %} 34 | {% endfor %} 35 | {% endif %} 36 | {% if flag_hyperparameter_tuning %} 37 | 38 | # BEST PARAMETERS IN THE CANDIDATE SCRIPT 39 | # PLEASE SEE THE CANDIDATE SCRIPTS FOR THE HYPERPARAMTER OPTIMIZATION CODE 40 | best_params = study.best_params 41 | {% endif %} 42 | 43 | {% if 'preprocess_dataset' in pipeline_json %} 44 | {{ pipeline_json['preprocess_dataset']['code_test'] }} 45 | 46 | {% endif %} 47 | {% if 'model' in pipeline_json %} 48 | 49 | # MODEL 50 | {{ pipeline_json['model']['code_test'] }} 51 | {% endif %} 52 | {% if 'inverse_target' in pipeline_json %} 53 | 54 | {{ pipeline_json['inverse_target']['code'] }} 55 | {% endif %} 56 | {% if 'evaluation' in pipeline_json %} 57 | 58 | #EVALUATION 59 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): 60 | {% filter indent(width=4, first=True) %} 61 | {{ pipeline_json['evaluation']['code_test'] }} 62 | {% endfilter %} 63 | {% endif %} 64 | 65 | {% if 'confusion_matrix' in pipeline_json and pipeline.task.task_type == 'classification'%} 66 | # Confusion Matrix 67 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()): 68 | {% filter indent(width=4, first=True) %} 69 | {{ pipeline_json['confusion_matrix']['code'] }} 70 | {% endfilter %} 71 | {% endif %} 72 | {% if 'output_prediction' in pipeline_json %} 73 | 74 | {{ pipeline_json['output_prediction']['code'] }} 75 | {% endif %} 76 | {% if 'permutation_importance' in pipeline_json %} 77 | 78 | {{ pipeline_json['permutation_importance']['code'] }} 79 | {% endif %} 80 | 81 | {% if 'shap' in pipeline_json and not pipeline.task.is_multiclass %} 82 | 83 | {{ pipeline_json['shap']['code'] }} 84 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/pipeline_train.py.jinja: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | {% if 'discard_columns' in pipeline_json %} 4 | 5 | {{ pipeline_json['discard_columns']['code_train'] }} 6 | {% endif %} 7 | {% if 'preprocessing_before_target_separation' in pipeline_json %} 8 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %} 9 | {% for code in component['code_train'] %} 10 | 11 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 12 | {{ code }} 13 | {% endfor %} 14 | {% endfor %} 15 | {% endif %} 16 | {% if 'target_separation' in pipeline_json %} 17 | 18 | {{ pipeline_json['target_separation']['code_train'] }} 19 | {% endif %} 20 | {% if 'preprocessing_after_target_separation' in pipeline_json %} 21 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %} 22 | {% for code in component['code_train'] %} 23 | 24 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 25 | {{ code }} 26 | {% endfor %} 27 | {% endfor %} 28 | {% endif %} 29 | {% if 'preprocessing_after_train_test_split' in pipeline_json %} 30 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %} 31 | {% for code in component['code_train'] %} 32 | 33 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 34 | {{ code }} 35 | {% endfor %} 36 | {% endfor %} 37 | {% endif %} 38 | {% if flag_hyperparameter_tuning %} 39 | 40 | # BEST PARAMETERS IN THE CANDIDATE SCRIPT 41 | # PLEASE SEE THE CANDIDATE SCRIPTS FOR THE HYPERPARAMTER OPTIMIZATION CODE 42 | best_params = study.best_params 43 | {% endif %} 44 | {% if 'model' in pipeline_json %} 45 | 46 | # MODEL 47 | {{ pipeline_json['model']['code_train'] }} 48 | {% endif %} 49 | -------------------------------------------------------------------------------- /sapientml_core/templates/pipeline_validation.py.jinja: -------------------------------------------------------------------------------- 1 | {% if 'discard_columns' in pipeline_json %} 2 | 3 | {{ pipeline_json['discard_columns']['code'] }} 4 | {% endif %} 5 | {% if 'preprocessing_before_target_separation' in pipeline_json %} 6 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %} 7 | {% for code in component['code'] %} 8 | 9 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 10 | {{ code }} 11 | {% endfor %} 12 | {% endfor %} 13 | {% endif %} 14 | {% if 'target_separation' in pipeline_json %} 15 | 16 | {{ pipeline_json['target_separation']['code_validation'] }} 17 | {% endif %} 18 | {% if 'preprocessing_after_target_separation' in pipeline_json %} 19 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %} 20 | {% for code in component['code'] %} 21 | 22 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 23 | {{ code }} 24 | {% endfor %} 25 | {% endfor %} 26 | {% endif %} 27 | {% if 'preprocessing_after_train_test_split' in pipeline_json %} 28 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %} 29 | {% for code in component['code'] %} 30 | 31 | # PREPROCESSING-{{ component['id'] + loop.index0 }} 32 | {{ code }} 33 | {% endfor %} 34 | {% endfor %} 35 | {% endif %} 36 | {% if flag_hyperparameter_tuning %} 37 | 38 | {{ pipeline_json['hyperparameter_optimization']['code'] }} 39 | {% else %} 40 | {% if 'model' in pipeline_json %} 41 | {% if 'preprocess_dataset' in pipeline_json %} 42 | {{ pipeline_json['preprocess_dataset']['code_test'] }} 43 | 44 | {% endif %} 45 | 46 | # MODEL 47 | {{ pipeline_json['model']['code'] }} 48 | {% endif %} 49 | {% if 'inverse_target' in pipeline_json %} 50 | 51 | {{ pipeline_json['inverse_target']['code'] }} 52 | {% endif %} 53 | {% if 'evaluation' in pipeline_json %} 54 | 55 | #EVALUATION 56 | {{ pipeline_json['evaluation']['code_validation'] }} 57 | {% endif %} 58 | {% endif %}{# if flag_hyperparameter_tuning #} 59 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/DATE.py.jinja: -------------------------------------------------------------------------------- 1 | DATE_COLUMNS = {{ columns }} 2 | for _col in DATE_COLUMNS: 3 | train_date_col = pd.to_datetime({{ train_dataset }}[_col], errors='coerce') 4 | {{ train_dataset }}[_col + "_year"] = train_date_col.dt.year.fillna(-1) 5 | {{ train_dataset }}[_col + "_month"] = train_date_col.dt.month.fillna(-1) 6 | {{ train_dataset }}[_col + "_day"] = train_date_col.dt.day.fillna(-1) 7 | {{ train_dataset }}[_col + "_day_of_week"] = train_date_col.dt.dayofweek.fillna(-1) 8 | {{ train_dataset }}.drop(_col, axis=1, inplace=True) 9 | 10 | test_date_col = pd.to_datetime({{ test_dataset }}[_col], errors='coerce') 11 | {{ test_dataset }}[_col+ "_year"] = test_date_col.dt.year.fillna(-1) 12 | {{ test_dataset }}[_col + "_month"] = test_date_col.dt.month.fillna(-1) 13 | {{ test_dataset }}[_col + "_day"] = test_date_col.dt.day.fillna(-1) 14 | {{ test_dataset }}[_col + "_day_of_week"] = test_date_col.dt.dayofweek.fillna(-1) 15 | {{ test_dataset }}.drop(_col, axis=1, inplace=True) -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/DATE_predict.jinja: -------------------------------------------------------------------------------- 1 | DATE_COLUMNS = {{ columns }} 2 | for _col in DATE_COLUMNS: 3 | test_date_col = pd.to_datetime({{ test_dataset }}[_col], errors='coerce') 4 | {{ test_dataset }}[_col+ "_year"] = test_date_col.dt.year.fillna(-1) 5 | {{ test_dataset }}[_col + "_month"] = test_date_col.dt.month.fillna(-1) 6 | {{ test_dataset }}[_col + "_day"] = test_date_col.dt.day.fillna(-1) 7 | {{ test_dataset }}[_col + "_day_of_week"] = test_date_col.dt.dayofweek.fillna(-1) 8 | {{ test_dataset }}.drop(_col, axis=1, inplace=True) -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/DATE_train.jinja: -------------------------------------------------------------------------------- 1 | DATE_COLUMNS = {{ columns }} 2 | for _col in DATE_COLUMNS: 3 | train_date_col = pd.to_datetime({{ train_dataset }}[_col], errors='coerce') 4 | {{ train_dataset }}[_col + "_year"] = train_date_col.dt.year.fillna(-1) 5 | {{ train_dataset }}[_col + "_month"] = train_date_col.dt.month.fillna(-1) 6 | {{ train_dataset }}[_col + "_day"] = train_date_col.dt.day.fillna(-1) 7 | {{ train_dataset }}[_col + "_day_of_week"] = train_date_col.dt.dayofweek.fillna(-1) 8 | {{ train_dataset }}.drop(_col, axis=1, inplace=True) -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/LabelEncoder.py.jinja: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import OrdinalEncoder 2 | 3 | CATEGORICAL_COLS = {{ columns }} 4 | ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) 5 | {{ train_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS]) 6 | {{ test_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS]) -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/LabelEncoder_predict.py.jinja: -------------------------------------------------------------------------------- 1 | with open('ordinalEncoder.pkl', 'rb') as f: 2 | ordinal_encoder = pickle.load(f) 3 | 4 | CATEGORICAL_COLS = {{ columns }} 5 | {{ test_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS]) -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/LabelEncoder_train.py.jinja: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import OrdinalEncoder 2 | 3 | CATEGORICAL_COLS = {{ columns }} 4 | ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) 5 | {{ train_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS]) 6 | 7 | with open('ordinalEncoder.pkl', 'wb') as f: 8 | pickle.dump(ordinal_encoder, f) 9 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/Processing.py.jinja: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | 4 | import nltk 5 | 6 | TEXT_COLUMNS = {{ columns }} 7 | def process_text(__dataset): 8 | for _col in TEXT_COLUMNS: 9 | process_text = [t.lower() for t in __dataset[_col]] 10 | 11 | # strip all punctuation 12 | table = str.maketrans('', '', string.punctuation) 13 | process_text = [t.translate(table) for t in process_text] 14 | 15 | # convert all numbers in text to 'num' 16 | process_text = [re.sub(r'\d+', 'num', t) for t in process_text] 17 | __dataset[_col] = process_text 18 | return __dataset 19 | 20 | {{ train_dataset }} = process_text({{ train_dataset }}) 21 | {{ test_dataset }} = process_text({{ test_dataset }}) -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/Processing_predict.py.jinja: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | 4 | import nltk 5 | 6 | TEXT_COLUMNS = {{ columns }} 7 | def process_text(__dataset): 8 | for _col in TEXT_COLUMNS: 9 | process_text = [t.lower() for t in __dataset[_col]] 10 | 11 | # strip all punctuation 12 | table = str.maketrans('', '', string.punctuation) 13 | process_text = [t.translate(table) for t in process_text] 14 | 15 | # convert all numbers in text to 'num' 16 | process_text = [re.sub(r'\d+', 'num', t) for t in process_text] 17 | __dataset[_col] = process_text 18 | return __dataset 19 | 20 | {{ test_dataset }} = process_text({{ test_dataset }}) -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/Processing_train.py.jinja: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | 4 | import nltk 5 | 6 | TEXT_COLUMNS = {{ columns }} 7 | def process_text(__dataset): 8 | for _col in TEXT_COLUMNS: 9 | process_text = [t.lower() for t in __dataset[_col]] 10 | 11 | # strip all punctuation 12 | table = str.maketrans('', '', string.punctuation) 13 | process_text = [t.translate(table) for t in process_text] 14 | 15 | # convert all numbers in text to 'num' 16 | process_text = [re.sub(r'\d+', 'num', t) for t in process_text] 17 | __dataset[_col] = process_text 18 | return __dataset 19 | 20 | {{ train_dataset }} = process_text({{ train_dataset }}) -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/SMOTE.py.jinja: -------------------------------------------------------------------------------- 1 | from imblearn.over_sampling import SMOTE 2 | 3 | smote = SMOTE(random_state=0) 4 | {% if pipeline.sparse_matrix %} 5 | feature_columns = feature_train.columns 6 | feature_train = feature_train.sparse.to_coo() 7 | feature_train, target_train = smote.fit_resample(feature_train, target_train) 8 | feature_train = pd.DataFrame.sparse.from_spmatrix(feature_train, columns=feature_columns) 9 | {% else %} 10 | feature_train, target_train = smote.fit_resample(feature_train, target_train) 11 | {% endif %} 12 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/STANDARD.py.jinja: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import StandardScaler 2 | 3 | standard_scaler = StandardScaler(with_mean=False) 4 | {% if pipeline.sparse_matrix %} 5 | {% set dataframe = "pd.DataFrame.sparse.from_spmatrix" %} 6 | {% else %} 7 | {% set dataframe = "pd.DataFrame" %} 8 | {% endif %} 9 | {{ train_dataset }} = {{ dataframe }}(standard_scaler.fit_transform({{ train_dataset }}), columns={{ train_dataset }}.columns, index={{ train_dataset }}.index) 10 | {{ test_dataset }} = {{ dataframe }}(standard_scaler.transform({{ test_dataset }}), columns={{ test_dataset }}.columns, index={{ test_dataset }}.index) 11 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/STANDARD_predict.py.jinja: -------------------------------------------------------------------------------- 1 | with open('standardScaler.pkl', 'rb') as f: 2 | standard_scaler = pickle.load(f) 3 | 4 | {% if pipeline.sparse_matrix %} 5 | {% set dataframe = "pd.DataFrame.sparse.from_spmatrix" %} 6 | {% else %} 7 | {% set dataframe = "pd.DataFrame" %} 8 | {% endif %} 9 | {{ test_dataset }} = {{ dataframe }}(standard_scaler.transform({{ test_dataset }}), columns={{ test_dataset }}.columns, index={{ test_dataset }}.index) 10 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/STANDARD_train.py.jinja: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import StandardScaler 2 | 3 | standard_scaler = StandardScaler(with_mean=False) 4 | {% if pipeline.sparse_matrix %} 5 | {% set dataframe = "pd.DataFrame.sparse.from_spmatrix" %} 6 | {% else %} 7 | {% set dataframe = "pd.DataFrame" %} 8 | {% endif %} 9 | {{ train_dataset }} = {{ dataframe }}(standard_scaler.fit_transform({{ train_dataset }}), columns={{ train_dataset }}.columns, index={{ train_dataset }}.index) 10 | 11 | with open('standardScaler.pkl', 'wb') as f: 12 | pickle.dump(standard_scaler, f) 13 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/TfidfVectorizer.py.jinja: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | 3 | TEXT_COLUMNS = {{ columns }} 4 | temp_train_data = {{ train_dataset }}[TEXT_COLUMNS] 5 | temp_test_data = {{ test_dataset }}[TEXT_COLUMNS] 6 | # Make the entire dataframe sparse to avoid it converting into a dense matrix. 7 | {{ train_dataset }} = {{ train_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0)) 8 | {{ test_dataset }} = {{ test_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0)) 9 | 10 | {% if pipeline.config.use_word_list %} 11 | {% if pipeline.config.use_word_list is mapping %} 12 | # Use only specified words as features for each column 13 | use_word_list = {{ pipeline.config.use_word_list }} 14 | for col, word_list in use_word_list.items(): 15 | word_list = [word.lower() for word in word_list] 16 | word_list = list(set(word_list)) 17 | use_word_list[col] = word_list 18 | for _col in TEXT_COLUMNS: 19 | tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list.get(_col)) 20 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col]) 21 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()] 22 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index) 23 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1) 24 | vector_test = tfidfvectorizer.transform(temp_test_data[_col]) 25 | vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index) 26 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1) 27 | {% else %} 28 | # Use only specified words as features 29 | use_word_list = {{ pipeline.config.use_word_list }} 30 | use_word_list = [word.lower() for word in use_word_list] 31 | use_word_list = list(set(use_word_list)) 32 | for _col in TEXT_COLUMNS: 33 | tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list) 34 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col]) 35 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()] 36 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index) 37 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1) 38 | vector_test = tfidfvectorizer.transform(temp_test_data[_col]) 39 | vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index) 40 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1) 41 | {% endif %} 42 | {% else %} 43 | for _col in TEXT_COLUMNS: 44 | tfidfvectorizer = TfidfVectorizer(max_features=3000) 45 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col]) 46 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()] 47 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index) 48 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1) 49 | vector_test = tfidfvectorizer.transform(temp_test_data[_col]) 50 | vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index) 51 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1) 52 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/TfidfVectorizer_predict.py.jinja: -------------------------------------------------------------------------------- 1 | TEXT_COLUMNS = {{ columns }} 2 | temp_test_data = {{ test_dataset }}[TEXT_COLUMNS] 3 | # Make the entire dataframe sparse to avoid it converting into a dense matrix. 4 | {{ test_dataset }} = {{ test_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0)) 5 | with open('tfidfVectorizer.pkl', 'rb') as f: 6 | vectorizers = pickle.load(f) 7 | for _col in TEXT_COLUMNS: 8 | tfidfvectorizer = vectorizers[_col] 9 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()] 10 | vector_test = tfidfvectorizer.transform(temp_test_data[_col]) 11 | vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index) 12 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1) 13 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/TfidfVectorizer_train.py.jinja: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | 3 | TEXT_COLUMNS = {{ columns }} 4 | temp_train_data = {{ train_dataset }}[TEXT_COLUMNS] 5 | # Make the entire dataframe sparse to avoid it converting into a dense matrix. 6 | {{ train_dataset }} = {{ train_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0)) 7 | vectorizers = {} 8 | 9 | {% if pipeline.config.use_word_list %} 10 | {% if pipeline.config.use_word_list is mapping %} 11 | # Use only specified words as features for each column 12 | use_word_list = {{ pipeline.config.use_word_list }} 13 | for col, word_list in use_word_list.items(): 14 | word_list = [word.lower() for word in word_list] 15 | word_list = list(set(word_list)) 16 | use_word_list[col] = word_list 17 | for _col in TEXT_COLUMNS: 18 | tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list.get(_col)) 19 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col]) 20 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()] 21 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index) 22 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1) 23 | vectorizers[_col] = tfidfvectorizer 24 | {% else %} 25 | # Use only specified words as features 26 | use_word_list = {{ pipeline.config.use_word_list }} 27 | use_word_list = [word.lower() for word in use_word_list] 28 | use_word_list = list(set(use_word_list)) 29 | for _col in TEXT_COLUMNS: 30 | tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list) 31 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col]) 32 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()] 33 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index) 34 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1) 35 | vectorizers[_col] = tfidfvectorizer 36 | {% endif %} 37 | {% else %} 38 | for _col in TEXT_COLUMNS: 39 | tfidfvectorizer = TfidfVectorizer(max_features=3000) 40 | vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col]) 41 | feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()] 42 | vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index) 43 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1) 44 | vectorizers[_col] = tfidfvectorizer 45 | {% endif %} 46 | 47 | with open('tfidfVectorizer.pkl', 'wb') as f: 48 | pickle.dump(vectorizers, f) -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/fillna-type-numeric.py.jinja: -------------------------------------------------------------------------------- 1 | {% if columns %} 2 | import numpy as np 3 | from sklearn.impute import SimpleImputer 4 | 5 | NUMERIC_COLS_WITH_MISSING_VALUES = {{ columns }} 6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 7 | {{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES]) 8 | {{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES]) 9 | {% endif %} 10 | {% if cols_almost_missing_numeric %} 11 | NUMERIC_ALMOST_MISSING_COLS = {{ cols_almost_missing_numeric }} 12 | {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0) 13 | {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0) 14 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/fillna-type-numeric_predict.py.jinja: -------------------------------------------------------------------------------- 1 | {% if columns %} 2 | with open('simpleimputer-numeric.pkl', 'rb') as f: 3 | simple_imputer = pickle.load(f) 4 | 5 | NUMERIC_COLS_WITH_MISSING_VALUES = {{ columns }} 6 | {{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES]) 7 | {% endif %} 8 | {% if cols_almost_missing_numeric %} 9 | NUMERIC_ALMOST_MISSING_COLS = {{ cols_almost_missing_numeric }} 10 | {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0) 11 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/fillna-type-numeric_train.py.jinja: -------------------------------------------------------------------------------- 1 | {% if columns %} 2 | import numpy as np 3 | from sklearn.impute import SimpleImputer 4 | 5 | NUMERIC_COLS_WITH_MISSING_VALUES = {{ columns }} 6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 7 | {{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES]) 8 | 9 | with open('simpleimputer-numeric.pkl', 'wb') as f: 10 | pickle.dump(simple_imputer, f) 11 | {% endif %} 12 | {% if cols_almost_missing_numeric %} 13 | NUMERIC_ALMOST_MISSING_COLS = {{ cols_almost_missing_numeric }} 14 | {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0) 15 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja: -------------------------------------------------------------------------------- 1 | {% if columns %} 2 | import numpy as np 3 | from sklearn.impute import SimpleImputer 4 | 5 | STRING_COLS_WITH_MISSING_VALUES = {{ columns }} 6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') 7 | {{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES]) 8 | {{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES]) 9 | {% endif %} 10 | {% if cols_almost_missing_string %} 11 | STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} 12 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) 13 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) 14 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') 15 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') 16 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja: -------------------------------------------------------------------------------- 1 | {% if columns %} 2 | with open('simpleimputer-string.pkl', 'rb') as f: 3 | simple_imputer = pickle.load(f) 4 | 5 | STRING_COLS_WITH_MISSING_VALUES = {{ columns }} 6 | {{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES]) 7 | {% endif %} 8 | {% if cols_almost_missing_string %} 9 | STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} 10 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) 11 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') 12 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja: -------------------------------------------------------------------------------- 1 | {% if columns %} 2 | import numpy as np 3 | from sklearn.impute import SimpleImputer 4 | 5 | STRING_COLS_WITH_MISSING_VALUES = {{ columns }} 6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') 7 | {{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES]) 8 | 9 | with open('simpleimputer-string.pkl', 'wb') as f: 10 | pickle.dump(simple_imputer, f) 11 | {% endif %} 12 | {% if cols_almost_missing_string %} 13 | STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }} 14 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str) 15 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('') 16 | {% endif %} -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/get_dummies.py.jinja: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import OneHotEncoder 2 | 3 | CATEGORICAL_COLS = {{ columns }} 4 | onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) 5 | train_encoded = pd.DataFrame(onehot_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ train_dataset }}.index) 6 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, train_encoded ], axis=1) 7 | {{ train_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True) 8 | test_encoded = pd.DataFrame(onehot_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ test_dataset }}.index) 9 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, test_encoded ], axis=1) 10 | {{ test_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True) 11 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/get_dummies_predict.py.jinja: -------------------------------------------------------------------------------- 1 | with open('oneHotEncoder.pkl', 'rb') as f: 2 | onehot_encoder = pickle.load(f) 3 | 4 | CATEGORICAL_COLS = {{ columns }} 5 | test_encoded = pd.DataFrame(onehot_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ test_dataset }}.index) 6 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, test_encoded ], axis=1) 7 | {{ test_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True) 8 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/get_dummies_train.py.jinja: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import OneHotEncoder 2 | 3 | CATEGORICAL_COLS = {{ columns }} 4 | onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) 5 | train_encoded = pd.DataFrame(onehot_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ train_dataset }}.index) 6 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, train_encoded ], axis=1) 7 | {{ train_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True) 8 | 9 | with open('oneHotEncoder.pkl', 'wb') as f: 10 | pickle.dump(onehot_encoder, f) 11 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/log.py.jinja: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | NUMERIC_COLS_TO_SCALE = {{ columns }} 4 | {{ train_dataset }}[NUMERIC_COLS_TO_SCALE] = np.log1p({{ train_dataset }}[NUMERIC_COLS_TO_SCALE]).replace([np.inf, -np.inf], np.nan).fillna({{ train_dataset }}[NUMERIC_COLS_TO_SCALE].mean()) 5 | 6 | 7 | NUMERIC_COLS_TO_SCALE_FOR_TEST = list(set(test_dataset.columns) & set(NUMERIC_COLS_TO_SCALE)) 8 | {{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST] = np.log1p({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST]).replace([np.inf, -np.inf], np.nan).fillna({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST].mean()) 9 | 10 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/log_predict.py.jinja: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | NUMERIC_COLS_TO_SCALE = {{ columns }} 4 | NUMERIC_COLS_TO_SCALE_FOR_TEST = list(set(test_dataset.columns) & set(NUMERIC_COLS_TO_SCALE)) 5 | {{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST] = np.log1p({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST]).replace([np.inf, -np.inf], np.nan).fillna({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST].mean()) 6 | 7 | -------------------------------------------------------------------------------- /sapientml_core/templates/preprocessing_templates/log_train.py.jinja: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | NUMERIC_COLS_TO_SCALE = {{ columns }} 4 | {{ train_dataset }}[NUMERIC_COLS_TO_SCALE] = np.log1p({{ train_dataset }}[NUMERIC_COLS_TO_SCALE]).replace([np.inf, -np.inf], np.nan).fillna({{ train_dataset }}[NUMERIC_COLS_TO_SCALE].mean()) 5 | 6 | -------------------------------------------------------------------------------- /sapientml_core/training/augmentation/mutation_results.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from collections import OrderedDict, defaultdict 17 | 18 | import pandas as pd 19 | from sapientml_core import internal_path 20 | from sapientml_core.seeding.predictor import name_to_label_mapping 21 | from sapientml_core.training.project_corpus import ProjectCorpus 22 | from tqdm import tqdm 23 | 24 | 25 | class MutationResult: 26 | """MutationResult class. 27 | 28 | This class loads the mutated results for each pipeline that were already stored in the sapientml_core cache 29 | and combines all the results in a CSV file and selects the best model. 30 | 31 | """ 32 | 33 | def __init__(self, mutation_result_path, project_list): 34 | self.mutation_result_path = mutation_result_path 35 | self.project_list = project_list 36 | 37 | def load_results(self): 38 | """Collects the score for augmented pipelines from exec_info directory. 39 | 40 | Returns 41 | ------- 42 | results: defaultdict 43 | 44 | """ 45 | results = defaultdict(defaultdict) 46 | models = list(name_to_label_mapping.keys()) + ["original"] 47 | execution_root_dir = internal_path.training_cache / "exec_info" 48 | 49 | for i in tqdm(range(0, len(self.project_list))): 50 | project = self.project_list[i] 51 | project_exec_dir = execution_root_dir / project.notebook_name 52 | project_key = project.file_name 53 | for model in models: 54 | result_file_path = project_exec_dir / model / "stdout.txt" 55 | acc, r2 = 0, 0 56 | if not os.path.exists(result_file_path): 57 | results[project_key][model] = 0 58 | continue 59 | with open(result_file_path, "r", encoding="utf-8") as f: 60 | lines = f.readlines() 61 | for line in lines: 62 | for trail in ["Accuracy: ", "R2: "]: 63 | data = line 64 | if data.count(trail) > 0: 65 | data = data[data.index(trail) + len(trail) :].strip() 66 | if data.count("%") > 0: 67 | data = data[: data.index("%")] 68 | data = float(data) / 100 69 | if trail == "Accuracy: ": 70 | acc = data 71 | if trail == "R2: ": 72 | r2 = data 73 | if project.metric == "accuracy": 74 | results[project_key][model] = round(acc, 5) 75 | elif project.metric == "r2": 76 | results[project_key][model] = round(r2, 5) 77 | 78 | best_models = [] 79 | sorted_results = sorted(results[project_key].items(), key=lambda x: x[1], reverse=True) 80 | best_value = 0 81 | for model, value in sorted_results: 82 | if value > 0 and value >= best_value: 83 | best_models.append(model) 84 | best_value = value 85 | else: 86 | break 87 | 88 | results[project_key]["best_models"] = best_models 89 | 90 | return results 91 | 92 | 93 | def main(): 94 | """Fetch the augmented pipeline results and store it in mutation_results.csv.""" 95 | corpus = ProjectCorpus() # Fetch all project and pipeline details 96 | mutation_result = MutationResult(internal_path.training_cache, corpus.project_list) 97 | results = mutation_result.load_results() 98 | result_list = [] 99 | for key, result in results.items(): 100 | result["file_name"] = key 101 | result = OrderedDict(result) 102 | result.move_to_end("file_name", last=False) 103 | result_list.append(result) 104 | result_dataframe = pd.DataFrame(result_list) 105 | result_dataframe.to_csv(internal_path.training_cache / "mutation_results.csv", index=False) 106 | 107 | 108 | if __name__ == "__main__": 109 | import argparse 110 | 111 | parser = argparse.ArgumentParser() 112 | parser.add_argument("--tag", type=str, help="Tag for output files and dirs.") 113 | args = parser.parse_args() 114 | if args.tag: 115 | internal_path.training_cache = internal_path.training_cache / args.tag 116 | 117 | main() 118 | -------------------------------------------------------------------------------- /sapientml_core/training/dataflowmodel/ast_operation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional, Union 16 | 17 | import libcst as cst 18 | import sapientml.macros as macros 19 | from libcst import RemoveFromParent 20 | from libcst.metadata import ParentNodeProvider, PositionProvider 21 | 22 | 23 | class NameTransformer(cst.CSTTransformer): 24 | METADATA_DEPENDENCIES = ( 25 | ParentNodeProvider, 26 | PositionProvider, 27 | ) 28 | 29 | def __init__(self, replacement): 30 | self.as_names = {} 31 | self.count = 0 32 | self.replacement = replacement 33 | 34 | def leave_Name(self, original_node, updated_node) -> cst.CSTNode: 35 | source_string = original_node.value 36 | if source_string in self.replacement.keys(): 37 | return updated_node.with_changes(value=self.replacement[source_string]) 38 | else: 39 | return original_node 40 | 41 | def leave_SimpleString(self, original_node: cst.Name, updated_node: cst.Name) -> cst.CSTNode: 42 | source_string = original_node.value 43 | if source_string in self.replacement.keys(): 44 | return updated_node.with_changes(value='"' + self.replacement[source_string] + '"') 45 | else: 46 | return original_node 47 | 48 | def get_LineNumber(self, node): 49 | pos = self.get_metadata(PositionProvider, node).start 50 | return pos.line 51 | 52 | 53 | class ArgumentRemover(cst.CSTTransformer): 54 | METADATA_DEPENDENCIES = ( 55 | ParentNodeProvider, 56 | PositionProvider, 57 | ) 58 | 59 | def __init__(self, model_name): 60 | self.target = "" 61 | self.model_name = model_name 62 | 63 | def leave_Arg(self, original_node: cst.Arg, updated_node: cst.Arg) -> Union[cst.Arg, cst.RemovalSentinel]: 64 | parent = self.get_metadata(ParentNodeProvider, original_node) 65 | while not isinstance(parent, cst.Call): 66 | parent = self.get_metadata(ParentNodeProvider, parent) 67 | 68 | func = parent.func 69 | name = None 70 | if isinstance(func, cst.Name): 71 | name = func.value 72 | elif isinstance(func, cst.Attribute): 73 | name = func.attr.value 74 | if name == self.model_name: 75 | return RemoveFromParent() 76 | return updated_node 77 | 78 | 79 | class ModelTransformer(cst.CSTTransformer): 80 | METADATA_DEPENDENCIES = ( 81 | ParentNodeProvider, 82 | PositionProvider, 83 | ) 84 | 85 | def __init__(self, model_name): 86 | self.target = "" 87 | self.model_name = model_name 88 | 89 | def visit_Assign(self, node) -> Optional[bool]: 90 | assigned_target = node.targets[0] 91 | target = assigned_target.target 92 | check = hasattr(target, "value") 93 | if check: 94 | value = node.value 95 | if isinstance(value, cst.Call): 96 | func = value.func 97 | name = None 98 | if isinstance(func, cst.Name): 99 | name = func.value 100 | elif isinstance(func, cst.Attribute): 101 | name = func.attr.value 102 | if name == self.model_name: 103 | self.target = target.value 104 | 105 | 106 | def transform_model_code(source_code, model_label, metric=None): 107 | source_tree = cst.parse_module(source_code) 108 | model_name = model_label.split(":")[2] 109 | transformer = ModelTransformer(model_name) 110 | wrapper = cst.metadata.MetadataWrapper(source_tree) 111 | modified_tree = wrapper.visit(transformer) 112 | code = modified_tree.code.splitlines() 113 | if metric == macros.Metric.AUC or metric == macros.Metric.Gini: 114 | transformed_code = ( 115 | code[0] 116 | + "\n" 117 | + transformer.target 118 | + ".fit(__feature_train, __target_train)\n__y_pred = " 119 | + transformer.target 120 | + ".predict_proba(__feature_test)" 121 | ) 122 | else: 123 | transformed_code = ( 124 | code[0] 125 | + "\n" 126 | + transformer.target 127 | + ".fit(__feature_train, __target_train)\n__y_pred = " 128 | + transformer.target 129 | + ".predict(__feature_test)" 130 | ) 131 | return transformed_code 132 | 133 | 134 | def remove_arguments(source_code, model_name): 135 | source_tree = cst.parse_module(source_code) 136 | transformer = ArgumentRemover(model_name) 137 | wrapper = cst.metadata.MetadataWrapper(source_tree) 138 | modified_tree = wrapper.visit(transformer) 139 | return modified_tree.code 140 | 141 | 142 | def replaceString(source_tree, replacement): 143 | transformer = NameTransformer(replacement) 144 | wrapper = cst.metadata.MetadataWrapper(source_tree) 145 | modified_tree = wrapper.visit(transformer) 146 | return modified_tree 147 | 148 | 149 | def construct_tree(notebook_path): 150 | with open(notebook_path, "r", encoding="utf-8") as file: 151 | code_content = file.read() 152 | parts = code_content.split("### Evaluation Template: ") 153 | code_content = parts[0] 154 | source_tree = cst.parse_module(code_content) 155 | return source_tree 156 | -------------------------------------------------------------------------------- /sapientml_core/training/dataflowmodel/determine_label_order.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import json 17 | 18 | from sapientml_core import internal_path 19 | 20 | LABELS_TO_IGNORE_NOW = { 21 | "PREPROCESS:DeleteColumns:drop:pandas", 22 | "PREPROCESS:Category:map:pandas", 23 | "PREPROCESS:MissingValues:dropna:pandas", 24 | "PREPROCESS:Category:replace:pandas", 25 | "PREPROCESS:FeatureSelection:select_dtypes:pandas", 26 | "PREPROCESS:GenerateColumn:addition:pandas", 27 | } 28 | 29 | 30 | def main(): 31 | """Removes duplication of labelling orders from dependent_labels.json file. 32 | 33 | This scripts create the dataflow model, i.e., extracts the order of two APIs A and B if there is any. 34 | There is an order between A --> B if A and B are dependent on each other based on 'dependent_api_extractor.py' and 35 | A is always followed by B in all piplelines and there is NO case in the corpus where B is followed by A. 36 | 37 | """ 38 | with open(internal_path.training_cache / "dependent_labels.json", "r", encoding="utf-8") as dependent_api_file: 39 | dependent_labels = json.load(dependent_api_file) 40 | 41 | dependent_order = set() 42 | 43 | for dependent_label_str in dependent_labels.keys(): 44 | dep_str_after_bracket_removal = dependent_label_str.replace("[", "").replace("]", "").replace("'", "") 45 | parts = dep_str_after_bracket_removal.split(",") 46 | if (parts[0] in LABELS_TO_IGNORE_NOW) or (parts[1].strip() in LABELS_TO_IGNORE_NOW): 47 | continue 48 | first = parts[0].split(":")[1].strip() 49 | second = parts[1].split(":")[1].strip() 50 | inverse_order = second + "#" + first 51 | if first != second: 52 | if inverse_order in dependent_order: 53 | dependent_order.remove(inverse_order) 54 | else: 55 | dependent_order.add(parts[0].strip() + "#" + parts[1].strip()) 56 | 57 | with open(internal_path.training_cache / "label_order.json", "w", encoding="utf-8") as outfile: 58 | json.dump(list(dependent_order), outfile, indent=4) 59 | 60 | 61 | if __name__ == "__main__": 62 | import argparse 63 | 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument("--tag", type=str, help="Tag for output files and dirs.") 66 | args = parser.parse_args() 67 | if args.tag: 68 | internal_path.training_cache = internal_path.training_cache / args.tag 69 | 70 | main() 71 | -------------------------------------------------------------------------------- /sapientml_core/training/denoising/df_collector.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pandas as pd 16 | 17 | 18 | def update_column_names(collector, line_no, obj, obj_name): 19 | """update_column_names function. 20 | 21 | This function is injected after each statement of the 22 | pipeline during instrumentation to collect the column names of the 23 | dataset after each statement. 24 | 25 | Parameters 26 | ---------- 27 | collector : dict 28 | Collection of all the column name. 29 | line_no : int 30 | line_no 31 | obj : dataframe 32 | Dataframe of particular object. 33 | obj_name : str 34 | Name of the object. 35 | 36 | Returns 37 | ------- 38 | dict 39 | 40 | """ 41 | now_obj = obj 42 | if isinstance(now_obj, pd.Series): 43 | now_obj = now_obj.to_frame() 44 | if isinstance(now_obj, pd.DataFrame): 45 | collector[line_no] = (list(now_obj.columns), obj_name, str(type(now_obj))) 46 | else: 47 | collector[line_no] = (None, obj_name, str(type(now_obj))) 48 | return collector 49 | -------------------------------------------------------------------------------- /sapientml_core/training/denoising/static_analysis_of_columns.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import json 17 | from pathlib import Path 18 | 19 | from sapientml.util.logging import setup_logger 20 | from sapientml_core import internal_path 21 | from sapientml_core.training import project_corpus 22 | from sapientml_core.training.denoising import ast_info_collector as collector 23 | from sapientml_core.util import file_util 24 | 25 | logger = setup_logger() 26 | 27 | 28 | def extract(json_metadata_file): 29 | """Extracting the pipeline. 30 | 31 | This function is collecting the pipeline details and extract 32 | the target column based on file data structure. 33 | 34 | Parameters 35 | ---------- 36 | json_metadata_file : str 37 | The parameter containg each pipeline details. 38 | 39 | Returns 40 | ------- 41 | str 42 | This funtion will return target_column_name. 43 | 44 | Raises 45 | ------ 46 | The ``Raises`` section is a list of all exceptions 47 | that are relevant to the interface. 48 | 49 | """ 50 | with open(json_metadata_file, "r", encoding="utf-8") as f: 51 | notebook_info = json.load(f) 52 | 53 | if isinstance(notebook_info, dict): 54 | target_column_name = notebook_info["target_column_name"] 55 | elif isinstance(notebook_info, list): 56 | target_column_name = notebook_info[1]["target_column_name"] 57 | else: 58 | logger.warning("Wrong format: {}".format(json_metadata_file)) 59 | raise 60 | 61 | return target_column_name 62 | 63 | 64 | def main(test_mode=False): 65 | """Fetch all the pipeline details from corpus and parse it using libcst library. 66 | 67 | This script performs static analysis of the pipeline to identify 68 | if there is any explicit renaming of the column names or explicit 69 | deletion of columns in the pipeline and create static_info.json file. 70 | 71 | Parameters 72 | ---------- 73 | test_mode : bool 74 | This parameter is used for test mode. 75 | 76 | Raises 77 | ------ 78 | Exception: 79 | The ``Raises`` section is a list of all exceptions 80 | that are relevant to the interface. 81 | 82 | """ 83 | corpus = project_corpus.ProjectCorpus() 84 | projects = corpus.project_list 85 | static_info_map = {} 86 | 87 | total_number_target_pipelines = len(projects) 88 | 89 | for i in range(0, total_number_target_pipelines): 90 | if test_mode and i > 5: 91 | break 92 | logger.info(f"RUNNING:{i + 1} out of:{total_number_target_pipelines} PIPELINE:{projects[i].pipeline_path}") 93 | project = projects[i] 94 | pipeline = project.pipeline_path 95 | file_name = project.file_name 96 | 97 | static_info = {} 98 | try: 99 | dataset = file_util.read_csv( 100 | Path(project.dataset_path), 101 | Path(project.pipeline_path), 102 | ) 103 | except Exception: 104 | raise 105 | 106 | json_meta = pipeline.replace(".py", ".info.json") 107 | 108 | target = extract(json_meta) 109 | source_file = pipeline 110 | with open(source_file, "r", encoding="utf-8") as f: 111 | source = f.read() 112 | 113 | try: 114 | column_api_map = collector.get_column_api_map(source) 115 | except Exception: 116 | raise 117 | 118 | dataset_columns = list(dataset.columns) 119 | dropped_columns = [] 120 | renamed_columns = [] 121 | for column in column_api_map: 122 | if "drop" in column_api_map[column]: 123 | if column != target and column in dataset_columns: 124 | dropped_columns.append(column) 125 | if "rename" in column_api_map[column]: 126 | renamed_columns.append(column) 127 | 128 | static_info["drop_api"] = dropped_columns 129 | static_info["rename_api"] = renamed_columns 130 | static_info["target"] = target 131 | static_info_map[file_name] = static_info 132 | try: 133 | dataset.drop(dropped_columns, axis=1, inplace=True) 134 | except Exception: 135 | raise 136 | 137 | logger.info(f"Total number of notebooks: {len(static_info_map)}") 138 | with open(internal_path.training_cache / "static_info.json", "w", encoding="utf-8") as f: 139 | json.dump(static_info_map, f, indent=4) 140 | 141 | 142 | if __name__ == "__main__": 143 | import argparse 144 | 145 | parser = argparse.ArgumentParser() 146 | parser.add_argument("--tag", type=str, help="Tag for output files and dirs.") 147 | args = parser.parse_args() 148 | if args.tag: 149 | internal_path.training_cache = internal_path.training_cache / args.tag 150 | test_mode = False 151 | main(test_mode) 152 | -------------------------------------------------------------------------------- /sapientml_core/training/meta_feature_selector.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from sapientml_core import ps_macros 17 | from sapientml_core.design import search_space 18 | from sklearn.tree import DecisionTreeClassifier 19 | 20 | 21 | def select_k_best_features(X, y): 22 | """Select the top k explanatory variables. 23 | 24 | Parameters 25 | ---------- 26 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix 27 | The training input samples. 28 | y : ArrayLike = numpy.typing.ArrayLike 29 | The target values 30 | 31 | Returns 32 | ------- 33 | list 34 | Returns a list of the top k selected column names. 35 | """ 36 | from sklearn.feature_selection import SelectKBest, mutual_info_regression 37 | 38 | # Select top 2 features based on mutual info regression 39 | selector = SelectKBest(mutual_info_regression, k=3) 40 | selector.fit(X, y) 41 | return list(X.columns[selector.get_support()]) 42 | 43 | 44 | def select_by_rfe(X, y): 45 | """Extract the top N(=n_features_to_select) feature values of importance by RFE(Recursive Feature Elimination). 46 | 47 | Parameters 48 | ---------- 49 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix | 50 | ArrayLike = numpy.typing.ArrayLike 51 | The training input samples. 52 | y : ArrayLike = numpy.typing.ArrayLike 53 | The target values. 54 | 55 | Returns 56 | ------- 57 | list 58 | Returns a list of selected column names. 59 | """ 60 | from sklearn.feature_selection import RFE 61 | 62 | # #Selecting the Best important features according to Logistic Regression 63 | rfe_selector = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=2, step=1) 64 | rfe_selector.fit(X, y) 65 | return list(X.columns[rfe_selector.get_support()]) 66 | 67 | 68 | def select_from_model(X, y): 69 | """Select features based on importance weights. 70 | 71 | Parameters 72 | ---------- 73 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix 74 | The training input samples. 75 | y : None | ArrayLike = numpy.typing.ArrayLike 76 | The target values(integers that correspond to classes in classification, real numbers in regression). 77 | 78 | Returns 79 | ------- 80 | list 81 | Returns a list of selected column names. 82 | """ 83 | from sklearn.feature_selection import SelectFromModel 84 | 85 | # #Selecting the Best important features according to Logistic Regression using SelectFromModel 86 | sfm_selector = SelectFromModel(estimator=DecisionTreeClassifier()) 87 | sfm_selector.fit(X, y) 88 | return list(X.columns[sfm_selector.get_support()]) 89 | 90 | 91 | def select_sequentially(X, y): 92 | """Select feature quantity in order and select feature quantity by greedy method. 93 | 94 | Parameters 95 | ---------- 96 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix 97 | Training vectors 98 | y : None | ArrayLike = numpy.typing.ArrayLike 99 | Target values. This parameter may be ignored for unsupervised learning. 100 | 101 | Returns 102 | ------- 103 | list 104 | Returns a list of selected column names. 105 | """ 106 | from sklearn.feature_selection import SequentialFeatureSelector 107 | 108 | # Selecting the Best important features according to Logistic Regression 109 | sfs_selector = SequentialFeatureSelector( 110 | estimator=DecisionTreeClassifier(), n_features_to_select=3, cv=10, direction="backward" 111 | ) 112 | sfs_selector.fit(X, y) 113 | return list(X.columns[sfs_selector.get_support()]) 114 | 115 | 116 | def select_based_on_correlation(data): 117 | """Create correlation maps for learning data. 118 | 119 | Parameters 120 | ---------- 121 | data : dataframe 122 | Training data 123 | 124 | Returns 125 | ------- 126 | correlation_map : defaultdict(list) 127 | """ 128 | from collections import defaultdict 129 | 130 | corr = data.corr(numeric_only=True) 131 | correlation_map = defaultdict(list) 132 | for i in range(len(corr.columns)): 133 | left = corr.columns[i] 134 | for j in range(i): 135 | if corr.iloc[i, j] >= 0.25: 136 | right = corr.columns[j] 137 | if left[0] != right[0]: 138 | correlation_map[left].append(right) 139 | 140 | if len(correlation_map[left]) == 0: 141 | for j in range(i): 142 | if corr.iloc[i, j] >= 0.15: 143 | right = corr.columns[j] 144 | if left[0] != right[0]: 145 | correlation_map[left].append(right) 146 | 147 | if len(correlation_map[left]) == 0: 148 | correlation_map[left] = list(search_space.meta_feature_list) 149 | return correlation_map 150 | 151 | 152 | def select_features(label): 153 | """Return manually selected feature labels. 154 | 155 | Parameters 156 | ---------- 157 | label : str 158 | 159 | Returns 160 | ------- 161 | selection_model[label] : list 162 | """ 163 | selection_model = { 164 | ps_macros.FILL: [ps_macros.MISSING_PRESENCE], 165 | # ps_macros.DROP: [ps_macros.MISSING_PRESENCE], 166 | ps_macros.IN_PLACE_CONVERT: [ 167 | ps_macros.CATG_PRESENCE, 168 | # ps_macros.IS_TARGET_STR, 169 | ps_macros.BINARY_CATG_PRESENCE, 170 | ps_macros.SMALL_CATG_PRESENCE, 171 | ps_macros.LARGE_CATG_PRESENCE, 172 | ], 173 | ps_macros.ONE_HOT: [ 174 | ps_macros.CATG_PRESENCE, 175 | # ps_macros.IS_TARGET_STR, 176 | ps_macros.BINARY_CATG_PRESENCE, 177 | ps_macros.SMALL_CATG_PRESENCE, 178 | ps_macros.LARGE_CATG_PRESENCE, 179 | ], 180 | ps_macros.VECT: [ps_macros.TEXT_PRESENCE], 181 | ps_macros.MISSING: [ps_macros.MISSING_PRESENCE], 182 | ps_macros.CATG: [ps_macros.CATG_PRESENCE], 183 | ps_macros.SCALING: [ 184 | ps_macros.NORMALIZED_MEAN, 185 | ps_macros.NORMALIZED_STD_DEV, 186 | ps_macros.NORMALIZED_VARIATION_ACROSS_COLUMNS, 187 | ], 188 | ps_macros.DATE: [ps_macros.DATE_PRESENCE], 189 | ps_macros.LEMMITIZE: [ps_macros.TEXT_PRESENCE], 190 | ps_macros.BALANCING: [ps_macros.IMBALANCE], 191 | ps_macros.LOG: [ 192 | ps_macros.MAX_SKEW, 193 | ], 194 | } 195 | return selection_model[label] 196 | -------------------------------------------------------------------------------- /sapientml_core/training/pp_model_trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import pickle 17 | from collections import OrderedDict, defaultdict 18 | from typing import Literal 19 | 20 | import pandas as pd 21 | from sapientml.util.logging import setup_logger 22 | from sapientml_core import internal_path 23 | from sapientml_core.design import search_space 24 | from sapientml_core.training import meta_feature_selector 25 | from sklearn.tree import DecisionTreeClassifier 26 | 27 | logger = setup_logger() 28 | 29 | 30 | def train_p_model(X, y): 31 | """Build a decision tree classifier from the training set (X, y). 32 | 33 | Parameters 34 | ---------- 35 | X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix | 36 | ArrayLike = numpy.typing.ArrayLike 37 | The training input samples. 38 | y : MatrixLike = np.ndarray | pd.DataFrame | spmatrix | 39 | ArrayLike = numpy.typing.ArrayLike 40 | The target values (class labels) as integers or strings 41 | 42 | Returns 43 | ------- 44 | model : DecisionTreeClassifier 45 | Fitted estimator. 46 | """ 47 | model = DecisionTreeClassifier(class_weight="balanced", max_depth=3) 48 | model.fit(X, y) 49 | return model 50 | 51 | 52 | def _train_preprocessors(train_data, feature_selection: Literal["select_manually", "customized"]): 53 | logger.info("Training skeleton predictor for preprocessors...") 54 | data = train_data 55 | data.drop( 56 | data.filter(regex="(TEMPLATE|IGNORE|EVAL:|RPEPROCESS:|MODEL:|Unnamed:)").columns, 57 | axis=1, 58 | inplace=True, 59 | ) 60 | data["project_target"] = ( 61 | data["csv_name"] + "_" + data["target_column_name"].apply(lambda line: "_".join(sorted(eval(line)))) 62 | ) 63 | all_labels = [v for v in data.columns if v.startswith(("PREPROCESS:"))] 64 | second_to_full_labels = defaultdict(list) 65 | for label in all_labels: 66 | second_to_full_labels["PREPROCESS:" + label.split(":")[1]].append(label) 67 | 68 | pp_models = OrderedDict() 69 | 70 | selected_features_map = meta_feature_selector.select_based_on_correlation(data) 71 | 72 | for _, detail_labels in second_to_full_labels.items(): 73 | for label in detail_labels: 74 | logger.debug(label) 75 | main_df = data.copy() 76 | # Feature Selection On 77 | y = main_df[label] 78 | X = main_df[search_space.meta_feature_list] 79 | 80 | if feature_selection == "select_manually": 81 | selected_features = meta_feature_selector.select_features(label) 82 | logger.debug("Selected Features:", selected_features) 83 | X = main_df[selected_features] 84 | elif feature_selection == "customized": 85 | selected_features = selected_features_map[label] 86 | if len(selected_features) == 0: 87 | selected_features = meta_feature_selector.select_sequentially(X, y) 88 | logger.debug("Selected Features:", selected_features) 89 | X = main_df[selected_features] 90 | 91 | pp_model = train_p_model(X, y) 92 | pp_models[label] = (pp_model, selected_features) 93 | 94 | return pp_models 95 | 96 | 97 | def _prepare_model_training_data(raw_meta_feature_train): 98 | # Remove all the unnecessary meta-features 99 | final_meta_features = raw_meta_feature_train[search_space.project_related_metadata + search_space.meta_feature_list] 100 | final_meta_features.fillna(0, inplace=True) 101 | for semantic_label, columns in search_space.label_mapping.items(): 102 | try: 103 | final_meta_features[semantic_label] = raw_meta_feature_train[columns].sum(axis=1) 104 | final_meta_features[semantic_label] = final_meta_features[semantic_label].apply(lambda x: 1 if x > 0 else 0) 105 | except KeyError as e: 106 | logger.warning(e) 107 | 108 | return final_meta_features 109 | 110 | 111 | def main(): 112 | """This main function preprocesses the learning data and saves fitted estimator for the DecisionTreeClassifier. 113 | 114 | Description of feature_selection : "select_manually" | "customized" 115 | Specify how features are selected. 116 | """ 117 | training_data_path = internal_path.training_cache / "pp_metafeatures_training.csv" 118 | # "select_manually" | "customized" 119 | feature_selection = "customized" 120 | raw_meta_feature_train = pd.read_csv(training_data_path) 121 | meta_feature_train = _prepare_model_training_data(raw_meta_feature_train) 122 | pp_models = _train_preprocessors(meta_feature_train, feature_selection) 123 | # Save model 124 | with open(internal_path.training_cache / "pp_models.pkl", "wb") as f: 125 | pickle.dump(pp_models, f) 126 | 127 | 128 | if __name__ == "__main__": 129 | import argparse 130 | 131 | parser = argparse.ArgumentParser() 132 | parser.add_argument("--tag", type=str, help="Tag for output files and dirs.") 133 | args = parser.parse_args() 134 | if args.tag: 135 | internal_path.training_cache = internal_path.training_cache / args.tag 136 | 137 | main() 138 | -------------------------------------------------------------------------------- /sapientml_core/training/project.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | 17 | 18 | @dataclass 19 | class ProjectInfo: 20 | pipeline_path: str # full path 21 | dataset_path: str # full path 22 | file_name: str # only name of the pipeline 23 | notebook_name: str # only name of the pipeline without extension 24 | accuracy: float 25 | csv_name: str 26 | target_column_name: str 27 | metric: str 28 | -------------------------------------------------------------------------------- /sapientml_core/training/project_corpus.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import doctest 17 | import json 18 | import re 19 | from pathlib import Path 20 | 21 | from sapientml.util.logging import setup_logger 22 | from sapientml_core import internal_path 23 | from tqdm import tqdm 24 | 25 | from .project import ProjectInfo 26 | 27 | logger = setup_logger() 28 | 29 | 30 | class ProjectCorpus: 31 | def __init__(self, target_project_name_list=None): 32 | self.target_project_name_list = target_project_name_list 33 | self.clean_notebook_dir_path = internal_path.clean_dir 34 | self.dataset_dir_path = internal_path.corpus_path / "dataset" 35 | self.metadata_dir_path = internal_path.corpus_path / "metadata" 36 | self.project_list = self._extract_project_info() 37 | 38 | def _extract_project_info(self): 39 | project_list = [] 40 | 41 | if self.target_project_name_list: 42 | pipeline_file_names = [Path(project_path) for project_path in self.target_project_name_list] 43 | else: 44 | pipeline_file_names = Path(self.clean_notebook_dir_path).rglob("*.py") 45 | 46 | for notebook_path in tqdm(list(pipeline_file_names)): 47 | notebook_info_path = notebook_path.with_suffix(".info.json") 48 | notebook_name = notebook_path.stem 49 | logger.debug(f"Extracting Project Info for {notebook_name}") 50 | # Read the target column information 51 | try: 52 | with open(notebook_info_path, "r", encoding="utf-8") as notebook_info_file: 53 | notebook_info = json.load(notebook_info_file) 54 | except Exception: 55 | logger.warning("Could not read JSON info file: {}".format(notebook_info_path)) 56 | continue 57 | 58 | if isinstance(notebook_info, list): 59 | notebook_info = notebook_info[1] 60 | 61 | if isinstance(notebook_info, dict): 62 | target_column_name = notebook_info["target_column_name"] 63 | dataset_folder_name = notebook_info["dataset_folder"] 64 | accuracy = notebook_info["accuracy"] 65 | metric = "accuracy" 66 | if accuracy == "N/A": 67 | accuracy = notebook_info["r2"] 68 | metric = "r2" 69 | try: 70 | accuracy = float(accuracy[:-1]) # discarding the percentage (%) sign from the end 71 | except Exception: 72 | accuracy = 0 73 | else: 74 | logger.warning("Wrong format: {}".format(notebook_info_path)) 75 | continue 76 | 77 | if isinstance(target_column_name, str): 78 | if target_column_name == "UNKNOWN": 79 | continue 80 | elif isinstance(notebook_info, list): 81 | if target_column_name[0] == "UNKNOWN": 82 | continue 83 | # Read the dataset 84 | project_fqn = notebook_name + ".py" 85 | dataset_paths = [ 86 | p 87 | for p in (Path(self.dataset_dir_path) / dataset_folder_name).glob("*") 88 | if re.search(r"/*\.(csv|tsv)", str(p)) 89 | ] 90 | if len(dataset_paths) == 0: 91 | logger.warning( 92 | "Could not find CSV/TSV file under {}/{}".format(self.dataset_dir_path, dataset_folder_name) 93 | ) 94 | continue 95 | 96 | dataset_path = dataset_paths[0] 97 | dataset_name = dataset_path.stem 98 | if len(dataset_paths) > 1: 99 | logger.warning( 100 | "Found multiple CSV/TSV files under {}. Using {}...".format( 101 | self.clean_notebook_dir_path, dataset_name 102 | ) 103 | ) 104 | 105 | project_info = ProjectInfo( 106 | str(notebook_path), 107 | str(dataset_path), 108 | project_fqn, 109 | notebook_name, 110 | accuracy, 111 | dataset_name, 112 | target_column_name, 113 | metric, 114 | ) 115 | project_list.append(project_info) 116 | return project_list 117 | 118 | 119 | if __name__ == "__main__": 120 | doctest.testmod() 121 | -------------------------------------------------------------------------------- /sapientml_core/util/file_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2024 The SapientML Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import calendar 17 | import datetime 18 | import glob 19 | import json 20 | import os 21 | import time 22 | 23 | import pandas as pd 24 | 25 | 26 | def get_time(): 27 | """Returns the current time. 28 | 29 | Returns 30 | ---------- 31 | readable : str 32 | Current time in ISO format 33 | """ 34 | ts = calendar.timegm(time.gmtime()) 35 | readable = datetime.datetime.fromtimestamp(ts).isoformat() 36 | return readable 37 | 38 | 39 | def read_file_in_a_list(file_name): 40 | """Open a file and place it in a list line by line(read().splitlines()). 41 | 42 | Parameters 43 | ---------- 44 | file_name : FileDescriptorOrPath 45 | File name. 46 | 47 | Returns 48 | ---------- 49 | lines : list[str] 50 | List file contents line by line. 51 | """ 52 | with open(file_name, "r", encoding="utf-8") as f: 53 | lines = f.read().splitlines() 54 | return lines 55 | 56 | 57 | def read_file(file_name): 58 | """Open file and read data with read(). 59 | 60 | Parameters 61 | ---------- 62 | file_name : FileDescriptorOrPath 63 | File name. 64 | 65 | Returns 66 | ---------- 67 | lines : str 68 | The entire text file read. 69 | """ 70 | with open(file_name, "r", encoding="utf-8") as f: 71 | lines = f.read() 72 | return lines 73 | 74 | 75 | def write_content_to_file(file_name, content): 76 | """write content to file. 77 | 78 | Parameters 79 | ---------- 80 | file_name : FileDescriptorOrPath 81 | File name. 82 | content : str 83 | What to write to the file. 84 | """ 85 | with open(file_name, "w", encoding="utf-8") as out_file: 86 | out_file.write(content) 87 | 88 | 89 | def get_file_list(path, type): 90 | """Get a list of files of a specified type in a directory. 91 | 92 | Parameters 93 | ---------- 94 | path : FileDescriptorOrPath 95 | Directory path. 96 | type : str 97 | File extension. 98 | Returns 99 | ---------- 100 | files_with_given_type : list 101 | List of retrieved files. 102 | """ 103 | os.chdir(path) 104 | files_with_given_type = [] 105 | for file in glob.glob("*." + type): 106 | files_with_given_type.append((path + "/" + file)) 107 | return files_with_given_type 108 | 109 | 110 | def load_json(file_name): 111 | """Load json format file. 112 | 113 | Parameters 114 | ---------- 115 | file_name : FileDescriptorOrPath 116 | File name. 117 | 118 | Returns 119 | ---------- 120 | content : Any 121 | Loaded content. 122 | """ 123 | with open(file_name, "r", encoding="utf-8") as input_file: 124 | content = json.load(input_file) 125 | return content 126 | 127 | 128 | def read_csv(csv_path, notebook_path): 129 | """Read a csv file. 130 | 131 | Parameters 132 | ---------- 133 | csv_path : FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] 134 | Csv file Path. 135 | notebook_path : pathlib.Path 136 | Notebook Directory Path. 137 | 138 | Returns 139 | ---------- 140 | dataset : pd.DataFrame 141 | Contents of the loaded csv. 142 | """ 143 | 144 | def read(path, **kwargs): 145 | if str(path).endswith(".csv"): 146 | return pd.read_csv(path, encoding_errors="ignore", on_bad_lines="warn", **kwargs) 147 | return pd.read_table(path, encoding_errors="ignore", on_bad_lines="warn", **kwargs) 148 | 149 | encoding = get_dataset_encoding(notebook_path) 150 | dataset = read(csv_path, encoding=encoding) 151 | num_of_features = dataset.shape[1] - 1 152 | if num_of_features == 0: 153 | dataset = read(csv_path, encoding=encoding, delim_whitespace=True) 154 | num_of_features = dataset.shape[1] - 1 155 | if num_of_features == 0: 156 | dataset = read(csv_path, encoding=encoding, delimiter=";") 157 | num_of_features = dataset.shape[1] - 1 158 | return dataset 159 | 160 | 161 | def get_dataset_encoding(notebook_path): 162 | """Get dataset encoding. 163 | 164 | Parameters 165 | ---------- 166 | notebook_path : StrPath | None | BytesPath 167 | Directory path of notebooks. 168 | 169 | Returns 170 | ---------- 171 | encoding : str | None 172 | """ 173 | if os.path.isdir(notebook_path): 174 | return None 175 | if not str(notebook_path).endswith(".py"): 176 | return None 177 | encoding = get_dataset_file(notebook_path) 178 | if encoding: 179 | return encoding 180 | return None 181 | 182 | 183 | def get_dataset_file(notebook_path): 184 | """Read notebook_path and get encoding. 185 | 186 | Parameters 187 | ---------- 188 | notebook_path : str 189 | File name. 190 | 191 | Returns 192 | ---------- 193 | csv_file_name : str | bytes | None 194 | File name of csv(dataset). 195 | encoding : str | None 196 | Encoding of notebook_path. 197 | """ 198 | f = open(notebook_path, "r", encoding="utf-8") 199 | lines = f.readlines() 200 | f.close() 201 | encoding = None 202 | for index in range(len(lines)): 203 | if ".read_csv(" in lines[index]: 204 | if "encoding=" in lines[index]: 205 | encoding = lines[index].split("encoding=")[1].split(")")[0].split(",")[0][1:-1] 206 | elif "encoding = " in lines[index]: 207 | encoding = lines[index].split("encoding = ")[1].split(")")[0].split(",")[0][1:-1] 208 | else: 209 | encoding = None 210 | return encoding 211 | return encoding 212 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/outputs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /tests/fixtures/params/config.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/fixtures/params/config.pkl -------------------------------------------------------------------------------- /tests/fixtures/params/dataset.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/fixtures/params/dataset.pkl -------------------------------------------------------------------------------- /tests/fixtures/params/task.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/fixtures/params/task.pkl -------------------------------------------------------------------------------- /tests/sapientml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/sapientml/__init__.py -------------------------------------------------------------------------------- /tests/sapientml/conftest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from unittest import mock 4 | 5 | import pytest 6 | 7 | 8 | @pytest.fixture(scope="session", autouse=True) 9 | def disable_logging(): 10 | logging.disable(logging.FATAL) 11 | yield 12 | logging.disable(logging.NOTSET) 13 | 14 | 15 | @pytest.fixture(scope="function", autouse=True) 16 | def reset_sapientml_logger(): 17 | # FIXME: more efficient way to reset a logger 18 | logger = logging.getLogger("sapientml") 19 | logger.handlers.clear() 20 | logger.root.handlers.clear() 21 | 22 | 23 | @pytest.fixture(scope="function", autouse=True) 24 | def path_home(tmp_path): 25 | with mock.patch.object(Path, "home"): 26 | yield Path(tmp_path) 27 | --------------------------------------------------------------------------------