├── .coveragerc
├── .editorconfig
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── dependabot.yml
    └── workflows
    │   ├── dependabot.yml
    │   ├── greetings.yml
    │   ├── lint.yml
    │   ├── release.yml
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pysen
    ├── pyproject.toml
    └── setup.cfg
├── CODEOWNERS
├── LICENSE
├── pyproject.toml
├── requirements-training.txt
├── sapientml_core
    ├── __init__.py
    ├── adaptation
    │   ├── __init__.py
    │   ├── artifacts
    │   │   ├── PY310
    │   │   │   └── label_order.json
    │   │   ├── PY311
    │   │   │   └── label_order.json
    │   │   ├── PY39
    │   │   │   └── label_order.json
    │   │   └── label_order.json
    │   └── generation
    │   │   ├── __init__.py
    │   │   ├── pipeline_template.py
    │   │   ├── predicate.py
    │   │   ├── preprocessing_label.py
    │   │   └── template_based_adaptation.py
    ├── datastore
    │   └── localfile
    │   │   ├── __init__.py
    │   │   ├── export_modules
    │   │       ├── sample_dataset.py
    │   │       └── split_timeseries_dataset.py
    │   │   ├── generator.py
    │   │   └── templates
    │   │       ├── concat_train_validation.py.jinja
    │   │       ├── drop_ignore_columns.py.jinja
    │   │       ├── drop_inf_or_nan_rows.py.jinja
    │   │       ├── load_localfile.py.jinja
    │   │       ├── load_localfile_predict.py.jinja
    │   │       ├── load_localfile_train.py.jinja
    │   │       ├── set_index.py.jinja
    │   │       ├── set_validation_as_test.py.jinja
    │   │       ├── split.py.jinja
    │   │       └── subsample.py.jinja
    ├── design
    │   ├── __init__.py
    │   ├── label_util.py
    │   ├── pp_component_groups.py
    │   └── search_space.py
    ├── enums.py
    ├── explain
    │   ├── AutoEDA.py
    │   ├── AutoVisualization.py
    │   ├── code_miner.py
    │   ├── code_template.py
    │   ├── main.py
    │   ├── pipeline_explanation.py
    │   └── templates
    │   │   └── jupyter_content.json
    ├── generator.py
    ├── internal_path.py
    ├── meta_features.py
    ├── models
    │   ├── PY310
    │   │   ├── feature_importance.json
    │   │   ├── mp_model_1.pkl
    │   │   ├── mp_model_2.pkl
    │   │   └── pp_models.pkl
    │   ├── PY311
    │   │   ├── feature_importance.json
    │   │   ├── mp_model_1.pkl
    │   │   ├── mp_model_2.pkl
    │   │   └── pp_models.pkl
    │   ├── PY39
    │   │   ├── feature_importance.json
    │   │   ├── mp_model_1.pkl
    │   │   ├── mp_model_2.pkl
    │   │   └── pp_models.pkl
    │   ├── feature_importance.json
    │   ├── model_metafeatures_test.csv
    │   ├── mp_model_1.pkl
    │   ├── mp_model_2.pkl
    │   └── pp_models.pkl
    ├── params.py
    ├── preprocess
    │   └── default
    │   │   ├── __init__.py
    │   │   ├── generator.py
    │   │   ├── params.py
    │   │   └── templates
    │   │       ├── drop_one_value_columns.py.jinja
    │   │       ├── handle_inf_columns.py.jinja
    │   │       ├── handle_iterable_values.py.jinja
    │   │       ├── handle_japanese_text.py.jinja
    │   │       ├── handle_mixed_typed_columns.py.jinja
    │   │       ├── none_has_columns.py.jinja
    │   │       └── rename_columns.py.jinja
    ├── ps_macros.py
    ├── seeding
    │   ├── __init__.py
    │   └── predictor.py
    ├── templates
    │   ├── explainability_templates
    │   │   ├── component_description.json
    │   │   ├── model_explanation.py.jinja
    │   │   └── preprocessing_explanation.py.jinja
    │   ├── model_templates
    │   │   ├── classification_post_process.jinja
    │   │   ├── hyperparameter_tuning.py.jinja
    │   │   ├── hyperparameters.py.jinja
    │   │   ├── hyperparameters_default_value.py.jinja
    │   │   ├── model.py.jinja
    │   │   ├── model_predict.py.jinja
    │   │   ├── model_test.py.jinja
    │   │   └── model_train.py.jinja
    │   ├── other_templates
    │   │   ├── confusion_matrix.py.jinja
    │   │   ├── drop_columns.py.jinja
    │   │   ├── evaluation.py.jinja
    │   │   ├── evaluation_test.py.jinja
    │   │   ├── hyperparameter_tuning_evaluation.py.jinja
    │   │   ├── inverse_target.py.jinja
    │   │   ├── permutation_importance.py.jinja
    │   │   ├── prediction_result.py.jinja
    │   │   ├── preprocess_dataset.py.jinja
    │   │   ├── shap.py.jinja
    │   │   ├── target_separation_predict.py.jinja
    │   │   ├── target_separation_test.py.jinja
    │   │   ├── target_separation_train.py.jinja
    │   │   └── target_separation_validation.py.jinja
    │   ├── pipeline_predict.py.jinja
    │   ├── pipeline_test.py.jinja
    │   ├── pipeline_train.py.jinja
    │   ├── pipeline_validation.py.jinja
    │   └── preprocessing_templates
    │   │   ├── DATE.py.jinja
    │   │   ├── DATE_predict.jinja
    │   │   ├── DATE_train.jinja
    │   │   ├── LabelEncoder.py.jinja
    │   │   ├── LabelEncoder_predict.py.jinja
    │   │   ├── LabelEncoder_train.py.jinja
    │   │   ├── Processing.py.jinja
    │   │   ├── Processing_predict.py.jinja
    │   │   ├── Processing_train.py.jinja
    │   │   ├── SMOTE.py.jinja
    │   │   ├── STANDARD.py.jinja
    │   │   ├── STANDARD_predict.py.jinja
    │   │   ├── STANDARD_train.py.jinja
    │   │   ├── TfidfVectorizer.py.jinja
    │   │   ├── TfidfVectorizer_predict.py.jinja
    │   │   ├── TfidfVectorizer_train.py.jinja
    │   │   ├── fillna-type-numeric.py.jinja
    │   │   ├── fillna-type-numeric_predict.py.jinja
    │   │   ├── fillna-type-numeric_train.py.jinja
    │   │   ├── fillna-type-string.py.jinja
    │   │   ├── fillna-type-string_predict.py.jinja
    │   │   ├── fillna-type-string_train.py.jinja
    │   │   ├── get_dummies.py.jinja
    │   │   ├── get_dummies_predict.py.jinja
    │   │   ├── get_dummies_train.py.jinja
    │   │   ├── log.py.jinja
    │   │   ├── log_predict.py.jinja
    │   │   └── log_train.py.jinja
    ├── training
    │   ├── augmentation
    │   │   ├── mutation_results.py
    │   │   ├── mutation_runner.py
    │   │   └── mutator.py
    │   ├── dataflowmodel
    │   │   ├── ast_operation.py
    │   │   ├── dependent_api_extractor.py
    │   │   └── determine_label_order.py
    │   ├── denoising
    │   │   ├── ast_info_collector.py
    │   │   ├── dataset_snapshot_extractor.py
    │   │   ├── determine_used_features.py
    │   │   ├── df_collector.py
    │   │   └── static_analysis_of_columns.py
    │   ├── meta_feature_extractor.py
    │   ├── meta_feature_selector.py
    │   ├── meta_model_trainer.py
    │   ├── pp_model_trainer.py
    │   ├── project.py
    │   └── project_corpus.py
    └── util
    │   └── file_util.py
└── tests
    ├── __init__.py
    ├── fixtures
        ├── datasets
        │   ├── testdata_df.csv
        │   ├── testdata_df_light.csv
        │   ├── testdata_test.csv
        │   ├── testdata_train.csv
        │   └── testdata_valid.csv
        ├── outputs
        │   └── .gitignore
        └── params
        │   ├── config.pkl
        │   ├── dataset.pkl
        │   └── task.pkl
    └── sapientml
        ├── __init__.py
        ├── conftest.py
        ├── test_generatedcode.py
        └── test_generatedcode_additional_patterns.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit =
 3 |     */.env/*
 4 |     */.venv/*
 5 |     */.cache/*
 6 |     */tmp/*
 7 |     */mining/collector.py
 8 |     */utilities/dataset_utility.py
 9 | 
10 | [report]
11 | exclude_lines =
12 |     pragma: no cover
13 |     if __name__ == .__main__.:


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*.py]
 4 | indent_style = space
 5 | indent_size = 4
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | end_of_line = lf
 9 | charset = utf-8
10 | 
11 | [*.json,*.csv]
12 | insert_final_newline = ignore
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Show your code calling `generate_code()`.
16 | 
17 | <details>
18 | <summary> script </summary>
19 | 
20 | ```python
21 | # Paste your code here. The following is an example.
22 | from sapientml import SapientMLGenerator
23 | sml = SapientMLGenerator()
24 | sml.generate_code('your arguments')
25 | ```
26 | </details>
27 | 
28 | 2. Attach the datasets or dataframes input to `generate_code()` if possible.
29 | 3. Show the generated code such as `1_default.py` when it was generated.
30 | 
31 | <details>
32 | <summary> generated code </summary>
33 | 
34 | ```python
35 | # Paste the generated code here.
36 | ```
37 | </details>
38 | 
39 | 4. Show the messages of SapientML and/or generated code.
40 | 
41 | **Expected behavior**
42 | A clear and concise description of what you expected to happen.
43 | 
44 | **Environment (please complete the following information):**
45 |  - OS: [e.g. Ubuntu 20.04]
46 |  - Docker Version (if applicable): [Docker version 20.10.17, build 100c701]
47 |  - Python Version: [e.g. 3.9.12]
48 |  - SapientML Version: [e.g. 2.3.4]
49 | 
50 | 
51 | **Additional context**
52 | Add any other context about the problem here.
53 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: 'enhancement'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # To get started with Dependabot version updates, you'll need to specify which
16 | # package ecosystems to update and where the package manifests are located.
17 | # Please see the documentation for all configuration options:
18 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
19 | 
20 | version: 2
21 | updates:
22 |   - package-ecosystem: "pip" # See documentation for possible values
23 |     directory: "/" # Location of package manifests
24 |     schedule:
25 |       interval: "weekly"
26 | 


--------------------------------------------------------------------------------
/.github/workflows/dependabot.yml:
--------------------------------------------------------------------------------
 1 | name: Dependabot auto approve and merge
 2 | on: pull_request
 3 | 
 4 | permissions:
 5 |   pull-requests: write
 6 |   contents: write
 7 | 
 8 | jobs:
 9 |   dependabot:
10 |     runs-on: ubuntu-latest
11 |     if: github.actor == 'dependabot[bot]'
12 |     steps:
13 |       - name: Dependabot metadata
14 |         id: metadata
15 |         uses: dependabot/fetch-metadata@v1
16 |         with:
17 |           github-token: "${{ secrets.GITHUB_TOKEN }}"
18 |       - name: Approve a PR
19 |         run: gh pr review --approve "$PR_URL"
20 |         env:
21 |           PR_URL: ${{github.event.pull_request.html_url}}
22 |           GH_TOKEN: ${{secrets.GITHUB_TOKEN}}
23 |       - name: Enable auto-merge for Dependabot PRs
24 |         if: steps.metadata.outputs.update-type == 'version-update:semver-patch'
25 |         run: gh pr merge --auto --merge "$PR_URL"
26 |         env:
27 |           PR_URL: ${{github.event.pull_request.html_url}}
28 |           GH_TOKEN: ${{secrets.GITHUB_TOKEN}}
29 | 


--------------------------------------------------------------------------------
/.github/workflows/greetings.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: Greetings
16 | 
17 | on: [pull_request_target, issues]
18 | 
19 | jobs:
20 |   greeting:
21 |     runs-on: ubuntu-latest
22 |     permissions:
23 |       issues: write
24 |       pull-requests: write
25 |     steps:
26 |     - uses: actions/first-interaction@v1
27 |       with:
28 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
29 |         issue-message: "# 🎉 Thanks for submitting the issue to SapientML!!\n\nWe have the [Discord](https://discord.gg/59yshERFD9) server. Please join the server!"
30 |         pr-message: "# 🎉 Thanks for submitting the PR to SapientML!!\n\nHere is the [Contribution Guideline](https://github.com/sapientml/sapientml/blob/main/CONTRIBUTING.md).\nWe would like you to read the document and follow it.\nIf you have any question or anything to be discussed, please join the [Discord](https://discord.gg/59yshERFD9) server and chat with us.\nThank again!"
31 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: Lint
16 | 
17 | on:
18 |   pull_request:
19 |     branches:
20 |       - main
21 | 
22 | env:
23 |   PYTHON_VERSION: "3.10"
24 |   POETRY_VERSION: "1.5.1"
25 |   POETRY_URL: https://install.python-poetry.org
26 | 
27 | jobs:
28 |   test:
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - name: Checkout
32 |         uses: actions/checkout@v4
33 |       - name: Cache Packages
34 |         uses: actions/cache@v2
35 |         with:
36 |           path: ~/.local
37 |           key: poetry-${{ matrix.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/*.yml') }}
38 | 
39 |       - name: Set up Python ${{ env.PYTHON_VERSION }}
40 |         uses: actions/setup-python@v4
41 |         with:
42 |           python-version: ${{ env.PYTHON_VERSION }}
43 | 
44 |       - name: Install Dependencies
45 |         run: pip install pysen flake8 black isort==5.12.0
46 | 
47 |       - name: Pysen run lint
48 |         run: pysen run lint
49 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | name: Release
 16 | 
 17 | on:
 18 |   push:
 19 |     tags:
 20 |       - '*.*.*'
 21 | 
 22 | env:
 23 |   POETRY_VERSION: "1.7.1"
 24 |   POETRY_URL: https://install.python-poetry.org
 25 | 
 26 | jobs:
 27 |   test:
 28 |     strategy:
 29 |       matrix:
 30 |         version: ["3.10", "3.11"]
 31 |         test: [test_misc,
 32 |           test_regressor_works_number, test_regressor_works_with_nosparse,
 33 |           test_classifier_category_binary_num_noproba, test_classifier_category_binary_num_proba,
 34 |           test_classifier_category_multi_nonnum_metric_noproba, test_classifier_category_multi_nonnum_metric_proba,
 35 |           test_classifier_category_binary_boolean_metric_noproba, test_classifier_category_binary_boolean_metric_proba,
 36 |           test_classifier_category_binary_num_use_proba_with_metric_default_noproba, test_classifier_category_multi_nonnum_noproba_metric_with_proba,
 37 |           test_classifier_notext_nonegative_explanatry, test_classifier_works_with
 38 |           ]
 39 |     runs-on: ubuntu-latest
 40 |     steps:
 41 |       - name: Checkout
 42 |         uses: actions/checkout@v4
 43 | 
 44 |       - name: Set up Python ${{ matrix.version }}
 45 |         uses: actions/setup-python@v4
 46 |         with:
 47 |           python-version: ${{ matrix.version }}
 48 | 
 49 |       - name: Install Poetry
 50 |         run: |
 51 |           curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }}
 52 |           echo "$HOME/.local/bin" >> $GITHUB_PATH
 53 | 
 54 |       - name: Install Dependencies
 55 |         run: poetry install
 56 | 
 57 |       - name: Pytest
 58 |         run: poetry run -- pytest -k ${{ matrix.test }}
 59 |       
 60 |       - name: Upload Coverage
 61 |         uses: actions/upload-artifact@v4
 62 |         with:
 63 |           name:  ${{ matrix.test }}
 64 |           include-hidden-files: true
 65 |           path: .coverage
 66 |           retention-days: 1
 67 |           overwrite: true
 68 |           
 69 | 
 70 |   additional_test:
 71 |     strategy:
 72 |       matrix:
 73 |         version: ["3.10", "3.11"]
 74 |         test: [test_additional_misc,
 75 |           test_additional_regressor_works_number, test_additional_regressor_works_with_nosparse,
 76 |           test_additional_classifier_category_binary_nonnum_noproba, test_additional_classifier_category_binary_nonnum_proba,
 77 |           test_additional_classifier_category_binary_num_noproba, test_additional_classifier_category_binary_num_proba,
 78 |           test_additional_classifier_category_multi_nonnum_metric_noproba, test_additional_classifier_category_multi_nonnum_metric_proba,
 79 |           test_additional_classifier_category_multi_num_metric_noproba, test_additional_classifier_category_multi_num_metric_proba,
 80 |           test_additional_classifier_category_binary_boolean_metric_noproba, test_additional_classifier_category_binary_boolean_metric_proba,
 81 |           test_additional_classifier_category_binary_num_use_proba_with_metric_default_noproba, test_additional_classifier_category_multi_nonnum_noproba_metric_with_proba,
 82 |           test_additional_classifier_works_with
 83 |           ]
 84 |     runs-on: ubuntu-latest
 85 |     steps:
 86 |       - name: Checkout
 87 |         uses: actions/checkout@v4
 88 | 
 89 |       - name: Set up Python ${{ matrix.version }}
 90 |         uses: actions/setup-python@v4
 91 |         with:
 92 |           python-version: ${{ matrix.version }}
 93 | 
 94 |       - name: Install Poetry
 95 |         run: |
 96 |           curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }}
 97 |           echo "$HOME/.local/bin" >> $GITHUB_PATH
 98 | 
 99 |       - name: Install Dependencies
100 |         run: poetry install
101 | 
102 |       - name: Pytest
103 |         run: poetry run -- pytest -k ${{ matrix.test }}
104 | 
105 |       - name: Upload Coverage
106 |         uses: actions/upload-artifact@v4
107 |         with:
108 |           name:  ${{ matrix.test }}
109 |           include-hidden-files: true
110 |           path: .coverage
111 |           retention-days: 1
112 |           overwrite: true
113 | 
114 |   report_coverage:
115 |     runs-on: ubuntu-latest
116 |     needs:
117 |       - test
118 |       - additional_test
119 |     steps:
120 |       - name: Checkout
121 |         uses: actions/checkout@v4
122 | 
123 |       - name: Set up Python 3.11
124 |         uses: actions/setup-python@v4
125 |         with:
126 |           python-version: 3.11
127 | 
128 |       - name: Download Coverage Files
129 |         uses: actions/download-artifact@v4
130 | 
131 |       - name: Install coverage
132 |         run: pip install coverage
133 | 
134 |       - name: Combine Coverage Files
135 |         run: |
136 |           mv --backup=t */.coverage .
137 |           coverage combine -a
138 |           coverage report
139 | 
140 |       - name: Report Coverage to CodeCov
141 |         uses: codecov/codecov-action@v3
142 |         with:
143 |           token: ${{ secrets.CODECOV_TOKEN }}
144 | 
145 |   release:
146 |     name: Release
147 |     runs-on: ubuntu-latest
148 |     steps:
149 |       - name: Checkout
150 |         uses: actions/checkout@v4
151 | 
152 |       - name: Set up Python 3.10
153 |         uses: actions/setup-python@v4
154 |         with:
155 |           python-version: "3.10"
156 | 
157 |       - name: Install Poetry
158 |         run: |
159 |           curl -sSL https://install.python-poetry.org | python - -y
160 | 
161 |       - name: Update PATH
162 |         run: echo "$HOME/.local/bin" >> $GITHUB_PATH
163 | 
164 |       - name: Set Version
165 |         run: |
166 |           SEMVER=$(git describe --exact-match --tags HEAD)
167 |           sed -i "s/\(version *= *\).*/\1\"$SEMVER\"/" pyproject.toml
168 | 
169 |       - name: Build project for distribution
170 |         run: poetry build
171 | 
172 |       - name: Check Version
173 |         id: check-version
174 |         run: |
175 |           [[ "$(poetry version --short)" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] || echo prerelease=true >> $GITHUB_OUTPUT
176 | 
177 |       - name: Create Release
178 |         uses: ncipollo/release-action@v1
179 |         with:
180 |           artifacts: "dist/*"
181 |           token: ${{ secrets.GITHUB_TOKEN }}
182 |           draft: false
183 |           prerelease: steps.check-version.outputs.prerelease == 'true'
184 | 
185 |       - name: Publish to PyPI
186 |         env:
187 |           POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }}
188 |         run: poetry publish --skip-existing
189 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: Testing
16 | 
17 | on:
18 |   pull_request:
19 |     branches:
20 |       - main
21 | 
22 | env:
23 |   POETRY_VERSION: "1.5.1"
24 |   POETRY_URL: https://install.python-poetry.org
25 | 
26 | jobs:  
27 |   test:
28 |     strategy:
29 |       matrix:
30 |         version: ["3.10", "3.11"]
31 |         test: [test_misc, test_regressor_works_number, test_regressor_works_with_nosparse,
32 |           test_classifier_category_binary_num_noproba, test_classifier_category_binary_num_proba,
33 |           test_classifier_category_multi_nonnum_metric_noproba, test_classifier_category_multi_nonnum_metric_proba,
34 |           test_classifier_category_binary_boolean_metric_noproba, test_classifier_category_binary_boolean_metric_proba,
35 |           test_classifier_category_binary_num_use_proba_with_metric_default_noproba, test_classifier_category_multi_nonnum_noproba_metric_with_proba,
36 |           test_classifier_notext_nonegative_explanatry, test_classifier_works_with,
37 |           ]
38 |     runs-on: ubuntu-latest
39 |     steps:
40 |       - name: Checkout
41 |         uses: actions/checkout@v4
42 | 
43 |       - name: Set up Python ${{ matrix.version }}
44 |         uses: actions/setup-python@v4
45 |         with:
46 |           python-version: ${{ matrix.version }}
47 | 
48 |       - name: Install Poetry
49 |         run: |
50 |           curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }}
51 |           echo "$HOME/.local/bin" >> $GITHUB_PATH
52 |         
53 |       - name: Install Dependencies
54 |         run: poetry install
55 | 
56 |       - name: Pytest
57 |         run: poetry run -- pytest -k ${{ matrix.test }}
58 |       
59 |       - name: Upload Coverage
60 |         uses: actions/upload-artifact@v4
61 |         with:
62 |           name:  ${{ matrix.test }}
63 |           include-hidden-files: true
64 |           path: .coverage
65 |           retention-days: 1
66 |           overwrite: true
67 | 
68 |   report_coverage:
69 |     runs-on: ubuntu-latest
70 |     needs:
71 |       - test
72 |     steps:
73 |       - name: Checkout
74 |         uses: actions/checkout@v4
75 | 
76 |       - name: Set up Python 3.11
77 |         uses: actions/setup-python@v4
78 |         with:
79 |           python-version: 3.11
80 | 
81 |       - name: Download Coverage Files
82 |         uses: actions/download-artifact@v4
83 | 
84 |       - name: Install coverage
85 |         run: pip install coverage
86 | 
87 |       - name: Combine Coverage Files
88 |         run: |
89 |           mv --backup=t */.coverage .
90 |           coverage combine -a
91 |           coverage report
92 | 
93 |       - name: Report Coverage to CodeCov
94 |         uses: codecov/codecov-action@v3
95 |         with:
96 |           token: ${{ secrets.CODECOV_TOKEN }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | poetry.lock


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - repo: local
3 |     hooks:
4 |       - id: pysen
5 |         name: Run pysen
6 |         entry: pysen run_files lint
7 |         language: system
8 |         types: [file, python]
9 | 


--------------------------------------------------------------------------------
/.pysen/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool]
 2 | [tool.black] # automatically generated by pysen
 3 | # pysen ignores and overwrites any modifications
 4 | line-length = 120
 5 | target-version = ["py310"]
 6 | 
 7 | [tool.isort] # automatically generated by pysen
 8 | # pysen ignores and overwrites any modifications
 9 | default_section = "THIRDPARTY"
10 | ensure_newline_before_comments = true
11 | force_grid_wrap = 0
12 | force_single_line = false
13 | include_trailing_comma = true
14 | line_length = 120
15 | multi_line_output = 3
16 | use_parentheses = true
17 | 


--------------------------------------------------------------------------------
/.pysen/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # automatically generated by pysen
 3 | # pysen ignores and overwrites any modifications
 4 | # e203: black treats : as a binary operator
 5 | # e231: black doesn't put a space after ,
 6 | # e501: black may exceed the line-length to follow other style rules
 7 | # w503 or w504: either one needs to be disabled to select w error codes
 8 | ignore = E203,E231,E501,W503
 9 | max-line-length = 120
10 | select = B,B950,C,E,F,W
11 | 
12 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @sapientml/maintainers
2 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | authors = ["The SapientML Authors"]
 3 | description = "A SapientML plugin of SapientMLGenerator"
 4 | license = "Apache-2.0"
 5 | maintainers = [
 6 |   "Kosaku Kimura <kimura.kosaku@fujitsu.com>",
 7 |   "Akira Ura <ura.akira@fujitsu.com>",
 8 | ]
 9 | name = "sapientml-core"
10 | version = "0"
11 | 
12 | [tool.poetry.dependencies]
13 | catboost = ">=1.2.3"
14 | imbalanced-learn = ">=0.11,<0.13"
15 | ipykernel = "^6.25.1"
16 | japanize-matplotlib = "^1.1.3"
17 | jinja2 = "^3.1.2"
18 | libcst = "^1.0.1"
19 | lightgbm = "^4.0.0"
20 | nbconvert = "^7.7.4"
21 | nbformat = "^5.9.2"
22 | nltk = "^3.8.1"
23 | numba = ">=0.57.1,<0.61.0"
24 | optuna = ">=3.2,<5.0"
25 | python = ">=3.9,<3.13"
26 | sapientml = "*"
27 | scikit-learn = "1.5.2"
28 | scipy = "^1.11.1"
29 | seaborn = ">=0.12.2,<0.14.0"
30 | shap = ">=0.43,<0.47"
31 | tqdm = "^4.66.1"
32 | xgboost = ">=1.7.6,<3.0.0"
33 | mecab-python3 = "^1.0.6"
34 | ipadic = "^1.0.0"
35 | fasttext-wheel = "^0.9.2"
36 | requests = "^2.31.0"
37 | 
38 | [tool.poetry.group.dev.dependencies]
39 | black = ">=23.7,<25.0"
40 | flake8 = ">=6.1,<8.0"
41 | isort = "^5.12.0"
42 | pre-commit = ">=3.3.3,<5.0.0"
43 | pysen = ">=0.10.5,<0.12.0"
44 | pytest = ">=7.4,<9.0"
45 | pytest-cov = ">=4.1,<7.0"
46 | pytest-xdist = "^3.3.1"
47 | 
48 | [build-system]
49 | build-backend = "poetry.core.masonry.api"
50 | requires = ["poetry-core>=1.0.0"]
51 | 
52 | [tool.poetry.plugins."sapientml.config"]
53 | sapientml = "sapientml_core:SapientMLConfig"
54 | 
55 | [tool.poetry.plugins."sapientml.pipeline_generator"]
56 | sapientml = "sapientml_core:SapientMLGenerator"
57 | 
58 | [tool.poetry.plugins."sapientml.datastore"]
59 | localfile = "sapientml_core.datastore.localfile:LocalFile"
60 | 
61 | [tool.poetry.plugins."sapientml.preprocess"]
62 | default = "sapientml_core.preprocess.default:DefaultPreprocess"
63 | 
64 | [tool.poetry.plugins."sapientml.export_modules"]
65 | sample-dataset = "sapientml_core.datastore.localfile.export_modules"
66 | 
67 | [tool.pysen]
68 | version = "0.11.0"
69 | 
70 | [tool.pysen-cli]
71 | settings_dir = ".pysen"
72 | 
73 | [tool.pysen.lint]
74 | enable_black = true
75 | enable_flake8 = true
76 | enable_isort = true
77 | enable_mypy = false
78 | line_length = 120
79 | py_version = "py310"
80 | 
81 | [tool.pysen.lint.source]
82 | includes = ["sapientml_core/", "tests/"]
83 | 
84 | [tool.pytest.ini_options]
85 | addopts = "-s -x --cov=sapientml_core"
86 | testpaths = ["tests"]
87 | 


--------------------------------------------------------------------------------
/requirements-training.txt:
--------------------------------------------------------------------------------
1 | category-encoders==2.6.4
2 | patsy==0.5.6
3 | statsmodels==0.14.4
4 | tensorflow==2.18.0
5 | wordcloud==1.9.4


--------------------------------------------------------------------------------
/sapientml_core/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .generator import SapientMLGenerator
16 | from .params import SapientMLConfig
17 | 
18 | __all__ = ["SapientMLGenerator", "SapientMLConfig"]
19 | 


--------------------------------------------------------------------------------
/sapientml_core/adaptation/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/sapientml_core/adaptation/artifacts/PY310/label_order.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas",
 3 |     "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas",
 4 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom",
 5 |     "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy",
 6 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas",
 7 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas",
 8 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas",
 9 |     "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas",
10 |     "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy",
11 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas",
12 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
13 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas",
14 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas",
15 |     "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom",
16 |     "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom",
17 |     "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas",
18 |     "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
19 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom",
20 |     "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas",
21 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom",
22 |     "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas",
23 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
24 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas",
25 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas",
26 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas",
27 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas",
28 |     "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas",
29 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
30 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas",
31 |     "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas"
32 | ]


--------------------------------------------------------------------------------
/sapientml_core/adaptation/artifacts/PY311/label_order.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas",
 3 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom",
 4 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas",
 5 |     "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas",
 6 |     "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy",
 7 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas",
 8 |     "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy",
 9 |     "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom",
10 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas",
11 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
12 |     "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas",
13 |     "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas",
14 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom",
15 |     "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas",
16 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom",
17 |     "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom",
18 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas",
19 |     "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
20 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas",
21 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
22 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas",
23 |     "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas",
24 |     "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas",
25 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas",
26 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas",
27 |     "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas",
28 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas",
29 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas",
30 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas",
31 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas"
32 | ]


--------------------------------------------------------------------------------
/sapientml_core/adaptation/artifacts/PY39/label_order.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
 3 |     "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas",
 4 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas",
 5 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas",
 6 |     "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas",
 7 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
 8 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom",
 9 |     "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy",
10 |     "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas",
11 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas",
12 |     "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom",
13 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
14 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
15 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas",
16 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom",
17 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas",
18 |     "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas",
19 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas",
20 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas",
21 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas",
22 |     "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas",
23 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas",
24 |     "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas",
25 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom",
26 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas",
27 |     "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas",
28 |     "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom",
29 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas",
30 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas",
31 |     "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy"
32 | ]


--------------------------------------------------------------------------------
/sapientml_core/adaptation/artifacts/label_order.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:TypeChange:astype:pandas",
 3 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:MissingValues:fillna:pandas",
 4 |     "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:Category:get_dummies:pandas",
 5 |     "PREPROCESS:Outlier:Quantile:custom#PREPROCESS:Scaling:log:numpy",
 6 |     "PREPROCESS:Column_Rename:rename:pandas#PREPROCESS:Category:get_dummies:pandas",
 7 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
 8 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
 9 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:TypeChange:astype:pandas",
10 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Category:get_dummies:pandas",
11 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:groupby:pandas",
12 |     "PREPROCESS:Category:get_dummies:pandas#PREPROCESS:FeatureSelection:columns:custom",
13 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:Category:get_dummies:pandas",
14 |     "PREPROCESS:Scaling:log1p:numpy#PREPROCESS:TypeChange:astype:pandas",
15 |     "PREPROCESS:Scaling:log:numpy#PREPROCESS:FeatureSelection:corr:custom",
16 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:pandas",
17 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Column_Rename:rename:pandas",
18 |     "PREPROCESS:GenerateColumn:median:pandas#PREPROCESS:MissingValues:fillna:pandas",
19 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:GenerateColumn:round:pandas",
20 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Category:get_dummies:pandas",
21 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:replace:pandas",
22 |     "PREPROCESS:Filtering:conditional:pandas#PREPROCESS:MissingValues:fillna:pandas",
23 |     "PREPROCESS:ConvertStr2Date:to_datetime:pandas#PREPROCESS:GenerateColumn:DATE:custom",
24 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Text:float:custom",
25 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:Str2str:frequency:custom",
26 |     "PREPROCESS:MissingValues:replace:pandas#PREPROCESS:TypeChange:astype:pandas",
27 |     "PREPROCESS:MissingValues:fillna:pandas#PREPROCESS:TypeChange:astype:pandas",
28 |     "PREPROCESS:TypeChange:astype:pandas#PREPROCESS:GenerateColumn:date:pandas",
29 |     "PREPROCESS:FeatureSelection:corr:custom#PREPROCESS:Scaling:log:numpy",
30 |     "PREPROCESS:MissingValues:interpolate:sklearn#PREPROCESS:CONVERT_NUM2NUM:where:numpy",
31 |     "PREPROCESS:GenerateColumn:groupby:pandas#PREPROCESS:Column_Rename:rename:pandas"
32 | ]


--------------------------------------------------------------------------------
/sapientml_core/adaptation/generation/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/sapientml_core/adaptation/generation/predicate.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from ...enums import Operator
16 | 
17 | 
18 | class Predicate:
19 |     """A class to represent the predicate.
20 | 
21 |     This class represents the data structure for loading a decision tree
22 |     condition/predicate and provides a function that can evaluate whether
23 |     the predicate is true for a particular column.
24 | 
25 |     """
26 | 
27 |     feature_name = ""
28 |     _operator = ""
29 |     _comparison_value = ""
30 | 
31 |     def __init__(self, feature_name, operator, comparison_value):
32 |         """Constructs all the necessary attributes for the predicate object.
33 | 
34 |         Parameters
35 |         ----------
36 |         feature_name : str
37 |            Meta feature name
38 |         operator : Operator
39 |         comparison_value : np.float
40 | 
41 |         """
42 |         self.feature_name = feature_name
43 |         self._operator = operator
44 |         self._comparison_value = comparison_value
45 | 
46 |     def evaluate_predicate(self, meta_features):
47 |         """Evaluate whether the predicate is true for a particular column.
48 | 
49 |         Parameters
50 |         ----------
51 |         meta_features : dict
52 | 
53 |         Returns
54 |         -------
55 |         result : bool
56 | 
57 |         Raises
58 |         ------
59 |         Exception
60 |             False
61 | 
62 |         """
63 |         try:
64 |             actual_value = meta_features[self.feature_name]
65 |             if actual_value == -1 or actual_value == 0:
66 |                 return False
67 |             if actual_value is None:
68 |                 return False
69 |         except Exception:
70 |             return False
71 | 
72 |         result = False
73 |         if self._operator is Operator.GREATER_THAN:
74 |             result = actual_value > self._comparison_value
75 |         elif self._operator is Operator.GREATER_THAN_OR_EQUAL_TO:
76 |             result = actual_value >= self._comparison_value
77 |         elif self._operator is Operator.EQUAL_TO:
78 |             result = actual_value == self._comparison_value
79 |         elif self._operator is Operator.LESS_THAN:
80 |             result = actual_value < self._comparison_value
81 |         elif self._operator is Operator.LESS_THAN_OR_EQUAL_TO:
82 |             result = actual_value <= self._comparison_value
83 | 
84 |         return result
85 | 


--------------------------------------------------------------------------------
/sapientml_core/adaptation/generation/preprocessing_label.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from ...enums import Operator
 16 | from .predicate import Predicate
 17 | 
 18 | 
 19 | class PreprocessingLabel:
 20 |     """A class to represent the preprocessinglabel.
 21 | 
 22 |     This script identifies the relevant columns in the dataset
 23 |     for each feature engineering components.
 24 | 
 25 |     """
 26 | 
 27 |     def __init__(self, label_name, meta_features, predicates):
 28 |         """Constructs all the necessary attributes for the preprocessinglabel object.
 29 | 
 30 |         Parameters
 31 |         ----------
 32 |         label_name : str
 33 |            Component name.
 34 |         meta_features : list
 35 |            Meta features selected.
 36 |         predicates : list
 37 |            predicates details.
 38 | 
 39 |         """
 40 |         self.label_name = label_name
 41 |         self.meta_features = meta_features
 42 |         self.predicate_objects = list()
 43 |         self._build_predicate_objects(predicates)
 44 |         self.relevant_columns = list()
 45 |         self.components_before = list()
 46 |         self.components_after = list()
 47 |         self.alternative_components = list()
 48 | 
 49 |     def __str__(self):
 50 |         return self.label_name
 51 | 
 52 |     def __repr__(self):
 53 |         return str(self)
 54 | 
 55 |     def _build_predicate_objects(self, predicates):
 56 |         for pred in predicates:
 57 |             feature_name = pred["feature_name"]
 58 |             operator = self._get_operator(pred["operator"])
 59 |             comparison_value = pred["threshold"]
 60 |             p = Predicate(feature_name, operator, comparison_value)
 61 |             self.predicate_objects.append(p)
 62 | 
 63 |     def _get_operator(self, op_string):
 64 |         if op_string == ">":
 65 |             return Operator.GREATER_THAN
 66 |         elif op_string == ">=":
 67 |             return Operator.GREATER_THAN_OR_EQUAL_TO
 68 |         elif op_string == "<":
 69 |             return Operator.LESS_THAN
 70 |         elif op_string == "<=":
 71 |             return Operator.LESS_THAN_OR_EQUAL_TO
 72 |         elif op_string == "==" or op_string == "=":
 73 |             return Operator.EQUAL_TO
 74 |         else:
 75 |             return Operator.NOT_EQUAL_TO
 76 | 
 77 |     def get_relevant_columns(self, dataset_summary, target, ignore_columns):
 78 |         """get_relevant_columns.
 79 | 
 80 |         Parameters
 81 |         ----------
 82 |         dataset_summary : DatasetSummary
 83 |            Object of the datasetsummary class.
 84 |         target : list
 85 |         ignore_columns : list
 86 | 
 87 |         Returns
 88 |         -------
 89 |         rel_columns_list : list
 90 |             Return the relavant column list.
 91 | 
 92 |         """
 93 |         rel_columns_list = []
 94 | 
 95 |         # approach 1: conjunction: a column is relavant if and only if all of the predicates applicable to that component are true
 96 |         # approach 2: disjunction: a column is relavant if and only if at least one of the predicates applicable to that component are true
 97 |         approach = 2
 98 | 
 99 |         for column_name, column in dataset_summary.columns.items():
100 |             if column_name in ignore_columns:
101 |                 continue
102 | 
103 |             # error handling for log transform: don't apply if any col value <= 0
104 |             if "PREPROCESS:Scaling:log" in self.label_name:
105 |                 if column.has_negative_value:
106 |                     continue
107 | 
108 |             result = list()  # holds boolean results of all predicates applicable to a column
109 |             for p in self.predicate_objects:
110 |                 # special handling of "target_imbalance_score" feature, since it should only be applied on target column
111 |                 if p.feature_name == "feature:target_imbalance_score":
112 |                     if column_name not in target:
113 |                         result.append(False)
114 |                         continue
115 |                 result.append(p.evaluate_predicate(column.meta_features))
116 | 
117 |             if approach == 1:  # conjunction
118 |                 if all(result):
119 |                     rel_columns_list.append(column_name)
120 |             elif approach == 2:  # disjunction
121 |                 if any(result):
122 |                     rel_columns_list.append(column_name)
123 | 
124 |         return rel_columns_list
125 | 


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .generator import LocalFile, LocalFileConfig
16 | 
17 | __all__ = ["LocalFile", "LocalFileConfig"]
18 | 


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/export_modules/sample_dataset.py:
--------------------------------------------------------------------------------
  1 | from decimal import ROUND_HALF_UP, Decimal
  2 | 
  3 | import pandas as pd
  4 | from sklearn.model_selection import train_test_split
  5 | 
  6 | 
  7 | def _sampled_training(dev_training_dataset, train_size, stratify, task_type) -> pd.DataFrame:
  8 |     sampled_training_dataset, _ = train_test_split(
  9 |         dev_training_dataset,
 10 |         train_size=train_size,
 11 |         stratify=stratify if task_type == "classification" else None,
 12 |     )
 13 |     return sampled_training_dataset  # type: ignore
 14 | 
 15 | 
 16 | def sample_dataset(
 17 |     dataframe: pd.DataFrame,
 18 |     sample_size: int,
 19 |     target_columns: list[str],
 20 |     task_type: str,
 21 | ) -> pd.DataFrame:
 22 |     # Sample the training set if the dataset is big
 23 |     # FIXME
 24 |     sampled_training_dataset = None
 25 |     num_of_rows = len(dataframe.index)
 26 |     if num_of_rows >= sample_size:
 27 |         rare_labels = []
 28 |         dataframe_alltargets = None
 29 |         if task_type == "classification":
 30 |             dataframe_alltargets = dataframe[target_columns].astype(str).apply("".join, axis=1)
 31 |             label_count = dataframe_alltargets.value_counts()
 32 |             rare_labels = label_count.loc[label_count == 1].index.tolist()
 33 | 
 34 |         if rare_labels and dataframe_alltargets is not None:
 35 |             dataframe_rare = dataframe[dataframe_alltargets.isin(rare_labels)]
 36 |             rare_index = dataframe_rare.index.values
 37 | 
 38 |             dataframe_wo_rare = dataframe.drop(rare_index)
 39 | 
 40 |             num_of_labels = [len(dataframe_wo_rare[target].value_counts()) for target in target_columns]
 41 | 
 42 |             rare_to_all_ratio = int(
 43 |                 Decimal(sample_size * len(dataframe_rare) / len(dataframe)).quantize(
 44 |                     Decimal("0"), rounding=ROUND_HALF_UP
 45 |                 )
 46 |             )
 47 |             not_rare_to_all_ratio = int(
 48 |                 Decimal(sample_size * len(dataframe_wo_rare) / len(dataframe)).quantize(
 49 |                     Decimal("0"), rounding=ROUND_HALF_UP
 50 |                 )
 51 |             )
 52 | 
 53 |             stratify_wo_rare = None
 54 | 
 55 |             if len(dataframe_rare) == len(dataframe):
 56 |                 sampled_training_dataset = _sampled_training(dataframe, sample_size, None, task_type)
 57 | 
 58 |             elif rare_to_all_ratio in [0, 1]:
 59 |                 sampled_training_dataset_rare = dataframe_rare
 60 | 
 61 |                 if max(num_of_labels) >= sample_size:
 62 |                     stratify_wo_rare = None
 63 |                 else:
 64 |                     stratify_wo_rare = dataframe_wo_rare[target_columns]
 65 |                 sampled_training_dataset_wo_rare = _sampled_training(
 66 |                     dataframe_wo_rare,
 67 |                     sample_size - len(sampled_training_dataset_rare),
 68 |                     stratify_wo_rare,
 69 |                     task_type,
 70 |                 )
 71 | 
 72 |                 sampled_training_dataset = pd.concat(
 73 |                     [sampled_training_dataset_wo_rare, sampled_training_dataset_rare]  # type: ignore
 74 |                 )
 75 | 
 76 |             elif not_rare_to_all_ratio in [0, 1]:
 77 |                 sampled_training_dataset_wo_rare = dataframe_wo_rare
 78 |                 sampled_training_dataset_rare = _sampled_training(
 79 |                     dataframe_rare,
 80 |                     sample_size - len(sampled_training_dataset_wo_rare),
 81 |                     None,
 82 |                     task_type,
 83 |                 )
 84 | 
 85 |                 sampled_training_dataset = pd.concat(
 86 |                     [sampled_training_dataset_wo_rare, sampled_training_dataset_rare]  # type: ignore
 87 |                 )
 88 | 
 89 |             else:
 90 |                 if max(num_of_labels) >= sample_size:
 91 |                     stratify_wo_rare = None
 92 |                 else:
 93 |                     stratify_wo_rare = dataframe_wo_rare[target_columns]
 94 | 
 95 |                 sampled_training_dataset_wo_rare = _sampled_training(
 96 |                     dataframe_wo_rare, not_rare_to_all_ratio, stratify_wo_rare, task_type
 97 |                 )
 98 |                 sampled_training_dataset_rare = _sampled_training(dataframe_rare, rare_to_all_ratio, None, task_type)
 99 | 
100 |                 sampled_training_dataset = pd.concat(
101 |                     [sampled_training_dataset_wo_rare, sampled_training_dataset_rare]  # type: ignore
102 |                 )
103 | 
104 |         else:
105 |             num_of_labels = [len(dataframe[target].value_counts()) for target in target_columns]
106 |             if max(num_of_labels) >= sample_size:
107 |                 stratify_wo_rare = None
108 |             else:
109 |                 stratify_wo_rare = dataframe[target_columns]
110 | 
111 |             sampled_training_dataset = _sampled_training(dataframe, sample_size, stratify_wo_rare, task_type)
112 |         return sampled_training_dataset
113 |     else:
114 |         return dataframe
115 | 


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/export_modules/split_timeseries_dataset.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import TimeSeriesSplit
 2 | 
 3 | 
 4 | def split_dataset(dataset, split_column_name, split_num, split_index):
 5 |     dataset = dataset.sort_values(split_column_name)
 6 |     splitter = TimeSeriesSplit(n_splits=split_num)
 7 |     train_idx, test_idx = list(splitter.split(dataset))[split_index]
 8 |     train_dataset, test_dataset = dataset.iloc[train_idx], dataset.iloc[test_idx]
 9 |     for col in train_dataset.columns:
10 |         if train_dataset[col].isnull().all():
11 |             if test_dataset[col].dtype == float or test_dataset[col].dtype == int:
12 |                 train_dataset.loc[:, col] = 0
13 |             elif test_dataset[col].dtype == object:
14 |                 train_dataset.loc[:, col] = ""
15 |             elif test_dataset[col].dtype == bool:
16 |                 train_dataset.loc[:, col] = False
17 |     return train_dataset, test_dataset
18 | 


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/concat_train_validation.py.jinja:
--------------------------------------------------------------------------------
1 | train_dataset = pd.concat([train_dataset, validation_dataset]).reset_index(drop=True)


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/drop_ignore_columns.py.jinja:
--------------------------------------------------------------------------------
 1 | # DROP IGNORED COLUMNS
 2 | ignore_columns = {{ ignore_columns }}
 3 | 
 4 | {% if train %}
 5 | train_dataset = train_dataset.drop(ignore_columns, axis=1, errors="ignore")
 6 | {% endif %}
 7 | {% if validation %}
 8 | validation_dataset = validation_dataset.drop(ignore_columns, axis=1, errors="ignore")
 9 | {% endif %}
10 | {% if test %}
11 | test_dataset = test_dataset.drop(ignore_columns, axis=1, errors="ignore")
12 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/drop_inf_or_nan_rows.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | train_dataset = train_dataset[~train_dataset[{{target_columns}}].isin([np.inf, -np.inf, np.nan]).any(axis=1)]
3 | 
4 | 


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/load_localfile.py.jinja:
--------------------------------------------------------------------------------
 1 | # LOAD DATA
 2 | import pandas as pd
 3 | 
 4 | {% if dataset.training_data_path.endswith(".pkl") %}
 5 | train_dataset = pd.read_pickle(r"{{ dataset.training_data_path }}")
 6 | {% else %}
 7 | train_dataset = pd.read_csv(r"{{ dataset.training_data_path }}", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
 8 | {% endif %}
 9 | 
10 | {% if dataset.validation_data_path %}
11 | {% if dataset.validation_data_path.endswith(".pkl") %}
12 | validation_dataset = pd.read_pickle(r"{{ dataset.validation_data_path }}")
13 | {% else %}
14 | validation_dataset = pd.read_csv(r"{{ dataset.validation_data_path }}", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
15 | {% endif %}
16 | {% endif %}{# if dataset.validation_data_path #}
17 | 
18 | {% if not validation and dataset.test_data_path %}
19 | {% if dataset.test_data_path.endswith(".pkl") %}
20 | test_dataset = pd.read_pickle(r"{{ dataset.test_data_path }}")
21 | {% else %}
22 | test_dataset = pd.read_csv(r"{{ dataset.test_data_path }}", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
23 | {% endif %}
24 | {% endif %}{# if not validation and dataset.test_data_path #}
25 | 
26 | 


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/load_localfile_predict.py.jinja:
--------------------------------------------------------------------------------
1 | # LOAD DATA
2 | import pandas as pd
3 | 
4 | {% if dataset.training_data_path.endswith(".pkl") %}
5 | test_dataset = pd.read_pickle("./test.pkl")
6 | {% else %}
7 | test_dataset = pd.read_csv("./test.csv", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
8 | {% endif %}
9 | 


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/load_localfile_train.py.jinja:
--------------------------------------------------------------------------------
1 | # LOAD DATA
2 | import pandas as pd
3 | 
4 | {% if dataset.training_data_path.endswith(".pkl") %}
5 | train_dataset = pd.read_pickle("./training.pkl")
6 | {% else %}
7 | train_dataset = pd.read_csv("./training.csv", encoding="{{ dataset.csv_encoding }}", delimiter="{{ dataset.csv_delimiter }}")
8 | {% endif %}
9 | 


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/set_index.py.jinja:
--------------------------------------------------------------------------------
1 | # SET ID_COLUMNS TO DATAFRAME'S INDEX
2 | id_columns_for_prediction = {{ id_columns_for_prediction }}
3 | test_dataset = test_dataset.set_index(id_columns_for_prediction, drop=False)


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/set_validation_as_test.py.jinja:
--------------------------------------------------------------------------------
1 | test_dataset = validation_dataset


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/split.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if (validation and (not dataset.validation_data_path)) or ((not validation) and (not dataset.test_data_path)) %}
 2 | 
 3 | # TRAIN-TEST SPLIT
 4 | {% if task.split_method == "random" %}
 5 | {% if task.split_stratification %}
 6 | from sklearn.model_selection import train_test_split
 7 | def split_dataset(dataset, train_size={{ task.split_train_size }}, random_state={{ task.split_seed }}):
 8 |     train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state, stratify=dataset["{{task.target_columns[0]}}"])
 9 |     return train_dataset, test_dataset
10 | {% else %}
11 | from sklearn.model_selection import train_test_split
12 | def split_dataset(dataset, train_size={{ task.split_train_size }}, random_state={{ task.split_seed }}):
13 |     train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=random_state)
14 |     return train_dataset, test_dataset	
15 | {% endif %}
16 | {% elif task.split_method == "group" %}
17 | from sklearn.model_selection import GroupShuffleSplit
18 | def split_dataset(dataset, split_column_name="{{ task.split_column_name }}", train_size={{ task.split_train_size }}, random_state={{ task.split_seed }}):
19 |     splitter = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=random_state)
20 |     train_idx, test_idx = next(splitter.split(dataset, groups=dataset[split_column_name]))
21 |     train_dataset, test_dataset = dataset.iloc[train_idx], dataset.iloc[test_idx]
22 |     return train_dataset, test_dataset
23 | {% else %}{# time #}
24 | from lib.split_timeseries_dataset import split_dataset
25 | {% endif %}
26 | {% if not dataset.test_data_path %}
27 | {% if task.split_method == "random" or task.split_method == "group" %}
28 | train_dataset, test_dataset = split_dataset(train_dataset)
29 | {% else %}
30 | train_dataset, test_dataset = split_dataset(train_dataset, split_column_name="{{ task.split_column_name }}", split_num={{ task.time_split_num }}, split_index={{ task.time_split_index}})
31 | {% endif %}
32 | {% endif %}
33 | {% if validation %}
34 | {% endif %}
35 | {% endif %}
36 | {% if validation and (not dataset.validation_data_path) %}
37 | {% if task.split_method == "random" or task.split_method == "group" %}
38 | train_dataset, validation_dataset = split_dataset(train_dataset)
39 | {% else %}
40 | train_dataset, validation_dataset = split_dataset(train_dataset, split_column_name="{{ task.split_column_name }}", split_num={{ task.time_split_num }}, split_index={{ task.time_split_index}})
41 | {% endif %}
42 | {% endif %}
43 | 


--------------------------------------------------------------------------------
/sapientml_core/datastore/localfile/templates/subsample.py.jinja:
--------------------------------------------------------------------------------
 1 | # SUBSAMPLE
 2 | # If the number of rows of train_dataset is larger than sample_size, sample rows to sample_size for speedup.
 3 | from lib.sample_dataset import sample_dataset
 4 | train_dataset = sample_dataset(
 5 |     dataframe=train_dataset,
 6 |     sample_size={{ sample_size }},
 7 |     target_columns={{ task.target_columns }},
 8 |     task_type='{{ task.task_type }}'
 9 | )
10 | 
11 | 


--------------------------------------------------------------------------------
/sapientml_core/design/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/design/__init__.py


--------------------------------------------------------------------------------
/sapientml_core/design/label_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | name_to_label_mapping = {
17 |     "random forest": {
18 |         "c": "MODEL:Classifier:RandomForestClassifier:sklearn",
19 |         "r": "MODEL:Regressor:RandomForestRegressor:sklearn",
20 |     },
21 |     "extra tree": {
22 |         "c": "MODEL:Classifier:ExtraTreesClassifier:sklearn",
23 |         "r": "MODEL:Regressor:ExtraTreesRegressor:sklearn",
24 |     },
25 |     "lightgbm": {"c": "MODEL:Classifier:LGBMClassifier:lightgbm", "r": "MODEL:Regressor:LGBMRegressor:lightgbm"},
26 |     "xgboost": {"c": "MODEL:Classifier:XGBClassifier:xgboost", "r": "MODEL:Regressor:XGBRegressor:xgboost"},
27 |     "catboost": {
28 |         "c": "MODEL:Classifier:CatBoostClassifier:catboost",
29 |         "r": "MODEL:Regressor:CatBoostRegressor:catboost",
30 |     },
31 |     "gradient boosting": {
32 |         "c": "MODEL:Classifier:GradientBoostingClassifier:sklearn",
33 |         "r": "MODEL:Regressor:GradientBoostingRegressor:sklearn",
34 |     },
35 |     "adaboost": {"c": "MODEL:Classifier:AdaBoostClassifier:sklearn", "r": "MODEL:Regressor:AdaBoostRegressor:sklearn"},
36 |     "decision tree": {
37 |         "c": "MODEL:Classifier:DecisionTreeClassifier:sklearn",
38 |         "r": "MODEL:Regressor:DecisionTreeRegressor:sklearn",
39 |     },
40 |     "svm": {"c": "MODEL:Classifier:SVC:sklearn", "r": "MODEL:Regressor:SVR:sklearn"},
41 |     "linear svm": {"c": "MODEL:Classifier:LinearSVC:sklearn", "r": "MODEL:Regressor:LinearSVR:sklearn"},
42 |     "logistic/linear regression": {
43 |         "c": "MODEL:Classifier:LogisticRegression:sklearn",
44 |         "r": "MODEL:Regressor:LinearRegression:sklearn",
45 |     },
46 |     "lasso": {"r": "MODEL:Regressor:Lasso:sklearn"},
47 |     "sgd": {"c": "MODEL:Classifier:SGDClassifier:sklearn", "r": "MODEL:Regressor:SGDRegressor:sklearn"},
48 |     "mlp": {"c": "MODEL:Classifier:MLPClassifier:sklearn", "r": "MODEL:Regressor:MLPRegressor:sklearn"},
49 |     "multinomial nb": {"c": "MODEL:Classifier:MultinomialNB:sklearn"},
50 |     "gaussian nb": {"c": "MODEL:Classifier:GaussianNB:sklearn"},
51 |     "bernoulli nb": {"c": "MODEL:Classifier:BernoulliNB:sklearn"},
52 | }
53 | 
54 | 
55 | def map_label_to_name():
56 |     """
57 |     Assign several internal labels to each ML component.
58 | 
59 |     Returns
60 |     ----------
61 |     label_to_name : dict[str, str]
62 |         Assigned result.
63 |     """
64 |     label_to_name = {"MODEL:Classifier:LGBMClassifier:lgbm": "lightgbm", "MODEL:Regressor:train:xgboost": "xgboost"}
65 |     for k, v in name_to_label_mapping.items():
66 |         for k1, v1 in v.items():
67 |             label_to_name[v1] = k
68 |     return label_to_name
69 | 


--------------------------------------------------------------------------------
/sapientml_core/design/pp_component_groups.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | drop_label_list = [
17 |     "PREPROCESS:MissingValues:dropna:pandas",
18 |     "PREPROCESS:MissingValues:notnull:pandas",
19 |     "PREPROCESS:MissingValues:isnull:pandas",
20 | ]
21 | filler_label = [
22 |     "PREPROCESS:MissingValues:fillna:pandas",
23 |     "PREPROCESS:MissingValues:SimpleImputer:sklearn",
24 |     "PREPROCESS:MissingValues:KNNImputer:sklearn",
25 |     "PREPROCESS:MissingValues:replace:pandas",
26 |     "PREPROCESS:MissingValues:random:custom",
27 |     "PREPROCESS:MissingValues:interpolate:sklearn",
28 | ]
29 | in_place_converter = [
30 |     "PREPROCESS:Category:LabelEncoder:sklearn",
31 |     "PREPROCESS:Category:factorize:pandas",
32 |     "PREPROCESS:Category:replace:pandas",
33 |     "PREPROCESS:Category:map:custom",
34 |     "PREPROCESS:Category:apply:pandas",
35 |     "PREPROCESS:Category:custom:pandas",
36 | ]
37 | one_hot = [
38 |     "PREPROCESS:Category:get_dummies:pandas",
39 |     "PREPROCESS:Category:OneHotEncoder:sklearn",
40 |     "PREPROCESS:Category:LabelBinarizer:sklearn",
41 | ]
42 | 
43 | text_vect = ["PREPROCESS:Text:CountVectorizer:sklearn", "PREPROCESS:Text:TfidfVectorizer:sklearn"]
44 | 
45 | scaling = [
46 |     "PREPROCESS:Scaling:STANDARD:sklearn",
47 |     "PREPROCESS:Scaling:MIN_MAX:custom",
48 |     "PREPROCESS:Scaling:MIN_MAX:sklearn",
49 |     "PREPROCESS:Scaling:STANDARD:custom",
50 |     "PREPROCESS:Scaling:Robust:sklearn",
51 |     "PREPROCESS:Scaling:STANDARD:Pandas",
52 |     "PREPROCESS:Scaling:normalize:sklearn",
53 |     "PREPROCESS:Scaling:normalize:Pandas",
54 |     "PREPROCESS:Scaling:STANDARD:pandas",
55 | ]
56 | 
57 | date = [
58 |     "PREPROCESS:GenerateColumn:date:pandas",
59 |     "PREPROCESS:GenerateColumn:DATE:pandas",
60 |     "PREPROCESS:GenerateColumn:DATE:custom",
61 | ]
62 | 
63 | text_processing = [
64 |     "PREPROCESS:Text:lower:pandas",
65 |     "PREPROCESS:Text:remove_non_alpha:custom",
66 |     "PREPROCESS:Text:tokenize:nltk",
67 |     "PREPROCESS:Text:Lemmtize:nltk",
68 | ]
69 | 
70 | balancing = [
71 |     "PREPROCESS:Balancing:SMOTE:imblearn",
72 |     "PREPROCESS:Balancing:resample:custom",
73 |     "PREPROCESS:Balancing:sample:custom",
74 | ]
75 | 
76 | log_transform = [
77 |     "PREPROCESS:Scaling:log1p:numpy",
78 |     "PREPROCESS:Scaling:power:custom",
79 |     "PREPROCESS:Scaling:log:numpy",
80 |     "PREPROCESS:Scaling:sqrt:numpy",
81 |     "PREPROCESS:Scaling:exp:numpy",
82 |     "PREPROCESS:Scaling:log:custom",
83 |     "PREPROCESS:Scaling:power_transform:sklearn",
84 | ]
85 | 


--------------------------------------------------------------------------------
/sapientml_core/design/search_space.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .. import ps_macros
16 | from .pp_component_groups import (
17 |     balancing,
18 |     date,
19 |     drop_label_list,
20 |     filler_label,
21 |     in_place_converter,
22 |     log_transform,
23 |     one_hot,
24 |     scaling,
25 |     text_processing,
26 |     text_vect,
27 | )
28 | 
29 | target_labels = [
30 |     ps_macros.FILL,
31 |     ps_macros.IN_PLACE_CONVERT,
32 |     ps_macros.ONE_HOT,
33 |     ps_macros.VECT,
34 |     ps_macros.DATE,
35 |     ps_macros.LEMMITIZE,
36 |     ps_macros.BALANCING,
37 |     ps_macros.SCALING,
38 |     ps_macros.LOG,
39 | ]
40 | 
41 | # Manually created semantic labels
42 | # Semantic labels are those that cannot be discriminated by our current list of meta-features
43 | 
44 | 
45 | label_mapping = {
46 |     # macros.DROP: drop_label_list,
47 |     ps_macros.FILL: filler_label,
48 |     ps_macros.IN_PLACE_CONVERT: in_place_converter,
49 |     ps_macros.ONE_HOT: one_hot,
50 |     ps_macros.VECT: text_vect,
51 |     ps_macros.MISSING: drop_label_list + filler_label,
52 |     ps_macros.CATG: in_place_converter + one_hot,
53 |     ps_macros.DATE: date,
54 |     ps_macros.LEMMITIZE: text_processing,
55 |     ps_macros.SCALING: scaling,
56 |     ps_macros.BALANCING: balancing,
57 |     ps_macros.LOG: log_transform,
58 | }
59 | 
60 | project_related_metadata = ["file_name", "notebook_name", "csv_name", "accuracy", "target_column_name"]
61 | 
62 | meta_feature_list = [
63 |     ps_macros.CATG_PRESENCE,
64 |     ps_macros.TEXT_PRESENCE,
65 |     ps_macros.BINARY_CATG_PRESENCE,
66 |     ps_macros.SMALL_CATG_PRESENCE,
67 |     ps_macros.LARGE_CATG_PRESENCE,
68 |     ps_macros.MISSING_PRESENCE,
69 |     ps_macros.NORMALIZED_MEAN,
70 |     ps_macros.NORMALIZED_STD_DEV,
71 |     ps_macros.NORMALIZED_VARIATION_ACROSS_COLUMNS,
72 |     ps_macros.DATE_PRESENCE,
73 |     ps_macros.IMBALANCE,
74 |     ps_macros.MAX_SKEW,
75 | ]
76 | 


--------------------------------------------------------------------------------
/sapientml_core/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import enum
16 | 
17 | 
18 | # various operators in decision path for FE/pre-processing meta-models.
19 | class Operator(enum.Enum):
20 |     EQUAL_TO = enum.auto()
21 |     NOT_EQUAL_TO = enum.auto()
22 |     GREATER_THAN = enum.auto()
23 |     GREATER_THAN_OR_EQUAL_TO = enum.auto()
24 |     LESS_THAN = enum.auto()
25 |     LESS_THAN_OR_EQUAL_TO = enum.auto()
26 | 


--------------------------------------------------------------------------------
/sapientml_core/explain/code_template.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import datetime
16 | 
17 | 
18 | class Code_Template:
19 |     """Code Template class."""
20 | 
21 |     def __init__(self):
22 |         self.str_reverse = {"NOW": str(datetime.datetime.now())}
23 | 
24 |     def update(self, lines):
25 |         """update method.
26 | 
27 |         Parameters
28 |         ----------
29 |         lines : list[str]
30 |             A line in block code from jupyter content template.
31 | 
32 |         Returns
33 |         -------
34 |         out : list[str]
35 |             Updated line in block code from jupyter content template.
36 | 
37 |         """
38 |         out = []
39 |         for line in lines:
40 |             for key in self.str_reverse:
41 |                 line = line.replace(key, self.str_reverse[key])
42 |             out.append(line)
43 |         return out
44 | 


--------------------------------------------------------------------------------
/sapientml_core/explain/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from typing import Literal, Optional
 16 | 
 17 | import pandas as pd
 18 | from sapientml.params import CancellationToken
 19 | from sapientml.util.logging import setup_logger
 20 | 
 21 | from .AutoEDA import EDA
 22 | from .AutoVisualization import AutoVisualization_Class
 23 | from .code_miner import Miner
 24 | 
 25 | logger = setup_logger()
 26 | 
 27 | 
 28 | def process(
 29 |     visualization: bool,
 30 |     eda: bool,
 31 |     dataframe: pd.DataFrame,
 32 |     script_path: str,
 33 |     target_columns: list[str],
 34 |     problem_type: Literal["regression", "classification"],
 35 |     ignore_columns: Optional[list[str]] = None,
 36 |     skeleton: Optional[dict] = None,
 37 |     explanation: Optional[dict] = None,
 38 |     run_info: Optional[dict] = None,
 39 |     internal_execution: bool = False,
 40 |     timeout: int = 0,
 41 |     cancel: Optional[CancellationToken] = None,
 42 | ):
 43 |     """process function.
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     visualization : bool
 48 |         True and otherwise False
 49 |     eda : bool
 50 |         True and otherwise False
 51 |     dataframe : pd.DataFrame
 52 |         dataframe input
 53 |     script_path : str
 54 |         Path of the script.
 55 |     target_columns : list[str]
 56 |         Names of target columns.
 57 |     problem_type : Literal["regression", "classification"]
 58 |         Type of problem either regression or classification
 59 |     ignore_columns : list[str], optional
 60 |         Column names which must not be used and must be dropped.
 61 |     skeleton : dict, optional
 62 |         Probabilty score and other details of preprocess and model components.
 63 |     explanation : dict, optional
 64 |         pipelines explanation
 65 |     run_info : dict, optional
 66 |         execution results, logs and other information.
 67 |     internal_execution : bool
 68 |         True and otherwise Flase
 69 |     timeout : int
 70 |         integer value for timeout
 71 |     cancel : CancellationToken, optional
 72 | 
 73 |     Returns
 74 |     -------
 75 |     output_files : List[str]
 76 |         list of .ipynb files.
 77 | 
 78 |     """
 79 |     output_files = None
 80 | 
 81 |     if visualization:
 82 |         # Call AutoVisualization to generate visualization codes
 83 |         AV = AutoVisualization_Class()
 84 |         visualization_code = AV.AutoVisualization(
 85 |             df=dataframe,
 86 |             target_columns=target_columns,
 87 |             problem_type=problem_type,
 88 |             ignore_columns=ignore_columns,
 89 |         )
 90 |     else:
 91 |         visualization_code = None
 92 | 
 93 |     if eda:
 94 |         # handle list(tuple, dict) value in dataframe.
 95 |         for col in dataframe.columns:
 96 |             exist_list_values = [x for x in dataframe[col] if type(x) in [list, tuple, dict]]
 97 |             if len(exist_list_values) > 0:
 98 |                 dataframe[col] = dataframe[col].fillna("").astype(str)
 99 |         eda = EDA(dataframe, target_columns, log_level=2)
100 | 
101 |         eda.check_consistency(convert=False)
102 | 
103 |         categories, desc = eda.cat_process(threshold=0.01, IQR_activation=True, z_activation=True)
104 | 
105 |         initial_blocks = eda.description
106 |     else:
107 |         initial_blocks = []
108 | 
109 |     code_miner = Miner(
110 |         script_path,
111 |         init_blocks=initial_blocks,
112 |         visualization_code=visualization_code,
113 |         logger=logger,
114 |         skeleton=skeleton,
115 |         explanation=explanation,
116 |         run_info=run_info,
117 |     )
118 |     output_files = code_miner.save_all(execution=internal_execution, timeout=timeout, cancel=cancel)
119 |     return output_files
120 | 


--------------------------------------------------------------------------------
/sapientml_core/explain/templates/jupyter_content.json:
--------------------------------------------------------------------------------
1 | {"# BEGIN": [["# Use a generic Kaggle dataset path to start"], []], "#*** PIPELINE ***": [["We have to preprocess the dataset as first step.", "Then, we will generate a pipeline to train a model."], []], "# LOAD DATA": [["# Input Dataset"], []], "# PREPROCESSING-number": [["# Feature Engineering"], []], "# DETATCH TARGET": [[], []], "# TRAIN TEST SPLIT": [["## Split Train/Test", "We have to seprate train and test before start straining a model"], []], "# MODEL": [["# Train a Model"], []]}


--------------------------------------------------------------------------------
/sapientml_core/internal_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from pathlib import Path
17 | 
18 | sapientml_core_root = Path(__file__).parents[0]
19 | 
20 | adaptation_root_dir = sapientml_core_root / "adaptation"
21 | artifacts_path = adaptation_root_dir / "artifacts"
22 | model_path = sapientml_core_root / "models"
23 | 
24 | benchmark_path = sapientml_core_root / "benchmarks"
25 | corpus_path = sapientml_core_root / "corpus"
26 | training_cache = sapientml_core_root / ".cache"
27 | 
28 | execution_cache_dir = training_cache / "exec_info"
29 | analysis_dir = training_cache / "analysis"
30 | clean_notebooks_dir_name = "clean-notebooks"
31 | clean_dir = corpus_path / clean_notebooks_dir_name
32 | project_labels_path = corpus_path / "annotated-notebooks" / "annotated-notebooks-1140.csv"
33 | 


--------------------------------------------------------------------------------
/sapientml_core/models/PY310/mp_model_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY310/mp_model_1.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/PY310/mp_model_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY310/mp_model_2.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/PY310/pp_models.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY310/pp_models.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/PY311/mp_model_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY311/mp_model_1.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/PY311/mp_model_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY311/mp_model_2.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/PY311/pp_models.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY311/pp_models.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/PY39/mp_model_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY39/mp_model_1.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/PY39/mp_model_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY39/mp_model_2.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/PY39/pp_models.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/PY39/pp_models.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/mp_model_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/mp_model_1.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/mp_model_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/mp_model_2.pkl


--------------------------------------------------------------------------------
/sapientml_core/models/pp_models.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/sapientml_core/models/pp_models.pkl


--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from .generator import DefaultPreprocess
16 | from .params import DefaultPreprocessConfig
17 | 
18 | __all__ = ["DefaultPreprocess", "DefaultPreprocessConfig"]
19 | 


--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/params.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from sapientml.params import Config, String
 4 | 
 5 | 
 6 | class DefaultPreprocessConfig(Config):
 7 |     """Configuration arguments for DefaultPreprocess class.
 8 | 
 9 |     Attributes
10 |     ----------
11 |     use_pos_list : Optional[list[str]]
12 |       List of parts-of-speech to be used during text analysis.
13 |       This variable is used for japanese texts analysis.
14 |       Select the part of speech below.
15 |       "名詞", "動詞", "形容詞", "形容動詞", "副詞".
16 |     use_word_stemming : bool default True
17 |       Specify whether or not word stemming is used.
18 |       This variable is used for japanese texts analysis.
19 | 
20 |     """
21 | 
22 |     use_pos_list: Optional[list[String]] = ["名詞", "動詞", "助動詞", "形容詞", "副詞"]
23 |     use_word_stemming: bool = True
24 | 


--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/drop_one_value_columns.py.jinja:
--------------------------------------------------------------------------------
1 | # DISCARD COLUMNS WITH ONE VALUE ONLY
2 | cols_one_value_only = {{ cols_one_value_only }}
3 | {% if training %}
4 | train_dataset = train_dataset.drop(cols_one_value_only, axis=1, errors="ignore")
5 | {% endif %}
6 | {% if test %}
7 | test_dataset = test_dataset.drop(cols_one_value_only, axis=1, errors="ignore")
8 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/handle_inf_columns.py.jinja:
--------------------------------------------------------------------------------
 1 | # CONVERT INF TO NAN
 2 | import numpy as np
 3 | cols_inf_values = {{ cols_inf_values }}
 4 | {% if training %}
 5 | train_dataset[cols_inf_values] = train_dataset[cols_inf_values].replace([-np.inf, np.inf], np.nan)
 6 | {% endif %}
 7 | {% if test %}
 8 | test_dataset[cols_inf_values] = test_dataset[cols_inf_values].replace([-np.inf, np.inf], np.nan)
 9 | {% endif %}
10 | 


--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/handle_iterable_values.py.jinja:
--------------------------------------------------------------------------------
 1 | # HANDLE ITERABLE VALUES IN DATAFRAME
 2 | cols_iterable_values = {{ cols_iterable_values }}
 3 | for col in cols_iterable_values:
 4 | {% if training %}
 5 |     train_dataset[col] = train_dataset[col].fillna("").astype(str)
 6 | {% endif %}
 7 | {% if test %}
 8 |     test_dataset[col] = test_dataset[col].fillna("").astype(str)
 9 | {% endif %}
10 | 
11 | 


--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/handle_japanese_text.py.jinja:
--------------------------------------------------------------------------------
 1 | # HANDLE JAPANESE TEXT
 2 | import MeCab
 3 | import ipadic
 4 | tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS)
 5 | use_pos_list = {{ config.use_pos_list }}
 6 | use_word_stemming = {{ config.use_word_stemming }}
 7 | def tokenize(text, use_pos_list, use_word_stemming, tokenizer):
 8 |     node = tokenizer.parseToNode(text)
 9 |     terms = []
10 |     while node:
11 |         features = node.feature.split(",")
12 |         pos = features[0]
13 |         if pos != "BOS/EOS":
14 |             if use_word_stemming:
15 |                 term = features[6]
16 |                 if (pos == "名詞") & (features[1] == "数"):
17 |                     term = node.surface
18 |             else:
19 |                 term = node.surface
20 |             if use_pos_list:
21 |                 if pos in use_pos_list:
22 |                     terms.append(term)
23 |             else:
24 |                 terms.append(term)
25 |         node = node.next
26 |     return " ".join(terms)
27 | cols_japanese_text = {{ cols_japanese_text}}
28 | for col in cols_japanese_text:
29 | {% if training %}
30 |     train_dataset[col] = train_dataset[col].fillna("").apply(lambda x: tokenize(x, use_pos_list, use_word_stemming, tokenizer))
31 | {% endif %}
32 | {% if test %}
33 |     test_dataset[col] = test_dataset[col].fillna("").apply(lambda x: tokenize(x, use_pos_list, use_word_stemming, tokenizer))
34 | {% endif %}
35 | 
36 | 


--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/handle_mixed_typed_columns.py.jinja:
--------------------------------------------------------------------------------
 1 | # HANDLE MIXED TYPE
 2 | import numpy as np
 3 | cols_numeric_and_string = {{ cols_numeric_and_string}}
 4 | for col in cols_numeric_and_string:
 5 | {% if training %}
 6 |     train_dataset[col + '__str'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), train_dataset[col], np.nan)
 7 |     train_dataset[col + '__str'] = np.where(train_dataset[col + '__str'].notnull(), train_dataset[col + '__str'].astype(str), np.nan)
 8 |     train_dataset[col + '__num'] = np.where(pd.to_numeric(train_dataset[col], errors='coerce').isnull(), np.nan, train_dataset[col]).astype(float)
 9 |     train_dataset = train_dataset.drop(col, axis=1)
10 | {% endif %}
11 | {% if test %}
12 |     test_dataset[col + '__str'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), test_dataset[col], np.nan)
13 |     test_dataset[col + '__str'] = np.where(test_dataset[col + '__str'].notnull(), test_dataset[col + '__str'].astype(str), np.nan)
14 |     test_dataset[col + '__num'] = np.where(pd.to_numeric(test_dataset[col], errors='coerce').isnull(), np.nan, test_dataset[col]).astype(float)
15 |     test_dataset = test_dataset.drop(col, axis=1)
16 | {% endif %}
17 | 


--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/none_has_columns.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | {% if training %}
3 | train_dataset = train_dataset.replace([None], np.nan)
4 | {% endif %}
5 | {% if test %}
6 | test_dataset = test_dataset.replace([None], np.nan)
7 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/preprocess/default/templates/rename_columns.py.jinja:
--------------------------------------------------------------------------------
 1 | # Remove special symbols that interfere with visualization and model training
 2 | import re
 3 | cols_has_symbols = {{ cols_has_symbols }}
 4 | inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+")
 5 | {% if training %}
 6 | train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
 7 | {% endif %}
 8 | {% if test %}
 9 | test_dataset = test_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
10 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/ps_macros.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | FILL = "PREPROCESS:MissingValues:fillna:pandas"
17 | IN_PLACE_CONVERT = "PREPROCESS:Category:LabelEncoder:sklearn"
18 | ONE_HOT = "PREPROCESS:Category:get_dummies:pandas"
19 | VECT = "PREPROCESS:Text:TfidfVectorizer:sklearn"
20 | MISSING = "PREPROCESS:MissingValues:all"
21 | CATG = "PREPROCESS:Category:all"
22 | SCALING = "PREPROCESS:Scaling:STANDARD:sklearn"
23 | DATE = "PREPROCESS:GenerateColumn:DATE:pandas"
24 | LEMMITIZE = "PREPROCESS:TextProcessing:Processing:custom"
25 | BALANCING = "PREPROCESS:Balancing:SMOTE:imblearn"
26 | LOG = "PREPROCESS:Scaling:log:custom"
27 | 
28 | # Revised meta-features
29 | 
30 | CATG_PRESENCE = "feature:str_category_presence"
31 | TEXT_PRESENCE = "feature:str_text_presence"
32 | BINARY_CATG_PRESENCE = "feature:str_category_binary_presence"
33 | SMALL_CATG_PRESENCE = "feature:str_category_small_presence"
34 | LARGE_CATG_PRESENCE = "feature:str_category_large_presence"
35 | DATE_PRESENCE = "feature:str_date_presence"
36 | STR_OTHER = "feature:str_other"
37 | 
38 | MISSING_PRESENCE = "feature:missing_values_presence"
39 | DATE_PRESENCE = "feature:str_date_presence"
40 | 
41 | NORMALIZED_MEAN = "feature:max_normalized_mean"
42 | NORMALIZED_STD_DEV = "feature:max_normalized_stddev"
43 | NORMALIZED_VARIATION_ACROSS_COLUMNS = "feature:normalized_variation_across_columns"
44 | IMBALANCE = "feature:target_imbalance_score"
45 | MAX_SKEW = "feature:max_skewness"
46 | 
47 | 
48 | TASK_CLASSIFICATION = "classification"
49 | TASK_REGRESSION = "regression"
50 | 


--------------------------------------------------------------------------------
/sapientml_core/seeding/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/explainability_templates/model_explanation.py.jinja:
--------------------------------------------------------------------------------
1 | # Component: {{ target_component_name }}
2 | # Efficient Cause: {{ target_component_name }} is required in this pipeline since the dataset has {{ relevant_meta_feature_list }}.


--------------------------------------------------------------------------------
/sapientml_core/templates/explainability_templates/preprocessing_explanation.py.jinja:
--------------------------------------------------------------------------------
1 | # Component: {{ target_component_name }}
2 | # Efficient Cause: {{ target_component_name }} is required in this pipeline since the dataset has {{ relevant_meta_feature_list }}. The relevant features are: {{ relevant_column_list }}.
3 | # Purpose: {{ api_description }}
4 | # Form:
5 | #   Input: {{ data_shape }}
6 | #   Key hyperparameters used: {{ hyperparameters_description }}
7 | # Alternatives: Although {{ alternative_component_list }} can also be used for this dataset, {{ target_component_name }} is used because it has more {{ relevant_meta_feature_1 }} than {{ relevant_meta_feature_2 }}.
8 | # Order: {{ target_component_name }} should be applied {{ before_or_after }} {{ dependent_component_list }}


--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/classification_post_process.jinja:
--------------------------------------------------------------------------------
1 | # POST PROCESSING
2 | {% if pipeline.adaptation_metric.startswith("MAP_") %}
3 | y_pred_sorted_index = pd.DataFrame(np.argsort(-y_pred))
4 | y_pred = y_pred_sorted_index.apply(lambda x: model.classes_[x]).to_numpy()
5 | {% else %}
6 | if np.shape(y_pred)[1] == 2:
7 |     y_pred = y_pred[:, 1]
8 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/hyperparameter_tuning.py.jinja:
--------------------------------------------------------------------------------
  1 | # HYPERPARAMETER OPTIMIZATION
  2 | import optuna
  3 | from {{import_library}} import {{ model_name }}
  4 | 
  5 | 
  6 | # NEED CV: ex.) optuna.integration.OptunaSearchCV()
  7 | class Objective(object):
  8 |     def __init__(self, feature_train, target_train, feature_test, target_test, __random_state):
  9 |         self.feature_train = feature_train
 10 |         self.target_train = target_train
 11 |         self.feature_test = feature_test
 12 |         self.target_test = target_test 
 13 |         self.__random_state = __random_state
 14 |     def __call__(self, trial):
 15 |         def set_hyperparameters(trial):
 16 |             params = {}
 17 | {{ params }}
 18 |             return params
 19 |         
 20 |         # SET DATA
 21 |         import numpy as np
 22 |     
 23 |         if isinstance(self.feature_train, pd.DataFrame):
 24 |             feature_train = self.feature_train
 25 |         elif isinstance(self.feature_train, np.ndarray):
 26 |             feature_train = pd.DataFrame(self.feature_train)
 27 |         else:
 28 |             feature_train = pd.DataFrame(self.feature_train.toarray())
 29 |     
 30 |         if isinstance(self.target_train, pd.DataFrame):
 31 |             target_train = self.target_train
 32 |         elif isinstance(self.target_train, np.ndarray):
 33 |             target_train = pd.DataFrame(self.target_train)
 34 |         else:
 35 |             target_train = pd.DataFrame(self.target_train.toarray())
 36 |     
 37 |         if isinstance(self.feature_test, pd.DataFrame):
 38 |             feature_test = self.feature_test
 39 |         elif isinstance(self.feature_test, np.ndarray):
 40 |             feature_test = pd.DataFrame(self.feature_test)
 41 |         else:
 42 |             feature_test = pd.DataFrame(self.feature_test.toarray())
 43 |     
 44 |         if isinstance(self.target_test, pd.DataFrame):
 45 | {% if 'inverse_target' in pipeline.pipeline_json %}
 46 |             target_test = self.target_test.copy()
 47 | {% else %}
 48 |             target_test = self.target_test
 49 | {% endif %}
 50 |         elif isinstance(self.target_test, np.ndarray):
 51 |             target_test = pd.DataFrame(self.target_test)
 52 |         else:
 53 |             target_test = pd.DataFrame(self.target_test.toarray())
 54 | 
 55 |         # MODEL 
 56 |         params = set_hyperparameters(trial)
 57 | {% if flag_no_random_seed_model %}
 58 |         model = {{ model_name }}(**params)
 59 | {% else %}
 60 |         model = {{ model_name }}(random_state=self.__random_state, **params)
 61 | {% endif %}
 62 | {% if is_multioutput_regression%}
 63 |         from sklearn.multioutput import MultiOutputRegressor
 64 | 
 65 |         model = MultiOutputRegressor(model)
 66 | {% elif is_multioutput_classification %}
 67 |         from sklearn.multioutput import MultiOutputClassifier
 68 | 
 69 |         model = MultiOutputClassifier(model)
 70 | {% endif %}
 71 | {% set xgbclassifier = "XGBClassifier" %}
 72 | {% if model_name == xgbclassifier %}
 73 |         from sklearn.preprocessing import LabelEncoder
 74 | 
 75 |         label_encoder = LabelEncoder()
 76 |         target_train = label_encoder.fit_transform(target_train)
 77 | {% endif %}
 78 | 
 79 | {% if pipeline.task.target_columns|length == 1 %}
 80 | {% if model_name == xgbclassifier %}
 81 |         model.fit(feature_train, target_train.ravel())
 82 | {% else %}
 83 |         model.fit(feature_train, target_train.values.ravel())
 84 | {% endif %}
 85 | {% else %}
 86 |         model.fit(feature_train, target_train)
 87 | {% endif %}
 88 | {% if flag_predict_proba == False %}
 89 |         y_pred = model.predict(feature_test)
 90 | {% if model_name == xgbclassifier and not flag_predict_proba%}
 91 |         y_pred = label_encoder.inverse_transform(y_pred)
 92 | {% endif %}
 93 | {% elif flag_predict_proba == True %}
 94 |         y_pred = model.predict_proba(feature_test)
 95 | {% filter indent(width=8, first=True) %}
 96 | {{ binary_classification_snippet }}
 97 | {% endfilter %}
 98 | {% endif %}
 99 | 
100 | {% if 'inverse_target' in pipeline.pipeline_json %}
101 | {% filter indent(width=8, first=True) %}
102 | {{ pipeline.pipeline_json['inverse_target_hpo']['code'] }}
103 | {% endfilter %}
104 | {% endif %}
105 |         
106 | {{ evaluation }}
107 |         
108 |         return score
109 |     
110 | n_trials = {{ pipeline.config.hyperparameter_tuning_n_trials }}
111 | timeout = {{ timeout }} 
112 | random_state = {{ pipeline.config.hyperparameter_tuning_random_state}} 
113 | random_state_model = {{ pipeline.config.seed_for_model}} 
114 | 
115 | {% set maximize_metrics = [macros.Metric.AUC.value, macros.Metric.Accuracy.value, macros.Metric.F1.value, macros.Metric.R2.value, macros.Metric.Gini.value, macros.Metric.ROC_AUC.value] %}
116 | {% set minimize_metrics = [macros.Metric.RMSE.value, macros.Metric.RMSLE.value, macros.Metric.MAE.value, macros.Metric.LogLoss.value] %}
117 | 
118 | {% if pipeline.adaptation_metric in maximize_metrics %}
119 | direction = 'maximize' 
120 | {% elif pipeline.adaptation_metric in minimize_metrics %}
121 | direction = 'minimize' 
122 | {% else %}
123 | direction = 'maximize' 
124 | {% endif %}
125 |     
126 | study = optuna.create_study(direction=direction,
127 |                 sampler=optuna.samplers.TPESampler(seed=random_state)) 
128 | {{ enqueue_default_hyperparameters }}
129 | study.optimize(Objective(feature_train, target_train, feature_test, target_test, random_state_model), 
130 |                 n_trials=n_trials, 
131 |                 timeout=timeout)
132 | best_params = study.best_params
133 | 
134 | print("best params:", best_params)
135 | print("RESULT: {{ pipeline.task.adaptation_metric }}: " + str(study.best_value))


--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/hyperparameters_default_value.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if model_name == 'RandomForestClassifier' %}
 2 | default_hyperparameters = {'class_weight': None, 'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100, 'oob_score': False}
 3 | {% elif model_name == 'RandomForestRegressor' %}
 4 | default_hyperparameters = {'criterion': 'squared_error', 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 100, 'oob_score': False}
 5 | {% elif model_name == 'ExtraTreesClassifier' %}
 6 | default_hyperparameters = {'class_weight': None, 'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
 7 | {% elif model_name == 'ExtraTreesRegressor' %}
 8 | default_hyperparameters = {'criterion': 'squared_error', 'max_features': 1.0, 'min_samples_leaf': 1, 'n_estimators': 100}
 9 | {% elif model_name == 'GradientBoostingClassifier' %}
10 | default_hyperparameters = {'criterion': 'friedman_mse', 'loss': 'log_loss', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100, 'subsample': 1.0}
11 | {% elif model_name == 'GradientBoostingRegressor' %}
12 | default_hyperparameters = {'alpha': 0.9, 'criterion': 'friedman_mse', 'loss': 'squared_error', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 100, 'subsample': 1.0}
13 | {% elif model_name == 'AdaBoostClassifier' %}
14 | default_hyperparameters = {'algorithm': 'SAMME.R', 'n_estimators': 50}
15 | {% elif model_name == 'AdaBoostRegressor' %}
16 | default_hyperparameters = {'loss': 'linear', 'n_estimators': 50}
17 | {% elif model_name == 'DecisionTreeClassifier' %}
18 | default_hyperparameters = {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 1}
19 | {% elif model_name == 'DecisionTreeRegressor' %}
20 | default_hyperparameters = {'criterion': 'squared_error', 'max_features': None, 'min_samples_leaf': 1}
21 | {% elif model_name == 'SVC' %}
22 | default_hyperparameters = {'C': 1.0, 'class_weight': None}
23 | {% elif model_name == 'SVR' %}
24 | default_hyperparameters = {'C': 1.0}
25 | {% elif model_name == 'LinearSVC' %}
26 | default_hyperparameters = {'C': 1.0, 'class_weight': None, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'penalty': 'l2'}
27 | {% elif model_name == 'LinearSVR' %}
28 | default_hyperparameters = {'C': 1.0, 'intercept_scaling': 1.0, 'loss': 'epsilon_insensitive'}
29 | {% elif model_name == 'LogisticRegression' %}
30 | default_hyperparameters = {'C': 1.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
31 | {% elif model_name == 'SGDClassifier' %}
32 | default_hyperparameters = {'alpha': 0.0001, 'class_weight': None, 'early_stopping': False, 'loss': 'hinge', 'penalty': 'l2'}
33 | {% elif model_name == 'SGDRegressor' %}
34 | default_hyperparameters = {'alpha': 0.0001, 'loss': 'squared_error', 'penalty': 'l2'}
35 | {% elif model_name == 'Lasso' %}
36 | default_hyperparameters = {'alpha': 1.0}
37 | {% elif model_name == 'MLPClassifier' %}
38 | default_hyperparameters = {'activation': 'relu', 'alpha': 0.0001, 'solver': 'adam'}
39 | {% elif model_name == 'MLPRegressor' %}
40 | default_hyperparameters = {'activation': 'relu', 'alpha': 0.0001, 'solver': 'adam'}
41 | {% elif model_name == 'LGBMClassifier' or model_name == 'LGBMRegressor' %}
42 | default_hyperparameters = {'class_weight': None, 'colsample_bytree': 1.0, 'min_child_samples': 20, 'min_child_weight': 0.001, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1.0, 'subsample_freq': 0}
43 | {% elif model_name == 'XGBClassifier' %}
44 | default_hyperparameters = {'colsample_bytree': 1, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1}
45 | {% elif model_name == 'XGBRegressor' %}
46 | default_hyperparameters = {'colsample_bytree': 1, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 1}
47 | {% elif model_name == 'CatBoostClassifier' or model_name == 'CatBoostRegressor' %}
48 | default_hyperparameters = {'boosting_type': 'Plain', 'depth': 6, 'bootstrap_type': 'MVS', 'silent': True}
49 | {% else %}
50 | default_hyperparameters = {}
51 | {% endif %}
52 | study.enqueue_trial(default_hyperparameters)


--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/model.py.jinja:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from {{import_library}} import {{ model_name }}
 3 | 
 4 | {% if "CatBoost" in model_name %}
 5 | {% set silent="silent=True, " %}
 6 | {% else %}
 7 | {% set silent="" %}
 8 | {% endif %}
 9 | {% if model_arg == "HPO_noRandomSeed" %}
10 | model = {{ model_name }}(**best_params)
11 | {% elif model_arg == "HPO_RandomSeed" %}
12 | random_state_model = {{ pipeline.config.seed_for_model}}
13 | model = {{ model_name }}(random_state=random_state_model, **best_params)
14 | {% elif model_arg == "noHPO_noRandomSeed" %}
15 | model = {{ model_name }}({{ silent }}{{ params }})
16 | {% elif model_arg == "noHPO_RandomSeed" %}
17 | random_state_model = {{ pipeline.config.seed_for_model}}
18 | model = {{ model_name }}({{ silent }}random_state=random_state_model, {{ params }})
19 | {% endif %}
20 | 
21 | {% if is_multioutput_regression%}
22 | from sklearn.multioutput import MultiOutputRegressor
23 | 
24 | model = MultiOutputRegressor(model)
25 | {% elif is_multioutput_classification %}
26 | from sklearn.multioutput import MultiOutputClassifier
27 | 
28 | model = MultiOutputClassifier(model)
29 | {% endif %}
30 | {% set xgbclassifier = "XGBClassifier" %}
31 | {% if is_multioutput_classification %}
32 | from sklearn.preprocessing import LabelEncoder
33 | label_encoders = {}
34 | for i, column in enumerate(target_train.columns):
35 |     le = LabelEncoder()
36 |     target_train[column] = le.fit_transform(target_train[column])
37 |     label_encoders[column] = le
38 | {% elif model_name == xgbclassifier %}
39 | from sklearn.preprocessing import LabelEncoder
40 | 
41 | label_encoder = LabelEncoder()
42 | target_train = pd.DataFrame(label_encoder.fit_transform(target_train), columns=TARGET_COLUMNS)
43 | {% endif %}
44 | {% if pipeline.task.target_columns|length == 1 %}
45 | model.fit(feature_train, target_train.values.ravel())
46 | {% else %}
47 | model.fit(feature_train, target_train)
48 | {% endif %}
49 | y_pred = model.predict(feature_test)
50 | {% if flag_predict_proba and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %}
51 | y_pred = model.classes_[np.argmax(y_pred, axis=1)].reshape(-1, 1)
52 | {% endif %}
53 | {% if is_multioutput_classification %}
54 | y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
55 | for column in TARGET_COLUMNS:
56 |     y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int))
57 | y_pred = y_pred_df
58 | {% elif model_name == xgbclassifier and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %}
59 | y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1)
60 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/model_predict.py.jinja:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | {% set xgbclassifier = "XGBClassifier" %}
 4 | 
 5 | with open('model.pkl', 'rb') as f:
 6 |     model = pickle.load(f)
 7 | 
 8 | {% if (pipeline.adaptation_metric not in macros.metric_needing_predict_proba) or (pipeline.config.predict_option == macros.PRED_DEFAULT) %}
 9 | y_pred = model.predict(feature_test)
10 | {% endif %}
11 | {% if pipeline.adaptation_metric and flag_predict_proba %}
12 | y_prob = model.predict_proba(feature_test)
13 | {% endif %}
14 | {% if model_name == xgbclassifier or is_multioutput_classification %}
15 | with open('target_LabelEncoder.pkl', 'rb') as f:
16 |     label_encoder = pickle.load(f)
17 | {% endif %}
18 | {% if is_multioutput_classification %}
19 | y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
20 | for column in TARGET_COLUMNS:
21 |     y_pred_df[column] = label_encoder[column].inverse_transform(y_pred_df[column].astype(int))
22 | y_pred = y_pred_df
23 | {% elif model_name == xgbclassifier and ((pipeline.adaptation_metric not in macros.metric_needing_predict_proba) or (pipeline.config.predict_option == macros.PRED_DEFAULT)) %}
24 | y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1)
25 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/model_test.py.jinja:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from {{import_library}} import {{ model_name }}
 3 | 
 4 | {% if "CatBoost" in model_name %}
 5 | {% set silent="silent=True, " %}
 6 | {% else %}
 7 | {% set silent="" %}
 8 | {% endif %}
 9 | {% if model_arg == "HPO_noRandomSeed" %}
10 | model = {{ model_name }}(**best_params)
11 | {% elif model_arg == "HPO_RandomSeed" %}
12 | random_state_model = {{ pipeline.config.seed_for_model}}
13 | model = {{ model_name }}(random_state=random_state_model, **best_params)
14 | {% elif model_arg == "noHPO_noRandomSeed" %}
15 | model = {{ model_name }}({{ silent }}{{ params }})
16 | {% elif model_arg == "noHPO_RandomSeed" %}
17 | random_state_model = {{ pipeline.config.seed_for_model}}
18 | model = {{ model_name }}({{ silent }}random_state=random_state_model, {{ params }})
19 | {% endif %}
20 | 
21 | {% if is_multioutput_regression%}
22 | from sklearn.multioutput import MultiOutputRegressor
23 | 
24 | model = MultiOutputRegressor(model)
25 | {% elif is_multioutput_classification %}
26 | from sklearn.multioutput import MultiOutputClassifier
27 | 
28 | model = MultiOutputClassifier(model)
29 | {% endif %}
30 | {% set xgbclassifier = "XGBClassifier" %}
31 | {% if is_multioutput_classification %}
32 | from sklearn.preprocessing import LabelEncoder
33 | label_encoders = {}
34 | for i, column in enumerate(target_train.columns):
35 |     le = LabelEncoder()
36 |     target_train[column] = le.fit_transform(target_train[column])
37 |     label_encoders[column] = le
38 | {% elif model_name == xgbclassifier %}
39 | from sklearn.preprocessing import LabelEncoder
40 | 
41 | label_encoder = LabelEncoder()
42 | target_train = pd.DataFrame(label_encoder.fit_transform(target_train), columns=TARGET_COLUMNS)
43 | {% endif %}
44 | {% if pipeline.task.target_columns|length == 1 %}
45 | model.fit(feature_train, target_train.values.ravel())
46 | {% else %}
47 | model.fit(feature_train, target_train)
48 | {% endif %}
49 | y_pred = model.predict(feature_test)
50 | 
51 | {% if is_multioutput_classification %}
52 | y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
53 | for column in TARGET_COLUMNS:
54 |     y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int))
55 | y_pred = y_pred_df
56 | {% elif model_name == xgbclassifier %}
57 | y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1)
58 | {% endif %}
59 | 
60 | {% if pipeline.task.task_type == 'classification' %}
61 | y_prob = model.predict_proba(feature_test)
62 | 
63 | # POST PROCESSING
64 | {% if pipeline.adaptation_metric.startswith("MAP_") %}
65 | y_prob_sorted_index = pd.DataFrame(np.argsort(-y_prob))
66 | y_prob_map_k = y_prob_sorted_index.apply(lambda x: model.classes_[x]).to_numpy()
67 | {% endif %}
68 | 
69 | {% if not is_multioutput_classification %}
70 | if np.shape(y_prob)[1] == 2:
71 |     y_prob = y_prob[:, 1]
72 | {% endif %}
73 | 
74 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/model_templates/model_train.py.jinja:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from {{import_library}} import {{ model_name }}
 3 | 
 4 | {% if model_arg == "HPO_noRandomSeed" %}
 5 | model = {{ model_name }}(**best_params)
 6 | {% elif model_arg == "HPO_RandomSeed" %}
 7 | random_state_model = {{ pipeline.config.seed_for_model}}
 8 | model = {{ model_name }}(random_state=random_state_model, **best_params)
 9 | {% elif model_arg == "noHPO_noRandomSeed" %}
10 | model = {{ model_name }}({{ params }})
11 | {% elif model_arg == "noHPO_RandomSeed" %}
12 | random_state_model = {{ pipeline.config.seed_for_model}}
13 | model = {{ model_name }}(random_state=random_state_model, {{ params }})
14 | {% endif %}
15 | 
16 | {% if is_multioutput_regression%}
17 | from sklearn.multioutput import MultiOutputRegressor
18 | 
19 | model = MultiOutputRegressor(model)
20 | {% elif is_multioutput_classification %}
21 | from sklearn.multioutput import MultiOutputClassifier
22 | 
23 | model = MultiOutputClassifier(model)
24 | {% endif %}
25 | {% set xgbclassifier = "XGBClassifier" %}
26 | {% if is_multioutput_classification %}
27 | from sklearn.preprocessing import LabelEncoder
28 | label_encoders = {}
29 | for i, column in enumerate(target_train.columns):
30 |     le = LabelEncoder()
31 |     target_train[column] = le.fit_transform(target_train[column])
32 |     label_encoders[column] = le
33 | with open('target_LabelEncoder.pkl', 'wb') as f:
34 |     pickle.dump(label_encoders, f)
35 | {% elif model_name == xgbclassifier %}
36 | from sklearn.preprocessing import LabelEncoder
37 | 
38 | label_encoder = LabelEncoder()
39 | target_train = pd.DataFrame(label_encoder.fit_transform(target_train), columns=TARGET_COLUMNS)
40 | with open('target_LabelEncoder.pkl', 'wb') as f:
41 |     pickle.dump(label_encoder, f)
42 | 
43 | {% endif %}
44 | {% if pipeline.task.target_columns|length == 1 %}
45 | model.fit(feature_train, target_train.values.ravel())
46 | {% else %}
47 | model.fit(feature_train, target_train)
48 | {% endif %}
49 | with open('model.pkl', 'wb') as f:
50 |     pickle.dump(model, f)


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/confusion_matrix.py.jinja:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import ConfusionMatrixDisplay
 2 | {% if pipeline.task.target_columns|length == 1 %}
 3 | ConfusionMatrixDisplay.from_predictions(target_test, y_pred)
 4 | {% elif is_multioutput_classification %}
 5 | for i, column in enumerate(y_pred.columns):
 6 |     disp = ConfusionMatrixDisplay.from_predictions(target_test[column], y_pred[column].values)
 7 |     disp.ax_.set_title(column)
 8 | {% else %}
 9 | for i, column in enumerate(target_test.columns):
10 |     disp = ConfusionMatrixDisplay.from_predictions(target_test[column], y_pred[:, i])
11 |     disp.ax_.set_title(column)
12 | {% endif %}
13 | 
14 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/drop_columns.py.jinja:
--------------------------------------------------------------------------------
1 | # DISCARD IRRELEVANT COLUMNS
2 | irrelevant_columns = {{ irrelevant_columns }}
3 | {% if train %}
4 | train_dataset = train_dataset.drop(irrelevant_columns, axis=1, errors="ignore")
5 | {% endif %}
6 | {% if test %}
7 | test_dataset = test_dataset.drop(irrelevant_columns, axis=1, errors="ignore")
8 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/evaluation.py.jinja:
--------------------------------------------------------------------------------
  1 | {% if pipeline.adaptation_metric == macros.Metric.AUC.value %}
  2 | from sklearn.metrics import roc_auc_score
  3 | {% if pipeline.task.is_multiclass == True %}
  4 | auc = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr")
  5 | {% else %}
  6 | auc = roc_auc_score(target_test, y_pred)
  7 | {% endif %}
  8 | print('RESULT: AUC Score: ' + str(auc))
  9 | {% elif (pipeline.adaptation_metric == macros.Metric.Accuracy.value) and (not pipeline.is_multi_class_multi_targets) %}
 10 | from sklearn.metrics import accuracy_score
 11 | 
 12 | accuracy = accuracy_score(target_test, y_pred)
 13 | print('RESULT: Accuracy: ' + str(accuracy))
 14 | {% elif (pipeline.adaptation_metric == macros.Metric.Accuracy.value) and (pipeline.is_multi_class_multi_targets) %}
 15 | from sklearn.metrics import accuracy_score
 16 | 
 17 | __accs = []
 18 | for i, col in enumerate(target_test.columns):
 19 |     one_acc = accuracy_score(target_test[col], y_pred[col])
 20 |     __accs.append(one_acc)
 21 | print(f"RESULT: Accuracy : {str(sum(__accs)/len(__accs))}")
 22 | {% elif pipeline.adaptation_metric == macros.Metric.F1.value and not is_multioutput_classification%}
 23 | from sklearn import metrics
 24 | 
 25 | f1 = metrics.f1_score(target_test, y_pred, average='macro')
 26 | print('RESULT: F1 Score: ' + str(f1))
 27 | {% elif pipeline.adaptation_metric == macros.Metric.F1.value and is_multioutput_classification%}
 28 | from sklearn import metrics
 29 | 
 30 | __f1s = []
 31 | for i, col in enumerate(target_test.columns):
 32 |     one_f1 = metrics.f1_score(target_test[col], y_pred[col], average='macro')
 33 |     __f1s.append(one_f1)
 34 | print(f"RESULT: F1 Score : {str(sum(__f1s)/len(__f1s))}")
 35 | {% elif pipeline.adaptation_metric == macros.Metric.R2.value %}
 36 | from sklearn import metrics
 37 | 
 38 | r2 = metrics.r2_score(target_test, y_pred)
 39 | print('RESULT: R2 Score:', str(r2))
 40 | {% elif pipeline.adaptation_metric == macros.Metric.RMSE.value %}
 41 | from sklearn.metrics import mean_squared_error
 42 | 
 43 | rmse = mean_squared_error(target_test, y_pred, squared=False)
 44 | print('RESULT: RMSE:', str(rmse))
 45 | {% elif pipeline.adaptation_metric == macros.Metric.RMSLE.value %}
 46 | import numpy as np
 47 | from sklearn.metrics import mean_squared_log_error
 48 | 
 49 | target_test = np.clip(target_test, 0, None)
 50 | y_pred = np.clip(y_pred, 0, None)
 51 | rmsle = np.sqrt(mean_squared_log_error(target_test, y_pred))
 52 | print('RESULT: RMSLE:', str(rmsle))
 53 | {% elif pipeline.adaptation_metric == macros.Metric.Gini.value %}
 54 | from sklearn.metrics import roc_auc_score
 55 | {% if pipeline.task.is_multiclass == True %}
 56 | gini = 2 * roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") - 1
 57 | {% else %}
 58 | gini = 2 * roc_auc_score(target_test, y_pred) - 1
 59 | {% endif %}
 60 | print('RESULT: Gini: ' + str(gini))
 61 | {% elif pipeline.adaptation_metric == macros.Metric.MAE.value %}
 62 | from sklearn.metrics import mean_absolute_error
 63 | 
 64 | mae = mean_absolute_error(target_test, y_pred)
 65 | print('RESULT: MAE:', str(mae))
 66 | {% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value %}
 67 | from sklearn.metrics import log_loss
 68 | 
 69 | log_loss = log_loss(target_test, y_pred)
 70 | print('RESULT: Log Loss:', str(log_loss))
 71 | {% elif pipeline.adaptation_metric == macros.Metric.ROC_AUC.value %}
 72 | from sklearn.metrics import roc_auc_score
 73 | {% if pipeline.task.is_multiclass == True %}
 74 | __roc_auc = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr")
 75 | {% else %}
 76 | __roc_auc = roc_auc_score(target_test, y_pred)
 77 | {% endif %}
 78 | print('RESULT: ROC AUC:', str(__roc_auc))
 79 | {% elif pipeline.adaptation_metric == macros.Metric.MCC.value %}
 80 | from sklearn.metrics import matthews_corrcoef
 81 | 
 82 | mcc = matthews_corrcoef(target_test, y_pred)
 83 | print('RESULT: MCC:', str(mcc))
 84 | {% elif pipeline.adaptation_metric.startswith("MAP_") %}
 85 | {% set k = pipeline.adaptation_metric.split("_")[1] %}
 86 | def apk(actual, predicted, k):
 87 |     if len(predicted)>k:
 88 |         predicted = predicted[:k]
 89 | 
 90 |     score = 0.0
 91 |     num_hits = 0.0
 92 | 
 93 |     for i,p in enumerate(predicted):
 94 |         if p in actual and p not in predicted[:i]:
 95 |             num_hits += 1.0
 96 |             score += num_hits / (i+1.0)
 97 | 
 98 |     return score / min(len(actual), k)
 99 | 
100 | def mapk(actual, predicted, k):
101 |     """ Computes the mean average precision at k.
102 | 
103 |     Args:
104 |         actual (list[list[str] or ndarray): A list of lists of elements that are to be predicted
105 |         predicted (list[list[str] or ndarray): A list of lists of predicted elements
106 |             (In each list, arrange in the order you predicted.)
107 |         k (int): The maximum number of predicted elements
108 | 
109 |     Returns:
110 |         double: The mean average precision at k over the input lists
111 |     """
112 |     return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
113 | 
114 | map_k = mapk(target_test.to_numpy(), y_pred, k={{ k }})
115 | print('RESULT: MAP@K: ' + str(map_k))
116 | {% elif pipeline.adaptation_metric == macros.Metric.QWK.value %}
117 | from sklearn.metrics import cohen_kappa_score
118 | 
119 | qwk = cohen_kappa_score(target_test, y_pred, weights='quadratic')
120 | print('RESULT: QWK:', str(qwk))
121 | {% elif pipeline.adaptation_metric == macros.Metric.MAPE.value %}
122 | from sklearn.metrics import mean_absolute_percentage_error
123 | 
124 | mape = mean_absolute_percentage_error(target_test, y_pred)
125 | print('RESULT: MAPE:', str(mape))
126 | 
127 | {% elif pipeline.task_type == macros.TASK_REGRESSION.value %}
128 | from sklearn import metrics
129 | 
130 | r2 = metrics.r2_score(target_test, y_pred)
131 | print('RESULT: R2 Score:', str(r2))
132 | {% else %}
133 | from sklearn import metrics
134 | 
135 | f1 = metrics.f1_score(target_test, y_pred, average='macro')
136 | print('RESULT: F1 Score: ' + str(f1))
137 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/evaluation_test.py.jinja:
--------------------------------------------------------------------------------
  1 | {% if pipeline.task.task_type == macros.TASK_CLASSIFICATION %}
  2 | 
  3 | ## Metric: F1
  4 | from sklearn.metrics import f1_score
  5 | {% if is_multioutput_classification%}
  6 | f1_scores = []
  7 | for i, column in enumerate(target_test.columns):
  8 |     f1_score_value = f1_score(target_test[column], y_pred[column], average='macro')
  9 |     f1_scores.append(f1_score_value)
 10 | average_f1_score = np.mean(f1_scores)
 11 | print('RESULT: Average F1 Score:', str(average_f1_score))
 12 | {% else %}
 13 | f1 = f1_score(target_test, y_pred, average='macro')
 14 | print('RESULT: F1 Score: ' + str(f1))
 15 | {% endif%}
 16 | 
 17 | ## Metric: Accuracy
 18 | from sklearn.metrics import accuracy_score
 19 | {% if not pipeline.is_multi_class_multi_targets %}
 20 | accuracy = accuracy_score(target_test, y_pred)
 21 | print('RESULT: Accuracy: ' + str(accuracy))
 22 | {% elif pipeline.is_multi_class_multi_targets %}
 23 | __accs = []
 24 | for i, col in enumerate(target_test.columns):
 25 |     one_acc = accuracy_score(target_test[col], y_pred[col])
 26 |     __accs.append(one_acc)
 27 | print(f"RESULT: Average Accuracy : {str(sum(__accs)/len(__accs))}")
 28 | {% endif %}
 29 | 
 30 | ## Metric: AUC and Gini
 31 | from sklearn.metrics import roc_auc_score
 32 | {% if is_multioutput_classification %}
 33 | auc_scores = []
 34 | gini_scores = []
 35 | for i, column in enumerate(target_test.columns):
 36 |     if y_prob[i].ndim == 2 and y_prob[i].shape[1] == 2:
 37 |         auc_score = roc_auc_score(target_test[column], y_prob[i][:, 1])
 38 |     elif y_prob[i].ndim == 2:
 39 |         auc_score = roc_auc_score(target_test[column], y_prob[i], multi_class="ovr")
 40 |     gini_score = 2 * auc_score - 1
 41 |     auc_scores.append(auc_score)
 42 |     gini_scores.append(gini_score)
 43 | auc = np.mean(auc_scores)
 44 | gini = np.mean(gini_scores)
 45 | print('RESULT: Average AUC Score:', str(auc))
 46 | print('RESULT: Average Gini Score:', str(gini))
 47 | {% else %}
 48 | {% if pipeline.task.is_multiclass == True %}
 49 | auc = roc_auc_score(target_test.values.ravel(), y_prob, multi_class="ovr")
 50 | {% else %}
 51 | auc = roc_auc_score(target_test, y_prob)
 52 | {% endif %}
 53 | gini = 2 * auc - 1
 54 | print('RESULT: AUC Score: ' + str(auc))
 55 | print('RESULT: Gini: ' + str(gini))
 56 | {% endif %}
 57 | 
 58 | ## Metric: Log Loss
 59 | from sklearn.metrics import log_loss
 60 | {% if is_multioutput_classification %}
 61 | log_loss_scores = []
 62 | for i, column in enumerate(target_test.columns):
 63 |     loss = log_loss(target_test[column], y_prob[i])
 64 |     log_loss_scores.append(loss)
 65 | avg_log_loss = np.mean(log_loss_scores)
 66 | print('RESULT: Average Log Loss:', str(avg_log_loss))
 67 | {% else %}
 68 | log_loss_score = log_loss(target_test, y_prob)
 69 | print('RESULT: Log Loss:', str(log_loss_score))
 70 | {% endif %}
 71 | 
 72 | {% if not is_multioutput_classification %}
 73 | 
 74 | ## Metric: MCC
 75 | from sklearn.metrics import matthews_corrcoef
 76 | 
 77 | mcc = matthews_corrcoef(target_test, y_pred)
 78 | print('RESULT: MCC:', str(mcc))
 79 | 
 80 | ## Metric: QWK
 81 | from sklearn.metrics import cohen_kappa_score
 82 | 
 83 | qwk = cohen_kappa_score(target_test, y_pred, weights='quadratic')
 84 | print('RESULT: QWK:', str(qwk))
 85 | 
 86 | {% if pipeline.adaptation_metric.startswith("MAP_") %}
 87 | ## Metric: MAP@K
 88 | {% set k = pipeline.adaptation_metric.split("_")[1] %}
 89 | def apk(actual, predicted, k):
 90 |     if len(predicted)>k:
 91 |         predicted = predicted[:k]
 92 | 
 93 |     score = 0.0
 94 |     num_hits = 0.0
 95 | 
 96 |     for i,p in enumerate(predicted):
 97 |         if p in actual and p not in predicted[:i]:
 98 |             num_hits += 1.0
 99 |             score += num_hits / (i+1.0)
100 | 
101 |     return score / min(len(actual), k)
102 | 
103 | def mapk(actual, predicted, k):
104 |     """ Computes the mean average precision at k.
105 | 
106 |     Args:
107 |         actual (list[list[str] or ndarray): A list of lists of elements that are to be predicted
108 |         predicted (list[list[str] or ndarray): A list of lists of predicted elements
109 |             (In each list, arrange in the order you predicted.)
110 |         k (int): The maximum number of predicted elements
111 | 
112 |     Returns:
113 |         double: The mean average precision at k over the input lists
114 |     """
115 |     return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
116 | 
117 | map_k = mapk(target_test.to_numpy(), y_prob_map_k, k={{ k }})
118 | print('RESULT: MAP@K: ' + str(map_k))
119 | 
120 | {% endif %}
121 | {% endif %}
122 | 
123 | {% elif pipeline.task.task_type == macros.TASK_REGRESSION %}
124 | 
125 | ## Metric: R2
126 | from sklearn import metrics
127 | 
128 | r2 = metrics.r2_score(target_test, y_pred)
129 | print('RESULT: R2 Score:', str(r2))
130 | 
131 | ## Metric: RMSE
132 | from sklearn.metrics import mean_squared_error
133 | 
134 | rmse = mean_squared_error(target_test, y_pred, squared=False)
135 | print('RESULT: RMSE:', str(rmse))
136 | 
137 | ## Metric: RMSLE
138 | import numpy as np
139 | from sklearn.metrics import mean_squared_log_error
140 | 
141 | target_test = np.clip(target_test, 0, None)
142 | y_pred = np.clip(y_pred, 0, None)
143 | rmsle = np.sqrt(mean_squared_log_error(target_test, y_pred))
144 | print('RESULT: RMSLE:', str(rmsle))
145 | 
146 | ## Metric: MAE
147 | from sklearn.metrics import mean_absolute_error
148 | 
149 | mae = mean_absolute_error(target_test, y_pred)
150 | print('RESULT: MAE:', str(mae))
151 | 
152 | ## Metric: MAPE
153 | from sklearn.metrics import mean_absolute_percentage_error
154 | 
155 | mape = mean_absolute_percentage_error(target_test, y_pred)
156 | print('RESULT: MAPE:', str(mape))
157 | 
158 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/hyperparameter_tuning_evaluation.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if pipeline.adaptation_metric == macros.Metric.AUC.value %}
 2 |         from sklearn.metrics import roc_auc_score
 3 | {% if pipeline.task.is_multiclass == True %}
 4 |         score = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr")
 5 | {% else %}
 6 |         score = roc_auc_score(target_test, y_pred)
 7 | {% endif %}
 8 | {% elif pipeline.adaptation_metric == macros.Metric.Accuracy.value %}
 9 |         from sklearn.metrics import accuracy_score
10 |         score = accuracy_score(target_test, y_pred)
11 | {% elif pipeline.adaptation_metric == macros.Metric.F1.value %}
12 |         from sklearn import metrics
13 |         score = metrics.f1_score(target_test, y_pred, average='macro')
14 | {% elif pipeline.adaptation_metric == macros.Metric.R2.value %}
15 |         from sklearn import metrics
16 |         score = metrics.r2_score(target_test, y_pred)
17 | {% elif pipeline.adaptation_metric == macros.Metric.RMSE.value %}
18 |         from sklearn.metrics import mean_squared_error
19 |         score = mean_squared_error(target_test, y_pred, squared=False)
20 | {% elif pipeline.adaptation_metric == macros.Metric.RMSLE.value %}
21 |         import numpy as np
22 |         from sklearn.metrics import mean_squared_log_error
23 |         target_test = np.clip(target_test, 0, None)
24 |         y_pred = np.clip(y_pred, 0, None)
25 |         score = np.sqrt(mean_squared_log_error(target_test, y_pred))
26 | {% elif pipeline.adaptation_metric == macros.Metric.Gini.value %}
27 |         from sklearn.metrics import roc_auc_score
28 | {% if pipeline.task.is_multiclass == True %}
29 |         score = 2 * roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr") - 1
30 | {% else %}
31 |         score = 2 * roc_auc_score(target_test, y_pred) - 1
32 | {% endif %}
33 | {% elif pipeline.adaptation_metric == macros.Metric.MAE.value %}
34 |         from sklearn.metrics import mean_absolute_error
35 |         score = mean_absolute_error(target_test, y_pred)
36 | {% elif pipeline.adaptation_metric == macros.Metric.LogLoss.value %}
37 |         from sklearn.metrics import log_loss
38 |         score = log_loss(target_test, y_pred)
39 | {% elif pipeline.adaptation_metric == macros.Metric.ROC_AUC.value %}
40 |         from sklearn.metrics import roc_auc_score
41 | {% if pipeline.task.is_multiclass == True %}
42 |         score = roc_auc_score(target_test.values.ravel(), y_pred, multi_class="ovr")
43 | {% else %}
44 |         score = roc_auc_score(target_test, y_pred)
45 | {% endif %}
46 | {% elif pipeline.adaptation_metric.startswith("MAP_") %}
47 | {% set k = pipeline.adaptation_metric.split("_")[1] %}
48 |         def apk(actual, predicted, k):
49 |             if len(predicted)>k:
50 |                 predicted = predicted[:k]
51 |         
52 |             score = 0.0
53 |             num_hits = 0.0
54 |     
55 |             for i,p in enumerate(predicted):
56 |                 if p in actual and p not in predicted[:i]:
57 |                     num_hits += 1.0
58 |                     score += num_hits / (i+1.0)
59 |     
60 |             return score / min(len(actual), k)
61 |     
62 |         def mapk(actual, predicted, k):
63 |             """ Computes the mean average precision at k.
64 | 
65 |             Args:
66 |                 actual (list[list[str] or ndarray): A list of lists of elements that are to be predicted
67 |                 predicted (list[list[str] or ndarray): A list of lists of predicted elements
68 |                     (In each list, arrange in the order you predicted.)
69 |                 k (int): The maximum number of predicted elements
70 | 
71 |             Returns:
72 |                 double: The mean average precision at k over the input lists
73 |             """
74 |             return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
75 |     
76 |         score = mapk(target_test.to_numpy(), y_pred, k={{ k }})
77 | {% elif pipeline.adaptation_metric == macros.Metric.MAPE.value %}
78 |         from sklearn.metrics import mean_absolute_percentage_error
79 |         score = mean_absolute_percentage_error(target_test, y_pred)        
80 | {% elif pipeline.task.task_type == macros.TASK_REGRESSION %}
81 |         from sklearn import metrics
82 |         score = metrics.r2_score(target_test, y_pred)
83 | {% else %}
84 |         from sklearn import metrics
85 |         score = metrics.f1_score(target_test, y_pred, average='macro')
86 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/inverse_target.py.jinja:
--------------------------------------------------------------------------------
 1 | # INVERSE TARGET
 2 | import numpy as np
 3 | 
 4 | COLS_TO_BE_INVERSED = list(set(NUMERIC_COLS_TO_SCALE) & set(TARGET_COLUMNS))
 5 | {% if flag_hyperparameter_tuning %}
 6 | target_test[COLS_TO_BE_INVERSED] = np.expm1(target_test[COLS_TO_BE_INVERSED])
 7 | y_pred = pd.DataFrame(data=y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
 8 | {% else %}
 9 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
10 |     target_test[COLS_TO_BE_INVERSED] = np.expm1(target_test[COLS_TO_BE_INVERSED])
11 | y_pred = pd.DataFrame(data=y_pred, columns=TARGET_COLUMNS, index=test_dataset.index)
12 | {% endif %}
13 | y_pred[COLS_TO_BE_INVERSED] = np.expm1(y_pred[COLS_TO_BE_INVERSED])
14 | y_pred = y_pred.to_numpy()
15 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/permutation_importance.py.jinja:
--------------------------------------------------------------------------------
 1 | # PERMUTATION IMPORTANCE
 2 | from sklearn.inspection import permutation_importance
 3 | {% if pipeline.task.target_columns|length == 1 %}
 4 | {% set TARGET_TRAIN = 'target_train[TARGET_COLUMNS[0]]' %}
 5 | {% else %}
 6 | {% set TARGET_TRAIN = 'target_train' %}
 7 | {% endif %}
 8 | {% if pipeline.sparse_matrix %}
 9 | if len(feature_train.columns) <= 100:
10 |     perm = permutation_importance(model, feature_train.sparse.to_dense(), {{ TARGET_TRAIN }},
11 |                                     n_repeats=5,
12 |                                     random_state=0)
13 |     perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean})
14 |     perm_df.to_csv("./permutation_importance.csv", index=False)
15 | {% else %}
16 | if len(feature_train.columns) <= 100:
17 |     perm = permutation_importance(model, feature_train, {{ TARGET_TRAIN }},
18 |                                     n_repeats=5,
19 |                                     random_state=0)
20 |     perm_df = pd.DataFrame({"feature": feature_train.columns, "importance": perm.importances_mean})
21 |     perm_df.to_csv("./permutation_importance.csv", index=False)
22 | {% endif %}
23 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/prediction_result.py.jinja:
--------------------------------------------------------------------------------
 1 | # OUTPUT PREDICTION
 2 | {% set xgbclassifier = "XGBClassifier" %}
 3 | {% if pipeline.config.predict_option == macros.PRED_PROBABILITY and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
 4 | prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index)
 5 | {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
 6 | prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index)
 7 | {% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
 8 | prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index)
 9 | {% elif pipeline.config.predict_option is none and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
10 | prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index)
11 | {% elif pipeline.config.predict_option is none and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
12 | prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index)
13 | {% elif pipeline.config.predict_option is none and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
14 | prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index)
15 | {% elif pipeline.adaptation_metric.startswith("MAP_") %}
16 | {% set k = pipeline.adaptation_metric.split("_")[1] %}
17 | {% if y_prob_map_k is none %}
18 | prediction = pd.DataFrame(y_prob, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_prob.shape[1] + 1)], index=feature_test.index)
19 | {% elif is_multioutput_classification %}
20 | prediction = y_pred
21 | {% else %}
22 | prediction = pd.DataFrame(y_prob_map_k, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_prob_map_k.shape[1] + 1)], index=feature_test.index)
23 | {% endif %}
24 | {% else %}
25 | prediction = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
26 | {% endif %}
27 | prediction.to_csv("./prediction_result.csv")
28 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/preprocess_dataset.py.jinja:
--------------------------------------------------------------------------------
1 | # Export preprocessed dataset
2 | import time
3 | timestamp_str = time.strftime("%Y%m%d_%H%M%S")
4 | preprocess_dataset=pd.concat([pd.concat([feature_train,
5 |                                       target_train], axis=1),
6 |                            pd.concat([feature_test,
7 |                                       target_test], axis=1)])
8 | preprocess_dataset.to_pickle(f"./preprocess_dataset_{timestamp_str}.pickle")
9 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/shap.py.jinja:
--------------------------------------------------------------------------------
 1 | # Models are restricted because of execution time.
 2 | {% set lgbmclassifier = "LGBMClassifier" %}
 3 | models_for_shap = ['XGBClassifier', 'XGBRegressor', 'LGBMClassifier', 'LGBMRegressor', 'GradientBoostingClassifier', 'GradientBoostingRegressor']
 4 | if model.__class__.__name__ in models_for_shap:
 5 |     import shap
 6 |     feature_shap = feature_train.sample(1000) if feature_train.shape[0] > 1000 else feature_train
 7 | {% if model_name == lgbmclassifier %}
 8 |     explainer = shap.Explainer(model,feature_shap)
 9 | {% else %}
10 |     explainer = shap.Explainer(model)
11 | {% endif %}
12 |     shap_values = explainer(feature_shap)
13 | 
14 |     # summarize the effects of all the features
15 |     shap.plots.beeswarm(shap_values)
16 | 
17 |     #bar plots
18 |     shap.plots.bar(shap_values)


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/target_separation_predict.py.jinja:
--------------------------------------------------------------------------------
1 | # DETACH TARGET
2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }}
3 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
4 |     feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1)
5 |     target_test = test_dataset[TARGET_COLUMNS].copy()
6 | else:
7 |     feature_test = test_dataset
8 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/target_separation_test.py.jinja:
--------------------------------------------------------------------------------
 1 | # DETACH TARGET
 2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }}
 3 | feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1)
 4 | target_train = train_dataset[TARGET_COLUMNS].copy()
 5 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
 6 |     feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1)
 7 |     target_test = test_dataset[TARGET_COLUMNS].copy()
 8 | else:
 9 |     feature_test = test_dataset
10 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/target_separation_train.py.jinja:
--------------------------------------------------------------------------------
1 | # DETACH TARGET
2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }}
3 | feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1)
4 | target_train = train_dataset[TARGET_COLUMNS].copy()


--------------------------------------------------------------------------------
/sapientml_core/templates/other_templates/target_separation_validation.py.jinja:
--------------------------------------------------------------------------------
1 | # DETACH TARGET
2 | TARGET_COLUMNS = {{ pipeline.task.target_columns }}
3 | feature_train = train_dataset.drop(TARGET_COLUMNS, axis=1)
4 | target_train = train_dataset[TARGET_COLUMNS].copy()
5 | feature_test = test_dataset.drop(TARGET_COLUMNS, axis=1)
6 | target_test = test_dataset[TARGET_COLUMNS].copy()
7 | 
8 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/pipeline_predict.py.jinja:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | {% if 'discard_columns' in pipeline_json %}
 4 | 
 5 | {{ pipeline_json['discard_columns']['code_predict'] }}
 6 | {% endif %}
 7 | {% if 'preprocessing_before_target_separation' in pipeline_json %}
 8 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %}
 9 | {% for code in component['code_predict'] %}
10 | 
11 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
12 | {{ code }}
13 | {% endfor %}
14 | {% endfor %}
15 | {% endif %}
16 | {% if 'target_separation' in pipeline_json %}
17 | 
18 | {{ pipeline_json['target_separation']['code_predict'] }}
19 | {% endif %}
20 | {% if 'preprocessing_after_target_separation' in pipeline_json %}
21 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %}
22 | {% for code in component['code_predict'] %}
23 | 
24 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
25 | {{ code }}
26 | {% endfor %}
27 | {% endfor %}
28 | {% endif %}
29 | {% if 'preprocessing_after_train_test_split' in pipeline_json %}
30 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %}
31 | {% for code in component['code_predict'] %}
32 | 
33 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
34 | {{ code }}
35 | {% endfor %}
36 | {% endfor %}
37 | {% endif %}
38 | {% if 'model' in pipeline_json %}
39 | 
40 | # MODEL
41 | {{ pipeline_json['model']['code_predict'] }}
42 | {% endif %}
43 | {% if 'inverse_target' in pipeline_json %}
44 | 
45 | {{ pipeline_json['inverse_target']['code'] }}
46 | {% endif %}
47 | {% if 'evaluation' in pipeline_json %}
48 | 
49 | #EVALUATION
50 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
51 | {% filter indent(width=4, first=True) %}
52 | {{ pipeline_json['evaluation']['code_predict'] }}
53 | {% endfilter %}
54 | {% endif %}
55 | {% if 'output_prediction' in pipeline_json %}
56 | 
57 | {{ pipeline_json['output_prediction']['code'] }}
58 | {% endif %}
59 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/pipeline_test.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if 'discard_columns' in pipeline_json %}
 2 | 
 3 | {{ pipeline_json['discard_columns']['code'] }}
 4 | {% endif %}
 5 | {% if 'preprocessing_before_target_separation' in pipeline_json %}
 6 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %}
 7 | {% for code in component['code'] %}
 8 | 
 9 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
10 | {{ code }}
11 | {% endfor %}
12 | {% endfor %}
13 | {% endif %}
14 | {% if 'target_separation' in pipeline_json %}
15 | 
16 | {{ pipeline_json['target_separation']['code_test'] }}
17 | {% endif %}
18 | {% if 'preprocessing_after_target_separation' in pipeline_json %}
19 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %}
20 | {% for code in component['code'] %}
21 | 
22 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
23 | {{ code }}
24 | {% endfor %}
25 | {% endfor %}
26 | {% endif %}
27 | {% if 'preprocessing_after_train_test_split' in pipeline_json %}
28 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %}
29 | {% for code in component['code'] %}
30 | 
31 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
32 | {{ code }}
33 | {% endfor %}
34 | {% endfor %}
35 | {% endif %}
36 | {% if flag_hyperparameter_tuning %}
37 | 
38 | # BEST PARAMETERS IN THE CANDIDATE SCRIPT
39 | # PLEASE SEE THE CANDIDATE SCRIPTS FOR THE HYPERPARAMTER OPTIMIZATION CODE
40 | best_params = study.best_params
41 | {% endif %}
42 | 
43 | {% if 'preprocess_dataset' in pipeline_json %}
44 | {{ pipeline_json['preprocess_dataset']['code_test'] }}
45 | 
46 | {% endif %}
47 | {% if 'model' in pipeline_json %}
48 | 
49 | # MODEL
50 | {{ pipeline_json['model']['code_test'] }}
51 | {% endif %}
52 | {% if 'inverse_target' in pipeline_json %}
53 | 
54 | {{ pipeline_json['inverse_target']['code'] }}
55 | {% endif %}
56 | {% if 'evaluation' in pipeline_json %}
57 | 
58 | #EVALUATION
59 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
60 | {% filter indent(width=4, first=True) %}
61 | {{ pipeline_json['evaluation']['code_test'] }}
62 | {% endfilter %}
63 | {% endif %}
64 | 
65 | {% if 'confusion_matrix' in pipeline_json and pipeline.task.task_type == 'classification'%}
66 | # Confusion Matrix
67 | if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
68 | {% filter indent(width=4, first=True) %}
69 | {{ pipeline_json['confusion_matrix']['code'] }}
70 | {% endfilter %}
71 | {% endif %}
72 | {% if 'output_prediction' in pipeline_json %}
73 | 
74 | {{ pipeline_json['output_prediction']['code'] }}
75 | {% endif %}
76 | {% if 'permutation_importance' in pipeline_json %}
77 | 
78 | {{ pipeline_json['permutation_importance']['code'] }}
79 | {% endif %}
80 | 
81 | {% if 'shap' in pipeline_json and not pipeline.task.is_multiclass %}
82 | 
83 | {{ pipeline_json['shap']['code'] }}
84 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/pipeline_train.py.jinja:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | {% if 'discard_columns' in pipeline_json %}
 4 | 
 5 | {{ pipeline_json['discard_columns']['code_train'] }}
 6 | {% endif %}
 7 | {% if 'preprocessing_before_target_separation' in pipeline_json %}
 8 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %}
 9 | {% for code in component['code_train'] %}
10 | 
11 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
12 | {{ code }}
13 | {% endfor %}
14 | {% endfor %}
15 | {% endif %}
16 | {% if 'target_separation' in pipeline_json %}
17 | 
18 | {{ pipeline_json['target_separation']['code_train'] }}
19 | {% endif %}
20 | {% if 'preprocessing_after_target_separation' in pipeline_json %}
21 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %}
22 | {% for code in component['code_train'] %}
23 | 
24 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
25 | {{ code }}
26 | {% endfor %}
27 | {% endfor %}
28 | {% endif %}
29 | {% if 'preprocessing_after_train_test_split' in pipeline_json %}
30 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %}
31 | {% for code in component['code_train'] %}
32 | 
33 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
34 | {{ code }}
35 | {% endfor %}
36 | {% endfor %}
37 | {% endif %}
38 | {% if flag_hyperparameter_tuning %}
39 | 
40 | # BEST PARAMETERS IN THE CANDIDATE SCRIPT
41 | # PLEASE SEE THE CANDIDATE SCRIPTS FOR THE HYPERPARAMTER OPTIMIZATION CODE
42 | best_params = study.best_params
43 | {% endif %}
44 | {% if 'model' in pipeline_json %}
45 | 
46 | # MODEL
47 | {{ pipeline_json['model']['code_train'] }}
48 | {% endif %}
49 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/pipeline_validation.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if 'discard_columns' in pipeline_json %}
 2 | 
 3 | {{ pipeline_json['discard_columns']['code'] }}
 4 | {% endif %}
 5 | {% if 'preprocessing_before_target_separation' in pipeline_json %}
 6 | {% for component in pipeline_json["preprocessing_before_target_separation"].values() %}
 7 | {% for code in component['code'] %}
 8 | 
 9 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
10 | {{ code }}
11 | {% endfor %}
12 | {% endfor %}
13 | {% endif %}
14 | {% if 'target_separation' in pipeline_json %}
15 | 
16 | {{ pipeline_json['target_separation']['code_validation'] }}
17 | {% endif %}
18 | {% if 'preprocessing_after_target_separation' in pipeline_json %}
19 | {% for component in pipeline_json["preprocessing_after_target_separation"].values() %}
20 | {% for code in component['code'] %}
21 | 
22 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
23 | {{ code }}
24 | {% endfor %}
25 | {% endfor %}
26 | {% endif %}
27 | {% if 'preprocessing_after_train_test_split' in pipeline_json %}
28 | {% for component in pipeline_json["preprocessing_after_train_test_split"].values() %}
29 | {% for code in component['code'] %}
30 | 
31 | # PREPROCESSING-{{ component['id'] + loop.index0 }}
32 | {{ code }}
33 | {% endfor %}
34 | {% endfor %}
35 | {% endif %}
36 | {% if flag_hyperparameter_tuning %}
37 | 
38 | {{ pipeline_json['hyperparameter_optimization']['code'] }}
39 | {% else %}
40 | {% if 'model' in pipeline_json %}
41 | {% if 'preprocess_dataset' in pipeline_json %}
42 | {{ pipeline_json['preprocess_dataset']['code_test'] }}
43 | 
44 | {% endif %}
45 | 
46 | # MODEL
47 | {{ pipeline_json['model']['code'] }}
48 | {% endif %}
49 | {% if 'inverse_target' in pipeline_json %}
50 | 
51 | {{ pipeline_json['inverse_target']['code'] }}
52 | {% endif %}
53 | {% if 'evaluation' in pipeline_json %}
54 | 
55 | #EVALUATION
56 | {{ pipeline_json['evaluation']['code_validation'] }}
57 | {% endif %}
58 | {% endif %}{# if flag_hyperparameter_tuning #}
59 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/DATE.py.jinja:
--------------------------------------------------------------------------------
 1 | DATE_COLUMNS = {{ columns }}
 2 | for _col in DATE_COLUMNS:
 3 |     train_date_col = pd.to_datetime({{ train_dataset }}[_col], errors='coerce')
 4 |     {{ train_dataset }}[_col + "_year"] = train_date_col.dt.year.fillna(-1)
 5 |     {{ train_dataset }}[_col + "_month"] = train_date_col.dt.month.fillna(-1)
 6 |     {{ train_dataset }}[_col + "_day"] = train_date_col.dt.day.fillna(-1)
 7 |     {{ train_dataset }}[_col + "_day_of_week"] = train_date_col.dt.dayofweek.fillna(-1)
 8 |     {{ train_dataset }}.drop(_col, axis=1, inplace=True)
 9 |     
10 |     test_date_col = pd.to_datetime({{ test_dataset }}[_col], errors='coerce')
11 |     {{ test_dataset }}[_col+ "_year"] = test_date_col.dt.year.fillna(-1)
12 |     {{ test_dataset }}[_col + "_month"] = test_date_col.dt.month.fillna(-1)
13 |     {{ test_dataset }}[_col + "_day"] = test_date_col.dt.day.fillna(-1)
14 |     {{ test_dataset }}[_col + "_day_of_week"] = test_date_col.dt.dayofweek.fillna(-1)
15 |     {{ test_dataset }}.drop(_col, axis=1, inplace=True)


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/DATE_predict.jinja:
--------------------------------------------------------------------------------
1 | DATE_COLUMNS = {{ columns }}
2 | for _col in DATE_COLUMNS:
3 |     test_date_col = pd.to_datetime({{ test_dataset }}[_col], errors='coerce')
4 |     {{ test_dataset }}[_col+ "_year"] = test_date_col.dt.year.fillna(-1)
5 |     {{ test_dataset }}[_col + "_month"] = test_date_col.dt.month.fillna(-1)
6 |     {{ test_dataset }}[_col + "_day"] = test_date_col.dt.day.fillna(-1)
7 |     {{ test_dataset }}[_col + "_day_of_week"] = test_date_col.dt.dayofweek.fillna(-1)
8 |     {{ test_dataset }}.drop(_col, axis=1, inplace=True)


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/DATE_train.jinja:
--------------------------------------------------------------------------------
1 | DATE_COLUMNS = {{ columns }}
2 | for _col in DATE_COLUMNS:
3 |     train_date_col = pd.to_datetime({{ train_dataset }}[_col], errors='coerce')
4 |     {{ train_dataset }}[_col + "_year"] = train_date_col.dt.year.fillna(-1)
5 |     {{ train_dataset }}[_col + "_month"] = train_date_col.dt.month.fillna(-1)
6 |     {{ train_dataset }}[_col + "_day"] = train_date_col.dt.day.fillna(-1)
7 |     {{ train_dataset }}[_col + "_day_of_week"] = train_date_col.dt.dayofweek.fillna(-1)
8 |     {{ train_dataset }}.drop(_col, axis=1, inplace=True)


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/LabelEncoder.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import OrdinalEncoder
2 | 
3 | CATEGORICAL_COLS = {{ columns }}
4 | ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
5 | {{ train_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS])
6 | {{ test_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS])


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/LabelEncoder_predict.py.jinja:
--------------------------------------------------------------------------------
1 | with open('ordinalEncoder.pkl', 'rb') as f:
2 |     ordinal_encoder = pickle.load(f)
3 | 
4 | CATEGORICAL_COLS = {{ columns }}
5 | {{ test_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS])


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/LabelEncoder_train.py.jinja:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import OrdinalEncoder
2 | 
3 | CATEGORICAL_COLS = {{ columns }}
4 | ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
5 | {{ train_dataset }}[CATEGORICAL_COLS] = ordinal_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS])
6 | 
7 | with open('ordinalEncoder.pkl', 'wb') as f:
8 |     pickle.dump(ordinal_encoder, f)
9 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/Processing.py.jinja:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | 
 4 | import nltk
 5 | 
 6 | TEXT_COLUMNS = {{ columns }}
 7 | def process_text(__dataset):
 8 |     for _col in TEXT_COLUMNS:
 9 |         process_text = [t.lower() for t in __dataset[_col]]
10 | 
11 |         # strip all punctuation
12 |         table = str.maketrans('', '', string.punctuation)
13 |         process_text = [t.translate(table) for t in process_text]
14 | 
15 |         # convert all numbers in text to 'num'
16 |         process_text = [re.sub(r'\d+', 'num', t) for t in process_text]
17 |         __dataset[_col] = process_text
18 |     return __dataset
19 | 
20 | {{ train_dataset }} = process_text({{ train_dataset }})
21 | {{ test_dataset }} = process_text({{ test_dataset }})


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/Processing_predict.py.jinja:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | 
 4 | import nltk
 5 | 
 6 | TEXT_COLUMNS = {{ columns }}
 7 | def process_text(__dataset):
 8 |     for _col in TEXT_COLUMNS:
 9 |         process_text = [t.lower() for t in __dataset[_col]]
10 | 
11 |         # strip all punctuation
12 |         table = str.maketrans('', '', string.punctuation)
13 |         process_text = [t.translate(table) for t in process_text]
14 | 
15 |         # convert all numbers in text to 'num'
16 |         process_text = [re.sub(r'\d+', 'num', t) for t in process_text]
17 |         __dataset[_col] = process_text
18 |     return __dataset
19 | 
20 | {{ test_dataset }} = process_text({{ test_dataset }})


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/Processing_train.py.jinja:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | 
 4 | import nltk
 5 | 
 6 | TEXT_COLUMNS = {{ columns }}
 7 | def process_text(__dataset):
 8 |     for _col in TEXT_COLUMNS:
 9 |         process_text = [t.lower() for t in __dataset[_col]]
10 | 
11 |         # strip all punctuation
12 |         table = str.maketrans('', '', string.punctuation)
13 |         process_text = [t.translate(table) for t in process_text]
14 | 
15 |         # convert all numbers in text to 'num'
16 |         process_text = [re.sub(r'\d+', 'num', t) for t in process_text]
17 |         __dataset[_col] = process_text
18 |     return __dataset
19 | 
20 | {{ train_dataset }} = process_text({{ train_dataset }})


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/SMOTE.py.jinja:
--------------------------------------------------------------------------------
 1 | from imblearn.over_sampling import SMOTE
 2 | 
 3 | smote = SMOTE(random_state=0)
 4 | {% if pipeline.sparse_matrix %}
 5 | feature_columns = feature_train.columns
 6 | feature_train = feature_train.sparse.to_coo()
 7 | feature_train, target_train = smote.fit_resample(feature_train, target_train)
 8 | feature_train =  pd.DataFrame.sparse.from_spmatrix(feature_train, columns=feature_columns)
 9 | {% else %}
10 | feature_train, target_train = smote.fit_resample(feature_train, target_train)
11 | {% endif %}
12 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/STANDARD.py.jinja:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import StandardScaler
 2 | 
 3 | standard_scaler = StandardScaler(with_mean=False)
 4 | {% if pipeline.sparse_matrix %}
 5 | {% set dataframe = "pd.DataFrame.sparse.from_spmatrix" %}
 6 | {% else %}
 7 | {% set dataframe = "pd.DataFrame" %}
 8 | {% endif %}
 9 | {{ train_dataset }} = {{ dataframe }}(standard_scaler.fit_transform({{ train_dataset }}), columns={{ train_dataset }}.columns, index={{ train_dataset }}.index)
10 | {{ test_dataset }} = {{ dataframe }}(standard_scaler.transform({{ test_dataset }}), columns={{ test_dataset }}.columns, index={{ test_dataset }}.index)
11 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/STANDARD_predict.py.jinja:
--------------------------------------------------------------------------------
 1 | with open('standardScaler.pkl', 'rb') as f:
 2 |     standard_scaler = pickle.load(f)
 3 | 
 4 | {% if pipeline.sparse_matrix %}
 5 | {% set dataframe = "pd.DataFrame.sparse.from_spmatrix" %}
 6 | {% else %}
 7 | {% set dataframe = "pd.DataFrame" %}
 8 | {% endif %}
 9 | {{ test_dataset }} = {{ dataframe }}(standard_scaler.transform({{ test_dataset }}), columns={{ test_dataset }}.columns, index={{ test_dataset }}.index)
10 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/STANDARD_train.py.jinja:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import StandardScaler
 2 | 
 3 | standard_scaler = StandardScaler(with_mean=False)
 4 | {% if pipeline.sparse_matrix %}
 5 | {% set dataframe = "pd.DataFrame.sparse.from_spmatrix" %}
 6 | {% else %}
 7 | {% set dataframe = "pd.DataFrame" %}
 8 | {% endif %}
 9 | {{ train_dataset }} = {{ dataframe }}(standard_scaler.fit_transform({{ train_dataset }}), columns={{ train_dataset }}.columns, index={{ train_dataset }}.index)
10 | 
11 | with open('standardScaler.pkl', 'wb') as f:
12 |     pickle.dump(standard_scaler, f)
13 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/TfidfVectorizer.py.jinja:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import TfidfVectorizer
 2 | 
 3 | TEXT_COLUMNS = {{ columns }}
 4 | temp_train_data = {{ train_dataset }}[TEXT_COLUMNS]
 5 | temp_test_data = {{ test_dataset }}[TEXT_COLUMNS]
 6 | # Make the entire dataframe sparse to avoid it converting into a dense matrix.
 7 | {{ train_dataset }} = {{ train_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0))
 8 | {{ test_dataset }} = {{ test_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0))
 9 | 
10 | {% if pipeline.config.use_word_list %}
11 | {% if pipeline.config.use_word_list is mapping %}
12 | # Use only specified words as features for each column
13 | use_word_list = {{ pipeline.config.use_word_list }}
14 | for col, word_list in use_word_list.items():
15 |     word_list = [word.lower() for word in word_list]
16 |     word_list = list(set(word_list))
17 |     use_word_list[col] = word_list
18 | for _col in TEXT_COLUMNS:
19 |     tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list.get(_col))
20 |     vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
21 |     feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
22 |     vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
23 |     {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
24 |     vector_test = tfidfvectorizer.transform(temp_test_data[_col])
25 |     vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index)
26 |     {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1)
27 | {% else %}
28 | # Use only specified words as features
29 | use_word_list = {{ pipeline.config.use_word_list }}
30 | use_word_list = [word.lower() for word in use_word_list]
31 | use_word_list = list(set(use_word_list))
32 | for _col in TEXT_COLUMNS:
33 |     tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list)
34 |     vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
35 |     feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
36 |     vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
37 |     {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
38 |     vector_test = tfidfvectorizer.transform(temp_test_data[_col])
39 |     vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index)
40 |     {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1)
41 | {% endif %}
42 | {% else %}
43 | for _col in TEXT_COLUMNS:
44 |     tfidfvectorizer = TfidfVectorizer(max_features=3000)
45 |     vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
46 |     feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
47 |     vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
48 |     {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
49 |     vector_test = tfidfvectorizer.transform(temp_test_data[_col])
50 |     vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index)
51 |     {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1)
52 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/TfidfVectorizer_predict.py.jinja:
--------------------------------------------------------------------------------
 1 | TEXT_COLUMNS = {{ columns }}
 2 | temp_test_data = {{ test_dataset }}[TEXT_COLUMNS]
 3 | # Make the entire dataframe sparse to avoid it converting into a dense matrix.
 4 | {{ test_dataset }} = {{ test_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0))
 5 | with open('tfidfVectorizer.pkl', 'rb') as f:
 6 |     vectorizers = pickle.load(f)
 7 | for _col in TEXT_COLUMNS:
 8 |     tfidfvectorizer = vectorizers[_col]
 9 |     feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
10 |     vector_test = tfidfvectorizer.transform(temp_test_data[_col])
11 |     vector_test = pd.DataFrame.sparse.from_spmatrix(vector_test, columns=feature_names, index=temp_test_data.index)
12 |     {{ test_dataset }} = pd.concat([{{ test_dataset }}, vector_test], axis=1)
13 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/TfidfVectorizer_train.py.jinja:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import TfidfVectorizer
 2 | 
 3 | TEXT_COLUMNS = {{ columns }}
 4 | temp_train_data = {{ train_dataset }}[TEXT_COLUMNS]
 5 | # Make the entire dataframe sparse to avoid it converting into a dense matrix.
 6 | {{ train_dataset }} = {{ train_dataset }}.drop(TEXT_COLUMNS, axis=1).astype(pd.SparseDtype('float64', 0))
 7 | vectorizers = {}
 8 | 
 9 | {% if pipeline.config.use_word_list %}
10 | {% if pipeline.config.use_word_list is mapping %}
11 | # Use only specified words as features for each column
12 | use_word_list = {{ pipeline.config.use_word_list }}
13 | for col, word_list in use_word_list.items():
14 |     word_list = [word.lower() for word in word_list]
15 |     word_list = list(set(word_list))
16 |     use_word_list[col] = word_list
17 | for _col in TEXT_COLUMNS:
18 |     tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list.get(_col))
19 |     vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
20 |     feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
21 |     vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
22 |     {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
23 |     vectorizers[_col] = tfidfvectorizer
24 | {% else %}
25 | # Use only specified words as features
26 | use_word_list = {{ pipeline.config.use_word_list }}
27 | use_word_list = [word.lower() for word in use_word_list]
28 | use_word_list = list(set(use_word_list))
29 | for _col in TEXT_COLUMNS:
30 |     tfidfvectorizer = TfidfVectorizer(max_features=3000, vocabulary=use_word_list)
31 |     vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
32 |     feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
33 |     vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
34 |     {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
35 |     vectorizers[_col] = tfidfvectorizer
36 | {% endif %}
37 | {% else %}
38 | for _col in TEXT_COLUMNS:
39 |     tfidfvectorizer = TfidfVectorizer(max_features=3000)
40 |     vector_train = tfidfvectorizer.fit_transform(temp_train_data[_col])
41 |     feature_names = ['_'.join([_col, name]) for name in tfidfvectorizer.get_feature_names_out()]
42 |     vector_train = pd.DataFrame.sparse.from_spmatrix(vector_train, columns=feature_names, index=temp_train_data.index)
43 |     {{ train_dataset }} = pd.concat([{{ train_dataset }}, vector_train], axis=1)
44 |     vectorizers[_col] = tfidfvectorizer
45 | {% endif %}
46 | 
47 | with open('tfidfVectorizer.pkl', 'wb') as f:
48 |     pickle.dump(vectorizers, f)


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-numeric.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if columns %}
 2 | import numpy as np
 3 | from sklearn.impute import SimpleImputer
 4 | 
 5 | NUMERIC_COLS_WITH_MISSING_VALUES = {{ columns }}
 6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 7 | {{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES])
 8 | {{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES])
 9 | {% endif %}
10 | {% if cols_almost_missing_numeric %}
11 | NUMERIC_ALMOST_MISSING_COLS = {{ cols_almost_missing_numeric }}
12 | {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0)
13 | {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0)
14 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-numeric_predict.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if columns %}
 2 | with open('simpleimputer-numeric.pkl', 'rb') as f:
 3 |     simple_imputer = pickle.load(f)
 4 | 
 5 | NUMERIC_COLS_WITH_MISSING_VALUES = {{ columns }}
 6 | {{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES])
 7 | {% endif %}
 8 | {% if cols_almost_missing_numeric %}
 9 | NUMERIC_ALMOST_MISSING_COLS = {{ cols_almost_missing_numeric }}
10 | {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ test_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0)
11 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-numeric_train.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if columns %}
 2 | import numpy as np
 3 | from sklearn.impute import SimpleImputer
 4 | 
 5 | NUMERIC_COLS_WITH_MISSING_VALUES = {{ columns }}
 6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 7 | {{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[NUMERIC_COLS_WITH_MISSING_VALUES])
 8 | 
 9 | with open('simpleimputer-numeric.pkl', 'wb') as f:
10 |     pickle.dump(simple_imputer, f)
11 | {% endif %}
12 | {% if cols_almost_missing_numeric %}
13 | NUMERIC_ALMOST_MISSING_COLS = {{ cols_almost_missing_numeric }}
14 | {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS] = {{ train_dataset }}[NUMERIC_ALMOST_MISSING_COLS].fillna(0)
15 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-string.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if columns %}
 2 | import numpy as np
 3 | from sklearn.impute import SimpleImputer
 4 | 
 5 | STRING_COLS_WITH_MISSING_VALUES = {{ columns }}
 6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
 7 | {{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES])
 8 | {{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES])
 9 | {% endif %}
10 | {% if cols_almost_missing_string %}
11 | STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
12 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
13 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
14 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
15 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
16 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-string_predict.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if columns %}
 2 | with open('simpleimputer-string.pkl', 'rb') as f:
 3 |     simple_imputer = pickle.load(f)
 4 | 
 5 | STRING_COLS_WITH_MISSING_VALUES = {{ columns }}
 6 | {{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.transform({{ test_dataset }}[STRING_COLS_WITH_MISSING_VALUES])
 7 | {% endif %}
 8 | {% if cols_almost_missing_string %}
 9 | STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
10 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
11 | {{ test_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ test_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
12 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/fillna-type-string_train.py.jinja:
--------------------------------------------------------------------------------
 1 | {% if columns %}
 2 | import numpy as np
 3 | from sklearn.impute import SimpleImputer
 4 | 
 5 | STRING_COLS_WITH_MISSING_VALUES = {{ columns }}
 6 | simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
 7 | {{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES] = simple_imputer.fit_transform({{ train_dataset }}[STRING_COLS_WITH_MISSING_VALUES])
 8 | 
 9 | with open('simpleimputer-string.pkl', 'wb') as f:
10 |     pickle.dump(simple_imputer, f)
11 | {% endif %}
12 | {% if cols_almost_missing_string %}
13 | STRING_ALMOST_MISSING_COLS = {{ cols_almost_missing_string }}
14 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].astype(str)
15 | {{ train_dataset }}[STRING_ALMOST_MISSING_COLS] = {{ train_dataset }}[STRING_ALMOST_MISSING_COLS].fillna('')
16 | {% endif %}


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/get_dummies.py.jinja:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import OneHotEncoder
 2 | 
 3 | CATEGORICAL_COLS = {{ columns }}
 4 | onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
 5 | train_encoded = pd.DataFrame(onehot_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ train_dataset }}.index)
 6 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, train_encoded ], axis=1)
 7 | {{ train_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True)
 8 | test_encoded = pd.DataFrame(onehot_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ test_dataset }}.index)
 9 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, test_encoded ], axis=1)
10 | {{ test_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True)
11 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/get_dummies_predict.py.jinja:
--------------------------------------------------------------------------------
1 | with open('oneHotEncoder.pkl', 'rb') as f:
2 |     onehot_encoder = pickle.load(f)
3 | 
4 | CATEGORICAL_COLS = {{ columns }}
5 | test_encoded = pd.DataFrame(onehot_encoder.transform({{ test_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ test_dataset }}.index)
6 | {{ test_dataset }} = pd.concat([{{ test_dataset }}, test_encoded ], axis=1)
7 | {{ test_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True)
8 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/get_dummies_train.py.jinja:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import OneHotEncoder
 2 | 
 3 | CATEGORICAL_COLS = {{ columns }}
 4 | onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
 5 | train_encoded = pd.DataFrame(onehot_encoder.fit_transform({{ train_dataset }}[CATEGORICAL_COLS]), columns=onehot_encoder.get_feature_names_out(), index={{ train_dataset }}.index)
 6 | {{ train_dataset }} = pd.concat([{{ train_dataset }}, train_encoded ], axis=1)
 7 | {{ train_dataset }}.drop(CATEGORICAL_COLS, axis=1, inplace=True)
 8 | 
 9 | with open('oneHotEncoder.pkl', 'wb') as f:
10 |     pickle.dump(onehot_encoder, f)
11 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/log.py.jinja:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | NUMERIC_COLS_TO_SCALE = {{ columns }}
 4 | {{ train_dataset }}[NUMERIC_COLS_TO_SCALE] = np.log1p({{ train_dataset }}[NUMERIC_COLS_TO_SCALE]).replace([np.inf, -np.inf], np.nan).fillna({{ train_dataset }}[NUMERIC_COLS_TO_SCALE].mean())
 5 | 
 6 | 
 7 | NUMERIC_COLS_TO_SCALE_FOR_TEST = list(set(test_dataset.columns) & set(NUMERIC_COLS_TO_SCALE))
 8 | {{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST] = np.log1p({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST]).replace([np.inf, -np.inf], np.nan).fillna({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST].mean())
 9 | 
10 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/log_predict.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | NUMERIC_COLS_TO_SCALE = {{ columns }}
4 | NUMERIC_COLS_TO_SCALE_FOR_TEST = list(set(test_dataset.columns) & set(NUMERIC_COLS_TO_SCALE))
5 | {{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST] = np.log1p({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST]).replace([np.inf, -np.inf], np.nan).fillna({{ test_dataset }}[NUMERIC_COLS_TO_SCALE_FOR_TEST].mean())
6 | 
7 | 


--------------------------------------------------------------------------------
/sapientml_core/templates/preprocessing_templates/log_train.py.jinja:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | NUMERIC_COLS_TO_SCALE = {{ columns }}
4 | {{ train_dataset }}[NUMERIC_COLS_TO_SCALE] = np.log1p({{ train_dataset }}[NUMERIC_COLS_TO_SCALE]).replace([np.inf, -np.inf], np.nan).fillna({{ train_dataset }}[NUMERIC_COLS_TO_SCALE].mean())
5 | 
6 | 


--------------------------------------------------------------------------------
/sapientml_core/training/augmentation/mutation_results.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | from collections import OrderedDict, defaultdict
 17 | 
 18 | import pandas as pd
 19 | from sapientml_core import internal_path
 20 | from sapientml_core.seeding.predictor import name_to_label_mapping
 21 | from sapientml_core.training.project_corpus import ProjectCorpus
 22 | from tqdm import tqdm
 23 | 
 24 | 
 25 | class MutationResult:
 26 |     """MutationResult class.
 27 | 
 28 |     This class loads the mutated results for each pipeline that were already stored in the sapientml_core cache
 29 |     and combines all the results in a CSV file and selects the best model.
 30 | 
 31 |     """
 32 | 
 33 |     def __init__(self, mutation_result_path, project_list):
 34 |         self.mutation_result_path = mutation_result_path
 35 |         self.project_list = project_list
 36 | 
 37 |     def load_results(self):
 38 |         """Collects the score for augmented pipelines from exec_info directory.
 39 | 
 40 |         Returns
 41 |         -------
 42 |         results: defaultdict
 43 | 
 44 |         """
 45 |         results = defaultdict(defaultdict)
 46 |         models = list(name_to_label_mapping.keys()) + ["original"]
 47 |         execution_root_dir = internal_path.training_cache / "exec_info"
 48 | 
 49 |         for i in tqdm(range(0, len(self.project_list))):
 50 |             project = self.project_list[i]
 51 |             project_exec_dir = execution_root_dir / project.notebook_name
 52 |             project_key = project.file_name
 53 |             for model in models:
 54 |                 result_file_path = project_exec_dir / model / "stdout.txt"
 55 |                 acc, r2 = 0, 0
 56 |                 if not os.path.exists(result_file_path):
 57 |                     results[project_key][model] = 0
 58 |                     continue
 59 |                 with open(result_file_path, "r", encoding="utf-8") as f:
 60 |                     lines = f.readlines()
 61 |                     for line in lines:
 62 |                         for trail in ["Accuracy: ", "R2: "]:
 63 |                             data = line
 64 |                             if data.count(trail) > 0:
 65 |                                 data = data[data.index(trail) + len(trail) :].strip()
 66 |                                 if data.count("%") > 0:
 67 |                                     data = data[: data.index("%")]
 68 |                                     data = float(data) / 100
 69 |                                     if trail == "Accuracy: ":
 70 |                                         acc = data
 71 |                                     if trail == "R2: ":
 72 |                                         r2 = data
 73 |                     if project.metric == "accuracy":
 74 |                         results[project_key][model] = round(acc, 5)
 75 |                     elif project.metric == "r2":
 76 |                         results[project_key][model] = round(r2, 5)
 77 | 
 78 |             best_models = []
 79 |             sorted_results = sorted(results[project_key].items(), key=lambda x: x[1], reverse=True)
 80 |             best_value = 0
 81 |             for model, value in sorted_results:
 82 |                 if value > 0 and value >= best_value:
 83 |                     best_models.append(model)
 84 |                     best_value = value
 85 |                 else:
 86 |                     break
 87 | 
 88 |             results[project_key]["best_models"] = best_models
 89 | 
 90 |         return results
 91 | 
 92 | 
 93 | def main():
 94 |     """Fetch the augmented pipeline results and store it in mutation_results.csv."""
 95 |     corpus = ProjectCorpus()  # Fetch all project and pipeline details
 96 |     mutation_result = MutationResult(internal_path.training_cache, corpus.project_list)
 97 |     results = mutation_result.load_results()
 98 |     result_list = []
 99 |     for key, result in results.items():
100 |         result["file_name"] = key
101 |         result = OrderedDict(result)
102 |         result.move_to_end("file_name", last=False)
103 |         result_list.append(result)
104 |     result_dataframe = pd.DataFrame(result_list)
105 |     result_dataframe.to_csv(internal_path.training_cache / "mutation_results.csv", index=False)
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     import argparse
110 | 
111 |     parser = argparse.ArgumentParser()
112 |     parser.add_argument("--tag", type=str, help="Tag for output files and dirs.")
113 |     args = parser.parse_args()
114 |     if args.tag:
115 |         internal_path.training_cache = internal_path.training_cache / args.tag
116 | 
117 |     main()
118 | 


--------------------------------------------------------------------------------
/sapientml_core/training/dataflowmodel/ast_operation.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from typing import Optional, Union
 16 | 
 17 | import libcst as cst
 18 | import sapientml.macros as macros
 19 | from libcst import RemoveFromParent
 20 | from libcst.metadata import ParentNodeProvider, PositionProvider
 21 | 
 22 | 
 23 | class NameTransformer(cst.CSTTransformer):
 24 |     METADATA_DEPENDENCIES = (
 25 |         ParentNodeProvider,
 26 |         PositionProvider,
 27 |     )
 28 | 
 29 |     def __init__(self, replacement):
 30 |         self.as_names = {}
 31 |         self.count = 0
 32 |         self.replacement = replacement
 33 | 
 34 |     def leave_Name(self, original_node, updated_node) -> cst.CSTNode:
 35 |         source_string = original_node.value
 36 |         if source_string in self.replacement.keys():
 37 |             return updated_node.with_changes(value=self.replacement[source_string])
 38 |         else:
 39 |             return original_node
 40 | 
 41 |     def leave_SimpleString(self, original_node: cst.Name, updated_node: cst.Name) -> cst.CSTNode:
 42 |         source_string = original_node.value
 43 |         if source_string in self.replacement.keys():
 44 |             return updated_node.with_changes(value='"' + self.replacement[source_string] + '"')
 45 |         else:
 46 |             return original_node
 47 | 
 48 |     def get_LineNumber(self, node):
 49 |         pos = self.get_metadata(PositionProvider, node).start
 50 |         return pos.line
 51 | 
 52 | 
 53 | class ArgumentRemover(cst.CSTTransformer):
 54 |     METADATA_DEPENDENCIES = (
 55 |         ParentNodeProvider,
 56 |         PositionProvider,
 57 |     )
 58 | 
 59 |     def __init__(self, model_name):
 60 |         self.target = ""
 61 |         self.model_name = model_name
 62 | 
 63 |     def leave_Arg(self, original_node: cst.Arg, updated_node: cst.Arg) -> Union[cst.Arg, cst.RemovalSentinel]:
 64 |         parent = self.get_metadata(ParentNodeProvider, original_node)
 65 |         while not isinstance(parent, cst.Call):
 66 |             parent = self.get_metadata(ParentNodeProvider, parent)
 67 | 
 68 |         func = parent.func
 69 |         name = None
 70 |         if isinstance(func, cst.Name):
 71 |             name = func.value
 72 |         elif isinstance(func, cst.Attribute):
 73 |             name = func.attr.value
 74 |         if name == self.model_name:
 75 |             return RemoveFromParent()
 76 |         return updated_node
 77 | 
 78 | 
 79 | class ModelTransformer(cst.CSTTransformer):
 80 |     METADATA_DEPENDENCIES = (
 81 |         ParentNodeProvider,
 82 |         PositionProvider,
 83 |     )
 84 | 
 85 |     def __init__(self, model_name):
 86 |         self.target = ""
 87 |         self.model_name = model_name
 88 | 
 89 |     def visit_Assign(self, node) -> Optional[bool]:
 90 |         assigned_target = node.targets[0]
 91 |         target = assigned_target.target
 92 |         check = hasattr(target, "value")
 93 |         if check:
 94 |             value = node.value
 95 |             if isinstance(value, cst.Call):
 96 |                 func = value.func
 97 |                 name = None
 98 |                 if isinstance(func, cst.Name):
 99 |                     name = func.value
100 |                 elif isinstance(func, cst.Attribute):
101 |                     name = func.attr.value
102 |                 if name == self.model_name:
103 |                     self.target = target.value
104 | 
105 | 
106 | def transform_model_code(source_code, model_label, metric=None):
107 |     source_tree = cst.parse_module(source_code)
108 |     model_name = model_label.split(":")[2]
109 |     transformer = ModelTransformer(model_name)
110 |     wrapper = cst.metadata.MetadataWrapper(source_tree)
111 |     modified_tree = wrapper.visit(transformer)
112 |     code = modified_tree.code.splitlines()
113 |     if metric == macros.Metric.AUC or metric == macros.Metric.Gini:
114 |         transformed_code = (
115 |             code[0]
116 |             + "\n"
117 |             + transformer.target
118 |             + ".fit(__feature_train, __target_train)\n__y_pred = "
119 |             + transformer.target
120 |             + ".predict_proba(__feature_test)"
121 |         )
122 |     else:
123 |         transformed_code = (
124 |             code[0]
125 |             + "\n"
126 |             + transformer.target
127 |             + ".fit(__feature_train, __target_train)\n__y_pred = "
128 |             + transformer.target
129 |             + ".predict(__feature_test)"
130 |         )
131 |     return transformed_code
132 | 
133 | 
134 | def remove_arguments(source_code, model_name):
135 |     source_tree = cst.parse_module(source_code)
136 |     transformer = ArgumentRemover(model_name)
137 |     wrapper = cst.metadata.MetadataWrapper(source_tree)
138 |     modified_tree = wrapper.visit(transformer)
139 |     return modified_tree.code
140 | 
141 | 
142 | def replaceString(source_tree, replacement):
143 |     transformer = NameTransformer(replacement)
144 |     wrapper = cst.metadata.MetadataWrapper(source_tree)
145 |     modified_tree = wrapper.visit(transformer)
146 |     return modified_tree
147 | 
148 | 
149 | def construct_tree(notebook_path):
150 |     with open(notebook_path, "r", encoding="utf-8") as file:
151 |         code_content = file.read()
152 |         parts = code_content.split("### Evaluation Template: ")
153 |         code_content = parts[0]
154 |         source_tree = cst.parse_module(code_content)
155 |     return source_tree
156 | 


--------------------------------------------------------------------------------
/sapientml_core/training/dataflowmodel/determine_label_order.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import json
17 | 
18 | from sapientml_core import internal_path
19 | 
20 | LABELS_TO_IGNORE_NOW = {
21 |     "PREPROCESS:DeleteColumns:drop:pandas",
22 |     "PREPROCESS:Category:map:pandas",
23 |     "PREPROCESS:MissingValues:dropna:pandas",
24 |     "PREPROCESS:Category:replace:pandas",
25 |     "PREPROCESS:FeatureSelection:select_dtypes:pandas",
26 |     "PREPROCESS:GenerateColumn:addition:pandas",
27 | }
28 | 
29 | 
30 | def main():
31 |     """Removes duplication of labelling orders from dependent_labels.json file.
32 | 
33 |     This scripts create the dataflow model, i.e., extracts the order of two APIs A and B if there is any.
34 |     There is an order between A --> B if A and B are dependent on each other based on 'dependent_api_extractor.py' and
35 |     A is always followed by B in all piplelines and there is NO case in the corpus where B is followed by A.
36 | 
37 |     """
38 |     with open(internal_path.training_cache / "dependent_labels.json", "r", encoding="utf-8") as dependent_api_file:
39 |         dependent_labels = json.load(dependent_api_file)
40 | 
41 |     dependent_order = set()
42 | 
43 |     for dependent_label_str in dependent_labels.keys():
44 |         dep_str_after_bracket_removal = dependent_label_str.replace("[", "").replace("]", "").replace("'", "")
45 |         parts = dep_str_after_bracket_removal.split(",")
46 |         if (parts[0] in LABELS_TO_IGNORE_NOW) or (parts[1].strip() in LABELS_TO_IGNORE_NOW):
47 |             continue
48 |         first = parts[0].split(":")[1].strip()
49 |         second = parts[1].split(":")[1].strip()
50 |         inverse_order = second + "#" + first
51 |         if first != second:
52 |             if inverse_order in dependent_order:
53 |                 dependent_order.remove(inverse_order)
54 |             else:
55 |                 dependent_order.add(parts[0].strip() + "#" + parts[1].strip())
56 | 
57 |     with open(internal_path.training_cache / "label_order.json", "w", encoding="utf-8") as outfile:
58 |         json.dump(list(dependent_order), outfile, indent=4)
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     import argparse
63 | 
64 |     parser = argparse.ArgumentParser()
65 |     parser.add_argument("--tag", type=str, help="Tag for output files and dirs.")
66 |     args = parser.parse_args()
67 |     if args.tag:
68 |         internal_path.training_cache = internal_path.training_cache / args.tag
69 | 
70 |     main()
71 | 


--------------------------------------------------------------------------------
/sapientml_core/training/denoising/df_collector.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pandas as pd
16 | 
17 | 
18 | def update_column_names(collector, line_no, obj, obj_name):
19 |     """update_column_names function.
20 | 
21 |        This function is injected after each statement of the
22 |        pipeline during instrumentation to collect the column names of the
23 |        dataset after each statement.
24 | 
25 |     Parameters
26 |     ----------
27 |     collector : dict
28 |         Collection of all the column name.
29 |     line_no : int
30 |         line_no
31 |     obj : dataframe
32 |          Dataframe of particular object.
33 |     obj_name : str
34 |         Name of the object.
35 | 
36 |     Returns
37 |     -------
38 |     dict
39 | 
40 |     """
41 |     now_obj = obj
42 |     if isinstance(now_obj, pd.Series):
43 |         now_obj = now_obj.to_frame()
44 |     if isinstance(now_obj, pd.DataFrame):
45 |         collector[line_no] = (list(now_obj.columns), obj_name, str(type(now_obj)))
46 |     else:
47 |         collector[line_no] = (None, obj_name, str(type(now_obj)))
48 |     return collector
49 | 


--------------------------------------------------------------------------------
/sapientml_core/training/denoising/static_analysis_of_columns.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import json
 17 | from pathlib import Path
 18 | 
 19 | from sapientml.util.logging import setup_logger
 20 | from sapientml_core import internal_path
 21 | from sapientml_core.training import project_corpus
 22 | from sapientml_core.training.denoising import ast_info_collector as collector
 23 | from sapientml_core.util import file_util
 24 | 
 25 | logger = setup_logger()
 26 | 
 27 | 
 28 | def extract(json_metadata_file):
 29 |     """Extracting the pipeline.
 30 | 
 31 |        This function is collecting the pipeline details and extract
 32 |        the target column based on file data structure.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     json_metadata_file : str
 37 |         The parameter containg each pipeline details.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     str
 42 |        This funtion will return target_column_name.
 43 | 
 44 |     Raises
 45 |     ------
 46 |         The ``Raises`` section is a list of all exceptions
 47 |         that are relevant to the interface.
 48 | 
 49 |     """
 50 |     with open(json_metadata_file, "r", encoding="utf-8") as f:
 51 |         notebook_info = json.load(f)
 52 | 
 53 |     if isinstance(notebook_info, dict):
 54 |         target_column_name = notebook_info["target_column_name"]
 55 |     elif isinstance(notebook_info, list):
 56 |         target_column_name = notebook_info[1]["target_column_name"]
 57 |     else:
 58 |         logger.warning("Wrong format: {}".format(json_metadata_file))
 59 |         raise
 60 | 
 61 |     return target_column_name
 62 | 
 63 | 
 64 | def main(test_mode=False):
 65 |     """Fetch all the pipeline details from corpus and parse it using libcst library.
 66 | 
 67 |        This script performs static analysis of the pipeline to identify
 68 |        if there is any explicit renaming of the column names or explicit
 69 |        deletion of columns in the pipeline and create static_info.json file.
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     test_mode : bool
 74 |          This parameter is used for test mode.
 75 | 
 76 |     Raises
 77 |     ------
 78 |     Exception:
 79 |         The ``Raises`` section is a list of all exceptions
 80 |         that are relevant to the interface.
 81 | 
 82 |     """
 83 |     corpus = project_corpus.ProjectCorpus()
 84 |     projects = corpus.project_list
 85 |     static_info_map = {}
 86 | 
 87 |     total_number_target_pipelines = len(projects)
 88 | 
 89 |     for i in range(0, total_number_target_pipelines):
 90 |         if test_mode and i > 5:
 91 |             break
 92 |         logger.info(f"RUNNING:{i + 1} out of:{total_number_target_pipelines} PIPELINE:{projects[i].pipeline_path}")
 93 |         project = projects[i]
 94 |         pipeline = project.pipeline_path
 95 |         file_name = project.file_name
 96 | 
 97 |         static_info = {}
 98 |         try:
 99 |             dataset = file_util.read_csv(
100 |                 Path(project.dataset_path),
101 |                 Path(project.pipeline_path),
102 |             )
103 |         except Exception:
104 |             raise
105 | 
106 |         json_meta = pipeline.replace(".py", ".info.json")
107 | 
108 |         target = extract(json_meta)
109 |         source_file = pipeline
110 |         with open(source_file, "r", encoding="utf-8") as f:
111 |             source = f.read()
112 | 
113 |         try:
114 |             column_api_map = collector.get_column_api_map(source)
115 |         except Exception:
116 |             raise
117 | 
118 |         dataset_columns = list(dataset.columns)
119 |         dropped_columns = []
120 |         renamed_columns = []
121 |         for column in column_api_map:
122 |             if "drop" in column_api_map[column]:
123 |                 if column != target and column in dataset_columns:
124 |                     dropped_columns.append(column)
125 |             if "rename" in column_api_map[column]:
126 |                 renamed_columns.append(column)
127 | 
128 |         static_info["drop_api"] = dropped_columns
129 |         static_info["rename_api"] = renamed_columns
130 |         static_info["target"] = target
131 |         static_info_map[file_name] = static_info
132 |         try:
133 |             dataset.drop(dropped_columns, axis=1, inplace=True)
134 |         except Exception:
135 |             raise
136 | 
137 |     logger.info(f"Total number of notebooks: {len(static_info_map)}")
138 |     with open(internal_path.training_cache / "static_info.json", "w", encoding="utf-8") as f:
139 |         json.dump(static_info_map, f, indent=4)
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     import argparse
144 | 
145 |     parser = argparse.ArgumentParser()
146 |     parser.add_argument("--tag", type=str, help="Tag for output files and dirs.")
147 |     args = parser.parse_args()
148 |     if args.tag:
149 |         internal_path.training_cache = internal_path.training_cache / args.tag
150 |     test_mode = False
151 |     main(test_mode)
152 | 


--------------------------------------------------------------------------------
/sapientml_core/training/meta_feature_selector.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | from sapientml_core import ps_macros
 17 | from sapientml_core.design import search_space
 18 | from sklearn.tree import DecisionTreeClassifier
 19 | 
 20 | 
 21 | def select_k_best_features(X, y):
 22 |     """Select the top k explanatory variables.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix
 27 |         The training input samples.
 28 |     y : ArrayLike = numpy.typing.ArrayLike
 29 |         The target values
 30 | 
 31 |     Returns
 32 |     -------
 33 |     list
 34 |         Returns a list of the top k selected column names.
 35 |     """
 36 |     from sklearn.feature_selection import SelectKBest, mutual_info_regression
 37 | 
 38 |     # Select top 2 features based on mutual info regression
 39 |     selector = SelectKBest(mutual_info_regression, k=3)
 40 |     selector.fit(X, y)
 41 |     return list(X.columns[selector.get_support()])
 42 | 
 43 | 
 44 | def select_by_rfe(X, y):
 45 |     """Extract the top N(=n_features_to_select) feature values of importance by RFE(Recursive Feature Elimination).
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix   |
 50 |         ArrayLike = numpy.typing.ArrayLike
 51 |         The training input samples.
 52 |     y : ArrayLike = numpy.typing.ArrayLike
 53 |         The target values.
 54 | 
 55 |     Returns
 56 |     -------
 57 |     list
 58 |         Returns a list of selected column names.
 59 |     """
 60 |     from sklearn.feature_selection import RFE
 61 | 
 62 |     # #Selecting the Best important features according to Logistic Regression
 63 |     rfe_selector = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=2, step=1)
 64 |     rfe_selector.fit(X, y)
 65 |     return list(X.columns[rfe_selector.get_support()])
 66 | 
 67 | 
 68 | def select_from_model(X, y):
 69 |     """Select features based on importance weights.
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix
 74 |         The training input samples.
 75 |     y : None | ArrayLike = numpy.typing.ArrayLike
 76 |         The target values(integers that correspond to classes in classification, real numbers in regression).
 77 | 
 78 |     Returns
 79 |     -------
 80 |     list
 81 |         Returns a list of selected column names.
 82 |     """
 83 |     from sklearn.feature_selection import SelectFromModel
 84 | 
 85 |     # #Selecting the Best important features according to Logistic Regression using SelectFromModel
 86 |     sfm_selector = SelectFromModel(estimator=DecisionTreeClassifier())
 87 |     sfm_selector.fit(X, y)
 88 |     return list(X.columns[sfm_selector.get_support()])
 89 | 
 90 | 
 91 | def select_sequentially(X, y):
 92 |     """Select feature quantity in order and select feature quantity by greedy method.
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix
 97 |         Training vectors
 98 |     y : None | ArrayLike = numpy.typing.ArrayLike
 99 |         Target values. This parameter may be ignored for unsupervised learning.
100 | 
101 |     Returns
102 |     -------
103 |     list
104 |         Returns a list of selected column names.
105 |     """
106 |     from sklearn.feature_selection import SequentialFeatureSelector
107 | 
108 |     # Selecting the Best important features according to Logistic Regression
109 |     sfs_selector = SequentialFeatureSelector(
110 |         estimator=DecisionTreeClassifier(), n_features_to_select=3, cv=10, direction="backward"
111 |     )
112 |     sfs_selector.fit(X, y)
113 |     return list(X.columns[sfs_selector.get_support()])
114 | 
115 | 
116 | def select_based_on_correlation(data):
117 |     """Create correlation maps for learning data.
118 | 
119 |     Parameters
120 |     ----------
121 |     data : dataframe
122 |         Training data
123 | 
124 |     Returns
125 |     -------
126 |     correlation_map : defaultdict(list)
127 |     """
128 |     from collections import defaultdict
129 | 
130 |     corr = data.corr(numeric_only=True)
131 |     correlation_map = defaultdict(list)
132 |     for i in range(len(corr.columns)):
133 |         left = corr.columns[i]
134 |         for j in range(i):
135 |             if corr.iloc[i, j] >= 0.25:
136 |                 right = corr.columns[j]
137 |                 if left[0] != right[0]:
138 |                     correlation_map[left].append(right)
139 | 
140 |         if len(correlation_map[left]) == 0:
141 |             for j in range(i):
142 |                 if corr.iloc[i, j] >= 0.15:
143 |                     right = corr.columns[j]
144 |                     if left[0] != right[0]:
145 |                         correlation_map[left].append(right)
146 | 
147 |         if len(correlation_map[left]) == 0:
148 |             correlation_map[left] = list(search_space.meta_feature_list)
149 |     return correlation_map
150 | 
151 | 
152 | def select_features(label):
153 |     """Return manually selected feature labels.
154 | 
155 |     Parameters
156 |     ----------
157 |     label : str
158 | 
159 |     Returns
160 |     -------
161 |     selection_model[label] : list
162 |     """
163 |     selection_model = {
164 |         ps_macros.FILL: [ps_macros.MISSING_PRESENCE],
165 |         # ps_macros.DROP: [ps_macros.MISSING_PRESENCE],
166 |         ps_macros.IN_PLACE_CONVERT: [
167 |             ps_macros.CATG_PRESENCE,
168 |             # ps_macros.IS_TARGET_STR,
169 |             ps_macros.BINARY_CATG_PRESENCE,
170 |             ps_macros.SMALL_CATG_PRESENCE,
171 |             ps_macros.LARGE_CATG_PRESENCE,
172 |         ],
173 |         ps_macros.ONE_HOT: [
174 |             ps_macros.CATG_PRESENCE,
175 |             # ps_macros.IS_TARGET_STR,
176 |             ps_macros.BINARY_CATG_PRESENCE,
177 |             ps_macros.SMALL_CATG_PRESENCE,
178 |             ps_macros.LARGE_CATG_PRESENCE,
179 |         ],
180 |         ps_macros.VECT: [ps_macros.TEXT_PRESENCE],
181 |         ps_macros.MISSING: [ps_macros.MISSING_PRESENCE],
182 |         ps_macros.CATG: [ps_macros.CATG_PRESENCE],
183 |         ps_macros.SCALING: [
184 |             ps_macros.NORMALIZED_MEAN,
185 |             ps_macros.NORMALIZED_STD_DEV,
186 |             ps_macros.NORMALIZED_VARIATION_ACROSS_COLUMNS,
187 |         ],
188 |         ps_macros.DATE: [ps_macros.DATE_PRESENCE],
189 |         ps_macros.LEMMITIZE: [ps_macros.TEXT_PRESENCE],
190 |         ps_macros.BALANCING: [ps_macros.IMBALANCE],
191 |         ps_macros.LOG: [
192 |             ps_macros.MAX_SKEW,
193 |         ],
194 |     }
195 |     return selection_model[label]
196 | 


--------------------------------------------------------------------------------
/sapientml_core/training/pp_model_trainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import pickle
 17 | from collections import OrderedDict, defaultdict
 18 | from typing import Literal
 19 | 
 20 | import pandas as pd
 21 | from sapientml.util.logging import setup_logger
 22 | from sapientml_core import internal_path
 23 | from sapientml_core.design import search_space
 24 | from sapientml_core.training import meta_feature_selector
 25 | from sklearn.tree import DecisionTreeClassifier
 26 | 
 27 | logger = setup_logger()
 28 | 
 29 | 
 30 | def train_p_model(X, y):
 31 |     """Build a decision tree classifier from the training set (X, y).
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     X : MatrixLike = np.ndarray | pd.DataFrame | spmatrix   |
 36 |         ArrayLike = numpy.typing.ArrayLike
 37 |         The training input samples.
 38 |     y : MatrixLike = np.ndarray | pd.DataFrame | spmatrix   |
 39 |         ArrayLike = numpy.typing.ArrayLike
 40 |         The target values (class labels) as integers or strings
 41 | 
 42 |     Returns
 43 |     -------
 44 |     model : DecisionTreeClassifier
 45 |         Fitted estimator.
 46 |     """
 47 |     model = DecisionTreeClassifier(class_weight="balanced", max_depth=3)
 48 |     model.fit(X, y)
 49 |     return model
 50 | 
 51 | 
 52 | def _train_preprocessors(train_data, feature_selection: Literal["select_manually", "customized"]):
 53 |     logger.info("Training skeleton predictor for preprocessors...")
 54 |     data = train_data
 55 |     data.drop(
 56 |         data.filter(regex="(TEMPLATE|IGNORE|EVAL:|RPEPROCESS:|MODEL:|Unnamed:)").columns,
 57 |         axis=1,
 58 |         inplace=True,
 59 |     )
 60 |     data["project_target"] = (
 61 |         data["csv_name"] + "_" + data["target_column_name"].apply(lambda line: "_".join(sorted(eval(line))))
 62 |     )
 63 |     all_labels = [v for v in data.columns if v.startswith(("PREPROCESS:"))]
 64 |     second_to_full_labels = defaultdict(list)
 65 |     for label in all_labels:
 66 |         second_to_full_labels["PREPROCESS:" + label.split(":")[1]].append(label)
 67 | 
 68 |     pp_models = OrderedDict()
 69 | 
 70 |     selected_features_map = meta_feature_selector.select_based_on_correlation(data)
 71 | 
 72 |     for _, detail_labels in second_to_full_labels.items():
 73 |         for label in detail_labels:
 74 |             logger.debug(label)
 75 |             main_df = data.copy()
 76 |             # Feature Selection On
 77 |             y = main_df[label]
 78 |             X = main_df[search_space.meta_feature_list]
 79 | 
 80 |             if feature_selection == "select_manually":
 81 |                 selected_features = meta_feature_selector.select_features(label)
 82 |                 logger.debug("Selected Features:", selected_features)
 83 |                 X = main_df[selected_features]
 84 |             elif feature_selection == "customized":
 85 |                 selected_features = selected_features_map[label]
 86 |                 if len(selected_features) == 0:
 87 |                     selected_features = meta_feature_selector.select_sequentially(X, y)
 88 |                 logger.debug("Selected Features:", selected_features)
 89 |                 X = main_df[selected_features]
 90 | 
 91 |             pp_model = train_p_model(X, y)
 92 |             pp_models[label] = (pp_model, selected_features)
 93 | 
 94 |     return pp_models
 95 | 
 96 | 
 97 | def _prepare_model_training_data(raw_meta_feature_train):
 98 |     # Remove all the unnecessary meta-features
 99 |     final_meta_features = raw_meta_feature_train[search_space.project_related_metadata + search_space.meta_feature_list]
100 |     final_meta_features.fillna(0, inplace=True)
101 |     for semantic_label, columns in search_space.label_mapping.items():
102 |         try:
103 |             final_meta_features[semantic_label] = raw_meta_feature_train[columns].sum(axis=1)
104 |             final_meta_features[semantic_label] = final_meta_features[semantic_label].apply(lambda x: 1 if x > 0 else 0)
105 |         except KeyError as e:
106 |             logger.warning(e)
107 | 
108 |     return final_meta_features
109 | 
110 | 
111 | def main():
112 |     """This main function preprocesses the learning data and saves fitted estimator for the DecisionTreeClassifier.
113 | 
114 |     Description of feature_selection : "select_manually" | "customized"
115 |         Specify how features are selected.
116 |     """
117 |     training_data_path = internal_path.training_cache / "pp_metafeatures_training.csv"
118 |     # "select_manually" | "customized"
119 |     feature_selection = "customized"
120 |     raw_meta_feature_train = pd.read_csv(training_data_path)
121 |     meta_feature_train = _prepare_model_training_data(raw_meta_feature_train)
122 |     pp_models = _train_preprocessors(meta_feature_train, feature_selection)
123 |     # Save model
124 |     with open(internal_path.training_cache / "pp_models.pkl", "wb") as f:
125 |         pickle.dump(pp_models, f)
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     import argparse
130 | 
131 |     parser = argparse.ArgumentParser()
132 |     parser.add_argument("--tag", type=str, help="Tag for output files and dirs.")
133 |     args = parser.parse_args()
134 |     if args.tag:
135 |         internal_path.training_cache = internal_path.training_cache / args.tag
136 | 
137 |     main()
138 | 


--------------------------------------------------------------------------------
/sapientml_core/training/project.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2024 The SapientML Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from dataclasses import dataclass
16 | 
17 | 
18 | @dataclass
19 | class ProjectInfo:
20 |     pipeline_path: str  # full path
21 |     dataset_path: str  # full path
22 |     file_name: str  # only name of the pipeline
23 |     notebook_name: str  # only name of the pipeline without extension
24 |     accuracy: float
25 |     csv_name: str
26 |     target_column_name: str
27 |     metric: str
28 | 


--------------------------------------------------------------------------------
/sapientml_core/training/project_corpus.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import doctest
 17 | import json
 18 | import re
 19 | from pathlib import Path
 20 | 
 21 | from sapientml.util.logging import setup_logger
 22 | from sapientml_core import internal_path
 23 | from tqdm import tqdm
 24 | 
 25 | from .project import ProjectInfo
 26 | 
 27 | logger = setup_logger()
 28 | 
 29 | 
 30 | class ProjectCorpus:
 31 |     def __init__(self, target_project_name_list=None):
 32 |         self.target_project_name_list = target_project_name_list
 33 |         self.clean_notebook_dir_path = internal_path.clean_dir
 34 |         self.dataset_dir_path = internal_path.corpus_path / "dataset"
 35 |         self.metadata_dir_path = internal_path.corpus_path / "metadata"
 36 |         self.project_list = self._extract_project_info()
 37 | 
 38 |     def _extract_project_info(self):
 39 |         project_list = []
 40 | 
 41 |         if self.target_project_name_list:
 42 |             pipeline_file_names = [Path(project_path) for project_path in self.target_project_name_list]
 43 |         else:
 44 |             pipeline_file_names = Path(self.clean_notebook_dir_path).rglob("*.py")
 45 | 
 46 |         for notebook_path in tqdm(list(pipeline_file_names)):
 47 |             notebook_info_path = notebook_path.with_suffix(".info.json")
 48 |             notebook_name = notebook_path.stem
 49 |             logger.debug(f"Extracting Project Info for {notebook_name}")
 50 |             # Read the target column information
 51 |             try:
 52 |                 with open(notebook_info_path, "r", encoding="utf-8") as notebook_info_file:
 53 |                     notebook_info = json.load(notebook_info_file)
 54 |             except Exception:
 55 |                 logger.warning("Could not read JSON info file: {}".format(notebook_info_path))
 56 |                 continue
 57 | 
 58 |             if isinstance(notebook_info, list):
 59 |                 notebook_info = notebook_info[1]
 60 | 
 61 |             if isinstance(notebook_info, dict):
 62 |                 target_column_name = notebook_info["target_column_name"]
 63 |                 dataset_folder_name = notebook_info["dataset_folder"]
 64 |                 accuracy = notebook_info["accuracy"]
 65 |                 metric = "accuracy"
 66 |                 if accuracy == "N/A":
 67 |                     accuracy = notebook_info["r2"]
 68 |                     metric = "r2"
 69 |                 try:
 70 |                     accuracy = float(accuracy[:-1])  # discarding the percentage (%) sign from the end
 71 |                 except Exception:
 72 |                     accuracy = 0
 73 |             else:
 74 |                 logger.warning("Wrong format: {}".format(notebook_info_path))
 75 |                 continue
 76 | 
 77 |             if isinstance(target_column_name, str):
 78 |                 if target_column_name == "UNKNOWN":
 79 |                     continue
 80 |             elif isinstance(notebook_info, list):
 81 |                 if target_column_name[0] == "UNKNOWN":
 82 |                     continue
 83 |             # Read the dataset
 84 |             project_fqn = notebook_name + ".py"
 85 |             dataset_paths = [
 86 |                 p
 87 |                 for p in (Path(self.dataset_dir_path) / dataset_folder_name).glob("*")
 88 |                 if re.search(r"/*\.(csv|tsv)", str(p))
 89 |             ]
 90 |             if len(dataset_paths) == 0:
 91 |                 logger.warning(
 92 |                     "Could not find CSV/TSV file under {}/{}".format(self.dataset_dir_path, dataset_folder_name)
 93 |                 )
 94 |                 continue
 95 | 
 96 |             dataset_path = dataset_paths[0]
 97 |             dataset_name = dataset_path.stem
 98 |             if len(dataset_paths) > 1:
 99 |                 logger.warning(
100 |                     "Found multiple CSV/TSV files under {}. Using {}...".format(
101 |                         self.clean_notebook_dir_path, dataset_name
102 |                     )
103 |                 )
104 | 
105 |             project_info = ProjectInfo(
106 |                 str(notebook_path),
107 |                 str(dataset_path),
108 |                 project_fqn,
109 |                 notebook_name,
110 |                 accuracy,
111 |                 dataset_name,
112 |                 target_column_name,
113 |                 metric,
114 |             )
115 |             project_list.append(project_info)
116 |         return project_list
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     doctest.testmod()
121 | 


--------------------------------------------------------------------------------
/sapientml_core/util/file_util.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2024 The SapientML Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | import calendar
 17 | import datetime
 18 | import glob
 19 | import json
 20 | import os
 21 | import time
 22 | 
 23 | import pandas as pd
 24 | 
 25 | 
 26 | def get_time():
 27 |     """Returns the current time.
 28 | 
 29 |     Returns
 30 |     ----------
 31 |     readable : str
 32 |         Current time in ISO format
 33 |     """
 34 |     ts = calendar.timegm(time.gmtime())
 35 |     readable = datetime.datetime.fromtimestamp(ts).isoformat()
 36 |     return readable
 37 | 
 38 | 
 39 | def read_file_in_a_list(file_name):
 40 |     """Open a file and place it in a list line by line(read().splitlines()).
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     file_name : FileDescriptorOrPath
 45 |         File name.
 46 | 
 47 |     Returns
 48 |     ----------
 49 |     lines : list[str]
 50 |         List file contents line by line.
 51 |     """
 52 |     with open(file_name, "r", encoding="utf-8") as f:
 53 |         lines = f.read().splitlines()
 54 |     return lines
 55 | 
 56 | 
 57 | def read_file(file_name):
 58 |     """Open file and read data with read().
 59 | 
 60 |     Parameters
 61 |     ----------
 62 |     file_name : FileDescriptorOrPath
 63 |         File name.
 64 | 
 65 |     Returns
 66 |     ----------
 67 |     lines : str
 68 |         The entire text file read.
 69 |     """
 70 |     with open(file_name, "r", encoding="utf-8") as f:
 71 |         lines = f.read()
 72 |     return lines
 73 | 
 74 | 
 75 | def write_content_to_file(file_name, content):
 76 |     """write content to file.
 77 | 
 78 |     Parameters
 79 |     ----------
 80 |     file_name : FileDescriptorOrPath
 81 |         File name.
 82 |     content : str
 83 |         What to write to the file.
 84 |     """
 85 |     with open(file_name, "w", encoding="utf-8") as out_file:
 86 |         out_file.write(content)
 87 | 
 88 | 
 89 | def get_file_list(path, type):
 90 |     """Get a list of files of a specified type in a directory.
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     path : FileDescriptorOrPath
 95 |         Directory path.
 96 |     type : str
 97 |         File extension.
 98 |     Returns
 99 |     ----------
100 |     files_with_given_type : list
101 |         List of retrieved files.
102 |     """
103 |     os.chdir(path)
104 |     files_with_given_type = []
105 |     for file in glob.glob("*." + type):
106 |         files_with_given_type.append((path + "/" + file))
107 |     return files_with_given_type
108 | 
109 | 
110 | def load_json(file_name):
111 |     """Load json format file.
112 | 
113 |     Parameters
114 |     ----------
115 |     file_name : FileDescriptorOrPath
116 |         File name.
117 | 
118 |     Returns
119 |     ----------
120 |     content : Any
121 |         Loaded content.
122 |     """
123 |     with open(file_name, "r", encoding="utf-8") as input_file:
124 |         content = json.load(input_file)
125 |     return content
126 | 
127 | 
128 | def read_csv(csv_path, notebook_path):
129 |     """Read a csv file.
130 | 
131 |     Parameters
132 |     ----------
133 |     csv_path :  FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]
134 |         Csv file Path.
135 |     notebook_path : pathlib.Path
136 |         Notebook Directory Path.
137 | 
138 |     Returns
139 |     ----------
140 |     dataset : pd.DataFrame
141 |         Contents of the loaded csv.
142 |     """
143 | 
144 |     def read(path, **kwargs):
145 |         if str(path).endswith(".csv"):
146 |             return pd.read_csv(path, encoding_errors="ignore", on_bad_lines="warn", **kwargs)
147 |         return pd.read_table(path, encoding_errors="ignore", on_bad_lines="warn", **kwargs)
148 | 
149 |     encoding = get_dataset_encoding(notebook_path)
150 |     dataset = read(csv_path, encoding=encoding)
151 |     num_of_features = dataset.shape[1] - 1
152 |     if num_of_features == 0:
153 |         dataset = read(csv_path, encoding=encoding, delim_whitespace=True)
154 |         num_of_features = dataset.shape[1] - 1
155 |     if num_of_features == 0:
156 |         dataset = read(csv_path, encoding=encoding, delimiter=";")
157 |         num_of_features = dataset.shape[1] - 1
158 |     return dataset
159 | 
160 | 
161 | def get_dataset_encoding(notebook_path):
162 |     """Get dataset encoding.
163 | 
164 |     Parameters
165 |     ----------
166 |     notebook_path : StrPath | None | BytesPath
167 |         Directory path of notebooks.
168 | 
169 |     Returns
170 |     ----------
171 |     encoding : str | None
172 |     """
173 |     if os.path.isdir(notebook_path):
174 |         return None
175 |     if not str(notebook_path).endswith(".py"):
176 |         return None
177 |     encoding = get_dataset_file(notebook_path)
178 |     if encoding:
179 |         return encoding
180 |     return None
181 | 
182 | 
183 | def get_dataset_file(notebook_path):
184 |     """Read notebook_path and get encoding.
185 | 
186 |     Parameters
187 |     ----------
188 |     notebook_path : str
189 |         File name.
190 | 
191 |     Returns
192 |     ----------
193 |     csv_file_name : str | bytes | None
194 |         File name of csv(dataset).
195 |     encoding : str | None
196 |         Encoding of notebook_path.
197 |     """
198 |     f = open(notebook_path, "r", encoding="utf-8")
199 |     lines = f.readlines()
200 |     f.close()
201 |     encoding = None
202 |     for index in range(len(lines)):
203 |         if ".read_csv(" in lines[index]:
204 |             if "encoding=" in lines[index]:
205 |                 encoding = lines[index].split("encoding=")[1].split(")")[0].split(",")[0][1:-1]
206 |             elif "encoding = " in lines[index]:
207 |                 encoding = lines[index].split("encoding = ")[1].split(")")[0].split(",")[0][1:-1]
208 |             else:
209 |                 encoding = None
210 |             return encoding
211 |     return encoding
212 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/fixtures/outputs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/tests/fixtures/params/config.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/fixtures/params/config.pkl


--------------------------------------------------------------------------------
/tests/fixtures/params/dataset.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/fixtures/params/dataset.pkl


--------------------------------------------------------------------------------
/tests/fixtures/params/task.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/fixtures/params/task.pkl


--------------------------------------------------------------------------------
/tests/sapientml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sapientml/core/e2b66db5d3b0462959dbb2a0e719ca2e2f7af42a/tests/sapientml/__init__.py


--------------------------------------------------------------------------------
/tests/sapientml/conftest.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | from unittest import mock
 4 | 
 5 | import pytest
 6 | 
 7 | 
 8 | @pytest.fixture(scope="session", autouse=True)
 9 | def disable_logging():
10 |     logging.disable(logging.FATAL)
11 |     yield
12 |     logging.disable(logging.NOTSET)
13 | 
14 | 
15 | @pytest.fixture(scope="function", autouse=True)
16 | def reset_sapientml_logger():
17 |     # FIXME: more efficient way to reset a logger
18 |     logger = logging.getLogger("sapientml")
19 |     logger.handlers.clear()
20 |     logger.root.handlers.clear()
21 | 
22 | 
23 | @pytest.fixture(scope="function", autouse=True)
24 | def path_home(tmp_path):
25 |     with mock.patch.object(Path, "home"):
26 |         yield Path(tmp_path)
27 | 


--------------------------------------------------------------------------------