├── .github
└── workflows
│ ├── docs.yml
│ ├── publish_package.yml
│ └── tests.yml
├── .gitignore
├── ISSUE_TEMPLATE.md
├── LICENSE
├── PULL_REQUEST_TEMPLATE.md
├── README.md
├── Tutorial
├── 1_Using_TPOT.ipynb
├── 2_Search_Spaces.ipynb
├── 3_Feature_Set_Selector.ipynb
├── 4_Genetic_Feature_Selection.ipynb
├── 5_GraphPipeline.ipynb
├── 6_Symbolic_Regression_and_Classification.ipynb
├── 7_dask_parallelization.ipynb
├── 8_SH_and_cv_early_pruning.ipynb
├── 9_Genetic_Algorithm_Overview.ipynb
├── amltk_search_space_parser_example.ipynb
└── simple_fss.csv
├── docs
├── Tutorial
├── archived
│ ├── api.md
│ ├── assets
│ │ ├── favicon.ico
│ │ └── tpot-logo.jpg
│ ├── citing.md
│ ├── contributing.md
│ ├── css
│ │ └── archived.css
│ ├── examples.md
│ ├── index.md
│ ├── installing.md
│ ├── related.md
│ ├── releases.md
│ ├── support.md
│ └── using.md
├── assets
│ ├── favicon.ico
│ └── tpot-logo.jpg
├── cite.md
├── contribute.md
├── css
│ └── extra.css
├── index.md
├── installation.md
├── related.md
├── requirements_docs.txt
├── scripts
│ ├── build_docs_sources.sh
│ ├── build_mkdocs.sh
│ └── build_tutorial_toc_not_used.sh
├── support.md
├── tpot_api
│ ├── classifier.md
│ ├── estimator.md
│ └── regressor.md
└── using.md
├── mkdocs_archived.yml
├── pyproject.toml
├── requirements_dev.txt
├── setup.cfg
├── setup.py
├── tox.ini
└── tpot
├── __init__.py
├── _version.py
├── builtin_modules
├── __init__.py
├── arithmetictransformer.py
├── column_one_hot_encoder.py
├── estimatortransformer.py
├── feature_encoding_frequency_selector.py
├── feature_set_selector.py
├── feature_transformers.py
├── genetic_encoders.py
├── imputer.py
├── nn.py
├── passkbinsdiscretizer.py
├── passthrough.py
├── tests
│ └── feature_set_selector_tests.py
└── zero_count.py
├── config
├── __init__.py
├── autoqtl_builtins.py
├── classifiers.py
├── classifiers_sklearnex.py
├── get_configspace.py
├── imputers.py
├── mdr_configs.py
├── regressors.py
├── regressors_sklearnex.py
├── selectors.py
├── special_configs.py
├── template_search_spaces.py
├── tests
│ ├── __init__.py
│ └── test_get_configspace.py
└── transformers.py
├── evolvers
├── __init__.py
├── base_evolver.py
└── steady_state_evolver.py
├── graphsklearn.py
├── individual.py
├── logbook.py
├── objectives
├── __init__.py
├── average_path_length.py
├── complexity.py
├── number_of_leaves.py
├── number_of_nodes.py
└── tests
│ ├── test_complexity_objective.py
│ └── test_number_of_nodes.py
├── old_config_utils
├── __init__.py
└── old_config_utils.py
├── population.py
├── search_spaces
├── __init__.py
├── base.py
├── graph_utils.py
├── nodes
│ ├── __init__.py
│ ├── estimator_node.py
│ ├── estimator_node_gradual.py
│ ├── fss_node.py
│ └── genetic_feature_selection.py
├── pipelines
│ ├── __init__.py
│ ├── choice.py
│ ├── dynamic_linear.py
│ ├── dynamicunion.py
│ ├── graph.py
│ ├── sequential.py
│ ├── tests
│ │ └── test_graphspace.py
│ ├── tree.py
│ ├── union.py
│ └── wrapper.py
├── tests
│ └── test_search_spaces.py
└── tuple_index.py
├── selectors
├── __init__.py
├── lexicase_selection.py
├── map_elites_selection.py
├── max_weighted_average_selector.py
├── nsgaii.py
├── random_selector.py
├── tournament_selection.py
└── tournament_selection_dominated.py
├── tests
├── __init__.py
├── conftest.py
├── test_estimators.py
└── test_hello_world.py
├── tpot_estimator
├── __init__.py
├── cross_val_utils.py
├── estimator.py
├── estimator_utils.py
├── steady_state_estimator.py
├── templates
│ ├── __init__.py
│ ├── tpot_autoimputer.py
│ └── tpottemplates.py
└── tests
│ ├── __init__.py
│ └── test_estimator_utils.py
└── utils
├── __init__.py
├── amltk_parser.py
├── eval_utils.py
└── utils.py
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: Docs Build
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | jobs:
9 | build_docs:
10 | runs-on: ubuntu-latest
11 | env:
12 | GIT_COMMITTER_NAME: "Doc Build Bot"
13 | GIT_COMMITTER_EMAIL: "jay-m-dev@users.noreply.github.com"
14 | steps:
15 | - name: Checkout code
16 | uses: actions/checkout@v2
17 |
18 | - name: Set up Python
19 | uses: actions/setup-python@v2
20 | with:
21 | python-version: '3.10'
22 |
23 | - name: Cache dependencies
24 | uses: actions/cache@v3
25 | with:
26 | path: ~/.cache/pip
27 | key: ${{ runner.os }}-pip-${{ hashFiles('docs/requirements_docs.txt') }}
28 | restore-keys: |
29 | ${{ runner.os }}-pip-
30 |
31 | - name: Install dependencies
32 | run: |
33 | pip install --upgrade pip
34 | pip install .
35 | pip install -r docs/requirements_docs.txt
36 |
37 | # - name: Convert notebooks to HTML
38 | # # if: ${{ github.event_name == 'push' && contains(github.event.head_commit.modified, 'Tutorial/') && contains(github.event.head_commit.modified, '.ipynb') }}
39 | # run: |
40 | # # jupyter nbconvert --to html --allow-errors --no-input --show-input --template classic --output-dir docs/tutorial Tutorial/*.ipynb
41 | # jupyter nbconvert --to html --allow-errors --template classic --output-dir docs/tutorial Tutorial/*.ipynb
42 |
43 | # - name: Build Tutorial Table of Contents
44 | # run: |
45 | # bash docs/scripts/build_tutorial_toc.sh
46 |
47 | - name: Build Documentation sources
48 | run: |
49 | bash docs/scripts/build_docs_sources.sh
50 |
51 | - name: Build mkdocs.yml
52 | run: |
53 | bash docs/scripts/build_mkdocs.sh
54 |
55 | - name: Checkout gh-pages
56 | run: |
57 | git fetch origin gh-pages
58 | git checkout gh-pages || git checkout --orphan gh-pages
59 | git pull origin gh-pages || echo "No remote changes to pull"
60 | git checkout main # Switch back before continuing
61 |
62 | - name: Build and Deploy Latest Docs
63 | run: |
64 | mike deploy --push --branch gh-pages latest
65 |
66 | - name: Build and Deploy Archived Docs
67 | run: |
68 | mike deploy --config-file mkdocs_archived.yml --push --branch gh-pages archived
69 |
70 | - name: Set Default Version
71 | run: |
72 | mike set-default latest --push --branch gh-pages
73 |
74 | - name: Create alias for Latest Docs
75 | run: |
76 | mike alias latest stable --push --branch gh-pages
77 |
--------------------------------------------------------------------------------
/.github/workflows/publish_package.yml:
--------------------------------------------------------------------------------
1 | name: Publish Package
2 |
3 | on:
4 | release:
5 | types: [published]
6 | workflow_dispatch:
7 |
8 | jobs:
9 | build-and-publish-pypi:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: Checkout code
13 | uses: actions/checkout@v2
14 |
15 | - name: Setup Python
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: '3.10'
19 |
20 | - name: Install dependencies
21 | run: |
22 | python -m pip install --upgrade pip
23 | pip install setuptools wheel twine
24 |
25 | - name: Build package
26 | run: python setup.py sdist bdist_wheel
27 |
28 | - name: Upload to PyPI
29 | env:
30 | TWINE_USERNAME: __token__
31 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
32 | run: twine upload dist/*
33 |
34 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | - push
5 | - pull_request
6 |
7 | jobs:
8 | test:
9 | runs-on: ${{ matrix.os }}
10 | strategy:
11 | matrix:
12 | os: [ubuntu-latest]
13 | python-version: ['3.10', '3.11', '3.12', '3.13']
14 |
15 | steps:
16 | - uses: actions/checkout@v2
17 | - name: Set up Python ${{ matrix.python-version }}
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: ${{ matrix.python-version }}
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install tox tox-gh-actions
25 | - name: Test with tox
26 | run: tox
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .pytest_cache/
3 | TPOT.egg-info
4 | TPOT.egg-info
5 | *.tar.gz
6 | *.pkl
7 | *.json
8 | joblib/
9 | cache_folder/
10 | dask-worker-space/
11 | .tox/
12 | *.egg-info/
13 | .coverage
14 | target/
15 | .venv/
16 | build/*
17 | *.egg
18 | *.coverage*
19 | docs/documentation/
20 | mkdocs.yml
--------------------------------------------------------------------------------
/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | [provide general introduction to the issue and why it is relevant to this repository]
2 |
3 | ## Context of the issue
4 |
5 | [provide more detailed introduction to the issue itself and why it is relevant]
6 |
7 | [the remaining entries are only necessary if you are reporting a bug]
8 |
9 | ## Process to reproduce the issue
10 |
11 | [ordered list the process to finding and recreating the issue, example below. A minimally reproducible example would be ideal. This refers to the minimum amount of code necessary to reproduce the issue.]
12 |
13 | 1. User creates TPOT instance
14 | 2. User calls TPOT `fit()` function with training data
15 | 3. TPOT crashes with a `KeyError` after 5 generations
16 |
17 | ## Expected result
18 |
19 | [describe what you would expect to have resulted from this process]
20 |
21 | ## Current result
22 |
23 | [describe what you currently experience from this process, and thereby explain the bug]
24 |
25 | ## Possible fix
26 |
27 | [not necessary, but suggest fixes or reasons for the bug]
28 |
29 | ## `name of issue` screenshot
30 |
31 | [if relevant, include a screenshot]
32 |
--------------------------------------------------------------------------------
/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | [please review the [Contribution Guidelines](http://epistasislab.github.io/tpot/contributing/) prior to submitting your pull request. go ahead and delete this line if you've already reviewed said guidelines.]
2 |
3 | ## What does this PR do?
4 |
5 |
6 |
7 | ## Where should the reviewer start?
8 |
9 |
10 |
11 | ## How should this PR be tested?
12 |
13 |
14 |
15 | ## Any background context you want to provide?
16 |
17 |
18 |
19 | ## What are the relevant issues?
20 |
21 | [you can link directly to issues by entering # then the number of the issue]
22 |
23 | ## Screenshots (if appropriate)
24 |
25 |
26 |
27 | ## Questions:
28 |
29 | - Do the docs need to be updated?
30 | - Does this PR add new (Python) dependencies?
31 |
--------------------------------------------------------------------------------
/Tutorial/simple_fss.csv:
--------------------------------------------------------------------------------
1 | one,a,b,c
2 | two,d,e,f
3 | three,g,h,i
--------------------------------------------------------------------------------
/docs/Tutorial:
--------------------------------------------------------------------------------
1 | ../Tutorial
--------------------------------------------------------------------------------
/docs/archived/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/docs/archived/assets/favicon.ico
--------------------------------------------------------------------------------
/docs/archived/assets/tpot-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/docs/archived/assets/tpot-logo.jpg
--------------------------------------------------------------------------------
/docs/archived/citing.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
⚠️ Warning
4 |
This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.
5 |
6 |
7 |
8 | # Citing TPOT
9 |
10 | If you use TPOT in a scientific publication, please consider citing at least one of the following papers:
11 |
12 |
13 | Trang T. Le, Weixuan Fu and Jason H. Moore (2020). [Scaling tree-based automated machine learning to biomedical big data with a feature set selector](https://academic.oup.com/bioinformatics/article/36/1/250/5511404). *Bioinformatics*.36(1): 250-256.
14 |
15 | BibTeX entry:
16 |
17 | ```bibtex
18 | @article{le2020scaling,
19 | title={Scaling tree-based automated machine learning to biomedical big data with a feature set selector},
20 | author={Le, Trang T and Fu, Weixuan and Moore, Jason H},
21 | journal={Bioinformatics},
22 | volume={36},
23 | number={1},
24 | pages={250--256},
25 | year={2020},
26 | publisher={Oxford University Press}
27 | }
28 | ```
29 |
30 |
31 |
32 | Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). [Automating biomedical data science through tree-based pipeline optimization](http://link.springer.com/chapter/10.1007/978-3-319-31204-0_9). *Applications of Evolutionary Computation*, pages 123-137.
33 |
34 | BibTeX entry:
35 |
36 | ```bibtex
37 | @inbook{Olson2016EvoBio,
38 | author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.},
39 | editor={Squillero, Giovanni and Burelli, Paolo},
40 | chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization},
41 | title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I},
42 | year={2016},
43 | publisher={Springer International Publishing},
44 | pages={123--137},
45 | isbn={978-3-319-31204-0},
46 | doi={10.1007/978-3-319-31204-0_9},
47 | url={http://dx.doi.org/10.1007/978-3-319-31204-0_9}
48 | }
49 | ```
50 |
51 | Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science
52 |
53 | Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). [Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science](http://dl.acm.org/citation.cfm?id=2908918). *Proceedings of GECCO 2016*, pages 485-492.
54 |
55 | BibTeX entry:
56 |
57 | ```bibtex
58 | @inproceedings{OlsonGECCO2016,
59 | author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.},
60 | title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science},
61 | booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016},
62 | series = {GECCO '16},
63 | year = {2016},
64 | isbn = {978-1-4503-4206-3},
65 | location = {Denver, Colorado, USA},
66 | pages = {485--492},
67 | numpages = {8},
68 | url = {http://doi.acm.org/10.1145/2908812.2908918},
69 | doi = {10.1145/2908812.2908918},
70 | acmid = {2908918},
71 | publisher = {ACM},
72 | address = {New York, NY, USA},
73 | }
74 | ```
75 |
76 | Alternatively, you can cite the repository directly with the following DOI:
77 |
78 | [DOI](https://zenodo.org/badge/latestdoi/20747/rhiever/tpot)
79 |
--------------------------------------------------------------------------------
/docs/archived/css/archived.css:
--------------------------------------------------------------------------------
1 | .md-grid {
2 | max-width: 100%;
3 | }
--------------------------------------------------------------------------------
/docs/archived/index.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
⚠️ Warning
4 |
This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | Consider TPOT your **Data Science Assistant**. TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | TPOT will automate the most tedious part of machine learning by intelligently exploring thousands of possible pipelines to find the best one for your data.
23 |
24 |
25 |
26 |
27 |
28 |
29 | An example machine learning pipeline
30 |
31 |
32 |
33 |
34 | Once TPOT is finished searching (or you get tired of waiting), it provides you with the Python code for the best pipeline it found so you can tinker with the pipeline from there.
35 |
36 |
37 |
38 |
39 |
40 |
41 | An example TPOT pipeline
42 |
43 |
44 |
45 |
46 | TPOT is built on top of scikit-learn, so all of the code it generates should look familiar... if you're familiar with scikit-learn, anyway.
47 |
48 | **TPOT is still under active development** and we encourage you to check back on this repository regularly for updates.
49 |
--------------------------------------------------------------------------------
/docs/archived/installing.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
⚠️ Warning
4 |
This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.
5 |
6 |
7 |
8 | # Installation
9 |
10 | TPOT is built on top of several existing Python libraries, including:
11 |
12 | * [NumPy](http://www.numpy.org/)
13 |
14 | * [SciPy](https://www.scipy.org/)
15 |
16 | * [scikit-learn](http://www.scikit-learn.org/)
17 |
18 | * [DEAP](https://github.com/DEAP/deap)
19 |
20 | * [update_checker](https://github.com/bboe/update_checker)
21 |
22 | * [tqdm](https://github.com/tqdm/tqdm)
23 |
24 | * [stopit](https://github.com/glenfant/stopit)
25 |
26 | * [pandas](http://pandas.pydata.org)
27 |
28 | * [joblib](https://joblib.readthedocs.io/en/latest/)
29 |
30 | * [xgboost](https://xgboost.readthedocs.io/en/latest/)
31 |
32 | Most of the necessary Python packages can be installed via the [Anaconda Python distribution](https://www.anaconda.com/products/individual), which we strongly recommend that you use. **Support for Python 3.4 and below has been officially dropped since version 0.11.0.**
33 |
34 |
35 | You can install TPOT using `pip` or `conda-forge`.
36 |
37 | ## pip
38 |
39 | NumPy, SciPy, scikit-learn, pandas, joblib, and PyTorch can be installed in Anaconda via the command:
40 |
41 | ```Shell
42 | conda install numpy scipy scikit-learn pandas joblib pytorch
43 | ```
44 |
45 | DEAP, update_checker, tqdm, stopit and xgboost can be installed with `pip` via the command:
46 |
47 | ```Shell
48 | pip install deap update_checker tqdm stopit xgboost
49 | ```
50 |
51 | **Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors.** If you have issues installing XGBoost, check the [XGBoost installation documentation](http://xgboost.readthedocs.io/en/latest/build.html).
52 |
53 | If you plan to use [Dask](http://dask.pydata.org/en/latest/) for parallel training, make sure to install [dask[delay] and dask[dataframe]](https://docs.dask.org/en/latest/install.html) and [dask_ml](https://dask-ml.readthedocs.io/en/latest/install.html). **It is noted that dask-ml>=1.7 requires distributed>=2.4.0 and scikit-learn>=0.23.0.**
54 |
55 | ```Shell
56 | pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 distributed>=2.10.0
57 | ```
58 |
59 | If you plan to use the [TPOT-MDR configuration](https://arxiv.org/abs/1702.01780), make sure to install [scikit-mdr](https://github.com/EpistasisLab/scikit-mdr) and [scikit-rebate](https://github.com/EpistasisLab/scikit-rebate):
60 |
61 | ```Shell
62 | pip install scikit-mdr skrebate
63 | ```
64 |
65 | To enable support for [PyTorch](https://pytorch.org/)-based neural networks (TPOT-NN), you will need to install PyTorch. TPOT-NN will work with either CPU or GPU PyTorch, but we strongly recommend using a GPU version, if possible, as CPU PyTorch models tend to train very slowly.
66 |
67 | We recommend following [PyTorch's installation instructions](https://pytorch.org/get-started/locally/) customized for your operating system and Python distribution.
68 |
69 | Finally to install TPOT itself, run the following command:
70 |
71 | ```Shell
72 | pip install tpot
73 | ```
74 |
75 | ## conda-forge
76 |
77 | To install tpot and its core dependencies you can use:
78 |
79 | ```Shell
80 | conda install -c conda-forge tpot
81 | ```
82 |
83 | To install additional dependencies you can use:
84 |
85 | ```Shell
86 | conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate
87 | ```
88 |
89 | As mentioned above, we recommend following [PyTorch's installation instructions](https://pytorch.org/get-started/locally/) for installing it to enable support for [PyTorch](https://pytorch.org/)-based neural networks (TPOT-NN).
90 |
91 | ## Installation for using TPOT-cuML configuration
92 |
93 | With "TPOT cuML" configuration (see built-in configurations), TPOT will search over a restricted configuration using the GPU-accelerated estimators in [RAPIDS cuML](https://github.com/rapidsai/cuml) and [DMLC XGBoost](https://github.com/dmlc/xgboost). **This configuration requires an NVIDIA Pascal architecture or better GPU with [compute capability 6.0+](https://developer.nvidia.com/cuda-gpus), and that the library cuML is installed.** With this configuration, all model training and predicting will be GPU-accelerated. This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the `TPOTClassifier` and `TPOTRegressor`.
94 |
95 | Please download this conda environment yml file to install TPOT for using TPOT-cuML configuration.
96 |
97 | ```
98 | conda env create -f tpot-cuml.yml -n tpot-cuml
99 | conda activate tpot-cuml
100 | ```
101 |
102 |
103 | ## Installation problems
104 |
105 | Please [file a new issue](https://github.com/EpistasisLab/tpot/issues/new) if you run into installation problems.
106 |
--------------------------------------------------------------------------------
/docs/archived/related.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
⚠️ Warning
4 |
This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.
5 |
6 |
7 |
8 | Other Automated Machine Learning (AutoML) tools and related projects:
9 |
10 |
11 |
12 | Name |
13 | Language |
14 | License |
15 | Description |
16 |
17 |
18 | Auto-WEKA |
19 | Java |
20 | GPL-v3 |
21 | Automated model selection and hyper-parameter tuning for Weka models. |
22 |
23 |
24 | auto-sklearn |
25 | Python |
26 | BSD-3-Clause |
27 | An automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator. |
28 |
29 |
30 | auto_ml |
31 | Python |
32 | MIT |
33 | Automated machine learning for analytics & production. Supports manual feature type declarations. |
34 |
35 |
36 | H2O AutoML |
37 | Java with Python, Scala & R APIs and web GUI |
38 | Apache 2.0 |
39 | Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform. |
40 |
41 |
42 | devol |
43 | Python |
44 | MIT |
45 | Automated deep neural network design via genetic programming. |
46 |
47 |
48 | MLBox |
49 | Python |
50 | BSD-3-Clause |
51 | Accurate hyper-parameter optimization in high-dimensional space with support for distributed computing. |
52 |
53 |
54 | Recipe |
55 | C |
56 | GPL-v3 |
57 | Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure. |
58 |
59 |
60 | Xcessiv |
61 | Python |
62 | Apache 2.0 |
63 | A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python. |
64 |
65 |
66 | GAMA |
67 | Python |
68 | Apache 2.0 |
69 | Machine-learning pipeline optimization through asynchronous evaluation based genetic programming. |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/docs/archived/support.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
⚠️ Warning
4 |
This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.
5 |
6 |
7 |
8 | TPOT was developed in the [Computational Genetics Lab](http://epistasis.org/) at the [University of Pennsylvania](https://www.upenn.edu/) with funding from the [NIH](http://www.nih.gov/) under grant R01 AI117694. We are incredibly grateful for the support of the NIH and the University of Pennsylvania during the development of this project.
9 |
10 | The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project.
11 |
--------------------------------------------------------------------------------
/docs/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/docs/assets/favicon.ico
--------------------------------------------------------------------------------
/docs/assets/tpot-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/docs/assets/tpot-logo.jpg
--------------------------------------------------------------------------------
/docs/cite.md:
--------------------------------------------------------------------------------
1 | # Citing TPOT
2 | If you use TPOT in a scientific publication, please consider citing at least one of the following papers:
3 |
4 | Trang T. Le, Weixuan Fu and Jason H. Moore (2020). [Scaling tree-based automated machine learning to biomedical big data with a feature set selector](https://academic.oup.com/bioinformatics/article/36/1/250/5511404). *Bioinformatics*.36(1): 250-256.
5 |
6 | BibTeX entry:
7 |
8 | ```bibtex
9 | @article{le2020scaling,
10 | title={Scaling tree-based automated machine learning to biomedical big data with a feature set selector},
11 | author={Le, Trang T and Fu, Weixuan and Moore, Jason H},
12 | journal={Bioinformatics},
13 | volume={36},
14 | number={1},
15 | pages={250--256},
16 | year={2020},
17 | publisher={Oxford University Press}
18 | }
19 | ```
20 |
21 |
22 | Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). [Automating biomedical data science through tree-based pipeline optimization](http://link.springer.com/chapter/10.1007/978-3-319-31204-0_9). *Applications of Evolutionary Computation*, pages 123-137.
23 |
24 | BibTeX entry:
25 |
26 | ```bibtex
27 | @inbook{Olson2016EvoBio,
28 | author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.},
29 | editor={Squillero, Giovanni and Burelli, Paolo},
30 | chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization},
31 | title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I},
32 | year={2016},
33 | publisher={Springer International Publishing},
34 | pages={123--137},
35 | isbn={978-3-319-31204-0},
36 | doi={10.1007/978-3-319-31204-0_9},
37 | url={http://dx.doi.org/10.1007/978-3-319-31204-0_9}
38 | }
39 | ```
40 |
41 | Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). [Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science](http://dl.acm.org/citation.cfm?id=2908918). *Proceedings of GECCO 2016*, pages 485-492.
42 |
43 | BibTeX entry:
44 |
45 | ```bibtex
46 | @inproceedings{OlsonGECCO2016,
47 | author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.},
48 | title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science},
49 | booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016},
50 | series = {GECCO '16},
51 | year = {2016},
52 | isbn = {978-1-4503-4206-3},
53 | location = {Denver, Colorado, USA},
54 | pages = {485--492},
55 | numpages = {8},
56 | url = {http://doi.acm.org/10.1145/2908812.2908918},
57 | doi = {10.1145/2908812.2908918},
58 | acmid = {2908918},
59 | publisher = {ACM},
60 | address = {New York, NY, USA},
61 | }
62 | ```
--------------------------------------------------------------------------------
/docs/contribute.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it.
4 |
5 | # Contribution Guide
6 |
7 | We welcome you to [check the existing issues](https://github.com/EpistasisLab/tpot/issues/) for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please [file a new issue](https://github.com/EpistasisLab/tpot/issues/new) so we can discuss it.
8 |
9 | ## Project layout
10 |
11 | The latest stable release of TPOT is on the [main branch](https://github.com/EpistasisLab/tpot/tree/main), whereas the latest version of TPOT in development is on the [development branch](https://github.com/EpistasisLab/tpot/tree/dev). Make sure you are looking at and working on the correct branch if you're looking to contribute code.
12 |
13 | In terms of directory structure:
14 |
15 | * All of TPOT's code sources are in the `tpot` directory
16 | * The documentation sources are in the `docs_sources` directory
17 | * Images in the documentation are in the `images` directory
18 | * Tutorials for TPOT are in the `tutorials` directory
19 | * Unit tests for TPOT are in the `tests.py` file
20 |
21 | Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the `development` branch.
22 |
23 | ## How to contribute
24 |
25 | The preferred way to contribute to TPOT is to fork the
26 | [main repository](https://github.com/EpistasisLab/tpot/) on
27 | GitHub:
28 |
29 | 1. Fork the [project repository](https://github.com/EpistasisLab/tpot):
30 | click on the 'Fork' button near the top of the page. This creates
31 | a copy of the code under your account on the GitHub server.
32 |
33 | 2. Clone this copy to your local disk:
34 |
35 | $ git clone git@github.com:YourUsername/tpot.git
36 | $ cd tpot
37 |
38 | 3. Create a branch to hold your changes:
39 |
40 | $ git checkout -b my-contribution
41 |
42 | 4. Make sure your local environment is setup correctly for development. Installation instructions are almost identical to [the user instructions](installing.md) except that TPOT should *not* be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the `pytest` package into your development environment so that you can test changes locally.
43 |
44 | $ conda install pytest
45 |
46 | 5. Start making changes on your newly created branch, remembering to never work on the ``main`` branch! Work on this copy on your computer using Git to do the version control.
47 |
48 |
49 | 6. Check your changes haven't broken any existing tests and pass all your new tests. Navigate the terminal into the `tpot/tpot/` folder and run the command `pytest` to start all tests. (note, you must have the `pytest` package installed within your dev environment for this to work):
50 |
51 | $ pytest
52 |
53 | 7. When you're done editing and local testing, run:
54 |
55 | $ git add modified_files
56 | $ git commit
57 |
58 | to record your changes in Git, then push them to GitHub with:
59 |
60 | $ git push -u origin my-contribution
61 |
62 | Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the `dev` branch, as the `main` branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers.
63 |
64 | (If any of the above seems like magic to you, then look up the
65 | [Git documentation](http://git-scm.com/documentation) on the web.)
66 |
67 | ## Before submitting your pull request
68 |
69 | Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes.
70 |
71 | If your contribution changes TPOT in any way:
72 |
73 | * Update the [documentation](https://github.com/EpistasisLab/tpot/tree/main/docs) so all of your changes are reflected there.
74 |
75 | * Update the [README](https://github.com/EpistasisLab/tpot/blob/main/README.md) if anything there has changed.
76 |
77 | If your contribution involves any code changes:
78 |
79 | * Update the [project unit tests](https://github.com/EpistasisLab/tpot/tree/main/tpot/tests) to test your code changes.
80 |
81 | * Make sure that your code is properly commented with [docstrings](https://www.python.org/dev/peps/pep-0257/) and comments explaining your rationale behind non-obvious coding practices.
82 |
83 |
84 | If your contribution requires a new library dependency:
85 |
86 | * Double-check that the new dependency is easy to install via `pip` or Anaconda. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install.
87 |
88 |
89 | ## After submitting your pull request
90 |
91 | After submitting your pull request, GitHub will automatically run unit tests on your changes and make sure that your updated code builds and runs. We also use services that automatically check code quality and test coverage.
92 |
93 | Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors.
94 |
--------------------------------------------------------------------------------
/docs/css/extra.css:
--------------------------------------------------------------------------------
1 | .md-grid {
2 | max-width: 100%;
3 | }
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | {%
2 | include-markdown "../README.md"
3 | %}
--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | TPOT requires a working installation of Python.
4 |
5 | ### Creating a conda environment (optional)
6 |
7 | We recommend using conda environments for installing TPOT, though it would work equally well if manually installed without it.
8 |
9 | [More information on making anaconda environments found here.](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html)
10 |
11 | ```
12 | conda create --name tpotenv python=3.13
13 | conda activate tpotenv
14 | ```
15 |
16 | ### Note for M1 Mac or other Arm-based CPU users
17 |
18 | You need to install the lightgbm package directly from conda using the following command before installing TPOT.
19 |
20 | This is to ensure that you get the version that is compatible with your system.
21 |
22 | ```
23 | conda install --yes -c conda-forge 'lightgbm>=3.3.3'
24 | ```
25 |
26 | ### Developer/Latest Branch Installation
27 |
28 |
29 | ```
30 | pip install -e /path/to/tpotrepo
31 | ```
32 |
33 | If you downloaded with git pull, then the repository folder will be named TPOT. (Note: this folder is the one that includes setup.py inside of it and not the folder of the same name inside it).
34 | If you downloaded as a zip, the folder may be called tpot-main.
35 |
--------------------------------------------------------------------------------
/docs/related.md:
--------------------------------------------------------------------------------
1 | Other Automated Machine Learning (AutoML) tools and related projects:
2 |
3 |
4 |
5 | Name |
6 | Language |
7 | License |
8 | Description |
9 |
10 |
11 | Auto-WEKA |
12 | Java |
13 | GPL-v3 |
14 | Automated model selection and hyper-parameter tuning for Weka models. |
15 |
16 |
17 | auto-sklearn |
18 | Python |
19 | BSD-3-Clause |
20 | An automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator. |
21 |
22 |
23 | auto_ml |
24 | Python |
25 | MIT |
26 | Automated machine learning for analytics & production. Supports manual feature type declarations. |
27 |
28 |
29 | H2O AutoML |
30 | Java with Python, Scala & R APIs and web GUI |
31 | Apache 2.0 |
32 | Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform. |
33 |
34 |
35 | devol |
36 | Python |
37 | MIT |
38 | Automated deep neural network design via genetic programming. |
39 |
40 |
41 | MLBox |
42 | Python |
43 | BSD-3-Clause |
44 | Accurate hyper-parameter optimization in high-dimensional space with support for distributed computing. |
45 |
46 |
47 | Recipe |
48 | C |
49 | GPL-v3 |
50 | Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure. |
51 |
52 |
53 | Xcessiv |
54 | Python |
55 | Apache 2.0 |
56 | A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python. |
57 |
58 |
59 | GAMA |
60 | Python |
61 | Apache 2.0 |
62 | Machine-learning pipeline optimization through asynchronous evaluation based genetic programming. |
63 |
64 |
65 | PyMoo |
66 | Python |
67 | Apache 2.0 |
68 | Multi-objective optimization in Python. |
69 |
70 |
71 | Karoo GP |
72 | Python |
73 | MIT |
74 | A Python based genetic programming application suite with support for symbolic regression and classification. |
75 |
76 |
77 | MABE |
78 | C++ |
79 | See here |
80 | A Python based genetic programming application suite with support for symbolic regression and classification. |
81 |
82 |
83 | SBBFramework |
84 | Python |
85 | BSD-2-Clause |
86 | Python implementation of Symbiotic Bid-Based (SBB) framework for problem decomposition using Genetic Programming (GP). |
87 |
88 |
89 | Tiny GP |
90 | Python |
91 | GPL-v3 |
92 | A minimalistic program implementing Koza-style (tree-based) genetic programming to solve a symbolic regression problem. |
93 |
94 |
95 | Baikal |
96 | Python |
97 | BSD-3-Clause |
98 | A graph-based functional API for building complex scikit-learn pipelines. |
99 |
100 |
101 | skdag |
102 | Python |
103 | MIT |
104 | A more flexible alternative to scikit-learn Pipelines. |
105 |
106 |
107 | d6tflow |
108 | Python |
109 | MIT |
110 | A python library which makes building complex data science workflows easy, fast and intuitive. |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/docs/requirements_docs.txt:
--------------------------------------------------------------------------------
1 | griffe==1.3.1
2 | mike==2.1.3
3 | mkdocs==1.6.1
4 | mkdocs-include-markdown-plugin==6.2.2
5 | mkdocs-jupyter==0.25.0
6 | mkdocs-material==9.5.35
7 | mkdocstrings==0.26.1
8 | mkdocstrings-python==1.11.1
9 | nbconvert==7.16.5
10 |
--------------------------------------------------------------------------------
/docs/scripts/build_docs_sources.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function iterate_files() {
4 | local directory="$1"
5 | base_dir="docs/documentation"
6 |
7 | for file in "$directory"/*; do
8 | if [ -f "$file" ] && [[ "$file" == *.py ]] && [ "$(basename "$file")" != "__init__.py" ] && \
9 | ! echo "$file" | grep -q "test" && [ "$(basename "$file")" != "graph_utils.py" ]; then
10 | directories=$base_dir/$(dirname "$file")
11 | file_name=$(basename "$file")
12 | md_file=$directories/"${file_name%.*}".md
13 |
14 | mkdir -p $directories && touch $md_file
15 | include_line=$(dirname "$file")
16 | include_line="${include_line//\//.}"."${file_name%.*}"
17 | echo "::: $include_line" > $md_file
18 |
19 | elif [ -d "$file" ]; then
20 | iterate_files "$file"
21 | fi
22 | done
23 | }
24 |
25 | iterate_files "tpot"
26 |
--------------------------------------------------------------------------------
/docs/scripts/build_mkdocs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cat > mkdocs.yml <> mkdocs.yml
79 | echo " - tpot_api/estimator.md" >> mkdocs.yml
80 | echo " - tpot_api/classifier.md" >> mkdocs.yml
81 | echo " - tpot_api/regressor.md" >> mkdocs.yml
82 | echo " - Examples:" >> mkdocs.yml
83 | for file in docs/Tutorial/*.ipynb; do
84 | base=$(basename $file .ipynb)
85 | echo " - Tutorial/$base.ipynb" >> mkdocs.yml
86 | done
87 | echo " - Documentation:" >> mkdocs.yml
88 | function iterate_source_files() {
89 | local directory="$1"
90 |
91 | for file in "$directory"/*; do
92 | if [ -f "$file" ] && [[ "$file" == *.md ]]; then
93 | slash_count=$(echo "$file" | grep -o '/' | wc -l)
94 | num_spaces=$((slash_count * 2))
95 | spaces=$(printf "%*s" $num_spaces)
96 | echo "$spaces- ${file#*/}" >> mkdocs.yml
97 | fi
98 | done
99 |
100 | for file in "$directory"/*; do
101 | if [ -d "$file" ]; then
102 | slash_count=$(echo "$file" | grep -o '/' | wc -l)
103 | num_spaces=$((slash_count * 2))
104 | spaces=$(printf "%*s" $num_spaces)
105 | last_dir=$(basename "$file")
106 | echo "$spaces- $last_dir:" >> mkdocs.yml
107 | iterate_source_files "$file"
108 | fi
109 | done
110 | }
111 | iterate_source_files "docs/documentation"
112 | # make these static instead
113 | # for file in docs/*.md; do
114 | # base=$(basename $file .md)
115 | # if [ "$base" == "index" ]; then
116 | # continue
117 | # fi
118 | # echo " - $base.md" >> mkdocs.yml
119 | # done
120 | echo " - contribute.md" >> mkdocs.yml
121 | echo " - cite.md" >> mkdocs.yml
122 | echo " - support.md" >> mkdocs.yml
123 | echo " - related.md" >> mkdocs.yml
124 | # moved to the top
125 | # # test docstring
126 | # # echo " - Tutorials:" >> mkdocs.yml
127 | # for file in docs/tutorial/*.ipynb; do
128 | # base=$(basename $file .ipynb)
129 | # echo " - tutorial/$base.ipynb" >> mkdocs.yml
130 | # done
--------------------------------------------------------------------------------
/docs/scripts/build_tutorial_toc_not_used.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for file in docs/tutorial/*.html; do
4 | base=$(basename "$file" .html)
5 | echo "" > "docs/tutorial/$base.md"
6 | done
7 |
--------------------------------------------------------------------------------
/docs/support.md:
--------------------------------------------------------------------------------
1 | # Support
2 |
3 | TPOT was developed in the [Artificial Intelligence Innovation (A2I) Lab](http://epistasis.org/) at Cedars-Sinai with funding from the [NIH](http://www.nih.gov/) under grants U01 AG066833 and R01 LM010098. We are incredibly grateful for the support of the NIH and the Cedars-Sinai during the development of this project.
4 |
5 | The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project.
--------------------------------------------------------------------------------
/docs/tpot_api/classifier.md:
--------------------------------------------------------------------------------
1 | ::: tpot.tpot_estimator.templates.tpottemplates.TPOTClassifier
--------------------------------------------------------------------------------
/docs/tpot_api/estimator.md:
--------------------------------------------------------------------------------
1 | ::: tpot.tpot_estimator.estimator
--------------------------------------------------------------------------------
/docs/tpot_api/regressor.md:
--------------------------------------------------------------------------------
1 | ::: tpot.tpot_estimator.templates.tpottemplates.TPOTRegressor
--------------------------------------------------------------------------------
/docs/using.md:
--------------------------------------------------------------------------------
1 | # Using TPOT
2 | See the Tutorials Folder for more instructions and examples.
3 |
4 | ## Best Practices
5 |
6 | ### 1
7 | TPOT uses dask for parallel processing. When Python is parallelized, each module is imported within each processes. Therefore it is important to protect all code within a `if __name__ == "__main__"` when running TPOT from a script. This is not required when running TPOT from a notebook.
8 |
9 | For example:
10 |
11 | ```
12 | #my_analysis.py
13 |
14 | import tpot
15 | if __name__ == "__main__":
16 | X, y = load_my_data()
17 | est = tpot.TPOTClassifier()
18 | est.fit(X,y)
19 | #rest of analysis
20 | ```
21 |
22 | ### 2
23 |
24 | When designing custom objective functions, avoid the use of global variables.
25 |
26 | Don't Do:
27 | ```
28 | global_X = [[1,2],[4,5]]
29 | global_y = [0,1]
30 | def foo(est):
31 | return my_scorer(est, X=global_X, y=global_y)
32 |
33 | ```
34 |
35 | Instead use a partial
36 |
37 | ```
38 | from functools import partial
39 |
40 | def foo_scorer(est, X, y):
41 | return my_scorer(est, X, y)
42 |
43 | if __name__=='__main__':
44 | X = [[1,2],[4,5]]
45 | y = [0,1]
46 | final_scorer = partial(foo_scorer, X=X, y=y)
47 | ```
48 |
49 | Similarly when using lambda functions.
50 |
51 | Dont Do:
52 |
53 | ```
54 | def new_objective(est, a, b)
55 | #definition
56 |
57 | a = 100
58 | b = 20
59 | bad_function = lambda est : new_objective(est=est, a=a, b=b)
60 | ```
61 |
62 | Do:
63 | ```
64 | def new_objective(est, a, b)
65 | #definition
66 |
67 | a = 100
68 | b = 20
69 | good_function = lambda est, a=a, b=b : new_objective(est=est, a=a, b=b)
70 | ```
71 |
72 | ## Tips
73 |
74 | TPOT will not check if your data is correctly formatted. It will assume that you have passed in operators that can handle the type of data that was passed in. For instance, if you pass in a pandas dataframe with categorical features and missing data, then you should also include in your configuration operators that can handle those feautures of the data. Alternatively, if you pass in `preprocessing = True`, TPOT will impute missing values, one hot encode categorical features, then standardize the data. (Note that this is currently fitted and transformed on the entire training set before splitting for CV. Later there will be an option to apply per fold, and have the parameters be learnable.)
75 |
76 | Setting `verbose` to 5 can be helpful during debugging as it will print out the error generated by failing pipelines.
--------------------------------------------------------------------------------
/mkdocs_archived.yml:
--------------------------------------------------------------------------------
1 | site_name: TPOT
2 | site_url: http://epistasislab.github.io/tpot
3 | site_author: Randal S. Olson
4 | site_description: Documentation for TPOT, a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.
5 |
6 | repo_url: https://github.com/epistasislab/tpot
7 | edit_uri: edit/master/docs/archived/
8 | docs_dir: docs/archived/
9 | site_dir: target/archived_site
10 | #theme: readthedocs
11 | theme:
12 | name: material
13 | logo: assets/tpot-logo.jpg
14 | favicon: assets/favicon.ico
15 | features:
16 | - toc.integrate
17 | - navigation.top
18 | palette:
19 | # light mode
20 | - scheme: default
21 | primary: grey
22 | toggle:
23 | icon: material/brightness-7
24 | name: Switch to dark mode
25 |
26 | # dark mode
27 | - scheme: slate
28 | primary: grey
29 | toggle:
30 | icon: material/brightness-4
31 | name: Switch to light mode
32 |
33 | extra:
34 | version:
35 | provider: mike
36 |
37 | extra_css:
38 | - css/archived.css
39 |
40 | markdown_extensions:
41 | - tables
42 | - fenced_code
43 | - pymdownx.highlight:
44 | anchor_linenums: true
45 | - pymdownx.inlinehilite
46 | - pymdownx.snippets
47 | - pymdownx.superfences
48 |
49 | plugins:
50 | - include-markdown
51 |
52 | copyright: Developed by Randal S. Olson and others at the University of Pennsylvania
53 |
54 | nav:
55 | - Home: index.md
56 | - Installation: installing.md
57 | - Using TPOT: using.md
58 | - TPOT API: api.md
59 | - Examples: examples.md
60 | - Contributing: contributing.md
61 | - Release Notes: releases.md
62 | - Citing TPOT: citing.md
63 | - Support: support.md
64 | - Related: related.md
65 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.pytest.ini_options]
6 | addopts = "--cov=tpot"
7 | testpaths = [
8 | "tpot/tests",
9 | ]
10 |
11 | [tool.mypy]
12 | mypy_path = "tpot"
13 | check_untyped_defs = true
14 | disallow_any_generics = true
15 | ignore_missing_imports = true
16 | no_implicit_optional = true
17 | show_error_codes = true
18 | strict_equality = true
19 | warn_redundant_casts = true
20 | warn_return_any = true
21 | warn_unreachable = true
22 | warn_unused_configs = true
23 | no_implicit_reexport = true
--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | flake8==6.0.0
2 | tox==4.4.12
3 | pytest==7.3.0
4 | pytest-cov==4.0.0
5 | mypy==1.2.0
6 | setuptools
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [options.extras_require]
2 | testing =
3 | pytest>=6.0
4 | pytest-cov>=2.0
5 | mypy>=0.910
6 | flake8>=3.9
7 | tox>=3.24
8 |
9 | [options.package_data]
10 | tpot = py.typed
11 |
12 | [flake8]
13 | max-line-length = 120
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #TODO update this
4 | from setuptools import setup, find_packages
5 |
6 | def calculate_version():
7 | initpy = open('tpot/_version.py').read().split('\n')
8 | version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1]
9 | return version
10 |
11 |
12 | package_version = calculate_version()
13 |
14 | setup(
15 | name='TPOT',
16 | python_requires='>=3.10, <3.14',
17 | version=package_version,
18 | author='Pedro Ribeiro',
19 | packages=find_packages(),
20 | url='https://github.com/EpistasisLab/tpot',
21 | license='GNU/LGPLv3', #TODO
22 | entry_points={'console_scripts': ['tpot=tpot:main', ]},
23 | description=('Tree-based Pipeline Optimization Tool'),
24 | long_description='''
25 | A Python tool that automatically creates and optimizes machine learning pipelines using genetic programming.
26 |
27 |
28 | ''',
29 | zip_safe=True,
30 | install_requires=['numpy==1.26.4',
31 | 'scipy>=1.3.1',
32 | 'scikit-learn>=1.4.2,<1.6',
33 | 'update_checker>=0.16',
34 | 'tqdm>=4.36.1',
35 | 'stopit>=1.1.1',
36 | 'pandas>=2.2.0',
37 | 'joblib>=1.1.1',
38 | 'xgboost>=1.7.0',
39 | 'matplotlib>=3.6.2',
40 | 'traitlets>=5.8.0',
41 | 'lightgbm>=3.3.3',
42 | 'optuna>=3.0.5',
43 | 'networkx>=3.0',
44 | 'dask>=2024.4.2',
45 | 'distributed>=2024.4.2',
46 | 'dask-expr>=1.0.12',
47 | 'dask-jobqueue>=0.8.5',
48 | 'func_timeout>=4.3.5',
49 | 'configspace>=1.1.1',
50 | 'dill>=0.3.9',
51 | 'seaborn>=0.13.2',
52 | ],
53 | extras_require={
54 | 'skrebate': ['skrebate>=0.3.4'],
55 | 'mdr': ['scikit-mdr>=0.4.4'],
56 | 'sklearnex' : ['scikit-learn-intelex>=2023.2.1'],
57 | 'amltk' : ['amltk>=1.12.1'],
58 | },
59 | classifiers=[
60 | 'Intended Audience :: Science/Research',
61 | 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)',
62 | 'Programming Language :: Python :: 3.10',
63 | 'Programming Language :: Python :: 3.11',
64 | 'Programming Language :: Python :: 3.12',
65 | 'Programming Language :: Python :: 3.13',
66 | 'Topic :: Scientific/Engineering :: Artificial Intelligence'
67 | ],
68 | keywords=['pipeline optimization', 'hyperparameter optimization', 'data science', 'machine learning', 'genetic programming', 'evolutionary computation'],
69 | )
70 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | minversion = 3.28.0
3 | # flake8 and mypy outputs severla errors, so we disable them for now
4 | # envlist = py310, flake8, mypy
5 | envlist = py310, py311, py312, py313
6 | isolated_build = true
7 |
8 | [gh-actions]
9 | python =
10 | 3.10: py310
11 | 3.11: py311
12 | 3.12: py312
13 | 3.13: py313
14 | # 3.10: py310, flake8, mypy
15 |
16 | [testenv]
17 | setenv =
18 | PYTHONPATH = {toxinidir}
19 | deps =
20 | -r{toxinidir}/requirements_dev.txt
21 | commands =
22 | pytest --basetemp={envtmpdir}
23 |
24 | [testenv:flake8]
25 | basepython = python3.10
26 | deps = flake8
27 | commands = flake8 tpot
28 |
29 | [testenv:mypy]
30 | basepython = python3.10
31 | deps =
32 | -r{toxinidir}/requirements_dev.txt
33 | commands = mypy tpot
34 |
--------------------------------------------------------------------------------
/tpot/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 |
37 | #TODO: are all the imports in the init files done correctly?
38 | #TODO clean up import organization
39 |
40 | from .individual import BaseIndividual
41 |
42 | from .graphsklearn import GraphPipeline
43 | from .population import Population
44 |
45 | from . import builtin_modules
46 | from . import config
47 | from . import search_spaces
48 | from . import utils
49 | from . import evolvers
50 | from . import objectives
51 | from . import selectors
52 | from . import tpot_estimator
53 | from . import old_config_utils
54 |
55 | from .tpot_estimator import TPOTClassifier, TPOTRegressor, TPOTEstimator, TPOTEstimatorSteadyState
56 |
57 | from update_checker import update_check
58 | from ._version import __version__
59 | update_check("tpot",__version__)
--------------------------------------------------------------------------------
/tpot/_version.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | __version__ = '1.0.0'
37 |
--------------------------------------------------------------------------------
/tpot/builtin_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature_set_selector import FeatureSetSelector
2 | from .zero_count import ZeroCount
3 | from .column_one_hot_encoder import ColumnOneHotEncoder, ColumnOrdinalEncoder
4 | from .arithmetictransformer import ArithmeticTransformer
5 | from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
6 | from .passthrough import Passthrough, SkipTransformer
7 | from .imputer import ColumnSimpleImputer
8 | from .estimatortransformer import EstimatorTransformer
9 | from .passkbinsdiscretizer import PassKBinsDiscretizer
10 |
11 | try:
12 | from .nn import PytorchLRClassifier, PytorchMLPClassifier
13 | except (ModuleNotFoundError, ImportError):
14 | pass
15 | # import warnings
16 | # warnings.warn("Warning: optional dependency `torch` is not available. - skipping import of NN models.")
--------------------------------------------------------------------------------
/tpot/builtin_modules/feature_encoding_frequency_selector.py:
--------------------------------------------------------------------------------
1 | """
2 | From https://github.com/EpistasisLab/autoqtl
3 | """
4 |
5 | import numpy as np
6 | from sklearn.base import BaseEstimator
7 | from sklearn.feature_selection._base import SelectorMixin
8 |
9 | class FeatureEncodingFrequencySelector(BaseEstimator, SelectorMixin):
10 | """Feature selector based on Encoding Frequency. Encoding frequency is the frequency of each unique element(0/1/2/3) present in a feature set.
11 | Features are selected on the basis of a threshold assigned for encoding frequency. If frequency of any unique element is less than or equal to threshold, the feature is removed. """
12 |
13 | @property
14 | def __name__(self):
15 | """Instance name is the same as the class name. """
16 | return self.__class__.__name__
17 |
18 | def __init__(self, threshold):
19 | """Create a FeatureEncodingFrequencySelector object.
20 |
21 | Parameters
22 | ----------
23 | threshold : float, required
24 | Threshold value for allele frequency. If frequency of A or frequency of a is less than the threshold value then the feature is dropped.
25 |
26 | Returns
27 | -------
28 | None
29 |
30 | """
31 | self.threshold = threshold
32 |
33 | """def fit(self, X, y=None):
34 | Fit FeatureAlleleFrequencySelector for feature selection
35 |
36 | Parameters
37 | ----------
38 | X : numpy ndarray, {n_samples, n_features}
39 | The training input samples.
40 | y : numpy array {n_samples,}
41 | The training target values.
42 |
43 | Returns
44 | -------
45 | self : object
46 | Returns a copy of the estimator
47 |
48 | self.selected_feature_indexes = []
49 | self.no_of_features = X.shape[1]
50 |
51 | # Finding the no of alleles in each feature column
52 | for i in range(0, X.shape[1]):
53 | no_of_AA_featurewise = np.count_nonzero(X[:,i]==0)
54 | no_of_Aa_featurewise = np.count_nonzero(X[:,i]==1)
55 | no_of_aa_featurewise = np.count_nonzero(X[:,i]==2)
56 |
57 |
58 | frequency_A_featurewise = (2*no_of_AA_featurewise + no_of_Aa_featurewise) / (2*no_of_AA_featurewise +
59 | 2*no_of_Aa_featurewise + 2*no_of_aa_featurewise)
60 |
61 | frequency_a_featurewise = 1 - frequency_A_featurewise
62 |
63 | if(not(frequency_A_featurewise <= self.threshold) and not(frequency_a_featurewise <= self.threshold)):
64 | self.selected_feature_indexes.append(i)
65 | return self"""
66 |
67 | """def transform(self, X):
68 | Make subset after fit
69 |
70 | Parameters
71 | ----------
72 | X : numpy ndarray, {n_samples, n_features}
73 | New data, where n_samples is the number of samples and n_features is the number of features.
74 |
75 | Returns
76 | -------
77 | X_transformed : numpy ndarray, {n_samples, n_features}
78 | The transformed feature set.
79 |
80 |
81 | X_transformed = X[:, self.selected_feature_indexes]
82 |
83 | return X_transformed"""
84 |
85 | def fit(self, X, y=None) :
86 | """Fit FeatureEncodingFrequencySelector for feature selection. This function gets the appropriate features. """
87 |
88 | self.selected_feature_indexes = []
89 | self.no_of_original_features = X.shape[1]
90 |
91 | # Finding the frequency of all the unique elements present featurewise in the input variable X
92 | for i in range(0, X.shape[1]):
93 | unique, counts = np.unique(X[:,i], return_counts=True)
94 | element_count_dict_featurewise = dict(zip(unique, counts))
95 | element_frequency_dict_featurewise = {}
96 | feature_column_selected = True
97 |
98 | for x in unique:
99 | x_frequency_featurewise = element_count_dict_featurewise[x] / sum(counts)
100 | element_frequency_dict_featurewise[x] = x_frequency_featurewise
101 |
102 | for frequency in element_frequency_dict_featurewise.values():
103 | if frequency <= self.threshold :
104 | feature_column_selected = False
105 | break
106 |
107 | if feature_column_selected == True :
108 | self.selected_feature_indexes.append(i)
109 |
110 | if not len(self.selected_feature_indexes):
111 | """msg = "No feature in X meets the encoding frequency threshold {0:.5f}"
112 | raise ValueError(msg.format(self.threshold))"""
113 | for i in range(0, X.shape[1]):
114 | self.selected_feature_indexes.append(i)
115 |
116 | return self
117 |
118 | def transform(self, X):
119 | """ Make subset after fit. This function returns a transformed version of X. """
120 | X_transformed = X[:, self.selected_feature_indexes]
121 |
122 | return X_transformed
123 |
124 |
125 | def _get_support_mask(self):
126 | """
127 | Get the boolean mask indicating which features are selected
128 | It is the abstractmethod
129 |
130 | Returns
131 | -------
132 | support : boolean array of shape [# input features]
133 | An element is True iff its corresponding feature is selected for retention.
134 | """
135 | n_features = self.no_of_original_features
136 | mask = np.zeros(n_features, dtype=bool)
137 | mask[np.asarray(self.selected_feature_indexes)] = True
138 |
139 | return mask
140 |
--------------------------------------------------------------------------------
/tpot/builtin_modules/feature_set_selector.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 |
37 | #TODO handle sparse input?
38 |
39 | import numpy as np
40 | import pandas as pd
41 | import os, os.path
42 | from sklearn.base import BaseEstimator
43 | from sklearn.feature_selection._base import SelectorMixin
44 |
45 |
46 |
47 | #TODO clean this up and make sure it works
48 | class FeatureSetSelector(BaseEstimator, SelectorMixin):
49 | """
50 | Select predefined feature subsets.
51 |
52 |
53 | """
54 |
55 | def __init__(self, sel_subset=None, name=None):
56 | """Create a FeatureSetSelector object.
57 |
58 | Parameters
59 | ----------
60 | sel_subset: list or int
61 | If X is a dataframe, items in sel_subset list must correspond to column names
62 | If X is a numpy array, items in sel_subset list must correspond to column indexes
63 | int: index of a single column
64 | Returns
65 | -------
66 | None
67 |
68 | """
69 | self.name = name
70 | self.sel_subset = sel_subset
71 |
72 |
73 | def fit(self, X, y=None):
74 | """Fit FeatureSetSelector for feature selection
75 |
76 | Parameters
77 | ----------
78 | X: array-like of shape (n_samples, n_features)
79 | The training input samples.
80 | y: array-like, shape (n_samples,)
81 | The target values (integers that correspond to classes in classification, real numbers in regression).
82 |
83 | Returns
84 | -------
85 | self: object
86 | Returns a copy of the estimator
87 | """
88 | if isinstance(self.sel_subset, int) or isinstance(self.sel_subset, str):
89 | self.sel_subset = [self.sel_subset]
90 |
91 | #generate self.feat_list_idx
92 | if isinstance(X, pd.DataFrame):
93 | self.feature_names_in_ = X.columns.tolist()
94 | self.feat_list_idx = sorted([self.feature_names_in_.index(feat) for feat in self.sel_subset])
95 |
96 |
97 | elif isinstance(X, np.ndarray):
98 | self.feature_names_in_ = None#list(range(X.shape[1]))
99 |
100 | self.feat_list_idx = sorted(self.sel_subset)
101 |
102 | n_features = X.shape[1]
103 | self.mask = np.zeros(n_features, dtype=bool)
104 | self.mask[np.asarray(self.feat_list_idx)] = True
105 |
106 | return self
107 |
108 | #TODO keep returned as dataframe if input is dataframe? may not be consistent with sklearn
109 |
110 | # def transform(self, X):
111 |
112 | def _get_tags(self):
113 | tags = {"allow_nan": True, "requires_y": False}
114 | return tags
115 |
116 | def _get_support_mask(self):
117 | """
118 | Get the boolean mask indicating which features are selected
119 | Returns
120 | -------
121 | support : boolean array of shape [# input features]
122 | An element is True iff its corresponding feature is selected for
123 | retention.
124 | """
125 | return self.mask
126 |
127 |
--------------------------------------------------------------------------------
/tpot/builtin_modules/imputer.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 |
37 | #TODO support np arrays
38 |
39 | import numpy as np
40 | from scipy import sparse
41 |
42 | from sklearn.base import BaseEstimator, TransformerMixin
43 | from sklearn.utils import check_array
44 | from sklearn.preprocessing import OneHotEncoder
45 | import sklearn
46 | import sklearn.impute
47 |
48 | import pandas as pd
49 | from pandas.api.types import is_numeric_dtype
50 | import sklearn.compose
51 |
52 |
53 | class ColumnSimpleImputer(BaseEstimator, TransformerMixin):
54 | def __init__(self, columns="all",
55 | missing_values=np.nan,
56 | strategy="mean",
57 | fill_value=None,
58 | copy=True,
59 | add_indicator=False,
60 | keep_empty_features=False,):
61 | """"
62 | A wrapper for SimpleImputer that allows for imputation of specific columns in a DataFrame or np array.
63 | Passes through columns that are not imputed.
64 |
65 | Parameters
66 | ----------
67 | columns : str, list, default='all'
68 | Determines which columns to impute with sklearn.impute.SimpleImputer.
69 | - 'categorical' : Automatically select categorical features
70 | - 'numeric' : Automatically select numeric features
71 | - 'all' : Select all features
72 | - list : A list of columns to select
73 |
74 | # See documentation from sklearn.impute.SimpleImputer for the following parameters
75 | missing_values, strategy, fill_value, copy, add_indicator, keep_empty_features
76 |
77 | """
78 |
79 | self.columns = columns
80 | self.missing_values = missing_values
81 | self.strategy = strategy
82 | self.fill_value = fill_value
83 | self.copy = copy
84 | self.add_indicator = add_indicator
85 | self.keep_empty_features = keep_empty_features
86 |
87 |
88 | def fit(self, X, y=None):
89 | if (self.columns == "categorical" or self.columns == "numeric") and not isinstance(X, pd.DataFrame):
90 | raise ValueError(f"Invalid value for columns: {self.columns}. "
91 | "Only 'all' or is supported for np arrays")
92 |
93 | if self.columns == "categorical":
94 | self.columns_ = list(X.select_dtypes(exclude='number').columns)
95 | elif self.columns == "numeric":
96 | self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])]
97 | elif self.columns == "all":
98 | if isinstance(X, pd.DataFrame):
99 | self.columns_ = X.columns
100 | else:
101 | self.columns_ = list(range(X.shape[1]))
102 | elif isinstance(self.columns, list):
103 | self.columns_ = self.columns
104 | else:
105 | raise ValueError(f"Invalid value for columns: {self.columns}")
106 |
107 | if len(self.columns_) == 0:
108 | return self
109 |
110 | self.imputer = sklearn.impute.SimpleImputer(missing_values=self.missing_values,
111 | strategy=self.strategy,
112 | fill_value=self.fill_value,
113 | copy=self.copy,
114 | add_indicator=self.add_indicator,
115 | keep_empty_features=self.keep_empty_features)
116 |
117 | if isinstance(X, pd.DataFrame):
118 | self.imputer.set_output(transform="pandas")
119 |
120 | if isinstance(X, pd.DataFrame):
121 | self.imputer.fit(X[self.columns_], y)
122 | else:
123 | self.imputer.fit(X[:, self.columns_], y)
124 |
125 | return self
126 |
127 | def transform(self, X):
128 | if len(self.columns_) == 0:
129 | return X
130 |
131 | if isinstance(X, pd.DataFrame):
132 | X = X.copy()
133 | X[self.columns_] = self.imputer.transform(X[self.columns_])
134 | return X
135 | else:
136 | X = np.copy(X)
137 | X[:, self.columns_] = self.imputer.transform(X[:, self.columns_])
138 | return X
139 |
140 |
141 |
--------------------------------------------------------------------------------
/tpot/builtin_modules/passkbinsdiscretizer.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import pandas as pd
37 | from sklearn.base import BaseEstimator, TransformerMixin
38 | from sklearn.compose import ColumnTransformer
39 | from sklearn.preprocessing import KBinsDiscretizer
40 | import numpy as np
41 |
42 | def select_features(X, min_unique=10,):
43 | """
44 | Given a DataFrame or numpy array, return a list of column indices that have more than min_unique unique values.
45 |
46 | Parameters
47 | ----------
48 | X: DataFrame or numpy array
49 | Data to select features from
50 | min_unique: int, default=10
51 | Minimum number of unique values a column must have to be selected
52 |
53 | Returns
54 | -------
55 | list
56 | List of column indices that have more than min_unique unique values
57 |
58 | """
59 |
60 | if isinstance(X, pd.DataFrame):
61 | return [col for col in X.columns if len(X[col].unique()) > min_unique]
62 | else:
63 | return [i for i in range(X.shape[1]) if len(np.unique(X[:, i])) > min_unique]
64 |
65 | class PassKBinsDiscretizer(BaseEstimator, TransformerMixin):
66 | def __init__(self, n_bins=5, encode='onehot-dense', strategy='quantile', subsample=None, random_state=None):
67 | self.n_bins = n_bins
68 | self.encode = encode
69 | self.strategy = strategy
70 | self.subsample = subsample
71 | self.random_state = random_state
72 | """
73 | Same as sklearn.preprocessing.KBinsDiscretizer, but passes through columns that are not discretized due to having fewer than n_bins unique values instead of ignoring them.
74 | See sklearn.preprocessing.KBinsDiscretizer for more information.
75 | """
76 |
77 | def fit(self, X, y=None):
78 | # Identify columns with more than n unique values
79 | # Create a ColumnTransformer to select and discretize the chosen columns
80 | self.selected_columns_ = select_features(X, min_unique=10)
81 | if isinstance(X, pd.DataFrame):
82 | self.not_selected_columns_ = [col for col in X.columns if col not in self.selected_columns_]
83 | else:
84 | self.not_selected_columns_ = [i for i in range(X.shape[1]) if i not in self.selected_columns_]
85 |
86 | enc = KBinsDiscretizer(n_bins=self.n_bins, encode=self.encode, strategy=self.strategy, subsample=self.subsample, random_state=self.random_state)
87 | self.transformer = ColumnTransformer([
88 | ('discretizer', enc, self.selected_columns_),
89 | ('passthrough', 'passthrough', self.not_selected_columns_)
90 | ])
91 | self.transformer.fit(X)
92 | return self
93 |
94 | def transform(self, X):
95 | return self.transformer.transform(X)
--------------------------------------------------------------------------------
/tpot/builtin_modules/passthrough.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from sklearn.base import BaseEstimator, TransformerMixin
37 | import numpy as np
38 |
39 | class Passthrough(TransformerMixin,BaseEstimator):
40 | """
41 | A transformer that does nothing. It just passes the input array as is.
42 | """
43 |
44 | def fit(self, X=None, y=None):
45 | """
46 | Nothing to fit, just returns self.
47 | """
48 | return self
49 |
50 | def transform(self, X):
51 | """
52 | returns the input array as is.
53 | """
54 | return X
55 |
56 |
57 | class SkipTransformer(TransformerMixin,BaseEstimator):
58 | """
59 | A transformer returns an empty array. When combined with FeatureUnion, it can be used to skip a branch.
60 | """
61 | def fit(self, X=None, y=None):
62 | """
63 | Nothing to fit, just returns self.
64 | """
65 | return self
66 |
67 | def transform(self, X):
68 | """
69 | returns an empty array.
70 | """
71 | return np.array([]).reshape(X.shape[0],0)
72 |
73 |
--------------------------------------------------------------------------------
/tpot/builtin_modules/tests/feature_set_selector_tests.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 |
37 |
38 | import numpy as np
39 | import pandas as pd
40 | from tpot.config.custom_modules import FeatureSetSelector
41 | from nose.tools import assert_raises
42 |
43 | test_data = pd.read_csv("tests/tests.csv")
44 | test_X = test_data.drop("class", axis=1)
45 |
46 |
47 | def test_FeatureSetSelector_1():
48 | """Assert that the StackingEstimator returns transformed X based on test feature list 1."""
49 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1")
50 | ds.fit(test_X, y=None)
51 | transformed_X = ds.transform(test_X)
52 |
53 | assert transformed_X.shape[0] == test_X.shape[0]
54 | assert transformed_X.shape[1] != test_X.shape[1]
55 | assert transformed_X.shape[1] == 5
56 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values)
57 |
58 | def test_FeatureSetSelector_2():
59 | """Assert that the StackingEstimator returns transformed X based on test feature list 2."""
60 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_2")
61 | ds.fit(test_X, y=None)
62 | transformed_X = ds.transform(test_X)
63 |
64 | assert transformed_X.shape[0] == test_X.shape[0]
65 | assert transformed_X.shape[1] != test_X.shape[1]
66 | assert transformed_X.shape[1] == 6
67 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values)
68 |
69 | def test_FeatureSetSelector_3():
70 | """Assert that the StackingEstimator returns transformed X based on 2 subsets' names"""
71 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=["test_subset_1", "test_subset_2"])
72 | ds.fit(test_X, y=None)
73 | transformed_X = ds.transform(test_X)
74 |
75 | assert transformed_X.shape[0] == test_X.shape[0]
76 | assert transformed_X.shape[1] != test_X.shape[1]
77 | assert transformed_X.shape[1] == 7
78 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values)
79 |
80 | def test_FeatureSetSelector_4():
81 | """Assert that the StackingEstimator returns transformed X based on 2 subsets' indexs"""
82 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=[0, 1])
83 | ds.fit(test_X, y=None)
84 | transformed_X = ds.transform(test_X)
85 |
86 | assert transformed_X.shape[0] == test_X.shape[0]
87 | assert transformed_X.shape[1] != test_X.shape[1]
88 | assert transformed_X.shape[1] == 7
89 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values)
90 |
91 | def test_FeatureSetSelector_5():
92 | """Assert that the StackingEstimator returns transformed X seleced based on test feature list 1's index."""
93 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=0)
94 | ds.fit(test_X, y=None)
95 | transformed_X = ds.transform(test_X)
96 |
97 | assert transformed_X.shape[0] == test_X.shape[0]
98 | assert transformed_X.shape[1] != test_X.shape[1]
99 | assert transformed_X.shape[1] == 5
100 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values)
101 |
102 | def test_FeatureSetSelector_6():
103 | """Assert that the _get_support_mask function returns correct mask."""
104 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1")
105 | ds.fit(test_X, y=None)
106 | mask = ds._get_support_mask()
107 | get_mask = ds.get_support()
108 |
109 | assert mask.shape[0] == 30
110 | assert np.count_nonzero(mask) == 5
111 | assert np.array_equal(get_mask, mask)
112 |
113 | def test_FeatureSetSelector_7():
114 | """Assert that the StackingEstimator works as expected when input X is np.array."""
115 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1")
116 | ds.fit(test_X.values, y=None)
117 | transformed_X = ds.transform(test_X.values)
118 | str_feat_list = [str(i+2) for i in ds.feat_list_idx]
119 |
120 |
121 | assert transformed_X.shape[0] == test_X.shape[0]
122 | assert transformed_X.shape[1] != test_X.shape[1]
123 | assert transformed_X.shape[1] == 5
124 | assert np.array_equal(transformed_X, test_X.values[:, ds.feat_list_idx])
125 | assert np.array_equal(transformed_X, test_X[str_feat_list].values)
126 |
127 |
128 | def test_FeatureSetSelector_8():
129 | """Assert that the StackingEstimator rasies ValueError when features are not available."""
130 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_4")
131 | assert_raises(ValueError, ds.fit, test_X)
132 |
133 |
134 | def test_FeatureSetSelector_9():
135 | """Assert that the StackingEstimator __name__ returns correct class name."""
136 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_4")
137 | assert ds.__name__ == 'FeatureSetSelector'
138 |
--------------------------------------------------------------------------------
/tpot/builtin_modules/zero_count.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 |
37 |
38 | import numpy as np
39 | from sklearn.base import BaseEstimator, TransformerMixin
40 | from sklearn.utils import check_array
41 |
42 |
43 | class ZeroCount(BaseEstimator, TransformerMixin):
44 | """Adds the count of zeros and count of non-zeros per sample as features."""
45 |
46 | def fit(self, X, y=None):
47 | """Dummy function to fit in with the sklearn API."""
48 | return self
49 |
50 | def transform(self, X, y=None):
51 | """Transform data by adding two virtual features.
52 |
53 | Parameters
54 | ----------
55 | X: numpy ndarray, {n_samples, n_components}
56 | New data, where n_samples is the number of samples and n_components
57 | is the number of components.
58 | y: None
59 | Unused
60 |
61 | Returns
62 | -------
63 | X_transformed: array-like, shape (n_samples, n_features)
64 | The transformed feature set
65 | """
66 | X = check_array(X)
67 | n_features = X.shape[1]
68 |
69 | X_transformed = np.copy(X)
70 |
71 | non_zero_vector = np.count_nonzero(X_transformed, axis=1)
72 | non_zero = np.reshape(non_zero_vector, (-1, 1))
73 | zero_col = np.reshape(n_features - non_zero_vector, (-1, 1))
74 |
75 | X_transformed = np.hstack((non_zero, X_transformed))
76 | X_transformed = np.hstack((zero_col, X_transformed))
77 |
78 | return X_transformed
79 |
--------------------------------------------------------------------------------
/tpot/config/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from .get_configspace import get_search_space
--------------------------------------------------------------------------------
/tpot/config/autoqtl_builtins.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from tpot.builtin_modules import genetic_encoders
37 | from tpot.builtin_modules import feature_encoding_frequency_selector
38 | import sklearn
39 | import numpy as np
40 |
41 | from ConfigSpace import ConfigurationSpace
42 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
43 |
44 | FeatureEncodingFrequencySelector_ConfigurationSpace = ConfigurationSpace(
45 | space = {
46 | 'threshold': Float("threshold", bounds=(0, .35))
47 | }
48 | )
49 |
50 |
51 | # genetic_encoders.DominantEncoder : {},
52 | # genetic_encoders.RecessiveEncoder : {},
53 | # genetic_encoders.HeterosisEncoder : {},
54 | # genetic_encoders.UnderDominanceEncoder : {},
55 | # genetic_encoders.OverDominanceEncoder : {},
56 |
57 |
--------------------------------------------------------------------------------
/tpot/config/classifiers_sklearnex.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from ConfigSpace import ConfigurationSpace
37 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
38 |
39 |
40 | def get_RandomForestClassifier_ConfigurationSpace(random_state, n_jobs=1):
41 | space = {
42 | 'n_estimators': 100, #TODO make this a higher number? learned?
43 | 'bootstrap': Categorical("bootstrap", [True, False]),
44 | 'min_samples_split': Integer("min_samples_split", bounds=(2, 20)),
45 | 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 20)),
46 | 'n_jobs': n_jobs,
47 |
48 | }
49 |
50 | if random_state is not None: #This is required because configspace doesn't allow None as a value
51 | space['random_state'] = random_state
52 |
53 | return ConfigurationSpace(
54 | space = space
55 | )
56 |
57 | def get_KNeighborsClassifier_ConfigurationSpace(n_samples):
58 | return ConfigurationSpace(
59 | space = {
60 | 'n_neighbors': Integer("n_neighbors", bounds=(1, max(n_samples, 100)), log=True),
61 | 'weights': Categorical("weights", ['uniform', 'distance']),
62 | }
63 | )
64 |
65 |
66 | #TODO add conditionals
67 | def get_LogisticRegression_ConfigurationSpace(random_state):
68 | space = {
69 | 'solver': Categorical("solver", ['liblinear', 'sag', 'saga']),
70 | 'penalty': Categorical("penalty", ['l1', 'l2']),
71 | 'dual': Categorical("dual", [True, False]),
72 | 'C': Float("C", bounds=(1e-4, 1e4), log=True),
73 | 'max_iter': 1000,
74 | }
75 |
76 | if random_state is not None: #This is required because configspace doesn't allow None as a value
77 | space['random_state'] = random_state
78 |
79 | return ConfigurationSpace(
80 | space = space
81 | )
82 |
83 | def get_SVC_ConfigurationSpace(random_state):
84 | space = {
85 | 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']),
86 | 'C': Float("C", bounds=(1e-4, 25), log=True),
87 | 'degree': Integer("degree", bounds=(1, 4)),
88 | 'max_iter': 3000,
89 | 'tol': 0.001,
90 | 'probability': Categorical("probability", [True]), # configspace doesn't allow bools as a default value? but does allow them as a value inside a Categorical
91 | }
92 |
93 | if random_state is not None: #This is required because configspace doesn't allow None as a value
94 | space['random_state'] = random_state
95 |
96 | return ConfigurationSpace(
97 | space = space
98 | )
99 |
100 | def get_NuSVC_ConfigurationSpace(random_state):
101 | space = {
102 | 'nu': Float("nu", bounds=(0.05, 1.0)),
103 | 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']),
104 | #'C': Float("C", bounds=(1e-4, 25), log=True),
105 | 'degree': Integer("degree", bounds=(1, 4)),
106 | 'class_weight': Categorical("class_weight", [None, 'balanced']),
107 | 'max_iter': 3000,
108 | 'tol': 0.005,
109 | 'probability': Categorical("probability", [True]), # configspace doesn't allow bools as a default value? but does allow them as a value inside a Categorical
110 | }
111 |
112 | if random_state is not None: #This is required because configspace doesn't allow None as a value
113 | space['random_state'] = random_state
114 |
115 | return ConfigurationSpace(
116 | space = space
117 | )
--------------------------------------------------------------------------------
/tpot/config/imputers.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import sklearn
37 | import sklearn.ensemble
38 | import sklearn.linear_model
39 | import sklearn.neighbors
40 | from ConfigSpace import ConfigurationSpace
41 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
42 | from ConfigSpace import EqualsCondition
43 |
44 |
45 | simple_imputer_cs = ConfigurationSpace(
46 | space = {
47 | 'strategy' : Categorical('strategy',
48 | ['mean','median', 'most_frequent', 'constant']
49 | ),
50 | #'add_indicator' : Categorical('add_indicator', [True, False]),
51 | #Removed add_indicator, it appends a mask next to the rest of the data
52 | # and can cause errors. gk
53 | }
54 | )
55 | #test
56 | def get_IterativeImputer_config_space(n_features, random_state):
57 | space = { 'initial_strategy' : Categorical('initial_strategy',
58 | ['mean', 'median',
59 | 'most_frequent', 'constant']),
60 | 'n_nearest_features' : Integer('n_nearest_features',
61 | bounds=(1, n_features)),
62 | 'imputation_order' : Categorical('imputation_order',
63 | ['ascending', 'descending',
64 | 'roman', 'arabic', 'random']),
65 | }
66 |
67 | estimator = Categorical('estimator', ['Bayesian', 'RFR', 'Ridge', 'KNN'])
68 | sample_posterior = Categorical('sample_posterior', [True, False])
69 | sampling_condition = EqualsCondition(sample_posterior, estimator, 'Bayesian')
70 |
71 | if random_state is not None:
72 | #This is required because configspace doesn't allow None as a value
73 | space['random_state'] = random_state
74 |
75 | cs = ConfigurationSpace(space=space)
76 | cs.add([estimator, sample_posterior])
77 | cs.add([sampling_condition])
78 | return cs
79 |
80 | def get_IterativeImputer_config_space_no_estimator(n_features, random_state):
81 | space = { 'initial_strategy' : Categorical('initial_strategy',
82 | ['mean', 'median',
83 | 'most_frequent', 'constant']),
84 | 'n_nearest_features' : Integer('n_nearest_features',
85 | bounds=(1, n_features)),
86 | 'imputation_order' : Categorical('imputation_order',
87 | ['ascending', 'descending',
88 | 'roman', 'arabic', 'random']),
89 | }
90 |
91 | if random_state is not None:
92 | #This is required because configspace doesn't allow None as a value
93 | space['random_state'] = random_state
94 |
95 | cs = ConfigurationSpace(space=space)
96 |
97 | return cs
98 |
99 | def get_KNNImputer_config_space(n_samples):
100 | space = {
101 | 'n_neighbors': Integer('n_neighbors', bounds=(1, max(n_samples,100))),
102 | 'weights': Categorical('weights', ['uniform', 'distance'])
103 | }
104 |
105 | return ConfigurationSpace(
106 | space=space
107 | )
108 |
109 | def IterativeImputer_hyperparameter_parser(params):
110 | est = params['estimator']
111 | match est:
112 | case 'Bayesian':
113 | estimator = sklearn.linear_model.BayesianRidge()
114 | case 'RFR':
115 | estimator = sklearn.ensemble.RandomForestRegressor()
116 | case 'Ridge':
117 | estimator = sklearn.linear_model.Ridge()
118 | case 'KNN':
119 | estimator = sklearn.neighbors.KNeighborsRegressor()
120 |
121 | final_params = {
122 | 'estimator' : estimator,
123 | 'initial_strategy' : params['initial_strategy'],
124 | 'n_nearest_features' : params['n_nearest_features'],
125 | 'imputation_order' : params['imputation_order'],
126 | }
127 |
128 | if 'sample_posterior' in params:
129 | final_params['sample_posterior'] = params['sample_posterior']
130 |
131 | if 'random_state' in params:
132 | final_params['random_state'] = params['random_state']
133 |
134 | return final_params
--------------------------------------------------------------------------------
/tpot/config/mdr_configs.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from ConfigSpace import ConfigurationSpace
37 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
38 |
39 |
40 |
41 | #MDR
42 | MDR_configspace = ConfigurationSpace(
43 | space = {
44 | 'tie_break': Categorical('tie_break', [0,1]),
45 | 'default_label': Categorical('default_label', [0,1]),
46 | }
47 | )
48 |
49 |
50 |
51 |
52 | def get_skrebate_ReliefF_config_space(n_features):
53 | return ConfigurationSpace(
54 | space = {
55 | 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True),
56 | 'n_neighbors': Integer('n_neighbors', bounds=(2,500), log=True),
57 | }
58 | )
59 |
60 |
61 | def get_skrebate_SURF_config_space(n_features):
62 | return ConfigurationSpace(
63 | space = {
64 | 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True),
65 | }
66 | )
67 |
68 |
69 | def get_skrebate_SURFstar_config_space(n_features):
70 | return ConfigurationSpace(
71 | space = {
72 | 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True),
73 | }
74 | )
75 | def get_skrebate_MultiSURF_config_space(n_features):
76 | return ConfigurationSpace(
77 | space = {
78 | 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True),
79 | }
80 | )
81 |
--------------------------------------------------------------------------------
/tpot/config/regressors_sklearnex.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from ConfigSpace import ConfigurationSpace
37 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
38 |
39 |
40 |
41 | def get_RandomForestRegressor_ConfigurationSpace(random_state):
42 | space = {
43 | 'n_estimators': 100,
44 | 'max_features': Float("max_features", bounds=(0.05, 1.0)),
45 | 'bootstrap': Categorical("bootstrap", [True, False]),
46 | 'min_samples_split': Integer("min_samples_split", bounds=(2, 21)),
47 | 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 21)),
48 | }
49 |
50 | if random_state is not None: #This is required because configspace doesn't allow None as a value
51 | space['random_state'] = random_state
52 |
53 | return ConfigurationSpace(
54 | space = space
55 | )
56 |
57 |
58 | def get_KNeighborsRegressor_ConfigurationSpace(n_samples):
59 | return ConfigurationSpace(
60 | space = {
61 | 'n_neighbors': Integer("n_neighbors", bounds=(1, max(n_samples, 100))),
62 | 'weights': Categorical("weights", ['uniform', 'distance']),
63 | }
64 | )
65 |
66 |
67 | def get_Ridge_ConfigurationSpace(random_state):
68 | space = {
69 | 'alpha': Float("alpha", bounds=(0.0, 1.0)),
70 | 'fit_intercept': Categorical("fit_intercept", [True]),
71 | 'tol': Float("tol", bounds=(1e-5, 1e-1)),
72 | }
73 |
74 | if random_state is not None: #This is required because configspace doesn't allow None as a value
75 | space['random_state'] = random_state
76 |
77 | return ConfigurationSpace(
78 | space = space
79 | )
80 |
81 | def get_Lasso_ConfigurationSpace(random_state):
82 | space = {
83 | 'alpha': Float("alpha", bounds=(0.0, 1.0)),
84 | 'fit_intercept': Categorical("fit_intercept", [True]),
85 | 'precompute': Categorical("precompute", [True, False, 'auto']),
86 | 'tol': 0.001,
87 | 'positive': Categorical("positive", [True, False]),
88 | 'selection': Categorical("selection", ['cyclic', 'random']),
89 | }
90 |
91 | if random_state is not None: #This is required because configspace doesn't allow None as a value
92 | space['random_state'] = random_state
93 |
94 | return ConfigurationSpace(
95 | space = space
96 | )
97 |
98 | def get_ElasticNet_ConfigurationSpace(random_state):
99 | space = {
100 | 'alpha': Float("alpha", bounds=(0.0, 1.0)),
101 | 'l1_ratio': Float("l1_ratio", bounds=(0.0, 1.0)),
102 | }
103 |
104 | if random_state is not None: #This is required because configspace doesn't allow None as a value
105 | space['random_state'] = random_state
106 |
107 | return ConfigurationSpace(
108 | space = space
109 | )
110 |
111 |
112 | def get_SVR_ConfigurationSpace(random_state):
113 | space = {
114 | 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']),
115 | 'C': Float("C", bounds=(1e-4, 25), log=True),
116 | 'degree': Integer("degree", bounds=(1, 4)),
117 | 'max_iter': 3000,
118 | 'tol': 0.001,
119 | }
120 |
121 | if random_state is not None: #This is required because configspace doesn't allow None as a value
122 | space['random_state'] = random_state
123 |
124 | return ConfigurationSpace(
125 | space = space
126 | )
127 |
128 | def get_NuSVR_ConfigurationSpace(random_state):
129 | space = {
130 | 'nu': Float("nu", bounds=(0.05, 1.0)),
131 | 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']),
132 | 'C': Float("C", bounds=(1e-4, 25), log=True),
133 | 'degree': Integer("degree", bounds=(1, 4)),
134 | 'max_iter': 3000,
135 | 'tol': 0.005,
136 | }
137 |
138 | if random_state is not None: #This is required because configspace doesn't allow None as a value
139 | space['random_state'] = random_state
140 |
141 | return ConfigurationSpace(
142 | space = space
143 | )
--------------------------------------------------------------------------------
/tpot/config/selectors.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | #TODO: how to best support transformers/selectors that take other transformers with their own hyperparameters?
37 | import numpy as np
38 | import sklearn
39 |
40 | from ConfigSpace import ConfigurationSpace
41 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
42 |
43 | SelectFwe_configspace = ConfigurationSpace(
44 | space = {
45 | 'alpha': Float('alpha', bounds=(1e-4, 0.05), log=True),
46 | }
47 | )
48 |
49 |
50 | SelectPercentile_configspace = ConfigurationSpace(
51 | space = {
52 | 'percentile': Float('percentile', bounds=(1, 100.0)),
53 | }
54 | )
55 |
56 | VarianceThreshold_configspace = ConfigurationSpace(
57 | space = {
58 | 'threshold': Float('threshold', bounds=(1e-4, .2), log=True),
59 | }
60 | )
61 |
62 |
63 |
64 | # Note the RFE_configspace_part and SelectFromModel_configspace_part are not complete, they both require the estimator to be set.
65 | # These are indended to be used with the Wrapped search space.
66 | RFE_configspace_part = ConfigurationSpace(
67 | space = {
68 | 'step': Float('step', bounds=(1e-4, 1.0)),
69 | }
70 | )
71 |
72 | SelectFromModel_configspace_part = ConfigurationSpace(
73 | space = {
74 | 'threshold': Float('threshold', bounds=(1e-4, 1.0), log=True),
75 | }
76 | )
77 |
--------------------------------------------------------------------------------
/tpot/config/special_configs.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from tpot.builtin_modules import ArithmeticTransformer, FeatureSetSelector
37 | from functools import partial
38 | import pandas as pd
39 | import numpy as np
40 | from tpot.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
41 |
42 | from ConfigSpace import ConfigurationSpace
43 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
44 |
45 | def get_ArithmeticTransformer_ConfigurationSpace():
46 | return ConfigurationSpace(
47 | space = {
48 | 'function': Categorical("function", ["add", "mul_neg_1", "mul", "safe_reciprocal", "eq","ne","ge","gt","le","lt", "min","max","0","1"]),
49 | }
50 | )
51 |
52 |
53 |
54 |
55 | # AddTransformer: {}
56 | # mul_neg_1_Transformer: {}
57 | # MulTransformer: {}
58 | # SafeReciprocalTransformer: {}
59 | # EQTransformer: {}
60 | # NETransformer: {}
61 | # GETransformer: {}
62 | # GTTransformer: {}
63 | # LETransformer: {}
64 | # LTTransformer: {}
65 | # MinTransformer: {}
66 | # MaxTransformer: {}
67 |
68 |
--------------------------------------------------------------------------------
/tpot/config/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/tpot/config/tests/__init__.py
--------------------------------------------------------------------------------
/tpot/config/tests/test_get_configspace.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import tpot
3 | import sys
4 | from sklearn.datasets import load_iris
5 | import random
6 | import sklearn
7 |
8 | import tpot.config
9 |
10 | from ..get_configspace import STRING_TO_CLASS, GROUPNAMES
11 |
12 | def test_loop_through_all_hyperparameters():
13 |
14 | n_classes=3
15 | n_samples=100
16 | n_features=100
17 | random_state=None
18 |
19 | for class_name, _ in STRING_TO_CLASS.items():
20 | print(class_name)
21 | estnode_gen = tpot.config.get_search_space(class_name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state)
22 |
23 | #generate 100 random hyperparameters and make sure they are all valid
24 | for i in range(25):
25 | estnode = estnode_gen.generate()
26 | est = estnode.export_pipeline()
27 |
28 | @pytest.mark.skipif(sys.platform == 'darwin', reason="sklearnex dependency not available on macOS")
29 | def test_loop_through_groupnames():
30 |
31 | n_classes=3
32 | n_samples=100
33 | n_features=100
34 | random_state=None
35 |
36 | for groupname, group in GROUPNAMES.items():
37 | for class_name in group:
38 | print(class_name)
39 | estnode_gen = tpot.config.get_search_space(class_name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state)
40 |
41 | #generate 10 random hyperparameters and make sure they are all valid
42 | for i in range(25):
43 | estnode = estnode_gen.generate()
44 | est = estnode.export_pipeline()
--------------------------------------------------------------------------------
/tpot/evolvers/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_evolver import *
2 | from .steady_state_evolver import *
--------------------------------------------------------------------------------
/tpot/individual.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from abc import abstractmethod
37 | import types
38 | import numpy as np
39 | import copy
40 | import copy
41 | import typing
42 |
43 |
44 | class BaseIndividual:
45 |
46 |
47 | def __init__(self) -> None:
48 | self.mutation_list = []
49 | self.crossover_list = []
50 |
51 | def mutate(self, rng=None):
52 | rng = np.random.default_rng(rng)
53 | mutation_list_copy = self.mutation_list.copy()
54 | rng.shuffle(mutation_list_copy)
55 | for func in mutation_list_copy:
56 | if func():
57 | return True
58 | return False
59 |
60 | def crossover(self, ind2, rng=None):
61 | rng = np.random.default_rng(rng)
62 | crossover_list_copy = self.crossover_list.copy()
63 | rng.shuffle(crossover_list_copy)
64 | for func in crossover_list_copy:
65 | if func(ind2):
66 | return True
67 | return False
68 |
69 | # a guided change of an individual when given an objective function
70 | def optimize(self, objective_function, rng=None , steps=5):
71 | rng = np.random.default_rng(rng)
72 | for _ in range(steps):
73 | self.mutate(rng=rng)
74 |
75 | #Return a hashable unique to this individual setup
76 | #For use when evaluating whether or not an individual is 'the same' and another individual
77 | def unique_id(self):
78 | return self
79 |
80 |
81 | #TODO https://www.pythontutorial.net/python-oop/python-__hash__/
82 | #python hashing and __eq__ functions look into
83 | #whether or not this would be a better way of doing things
84 |
85 | # #TODO: use this instead of unique_id()?
86 | # #unique_id() and __repr__ could have different levels of specificity.
87 | # def __repr__(self) -> str:
88 | # pass
89 |
90 | # def __hash__(self) -> int:
91 | # pass
92 |
93 | # def __eq__(self, other):
94 | # self.unique_id() == other.unique_id()
95 |
--------------------------------------------------------------------------------
/tpot/logbook.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | class CallBackInterface():
37 | def __init__(self) -> None:
38 | pass
39 |
40 | def step_callback(self, population):
41 | pass
42 |
43 | def population_mutate_callback(self, offspring, parent=None):
44 | pass
45 |
46 | def population_crossover_callback(self, offspring, parent=None):
47 | pass
48 |
49 | def evolutionary_algorithm_step_callback(self, population):
50 | pass
51 |
52 | class Logbook():
53 |
54 | pass
55 |
56 |
--------------------------------------------------------------------------------
/tpot/objectives/__init__.py:
--------------------------------------------------------------------------------
1 | from .average_path_length import average_path_length_objective
2 | from .number_of_nodes import number_of_nodes_objective
3 | from .number_of_leaves import number_of_leaves_scorer, number_of_leaves_objective
4 | from .complexity import complexity_scorer
5 |
6 |
7 | #these scorers are calculated per fold of CV on the fitted pipeline for that fold
8 | SCORERS = {
9 | "complexity_scorer": complexity_scorer
10 | }
11 |
12 | #these objectives are calculated once on unfitted models as secondary objectives
13 | OBJECTIVES = { "average_path_length_objective": average_path_length_objective,
14 | "number_of_nodes_objective": number_of_nodes_objective,
15 | "number_of_leaves_objective": number_of_leaves_objective
16 | }
--------------------------------------------------------------------------------
/tpot/objectives/average_path_length.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import networkx as nx
37 | import numpy as np
38 |
39 | def average_path_length_objective(graph_pipeline):
40 | """
41 | Computes the average shortest path from all nodes to the root/final estimator (only supported for GraphPipeline)
42 |
43 | Parameters
44 | ----------
45 | graph_pipeline: GraphPipeline
46 | The pipeline to compute the average path length for
47 |
48 | """
49 |
50 | path_lengths = nx.shortest_path_length(graph_pipeline.graph, source=graph_pipeline.root)
51 | return np.mean(np.array(list(path_lengths.values())))+1
--------------------------------------------------------------------------------
/tpot/objectives/number_of_leaves.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | def number_of_leaves_scorer(est,X=None, y=None):
37 | return len([v for v, d in est.graph.out_degree() if d == 0])
38 |
39 | def number_of_leaves_objective(est):
40 | """
41 | Calculates the number of leaves (input nodes) in a GraphPipeline
42 |
43 | Parameters
44 | ----------
45 | est: GraphPipeline
46 | The pipeline to compute the number of leaves for
47 | """
48 | return len([v for v, d in est.graph.out_degree() if d == 0])
--------------------------------------------------------------------------------
/tpot/objectives/number_of_nodes.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from ..graphsklearn import GraphPipeline
37 | from sklearn.pipeline import Pipeline
38 | import sklearn
39 |
40 | def number_of_nodes_objective(est):
41 | """
42 | Calculates the number of leaves (input nodes) in an sklearn pipeline
43 |
44 | Parameters
45 | ----------
46 | est: GraphPipeline | Pipeline | FeatureUnion | BaseEstimator
47 | The pipeline to compute the number of nodes from.
48 | """
49 |
50 | if isinstance(est, GraphPipeline):
51 | return sum(number_of_nodes_objective(est.graph.nodes[node]["instance"]) for node in est.graph.nodes)
52 | if isinstance(est, Pipeline):
53 | return sum(number_of_nodes_objective(estimator) for _,estimator in est.steps)
54 | if isinstance(est, sklearn.pipeline.FeatureUnion):
55 | return sum(number_of_nodes_objective(estimator) for _,estimator in est.transformer_list)
56 |
57 | return 1
--------------------------------------------------------------------------------
/tpot/objectives/tests/test_complexity_objective.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/tpot/objectives/tests/test_complexity_objective.py
--------------------------------------------------------------------------------
/tpot/objectives/tests/test_number_of_nodes.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import tpot
3 | from sklearn.datasets import load_iris
4 | import random
5 | import sklearn
6 |
7 | from sklearn.svm import SVC
8 | from sklearn.preprocessing import StandardScaler
9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.datasets import make_classification
11 | from sklearn.model_selection import train_test_split
12 | from sklearn.pipeline import Pipeline
13 | import networkx as nx
14 | import tpot
15 | from tpot import GraphPipeline
16 | import sklearn.metrics
17 |
18 | def test_number_of_nodes_objective_Graphpipeline():
19 | g = nx.DiGraph()
20 |
21 | g.add_node("scaler", instance=StandardScaler())
22 | g.add_node("svc", instance=SVC())
23 | g.add_node("LogisticRegression", instance=LogisticRegression())
24 | g.add_node("LogisticRegression2", instance=LogisticRegression())
25 |
26 | g.add_edge("svc","scaler")
27 | g.add_edge("LogisticRegression", "scaler")
28 | g.add_edge("LogisticRegression2", "LogisticRegression")
29 | g.add_edge("LogisticRegression2", "svc")
30 |
31 | est = GraphPipeline(g)
32 |
33 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(est) == 4
34 |
35 | def test_number_of_nodes_objective_Pipeline():
36 | pipe = Pipeline([("scaler", StandardScaler()), ("svc", SVC())])
37 |
38 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(pipe) == 2
39 |
40 | def test_number_of_nodes_objective_not_pipeline_or_graphpipeline():
41 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(SVC()) == 1
42 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(StandardScaler()) == 1
43 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(LogisticRegression()) == 1
44 |
45 | def test_number_of_nodes_objective_pipeline_in_graphpipeline():
46 | g = nx.DiGraph()
47 |
48 | g.add_node("scaler", instance=StandardScaler())
49 | g.add_node("pipe", instance=Pipeline([("scaler", StandardScaler()), ("svc", SVC())]))
50 |
51 | g.add_edge("pipe","scaler")
52 |
53 | est = GraphPipeline(g)
54 |
55 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(est) == 3
56 |
57 | def test_number_of_nodes_objective_graphpipeline_in_pipeline():
58 | pipe = Pipeline([("scaler", StandardScaler()), ("svc", SVC())])
59 |
60 | g = nx.DiGraph()
61 |
62 | g.add_node("scaler", instance=StandardScaler())
63 | g.add_node("svc", instance=SVC())
64 | g.add_node("LogisticRegression", instance=LogisticRegression())
65 | g.add_node("LogisticRegression2", instance=LogisticRegression())
66 |
67 | g.add_edge("svc","scaler")
68 | g.add_edge("LogisticRegression", "scaler")
69 | g.add_edge("LogisticRegression2", "LogisticRegression")
70 | g.add_edge("LogisticRegression2", "svc")
71 |
72 | est = GraphPipeline(g)
73 |
74 | pipe.steps.append(("graphpipe", est))
75 |
76 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(pipe) == 6
77 |
78 |
79 | def test_number_of_nodes_objective_graphpipeline_in_graphpipeline():
80 | g = nx.DiGraph()
81 |
82 | g.add_node("scaler", instance=StandardScaler())
83 | g.add_node("svc", instance=SVC())
84 | g.add_node("LogisticRegression", instance=LogisticRegression())
85 | g.add_node("LogisticRegression2", instance=LogisticRegression())
86 |
87 | g.add_edge("svc","scaler")
88 | g.add_edge("LogisticRegression", "scaler")
89 | g.add_edge("LogisticRegression2", "LogisticRegression")
90 | g.add_edge("LogisticRegression2", "svc")
91 |
92 | est = GraphPipeline(g)
93 |
94 | g2 = nx.DiGraph()
95 |
96 | g2.add_node("g1", instance=est)
97 | g2.add_node("svc", instance=SVC())
98 | g2.add_node("LogisticRegression", instance=LogisticRegression())
99 | g2.add_node("LogisticRegression2", instance=LogisticRegression())
100 |
101 | g2.add_edge("svc","g1")
102 | g2.add_edge("LogisticRegression", "g1")
103 | g2.add_edge("LogisticRegression2", "LogisticRegression")
104 | g2.add_edge("LogisticRegression2", "svc")
105 |
106 | est2 = GraphPipeline(g2)
107 |
108 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(est2) == 7
109 |
110 | def test_number_of_nodes_objective_pipeline_in_pipeline():
111 | pipe = Pipeline([("scaler", StandardScaler()), ("svc", SVC())])
112 |
113 | pipe2 = Pipeline([("pipe", pipe), ("svc", SVC())])
114 |
115 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(pipe2) == 3
116 |
--------------------------------------------------------------------------------
/tpot/old_config_utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from .old_config_utils import convert_config_dict_to_list, convert_config_dict_to_choicepipeline, convert_config_dict_to_graphpipeline, convert_config_dict_to_linearpipeline
--------------------------------------------------------------------------------
/tpot/search_spaces/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | from . import nodes
3 | from . import pipelines
--------------------------------------------------------------------------------
/tpot/search_spaces/graph_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import networkx as nx
37 | import numpy as np
38 |
39 |
40 | def remove_and_stitch(graph, node):
41 | successors = graph.successors(node)
42 | predecessors = graph.predecessors(node)
43 |
44 | graph.remove_node(node)
45 |
46 | for s in successors:
47 | for p in predecessors:
48 | graph.add_edge(p,s)
49 |
50 |
51 | def remove_nodes_disconnected_from_node(graph, node):
52 | descendants = nx.descendants(graph, node)
53 | for n in list(graph.nodes):
54 | if n not in descendants and n is not node:
55 | graph.remove_node(n)
56 | #graph.remove_nodes_from([n for n in graph.nodes if n not in nx.descendants(graph, node) and n is not node])
57 |
58 |
59 | def get_roots(graph):
60 | return [v for v, d in graph.in_degree() if d == 0]
61 |
62 | def get_leaves(graph):
63 | return [v for v, d in graph.out_degree() if d == 0]
64 |
65 | def get_max_path_through_node(graph, root, node):
66 | if len(list(graph.successors(node)))==0:
67 | return get_max_path_size(graph, root, node)
68 | else:
69 | leaves = [n for n in nx.descendants(graph,node) if len(list(graph.successors(n)))==0]
70 |
71 | return max([get_max_path_size(graph, root, l) for l in leaves])
72 |
73 |
74 | def get_max_path_size(graph, fromnode1,tonode2, return_path=False):
75 | if fromnode1 is tonode2:
76 | if return_path:
77 | return [fromnode1]
78 | return 1
79 | else:
80 | max_length_path = max(nx.all_simple_paths(graph, fromnode1, tonode2), key=lambda x: len(x))
81 | if return_path:
82 | return max_length_path
83 | return len(max_length_path) #gets the max path and finds the length of that path
84 |
85 |
86 | def invert_dictionary(d):
87 | inv_map = {}
88 | for k, v in d.items():
89 | inv_map.setdefault(v, set()).add(k)
90 |
91 | return inv_map
92 |
93 | def select_nodes_same_depth(g1, node1, g2, node2, rng=None):
94 | rng = np.random.default_rng(rng)
95 |
96 | g1_nodes = nx.shortest_path_length(g1, source=node1)
97 | g2_nodes = nx.shortest_path_length(g2, source=node2)
98 |
99 | max_depth = max(list(g1_nodes.values()) + list(g2_nodes.values()))
100 |
101 | g1_nodes = invert_dictionary(g1_nodes)
102 | g2_nodes = invert_dictionary(g2_nodes)
103 |
104 | # depth_number_of_nodes = []
105 | # for i in range(max_depth+1):
106 | # n = 0
107 | # if i in g1_nodes and i in g2_nodes:
108 | # depth_number_of_nodes.append(len(g1_nodes[i])+len(g1_nodes[i]))
109 | # else:
110 | # break
111 |
112 | possible_pairs = []
113 | for i in range(max_depth+1):
114 | if i in g1_nodes and i in g2_nodes:
115 | for n1 in g1_nodes[i]:
116 | for n2 in g2_nodes[i]:
117 | possible_pairs.append( (n1,n2) )
118 |
119 | rng.shuffle(possible_pairs)
120 |
121 | for p in possible_pairs:
122 | yield p[0], p[1]
123 |
124 | def select_nodes_randomly(g1, g2, rng=None):
125 | rng = np.random.default_rng(rng)
126 |
127 | sorted_self_nodes_list = list(g1.nodes)
128 | rng.shuffle(sorted_self_nodes_list)
129 |
130 | sorted_other_nodes_list = list(g2.nodes)
131 | rng.shuffle(sorted_other_nodes_list)
132 | for node1 in sorted_self_nodes_list:
133 | for node2 in sorted_other_nodes_list:
134 | if node1 is node2:
135 | continue
136 | yield node1, node2
--------------------------------------------------------------------------------
/tpot/search_spaces/nodes/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from .estimator_node import *
37 | from .genetic_feature_selection import *
38 | from .fss_node import *
--------------------------------------------------------------------------------
/tpot/search_spaces/nodes/estimator_node.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | # try https://automl.github.io/ConfigSpace/main/api/hyperparameters.html
37 |
38 | import numpy as np
39 | from ..base import SklearnIndividual, SearchSpace
40 | from ConfigSpace import ConfigurationSpace
41 | from typing import final
42 |
43 |
44 | def default_hyperparameter_parser(params:dict) -> dict:
45 | return params
46 |
47 |
48 | class EstimatorNodeIndividual(SklearnIndividual):
49 | """
50 | Note that ConfigurationSpace does not support None as a parameter. Instead, use the special string "". TPOT will automatically replace instances of this string with the Python None.
51 |
52 | Parameters
53 | ----------
54 | method : type
55 | The class of the estimator to be used
56 |
57 | space : ConfigurationSpace|dict
58 | The hyperparameter space to be used. If a dict is passed, hyperparameters are fixed and not learned.
59 |
60 | """
61 | def __init__(self, method: type,
62 | space: ConfigurationSpace|dict, #TODO If a dict is passed, hyperparameters are fixed and not learned. Is this confusing? Should we make a second node type?
63 | hyperparameter_parser: callable = None,
64 | rng=None) -> None:
65 | super().__init__()
66 | self.method = method
67 | self.space = space
68 |
69 | if hyperparameter_parser is None:
70 | self.hyperparameter_parser = default_hyperparameter_parser
71 | else:
72 | self.hyperparameter_parser = hyperparameter_parser
73 |
74 | if isinstance(space, dict):
75 | self.hyperparameters = space
76 | else:
77 | rng = np.random.default_rng(rng)
78 | self.space.seed(rng.integers(0, 2**32))
79 | self.hyperparameters = dict(self.space.sample_configuration())
80 |
81 | def mutate(self, rng=None):
82 | if isinstance(self.space, dict):
83 | return False
84 |
85 | rng = np.random.default_rng(rng)
86 | self.space.seed(rng.integers(0, 2**32))
87 | self.hyperparameters = dict(self.space.sample_configuration())
88 | return True
89 |
90 | def crossover(self, other, rng=None):
91 | if isinstance(self.space, dict):
92 | return False
93 |
94 | rng = np.random.default_rng(rng)
95 | if self.method != other.method:
96 | return False
97 |
98 | #loop through hyperparameters, randomly swap items in self.hyperparameters with items in other.hyperparameters
99 | for hyperparameter in self.space:
100 | if rng.choice([True, False]):
101 | if hyperparameter in other.hyperparameters:
102 | self.hyperparameters[hyperparameter] = other.hyperparameters[hyperparameter]
103 |
104 | return True
105 |
106 |
107 |
108 | @final #this method should not be overridden, instead override hyperparameter_parser
109 | def export_pipeline(self, **kwargs):
110 | return self.method(**self.hyperparameter_parser(self.hyperparameters))
111 |
112 | def unique_id(self):
113 | #return a dictionary of the method and the hyperparameters
114 | method_str = self.method.__name__
115 | params = list(self.hyperparameters.keys())
116 | params = sorted(params)
117 |
118 | id_str = f"{method_str}({', '.join([f'{param}={self.hyperparameters[param]}' for param in params])})"
119 |
120 | return id_str
121 |
122 | class EstimatorNode(SearchSpace):
123 | def __init__(self, method, space, hyperparameter_parser=default_hyperparameter_parser):
124 | self.method = method
125 | self.space = space
126 | self.hyperparameter_parser = hyperparameter_parser
127 |
128 | def generate(self, rng=None):
129 | return EstimatorNodeIndividual(self.method, self.space, hyperparameter_parser=self.hyperparameter_parser, rng=rng)
--------------------------------------------------------------------------------
/tpot/search_spaces/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from .choice import *
2 | from .dynamic_linear import *
3 | from .sequential import *
4 | from .graph import *
5 | from .tree import *
6 | from .wrapper import *
7 | from .dynamicunion import *
8 | from .union import *
--------------------------------------------------------------------------------
/tpot/search_spaces/pipelines/choice.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import tpot
37 | import numpy as np
38 | import pandas as pd
39 | import sklearn
40 | from tpot import config
41 | from typing import Generator, List, Tuple, Union
42 | import random
43 | from ..base import SklearnIndividual, SearchSpace
44 |
45 | class ChoicePipelineIndividual(SklearnIndividual):
46 | def __init__(self, search_spaces : List[SearchSpace], rng=None) -> None:
47 | super().__init__()
48 | rng = np.random.default_rng(rng)
49 | self.search_spaces = search_spaces
50 | self.node = rng.choice(self.search_spaces).generate(rng=rng)
51 |
52 |
53 | def mutate(self, rng=None):
54 | rng = np.random.default_rng(rng)
55 | if rng.choice([True, False]):
56 | return self._mutate_select_new_node(rng)
57 | else:
58 | return self._mutate_node(rng)
59 |
60 | def _mutate_select_new_node(self, rng=None):
61 | rng = np.random.default_rng(rng)
62 | self.node = rng.choice(self.search_spaces).generate(rng=rng)
63 | return True
64 |
65 | def _mutate_node(self, rng=None):
66 | return self.node.mutate(rng)
67 |
68 | def crossover(self, other, rng=None):
69 | return self.node.crossover(other.node, rng)
70 |
71 | def export_pipeline(self, **kwargs):
72 | return self.node.export_pipeline(**kwargs)
73 |
74 | def unique_id(self):
75 | return self.node.unique_id()
76 |
77 |
78 | class ChoicePipeline(SearchSpace):
79 | def __init__(self, search_spaces : List[SearchSpace] ) -> None:
80 | self.search_spaces = search_spaces
81 |
82 | """
83 | Takes in a list of search spaces. Will select one node from the search space.
84 |
85 | """
86 |
87 | def generate(self, rng=None):
88 | rng = np.random.default_rng(rng)
89 | return ChoicePipelineIndividual(self.search_spaces, rng=rng)
--------------------------------------------------------------------------------
/tpot/search_spaces/pipelines/tests/test_graphspace.py:
--------------------------------------------------------------------------------
1 | # Test all nodes have all dictionaries
2 | import pytest
3 | import tpot
4 |
5 | import tpot
6 | from ConfigSpace import ConfigurationSpace
7 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
8 | from sklearn.neighbors import KNeighborsClassifier
9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.tree import DecisionTreeClassifier
11 | from sklearn.preprocessing import StandardScaler
12 |
13 |
14 | def test_merge_duplicate_nodes():
15 | knn_configspace = {}
16 | standard_scaler_configspace = {}
17 |
18 | knn_node = tpot.search_spaces.nodes.EstimatorNode(
19 | method = KNeighborsClassifier,
20 | space = knn_configspace,
21 | )
22 |
23 | scaler_node = tpot.search_spaces.nodes.EstimatorNode(
24 | method = StandardScaler,
25 | space = standard_scaler_configspace,
26 | )
27 |
28 |
29 | graph_search_space = tpot.search_spaces.pipelines.GraphSearchPipeline(
30 | root_search_space= knn_node,
31 | leaf_search_space = scaler_node,
32 | inner_search_space = None,
33 | max_size = 10,
34 | )
35 |
36 | ind = graph_search_space.generate()
37 |
38 | # all of these leaves should be identical
39 | ind._mutate_insert_leaf()
40 | ind._mutate_insert_leaf()
41 | ind._mutate_insert_leaf()
42 | ind._mutate_insert_leaf()
43 |
44 | ind._merge_duplicated_nodes()
45 |
46 | assert len(ind.graph.nodes) == 2
--------------------------------------------------------------------------------
/tpot/search_spaces/pipelines/tree.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import tpot
37 | import numpy as np
38 | import pandas as pd
39 | import sklearn
40 | from tpot import config
41 | from typing import Generator, List, Tuple, Union
42 | import random
43 | from ..base import SklearnIndividual, SearchSpace
44 | import networkx as nx
45 | import copy
46 | import matplotlib.pyplot as plt
47 |
48 | from .graph import GraphPipelineIndividual
49 |
50 |
51 | from ..graph_utils import *
52 |
53 | class TreePipelineIndividual(GraphPipelineIndividual):
54 | def __init__(self,
55 | **kwargs) -> None:
56 | super().__init__(**kwargs)
57 |
58 | self.crossover_methods_list = [self._crossover_swap_branch, self._crossover_swap_node, self._crossover_nodes]
59 | self.mutate_methods_list = [self._mutate_insert_leaf, self._mutate_insert_inner_node, self._mutate_remove_node, self._mutate_node]
60 | self.merge_duplicated_nodes_toggle = False
61 |
62 |
63 |
64 | class TreePipeline(SearchSpace):
65 | def __init__(self, root_search_space : SearchSpace,
66 | leaf_search_space : SearchSpace = None,
67 | inner_search_space : SearchSpace =None,
68 | min_size: int = 2,
69 | max_size: int = 10,
70 | crossover_same_depth=False) -> None:
71 |
72 | """
73 | Generates a pipeline of variable length. Pipeline will have a tree structure similar to TPOT1.
74 |
75 | """
76 |
77 | self.search_space = root_search_space
78 | self.leaf_search_space = leaf_search_space
79 | self.inner_search_space = inner_search_space
80 | self.min_size = min_size
81 | self.max_size = max_size
82 | self.crossover_same_depth = crossover_same_depth
83 |
84 | def generate(self, rng=None):
85 | rng = np.random.default_rng(rng)
86 | return TreePipelineIndividual(self.search_space, self.leaf_search_space, self.inner_search_space, self.min_size, self.max_size, self.crossover_same_depth, rng=rng)
--------------------------------------------------------------------------------
/tpot/search_spaces/pipelines/union.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import tpot
37 | import numpy as np
38 | import pandas as pd
39 | import sklearn
40 | from tpot import config
41 | from typing import Generator, List, Tuple, Union
42 | import random
43 | from ..base import SklearnIndividual, SearchSpace
44 | from ..tuple_index import TupleIndex
45 |
46 | class UnionPipelineIndividual(SklearnIndividual):
47 | """
48 | Takes in a list of search spaces. each space is a list of SearchSpaces.
49 | Will produce a FeatureUnion pipeline. Each step in the pipeline will correspond to the the search space provided in the same index.
50 | The resulting pipeline will be a FeatureUnion of the steps in the pipeline.
51 |
52 | """
53 |
54 | def __init__(self, search_spaces : List[SearchSpace], rng=None) -> None:
55 | super().__init__()
56 | self.search_spaces = search_spaces
57 |
58 | self.pipeline = []
59 | for space in self.search_spaces:
60 | self.pipeline.append(space.generate(rng))
61 |
62 | def mutate(self, rng=None):
63 | rng = np.random.default_rng(rng)
64 | step = rng.choice(self.pipeline)
65 | return step.mutate(rng)
66 |
67 |
68 | def crossover(self, other, rng=None):
69 | #swap a random step in the pipeline with the corresponding step in the other pipeline
70 | rng = np.random.default_rng(rng)
71 |
72 | cx_funcs = [self._crossover_node, self._crossover_swap_node]
73 | rng.shuffle(cx_funcs)
74 | for cx_func in cx_funcs:
75 | if cx_func(other, rng):
76 | return True
77 |
78 | return False
79 |
80 | def _crossover_swap_node(self, other, rng):
81 | rng = np.random.default_rng(rng)
82 | idx = rng.integers(1,len(self.pipeline))
83 |
84 | self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx]
85 | return True
86 |
87 | def _crossover_node(self, other, rng):
88 | rng = np.random.default_rng(rng)
89 |
90 | crossover_success = False
91 | for idx in range(len(self.pipeline)):
92 | if rng.random() < 0.5:
93 | if self.pipeline[idx].crossover(other.pipeline[idx], rng):
94 | crossover_success = True
95 |
96 | return crossover_success
97 |
98 | def export_pipeline(self, **kwargs):
99 | return sklearn.pipeline.make_union(*[step.export_pipeline(**kwargs) for step in self.pipeline])
100 |
101 | def unique_id(self):
102 | l = [step.unique_id() for step in self.pipeline]
103 | l = ["FeatureUnion"] + l
104 | return TupleIndex(tuple(l))
105 |
106 |
107 | class UnionPipeline(SearchSpace):
108 | def __init__(self, search_spaces : List[SearchSpace] ) -> None:
109 | """
110 | Takes in a list of search spaces. will produce a pipeline of Sequential length. Each step in the pipeline will correspond to the the search space provided in the same index.
111 | """
112 |
113 | self.search_spaces = search_spaces
114 |
115 | def generate(self, rng=None):
116 | rng = np.random.default_rng(rng)
117 | return UnionPipelineIndividual(self.search_spaces, rng=rng)
--------------------------------------------------------------------------------
/tpot/search_spaces/tests/test_search_spaces.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | # Test all nodes have all dictionaries
37 | import pytest
38 | import tpot
39 |
40 | import tpot
41 | from ConfigSpace import ConfigurationSpace
42 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
43 | from sklearn.neighbors import KNeighborsClassifier
44 | from sklearn.linear_model import LogisticRegression
45 | from sklearn.tree import DecisionTreeClassifier
46 | from sklearn.preprocessing import StandardScaler
47 |
48 |
49 | def test_EstimatorNodeCrossover():
50 | knn_configspace = {}
51 | standard_scaler_configspace = {}
52 |
53 | knn_node = tpot.search_spaces.nodes.EstimatorNode(
54 | method = KNeighborsClassifier,
55 | space = knn_configspace,
56 | )
57 |
58 | knnind1 = knn_node.generate()
59 | knnind2 = knn_node.generate()
60 |
61 | for i in range(0,10):
62 | knnind1.mutate()
63 | knnind2.mutate()
64 | knnind1.crossover(knnind2)
65 |
66 |
67 | def test_ValueError_different_types():
68 | knn_node = tpot.config.get_search_space(["KNeighborsClassifier"])
69 | sfm_wrapper_node = tpot.config.get_search_space(["SelectFromModel_classification"])
70 |
71 | for i in range(10):
72 | ind1 = knn_node.generate()
73 | ind2 = sfm_wrapper_node.generate()
74 | assert not ind1.crossover(ind2)
75 | assert not ind2.crossover(ind1)
76 |
77 | if __name__ == "__main__":
78 | test_EstimatorNodeCrossover()
79 | test_ValueError_different_types()
--------------------------------------------------------------------------------
/tpot/search_spaces/tuple_index.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import numpy as np
37 |
38 | class TupleIndex():
39 | """
40 | TPOT uses tuples to create a unique id for some pipeline search spaces. However, tuples sometimes don't interact correctly with pandas indexes.
41 | This class is a wrapper around a tuple that allows it to be used as a key in a dictionary, without it being an itereable.
42 |
43 | An alternative could be to make unique id return a string, but this would not work with graphpipelines, which require a special object.
44 | This class allows linear pipelines to contain graph pipelines while still being able to be used as a key in a dictionary.
45 |
46 | """
47 | def __init__(self, tup):
48 | self.tup = tup
49 |
50 | def __eq__(self,other) -> bool:
51 | return self.tup == other
52 |
53 | def __hash__(self) -> int:
54 | return self.tup.__hash__()
55 |
56 | def __str__(self) -> str:
57 | return self.tup.__str__()
58 |
59 | def __repr__(self) -> str:
60 | return self.tup.__repr__()
--------------------------------------------------------------------------------
/tpot/selectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .lexicase_selection import lexicase_selection
2 | from .max_weighted_average_selector import max_weighted_average_selector
3 | from .random_selector import random_selector
4 | from .tournament_selection import tournament_selection
5 | from .tournament_selection_dominated import tournament_selection_dominated
6 | from .nsgaii import nondominated_sorting, crowding_distance, dominates, survival_select_NSGA2
7 | from .map_elites_selection import map_elites_survival_selector, map_elites_parent_selector
8 |
9 |
10 | SELECTORS = {"lexicase":lexicase_selection,
11 | "max_weighted_average":max_weighted_average_selector,
12 | "random":random_selector,
13 | "tournament":tournament_selection,
14 | "tournament_dominated":tournament_selection_dominated,
15 | "nsgaii":survival_select_NSGA2,
16 | "map_elites_survival":map_elites_survival_selector,
17 | "map_elites_parent":map_elites_parent_selector,
18 | }
--------------------------------------------------------------------------------
/tpot/selectors/lexicase_selection.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import numpy as np
37 |
38 | def lexicase_selection(scores, k, n_parents=1, rng=None):
39 | """
40 | Select the best individual according to Lexicase Selection, *k* times.
41 | The returned list contains the indices of the chosen *individuals*.
42 |
43 | Parameters
44 | ----------
45 | scores : np.ndarray
46 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives.
47 | k : int
48 | The number of individuals to select.
49 | n_parents : int, optional
50 | The number of parents to select per individual. The default is 1.
51 | rng : int, np.random.Generator, optional
52 | The random number generator. The default is None.
53 | Returns
54 | -------
55 | A array of indices of selected individuals of shape (k, n_parents).
56 | """
57 | rng = np.random.default_rng(rng)
58 | chosen =[]
59 | for i in range(k*n_parents):
60 | candidates = list(range(len(scores)))
61 | cases = list(range(len(scores[0])))
62 | rng.shuffle(cases)
63 |
64 | while len(cases) > 0 and len(candidates) > 1:
65 | best_val_for_case = max(scores[candidates,cases[0]])
66 | candidates = [x for x in candidates if scores[x, cases[0]] == best_val_for_case]
67 | cases.pop(0)
68 | chosen.append(rng.choice(candidates))
69 |
70 | return np.reshape(chosen, (k, n_parents))
--------------------------------------------------------------------------------
/tpot/selectors/max_weighted_average_selector.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import numpy as np
37 |
38 | def max_weighted_average_selector(scores,k, n_parents=1, rng=None):
39 | """
40 | Select the best individual according to Max Weighted Average Selection, *k* times.
41 |
42 | Parameters
43 | ----------
44 | scores : np.ndarray
45 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives.
46 | k : int
47 | The number of individuals to select.
48 | n_parents : int, optional
49 | The number of parents to select per individual. The default is 1.
50 | rng : int, np.random.Generator, optional
51 | The random number generator. The default is None.
52 |
53 | Returns
54 | -------
55 | A array of indices of selected individuals of shape (k, n_parents).
56 |
57 | """
58 | ave_scores = [np.nanmean(s ) for s in scores ] #TODO make this more efficient
59 | chosen = np.argsort(ave_scores)[::-1][0:k] #TODO check this behavior with nans
60 | return np.reshape(chosen, (k, n_parents))
--------------------------------------------------------------------------------
/tpot/selectors/random_selector.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import numpy as np
37 |
38 | def random_selector(scores, k, n_parents=1, rng=None, ):
39 | """
40 | Randomly selects indeces of individuals from the scores matrix.
41 |
42 | Parameters
43 | ----------
44 | scores : np.ndarray
45 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives.
46 | k : int
47 | The number of individuals to select.
48 | n_parents : int, optional
49 | The number of parents to select per individual. The default is 1.
50 | rng : int, np.random.Generator, optional
51 | The random number generator. The default is None.
52 |
53 | Returns
54 | -------
55 | A array of indices of randomly selected individuals (with replacement) of shape (k, n_parents).
56 |
57 | """
58 | rng = np.random.default_rng(rng)
59 | chosen = rng.choice(list(range(0,len(scores))), size=k*n_parents)
60 | return np.reshape(chosen, (k, n_parents))
--------------------------------------------------------------------------------
/tpot/selectors/tournament_selection.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import numpy as np
37 |
38 | def tournament_selection(scores, k, n_parents=1, rng=None, tournament_size=2, score_index=0):
39 | """
40 | Select the best individual among *tournsize* randomly chosen
41 | individuals, *k* times. The returned list contains the indices of the chosen *individuals*.
42 |
43 | Parameters
44 | ----------
45 | scores : np.ndarray
46 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives.
47 | k : int
48 | The number of individuals to select.
49 | n_parents : int, optional
50 | The number of parents to select per individual. The default is 1.
51 | rng : int, np.random.Generator, optional
52 | The random number generator. The default is None.
53 | tournament_size : int, optional
54 | The number of individuals participating in each tournament.
55 | score_index : int, str, optional
56 | The index of the score to use for selection. If "average" is passed, the average score is used. The default is 0 (only the first score is used).
57 |
58 | Returns
59 | -------
60 | A array of indices of selected individuals of shape (k, n_parents).
61 | """
62 |
63 | rng = np.random.default_rng(rng)
64 |
65 | if isinstance(score_index,int):
66 | key=lambda x:x[1][score_index]
67 | elif score_index == "average":
68 | key=lambda x:np.mean(x[1])
69 |
70 | chosen = []
71 | for i in range(k*n_parents):
72 | aspirants_idx =[rng.choice(len(scores)) for i in range(tournament_size)]
73 | aspirants = list(zip(aspirants_idx, scores[aspirants_idx])) # Zip indices and elements together
74 | chosen.append(max(aspirants, key=key)[0]) # Retrun the index of the maximum element
75 |
76 | return np.reshape(chosen, (k, n_parents))
--------------------------------------------------------------------------------
/tpot/selectors/tournament_selection_dominated.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import numpy as np
37 |
38 | from.nsgaii import nondominated_sorting, crowding_distance, dominates
39 |
40 | #based on deap
41 | def tournament_selection_dominated(scores, k, n_parents=2, rng=None):
42 | """
43 | Select the best individual among 2 randomly chosen
44 | individuals, *k* times. Selection is first attempted by checking if one individual dominates the other. Otherwise one with the highest crowding distance is selected.
45 | The returned list contains the indices of the chosen *individuals*.
46 |
47 | Parameters
48 | ----------
49 | scores : np.ndarray
50 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives.
51 | k : int
52 | The number of individuals to select.
53 | n_parents : int, optional
54 | The number of parents to select per individual. The default is 2.
55 | rng : int, np.random.Generator, optional
56 | The random number generator. The default is None.
57 |
58 | Returns
59 | -------
60 | A array of indices of selected individuals of shape (k, n_parents).
61 |
62 | """
63 |
64 | rng = np.random.default_rng(rng)
65 | pareto_fronts = nondominated_sorting(scores)
66 |
67 | # chosen = list(itertools.chain.from_iterable(fronts))
68 | # if len(chosen) >= k:
69 | # return chosen[0:k]
70 |
71 | crowding_dict = {}
72 | chosen = []
73 | current_front_number = 0
74 | while current_front_number < len(pareto_fronts):
75 |
76 | current_front = np.array(list(pareto_fronts[current_front_number]))
77 | front_scores = [scores[i] for i in current_front]
78 | crowding_distances = crowding_distance(front_scores)
79 | for i, crowding in zip(current_front,crowding_distances):
80 | crowding_dict[i] = crowding
81 |
82 | current_front_number += 1
83 |
84 |
85 | chosen = []
86 | for i in range(k*n_parents):
87 | asp1 = rng.choice(len(scores))
88 | asp2 = rng.choice(len(scores))
89 |
90 | if dominates(scores[asp1], scores[asp2]):
91 | chosen.append(asp1)
92 | elif dominates(scores[asp2], scores[asp1]):
93 | chosen.append(asp2)
94 |
95 | elif crowding_dict[asp1] > crowding_dict[asp2]:
96 | chosen.append(asp1)
97 | elif crowding_dict[asp1] < crowding_dict[asp2]:
98 | chosen.append(asp2)
99 |
100 | else:
101 | chosen.append(rng.choice([asp1,asp2]))
102 |
103 | return np.reshape(chosen, (k, n_parents))
104 |
--------------------------------------------------------------------------------
/tpot/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/tpot/tests/__init__.py
--------------------------------------------------------------------------------
/tpot/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import pytest
37 | import sys
38 |
39 |
40 | @pytest.fixture
41 | def capture_stdout(monkeypatch):
42 | buffer = {"stdout": "", "write_calls": 0}
43 |
44 | def fake_write(s):
45 | buffer["stdout"] += s
46 | buffer["write_calls"] += 1
47 |
48 | monkeypatch.setattr(sys.stdout, "write", fake_write)
49 | return buffer
50 |
--------------------------------------------------------------------------------
/tpot/tests/test_estimators.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import pytest
37 | import tpot
38 | from sklearn.datasets import load_iris
39 | import random
40 | import sklearn
41 |
42 | @pytest.fixture
43 | def sample_dataset():
44 | X_train, y_train = load_iris(return_X_y=True)
45 | return X_train, y_train
46 |
47 | #standard test
48 | @pytest.fixture
49 | def tpot_estimator():
50 |
51 | n_classes=3
52 | n_samples=100
53 | n_features=100
54 |
55 | search_space = tpot.search_spaces.pipelines.GraphSearchPipeline(
56 | root_search_space= tpot.config.get_search_space("classifiers", n_samples=n_samples, n_features=n_features, n_classes=n_classes),
57 | leaf_search_space = None,
58 | inner_search_space = tpot.config.get_search_space(["selectors","transformers"],n_samples=n_samples, n_features=n_features, n_classes=n_classes),
59 | max_size = 10,
60 | )
61 | return tpot.TPOTEstimator(
62 | search_space=search_space,
63 | population_size=10,
64 | generations=2,
65 | scorers=['roc_auc_ovr'],
66 | scorers_weights=[1],
67 | classification=True,
68 | n_jobs=4,
69 | early_stop=5,
70 | other_objective_functions= [],
71 | other_objective_functions_weights=[],
72 | max_time_mins=20/60,
73 | verbose=3)
74 |
75 | @pytest.fixture
76 | def tpot_classifier():
77 | return tpot.tpot_estimator.templates.TPOTClassifier(max_time_mins=60/60,verbose=0)
78 |
79 | @pytest.fixture
80 | def tpot_regressor():
81 | return tpot.tpot_estimator.templates.TPOTRegressor(max_time_mins=10/60,verbose=0)
82 |
83 |
84 | @pytest.fixture
85 | def tpot_estimator_with_pipeline(tpot_estimator,sample_dataset):
86 | tpot_estimator.fit(sample_dataset[0], sample_dataset[1])
87 | return tpot_estimator
88 |
89 | def test_tpot_estimator_predict(tpot_estimator_with_pipeline,sample_dataset):
90 | #X_test = [[1, 2, 3], [4, 5, 6]]
91 | X_test = sample_dataset[0]
92 | y_pred = tpot_estimator_with_pipeline.predict(X_test)
93 | assert len(y_pred) == len(X_test)
94 | assert tpot_estimator_with_pipeline.fitted_pipeline_ is not None
95 |
96 | def test_tpot_estimator_generations_type():
97 | with pytest.raises(TypeError):
98 | tpot.TPOTEstimator(generations="two", population_size=10, verbosity=2)
99 |
100 | def test_tpot_estimator_population_size_type():
101 | with pytest.raises(TypeError):
102 | tpot.TPOTEstimator(generations=2, population_size='ten', verbosity=2)
103 |
104 | def test_tpot_estimator_verbosity_type():
105 | with pytest.raises(TypeError):
106 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity='high')
107 |
108 | def test_tpot_estimator_scoring_type():
109 | with pytest.raises(TypeError):
110 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity=2, scoring=0.5)
111 |
112 | def test_tpot_estimator_cv_type():
113 | with pytest.raises(TypeError):
114 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity=2, cv='kfold')
115 |
116 | def test_tpot_estimator_n_jobs_type():
117 | with pytest.raises(TypeError):
118 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity=2, n_jobs='all')
119 |
120 | def test_tpot_estimator_config_dict_type():
121 | with pytest.raises(TypeError):
122 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity=2, config_dict='config')
123 |
124 |
125 |
126 |
127 |
128 | def test_tpot_classifier_fit(tpot_classifier,sample_dataset):
129 | #load iris dataset
130 | X_train = sample_dataset[0]
131 | y_train = sample_dataset[1]
132 | tpot_classifier.fit(X_train, y_train)
133 | assert tpot_classifier.fitted_pipeline_ is not None
134 |
135 | def test_tpot_regressor_fit(tpot_regressor):
136 |
137 | scorer = sklearn.metrics.get_scorer('neg_mean_squared_error')
138 | X, y = sklearn.datasets.load_diabetes(return_X_y=True)
139 | X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.05, test_size=0.95)
140 | tpot_regressor.fit(X_train, y_train)
141 | assert tpot_regressor.fitted_pipeline_ is not None
142 |
143 |
--------------------------------------------------------------------------------
/tpot/tests/test_hello_world.py:
--------------------------------------------------------------------------------
1 | """
2 | Test hello world.
3 | Notes:
4 | parameterizing the test_input and expected values allows tests continue running even if one fails.
5 | xfail marks a test as expected to fail. This is useful for tests that are not yet implemented.
6 | fixtures are used to setup and teardown tests. They are useful for tests that require a lot of setup.
7 | We can implement fixtures if we need them.
8 | """
9 |
10 | import pytest
11 |
12 |
13 | @pytest.mark.parametrize("test_input,expected", [
14 | ("Hello World", "Hello World"),
15 | ])
16 | def test_hello_world(test_input, expected):
17 | assert test_input is expected
18 |
19 |
20 |
21 | def test_print(capture_stdout):
22 | print("Hello World")
23 | assert capture_stdout["stdout"] == "Hello World\n"
24 |
--------------------------------------------------------------------------------
/tpot/tpot_estimator/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from .estimator import TPOTEstimator
37 | from .steady_state_estimator import TPOTEstimatorSteadyState
38 | from .templates import TPOTClassifier, TPOTRegressor
--------------------------------------------------------------------------------
/tpot/tpot_estimator/cross_val_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import time
37 | import sklearn.metrics
38 | from collections.abc import Iterable
39 | import pandas as pd
40 | import sklearn
41 | import numpy as np
42 |
43 | def cross_val_score_objective(estimator, X, y, scorers, cv, fold=None):
44 | """
45 | Compute the cross validated scores for a estimator. Only fits the estimator once per fold, and loops over the scorers to evaluate the estimator.
46 |
47 | Parameters
48 | ----------
49 | estimator: sklearn.base.BaseEstimator
50 | The estimator to fit and score.
51 | X: np.ndarray or pd.DataFrame
52 | The feature matrix.
53 | y: np.ndarray or pd.Series
54 | The target vector.
55 | scorers: list or scorer
56 | The scorers to use.
57 | If a list, will loop over the scorers and return a list of scorers.
58 | If a single scorer, will return a single score.
59 | cv: sklearn cross-validator
60 | The cross-validator to use. For example, sklearn.model_selection.KFold or sklearn.model_selection.StratifiedKFold.
61 | fold: int, optional
62 | The fold to return the scores for. If None, will return the mean of all the scores (per scorer). Default is None.
63 |
64 | Returns
65 | -------
66 | scores: np.ndarray or float
67 | The scores for the estimator per scorer. If fold is None, will return the mean of all the scores (per scorer).
68 | Returns a list if multiple scorers are used, otherwise returns a float for the single scorer.
69 |
70 | """
71 |
72 | #check if scores is not iterable
73 | if not isinstance(scorers, Iterable):
74 | scorers = [scorers]
75 | scores = []
76 | if fold is None:
77 | for train_index, test_index in cv.split(X, y):
78 | this_fold_estimator = sklearn.base.clone(estimator)
79 | if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
80 | X_train, X_test = X.iloc[train_index], X.iloc[test_index]
81 | else:
82 | X_train, X_test = X[train_index], X[test_index]
83 |
84 | if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
85 | y_train, y_test = y.iloc[train_index], y.iloc[test_index]
86 | else:
87 | y_train, y_test = y[train_index], y[test_index]
88 |
89 |
90 | start = time.time()
91 | this_fold_estimator.fit(X_train,y_train)
92 | duration = time.time() - start
93 |
94 | this_fold_scores = [sklearn.metrics.get_scorer(scorer)(this_fold_estimator, X_test, y_test) for scorer in scorers]
95 | scores.append(this_fold_scores)
96 | del this_fold_estimator
97 | del X_train
98 | del X_test
99 | del y_train
100 | del y_test
101 |
102 |
103 | return np.mean(scores,0)
104 | else:
105 | this_fold_estimator = sklearn.base.clone(estimator)
106 | train_index, test_index = list(cv.split(X, y))[fold]
107 | if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
108 | X_train, X_test = X.iloc[train_index], X.iloc[test_index]
109 | else:
110 | X_train, X_test = X[train_index], X[test_index]
111 |
112 | if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
113 | y_train, y_test = y.iloc[train_index], y.iloc[test_index]
114 | else:
115 | y_train, y_test = y[train_index], y[test_index]
116 |
117 | start = time.time()
118 | this_fold_estimator.fit(X_train,y_train)
119 | duration = time.time() - start
120 | this_fold_scores = [sklearn.metrics.get_scorer(scorer)(this_fold_estimator, X_test, y_test) for scorer in scorers]
121 | return this_fold_scores
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/tpot/tpot_estimator/templates/__init__.py:
--------------------------------------------------------------------------------
1 | from .tpottemplates import *
--------------------------------------------------------------------------------
/tpot/tpot_estimator/templates/tpot_autoimputer.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/tpot/tpot_estimator/templates/tpot_autoimputer.py
--------------------------------------------------------------------------------
/tpot/tpot_estimator/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 |
--------------------------------------------------------------------------------
/tpot/tpot_estimator/tests/test_estimator_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | import pytest
37 | import numpy as np
38 | import pandas as pd
39 | from ..estimator_utils import *
40 |
41 | def test_remove_underrepresented_classes():
42 | x = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
43 | y = np.array([0, 1, 0, 2])
44 | min_count = 2
45 | x_result, y_result = remove_underrepresented_classes(x, y, min_count)
46 | np.testing.assert_array_equal(x_result, np.array([[1, 2], [5, 6]]))
47 | np.testing.assert_array_equal(y_result, np.array([0, 0]))
48 |
49 | x = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6], 'd': [7, 8]}).T
50 | y = pd.Series([0, 1, 0, 2])
51 | min_count = 2
52 | x_result, y_result = remove_underrepresented_classes(x, y, min_count)
53 | pd.testing.assert_frame_equal(x_result, pd.DataFrame({'a': [1, 2], 'c': [5, 6]}).T)
54 | pd.testing.assert_series_equal(y_result, pd.Series([0, 1, 0, 2])[[0,2]])
55 |
56 | x = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
57 | y = np.array([0, 1, 0, 1])
58 | min_count = 2
59 | x_result, y_result = remove_underrepresented_classes(x, y, min_count)
60 | np.testing.assert_array_equal(x_result, np.array([[1, 2], [3, 4], [5, 6], [7, 8]]))
61 | np.testing.assert_array_equal(y_result, np.array([0, 1, 0, 1]))
62 |
63 | x = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6], 'd': [7, 8]}).T
64 | y = pd.Series([0, 1, 0, 1])
65 | min_count = 2
66 | x_result, y_result = remove_underrepresented_classes(x, y, min_count)
67 | pd.testing.assert_frame_equal(x_result, pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6], 'd': [7, 8]}).T)
68 | pd.testing.assert_series_equal(y_result, pd.Series([0, 1, 0, 1]))
69 |
70 |
71 | def test_check_if_y_is_encoded():
72 | assert check_if_y_is_encoded([0, 1, 2, 3]) == True
73 | assert check_if_y_is_encoded([0, 1, 3, 4]) == False
74 | assert check_if_y_is_encoded([0, 2, 3]) == False
75 | assert check_if_y_is_encoded([0]) == True
76 | assert check_if_y_is_encoded([0,0,0,0,1,1,1,1]) == True
77 | assert check_if_y_is_encoded([0,0,0,0,1,1,1,1,3]) == False
78 | assert check_if_y_is_encoded([1,1,1,1,2,2,2,2]) == False
79 |
--------------------------------------------------------------------------------
/tpot/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from . import eval_utils
37 | from .utils import *
38 |
39 | # If amltk is installed, import the parser
40 | try:
41 | from .amltk_parser import tpot_parser
42 | except ImportError:
43 | # Handle the case when amltk is not installed
44 | pass
45 | # print("amltk is not installed. Please install it to use tpot_parser.")
46 | # Optional: raise an exception or provide alternative functionality
--------------------------------------------------------------------------------
/tpot/utils/amltk_parser.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is part of the TPOT library.
3 |
4 | The current version of TPOT was developed at Cedars-Sinai by:
5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/)
6 | - Anil Saini (anil.saini@cshs.org)
7 | - Jose Hernandez (jgh9094@gmail.com)
8 | - Jay Moran (jay.moran@cshs.org)
9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org)
10 | - Hyunjun Choi (hyunjun.choi@cshs.org)
11 | - Gabriel Ketron (gabriel.ketron@cshs.org)
12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org)
13 | - Jason Moore (moorejh28@gmail.com)
14 |
15 | The original version of TPOT was primarily developed at the University of Pennsylvania by:
16 | - Randal S. Olson (rso@randalolson.com)
17 | - Weixuan Fu (weixuanf@upenn.edu)
18 | - Daniel Angell (dpa34@drexel.edu)
19 | - Jason Moore (moorejh28@gmail.com)
20 | - and many more generous open-source contributors
21 |
22 | TPOT is free software: you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License as
24 | published by the Free Software Foundation, either version 3 of
25 | the License, or (at your option) any later version.
26 |
27 | TPOT is distributed in the hope that it will be useful,
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | GNU Lesser General Public License for more details.
31 |
32 | You should have received a copy of the GNU Lesser General Public
33 | License along with TPOT. If not, see .
34 |
35 | """
36 | from amltk.pipeline import Choice, Component, Sequential, Node, Fixed, Split, Join, Searchable
37 | from tpot.search_spaces.pipelines import SequentialPipeline, ChoicePipeline, UnionPipeline
38 | from tpot.search_spaces.nodes import EstimatorNode
39 | from ConfigSpace import ConfigurationSpace
40 |
41 | def component_to_estimatornode(component: Component) -> EstimatorNode:
42 | method = component.item
43 | space_dict = {}
44 | if component.space is not None:
45 | space_dict.update(component.space)
46 | if component.config is not None:
47 | space_dict.update(component.config)
48 | space = ConfigurationSpace(component.space)
49 |
50 | tpot_sp = EstimatorNode(method=method, space=space)
51 | return tpot_sp
52 |
53 | def fixed_to_estimatornode(node: Fixed) -> EstimatorNode:
54 | method = node.item
55 | #check if method is a class or an object
56 | if not isinstance(method, type):
57 | method = type(method)
58 |
59 | #if baseestimator, get params
60 | if hasattr(node.item, 'get_params'):
61 | space_dict = node.item.get_params(deep=False)
62 | else:
63 | space_dict = {}
64 | if node.space is not None:
65 | space_dict.update(node.space)
66 | if node.config is not None:
67 | space_dict.update(node.config)
68 |
69 | tpot_sp = EstimatorNode(method=method, space=space_dict)
70 | return tpot_sp
71 |
72 | def sequential_to_sequentialpipeline(sequential: Sequential) -> SequentialPipeline:
73 | nodes = [tpot_parser(node) for node in sequential.nodes]
74 | tpot_sp = SequentialPipeline(search_spaces=nodes)
75 | return tpot_sp
76 |
77 | def choice_to_choicepipeline(choice: Choice) -> ChoicePipeline:
78 | nodes = [tpot_parser(node) for node in choice.nodes]
79 | tpot_sp = ChoicePipeline(search_spaces=nodes)
80 | return tpot_sp
81 |
82 |
83 | def split_to_unionpipeline(split: Split) -> UnionPipeline:
84 | nodes = [tpot_parser(node) for node in split.nodes]
85 | tpot_sp = UnionPipeline(search_spaces=nodes)
86 | return tpot_sp
87 |
88 | def tpot_parser(
89 | node: Node,
90 | ):
91 | """
92 | Convert amltk pipeline search space into a tpot pipeline search space.
93 |
94 | Parameters
95 | ----------
96 | node: amltk.pipeline.Node
97 | The node to convert.
98 |
99 | Returns
100 | -------
101 | tpot.search_spaces.base.SearchSpace
102 | The equivalent TPOT search space which can be optimized by TPOT.
103 | """
104 |
105 | if isinstance(node, Component):
106 | return component_to_estimatornode(node)
107 | elif isinstance(node, Sequential):
108 | return sequential_to_sequentialpipeline(node)
109 | elif isinstance(node, Choice):
110 | return choice_to_choicepipeline(node)
111 | elif isinstance(node, Fixed):
112 | return fixed_to_estimatornode(node)
113 | elif isinstance(node, Split):
114 | return split_to_unionpipeline(node)
115 | else:
116 | raise ValueError(f"Node type {type(node)} not supported")
117 |
--------------------------------------------------------------------------------