├── .github └── workflows │ ├── docs.yml │ ├── publish_package.yml │ └── tests.yml ├── .gitignore ├── ISSUE_TEMPLATE.md ├── LICENSE ├── PULL_REQUEST_TEMPLATE.md ├── README.md ├── Tutorial ├── 1_Using_TPOT.ipynb ├── 2_Search_Spaces.ipynb ├── 3_Feature_Set_Selector.ipynb ├── 4_Genetic_Feature_Selection.ipynb ├── 5_GraphPipeline.ipynb ├── 6_Symbolic_Regression_and_Classification.ipynb ├── 7_dask_parallelization.ipynb ├── 8_SH_and_cv_early_pruning.ipynb ├── 9_Genetic_Algorithm_Overview.ipynb ├── amltk_search_space_parser_example.ipynb └── simple_fss.csv ├── docs ├── Tutorial ├── archived │ ├── api.md │ ├── assets │ │ ├── favicon.ico │ │ └── tpot-logo.jpg │ ├── citing.md │ ├── contributing.md │ ├── css │ │ └── archived.css │ ├── examples.md │ ├── index.md │ ├── installing.md │ ├── related.md │ ├── releases.md │ ├── support.md │ └── using.md ├── assets │ ├── favicon.ico │ └── tpot-logo.jpg ├── cite.md ├── contribute.md ├── css │ └── extra.css ├── index.md ├── installation.md ├── related.md ├── requirements_docs.txt ├── scripts │ ├── build_docs_sources.sh │ ├── build_mkdocs.sh │ └── build_tutorial_toc_not_used.sh ├── support.md ├── tpot_api │ ├── classifier.md │ ├── estimator.md │ └── regressor.md └── using.md ├── mkdocs_archived.yml ├── pyproject.toml ├── requirements_dev.txt ├── setup.cfg ├── setup.py ├── tox.ini └── tpot ├── __init__.py ├── _version.py ├── builtin_modules ├── __init__.py ├── arithmetictransformer.py ├── column_one_hot_encoder.py ├── estimatortransformer.py ├── feature_encoding_frequency_selector.py ├── feature_set_selector.py ├── feature_transformers.py ├── genetic_encoders.py ├── imputer.py ├── nn.py ├── passkbinsdiscretizer.py ├── passthrough.py ├── tests │ └── feature_set_selector_tests.py └── zero_count.py ├── config ├── __init__.py ├── autoqtl_builtins.py ├── classifiers.py ├── classifiers_sklearnex.py ├── get_configspace.py ├── imputers.py ├── mdr_configs.py ├── regressors.py ├── regressors_sklearnex.py ├── selectors.py ├── special_configs.py ├── template_search_spaces.py ├── tests │ ├── __init__.py │ └── test_get_configspace.py └── transformers.py ├── evolvers ├── __init__.py ├── base_evolver.py └── steady_state_evolver.py ├── graphsklearn.py ├── individual.py ├── logbook.py ├── objectives ├── __init__.py ├── average_path_length.py ├── complexity.py ├── number_of_leaves.py ├── number_of_nodes.py └── tests │ ├── test_complexity_objective.py │ └── test_number_of_nodes.py ├── old_config_utils ├── __init__.py └── old_config_utils.py ├── population.py ├── search_spaces ├── __init__.py ├── base.py ├── graph_utils.py ├── nodes │ ├── __init__.py │ ├── estimator_node.py │ ├── estimator_node_gradual.py │ ├── fss_node.py │ └── genetic_feature_selection.py ├── pipelines │ ├── __init__.py │ ├── choice.py │ ├── dynamic_linear.py │ ├── dynamicunion.py │ ├── graph.py │ ├── sequential.py │ ├── tests │ │ └── test_graphspace.py │ ├── tree.py │ ├── union.py │ └── wrapper.py ├── tests │ └── test_search_spaces.py └── tuple_index.py ├── selectors ├── __init__.py ├── lexicase_selection.py ├── map_elites_selection.py ├── max_weighted_average_selector.py ├── nsgaii.py ├── random_selector.py ├── tournament_selection.py └── tournament_selection_dominated.py ├── tests ├── __init__.py ├── conftest.py ├── test_estimators.py └── test_hello_world.py ├── tpot_estimator ├── __init__.py ├── cross_val_utils.py ├── estimator.py ├── estimator_utils.py ├── steady_state_estimator.py ├── templates │ ├── __init__.py │ ├── tpot_autoimputer.py │ └── tpottemplates.py └── tests │ ├── __init__.py │ └── test_estimator_utils.py └── utils ├── __init__.py ├── amltk_parser.py ├── eval_utils.py └── utils.py /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Docs Build 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build_docs: 10 | runs-on: ubuntu-latest 11 | env: 12 | GIT_COMMITTER_NAME: "Doc Build Bot" 13 | GIT_COMMITTER_EMAIL: "jay-m-dev@users.noreply.github.com" 14 | steps: 15 | - name: Checkout code 16 | uses: actions/checkout@v2 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: '3.10' 22 | 23 | - name: Cache dependencies 24 | uses: actions/cache@v3 25 | with: 26 | path: ~/.cache/pip 27 | key: ${{ runner.os }}-pip-${{ hashFiles('docs/requirements_docs.txt') }} 28 | restore-keys: | 29 | ${{ runner.os }}-pip- 30 | 31 | - name: Install dependencies 32 | run: | 33 | pip install --upgrade pip 34 | pip install . 35 | pip install -r docs/requirements_docs.txt 36 | 37 | # - name: Convert notebooks to HTML 38 | # # if: ${{ github.event_name == 'push' && contains(github.event.head_commit.modified, 'Tutorial/') && contains(github.event.head_commit.modified, '.ipynb') }} 39 | # run: | 40 | # # jupyter nbconvert --to html --allow-errors --no-input --show-input --template classic --output-dir docs/tutorial Tutorial/*.ipynb 41 | # jupyter nbconvert --to html --allow-errors --template classic --output-dir docs/tutorial Tutorial/*.ipynb 42 | 43 | # - name: Build Tutorial Table of Contents 44 | # run: | 45 | # bash docs/scripts/build_tutorial_toc.sh 46 | 47 | - name: Build Documentation sources 48 | run: | 49 | bash docs/scripts/build_docs_sources.sh 50 | 51 | - name: Build mkdocs.yml 52 | run: | 53 | bash docs/scripts/build_mkdocs.sh 54 | 55 | - name: Checkout gh-pages 56 | run: | 57 | git fetch origin gh-pages 58 | git checkout gh-pages || git checkout --orphan gh-pages 59 | git pull origin gh-pages || echo "No remote changes to pull" 60 | git checkout main # Switch back before continuing 61 | 62 | - name: Build and Deploy Latest Docs 63 | run: | 64 | mike deploy --push --branch gh-pages latest 65 | 66 | - name: Build and Deploy Archived Docs 67 | run: | 68 | mike deploy --config-file mkdocs_archived.yml --push --branch gh-pages archived 69 | 70 | - name: Set Default Version 71 | run: | 72 | mike set-default latest --push --branch gh-pages 73 | 74 | - name: Create alias for Latest Docs 75 | run: | 76 | mike alias latest stable --push --branch gh-pages 77 | -------------------------------------------------------------------------------- /.github/workflows/publish_package.yml: -------------------------------------------------------------------------------- 1 | name: Publish Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | workflow_dispatch: 7 | 8 | jobs: 9 | build-and-publish-pypi: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout code 13 | uses: actions/checkout@v2 14 | 15 | - name: Setup Python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.10' 19 | 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install setuptools wheel twine 24 | 25 | - name: Build package 26 | run: python setup.py sdist bdist_wheel 27 | 28 | - name: Upload to PyPI 29 | env: 30 | TWINE_USERNAME: __token__ 31 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 32 | run: twine upload dist/* 33 | 34 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | test: 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | matrix: 12 | os: [ubuntu-latest] 13 | python-version: ['3.10', '3.11', '3.12', '3.13'] 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install tox tox-gh-actions 25 | - name: Test with tox 26 | run: tox -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .pytest_cache/ 3 | TPOT.egg-info 4 | TPOT.egg-info 5 | *.tar.gz 6 | *.pkl 7 | *.json 8 | joblib/ 9 | cache_folder/ 10 | dask-worker-space/ 11 | .tox/ 12 | *.egg-info/ 13 | .coverage 14 | target/ 15 | .venv/ 16 | build/* 17 | *.egg 18 | *.coverage* 19 | docs/documentation/ 20 | mkdocs.yml -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | [provide general introduction to the issue and why it is relevant to this repository] 2 | 3 | ## Context of the issue 4 | 5 | [provide more detailed introduction to the issue itself and why it is relevant] 6 | 7 | [the remaining entries are only necessary if you are reporting a bug] 8 | 9 | ## Process to reproduce the issue 10 | 11 | [ordered list the process to finding and recreating the issue, example below. A minimally reproducible example would be ideal. This refers to the minimum amount of code necessary to reproduce the issue.] 12 | 13 | 1. User creates TPOT instance 14 | 2. User calls TPOT `fit()` function with training data 15 | 3. TPOT crashes with a `KeyError` after 5 generations 16 | 17 | ## Expected result 18 | 19 | [describe what you would expect to have resulted from this process] 20 | 21 | ## Current result 22 | 23 | [describe what you currently experience from this process, and thereby explain the bug] 24 | 25 | ## Possible fix 26 | 27 | [not necessary, but suggest fixes or reasons for the bug] 28 | 29 | ## `name of issue` screenshot 30 | 31 | [if relevant, include a screenshot] 32 | -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | [please review the [Contribution Guidelines](http://epistasislab.github.io/tpot/contributing/) prior to submitting your pull request. go ahead and delete this line if you've already reviewed said guidelines.] 2 | 3 | ## What does this PR do? 4 | 5 | 6 | 7 | ## Where should the reviewer start? 8 | 9 | 10 | 11 | ## How should this PR be tested? 12 | 13 | 14 | 15 | ## Any background context you want to provide? 16 | 17 | 18 | 19 | ## What are the relevant issues? 20 | 21 | [you can link directly to issues by entering # then the number of the issue] 22 | 23 | ## Screenshots (if appropriate) 24 | 25 | 26 | 27 | ## Questions: 28 | 29 | - Do the docs need to be updated? 30 | - Does this PR add new (Python) dependencies? 31 | -------------------------------------------------------------------------------- /Tutorial/simple_fss.csv: -------------------------------------------------------------------------------- 1 | one,a,b,c 2 | two,d,e,f 3 | three,g,h,i -------------------------------------------------------------------------------- /docs/Tutorial: -------------------------------------------------------------------------------- 1 | ../Tutorial -------------------------------------------------------------------------------- /docs/archived/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/docs/archived/assets/favicon.ico -------------------------------------------------------------------------------- /docs/archived/assets/tpot-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/docs/archived/assets/tpot-logo.jpg -------------------------------------------------------------------------------- /docs/archived/citing.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ⚠️ Warning 4 |

This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.

5 | 6 |
7 | 8 | # Citing TPOT 9 | 10 | If you use TPOT in a scientific publication, please consider citing at least one of the following papers: 11 | 12 | 13 | Trang T. Le, Weixuan Fu and Jason H. Moore (2020). [Scaling tree-based automated machine learning to biomedical big data with a feature set selector](https://academic.oup.com/bioinformatics/article/36/1/250/5511404). *Bioinformatics*.36(1): 250-256. 14 | 15 | BibTeX entry: 16 | 17 | ```bibtex 18 | @article{le2020scaling, 19 | title={Scaling tree-based automated machine learning to biomedical big data with a feature set selector}, 20 | author={Le, Trang T and Fu, Weixuan and Moore, Jason H}, 21 | journal={Bioinformatics}, 22 | volume={36}, 23 | number={1}, 24 | pages={250--256}, 25 | year={2020}, 26 | publisher={Oxford University Press} 27 | } 28 | ``` 29 | 30 | 31 | 32 | Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). [Automating biomedical data science through tree-based pipeline optimization](http://link.springer.com/chapter/10.1007/978-3-319-31204-0_9). *Applications of Evolutionary Computation*, pages 123-137. 33 | 34 | BibTeX entry: 35 | 36 | ```bibtex 37 | @inbook{Olson2016EvoBio, 38 | author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, 39 | editor={Squillero, Giovanni and Burelli, Paolo}, 40 | chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, 41 | title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, 42 | year={2016}, 43 | publisher={Springer International Publishing}, 44 | pages={123--137}, 45 | isbn={978-3-319-31204-0}, 46 | doi={10.1007/978-3-319-31204-0_9}, 47 | url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} 48 | } 49 | ``` 50 | 51 | Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science 52 | 53 | Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). [Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science](http://dl.acm.org/citation.cfm?id=2908918). *Proceedings of GECCO 2016*, pages 485-492. 54 | 55 | BibTeX entry: 56 | 57 | ```bibtex 58 | @inproceedings{OlsonGECCO2016, 59 | author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, 60 | title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, 61 | booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, 62 | series = {GECCO '16}, 63 | year = {2016}, 64 | isbn = {978-1-4503-4206-3}, 65 | location = {Denver, Colorado, USA}, 66 | pages = {485--492}, 67 | numpages = {8}, 68 | url = {http://doi.acm.org/10.1145/2908812.2908918}, 69 | doi = {10.1145/2908812.2908918}, 70 | acmid = {2908918}, 71 | publisher = {ACM}, 72 | address = {New York, NY, USA}, 73 | } 74 | ``` 75 | 76 | Alternatively, you can cite the repository directly with the following DOI: 77 | 78 | [DOI](https://zenodo.org/badge/latestdoi/20747/rhiever/tpot) 79 | -------------------------------------------------------------------------------- /docs/archived/css/archived.css: -------------------------------------------------------------------------------- 1 | .md-grid { 2 | max-width: 100%; 3 | } -------------------------------------------------------------------------------- /docs/archived/index.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ⚠️ Warning 4 |

This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.

5 | 6 |
7 | 8 |
9 | 10 |
11 | 12 | Consider TPOT your **Data Science Assistant**. TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. 13 | 14 |
15 | 16 |
17 | TPOT Demo 18 |
19 | 20 |
21 | 22 | TPOT will automate the most tedious part of machine learning by intelligently exploring thousands of possible pipelines to find the best one for your data. 23 | 24 |
25 | 26 |
27 | An example machine learning pipeline 28 | 29 | An example machine learning pipeline 30 |
31 | 32 |
33 | 34 | Once TPOT is finished searching (or you get tired of waiting), it provides you with the Python code for the best pipeline it found so you can tinker with the pipeline from there. 35 | 36 |
37 | 38 |
39 | An example TPOT pipeline 40 | 41 | An example TPOT pipeline 42 |
43 | 44 |
45 | 46 | TPOT is built on top of scikit-learn, so all of the code it generates should look familiar... if you're familiar with scikit-learn, anyway. 47 | 48 | **TPOT is still under active development** and we encourage you to check back on this repository regularly for updates. 49 | -------------------------------------------------------------------------------- /docs/archived/installing.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ⚠️ Warning 4 |

This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.

5 | 6 |
7 | 8 | # Installation 9 | 10 | TPOT is built on top of several existing Python libraries, including: 11 | 12 | * [NumPy](http://www.numpy.org/) 13 | 14 | * [SciPy](https://www.scipy.org/) 15 | 16 | * [scikit-learn](http://www.scikit-learn.org/) 17 | 18 | * [DEAP](https://github.com/DEAP/deap) 19 | 20 | * [update_checker](https://github.com/bboe/update_checker) 21 | 22 | * [tqdm](https://github.com/tqdm/tqdm) 23 | 24 | * [stopit](https://github.com/glenfant/stopit) 25 | 26 | * [pandas](http://pandas.pydata.org) 27 | 28 | * [joblib](https://joblib.readthedocs.io/en/latest/) 29 | 30 | * [xgboost](https://xgboost.readthedocs.io/en/latest/) 31 | 32 | Most of the necessary Python packages can be installed via the [Anaconda Python distribution](https://www.anaconda.com/products/individual), which we strongly recommend that you use. **Support for Python 3.4 and below has been officially dropped since version 0.11.0.** 33 | 34 | 35 | You can install TPOT using `pip` or `conda-forge`. 36 | 37 | ## pip 38 | 39 | NumPy, SciPy, scikit-learn, pandas, joblib, and PyTorch can be installed in Anaconda via the command: 40 | 41 | ```Shell 42 | conda install numpy scipy scikit-learn pandas joblib pytorch 43 | ``` 44 | 45 | DEAP, update_checker, tqdm, stopit and xgboost can be installed with `pip` via the command: 46 | 47 | ```Shell 48 | pip install deap update_checker tqdm stopit xgboost 49 | ``` 50 | 51 | **Windows users: pip installation may not work on some Windows environments, and it may cause unexpected errors.** If you have issues installing XGBoost, check the [XGBoost installation documentation](http://xgboost.readthedocs.io/en/latest/build.html). 52 | 53 | If you plan to use [Dask](http://dask.pydata.org/en/latest/) for parallel training, make sure to install [dask[delay] and dask[dataframe]](https://docs.dask.org/en/latest/install.html) and [dask_ml](https://dask-ml.readthedocs.io/en/latest/install.html). **It is noted that dask-ml>=1.7 requires distributed>=2.4.0 and scikit-learn>=0.23.0.** 54 | 55 | ```Shell 56 | pip install dask[delayed] dask[dataframe] dask-ml fsspec>=0.3.3 distributed>=2.10.0 57 | ``` 58 | 59 | If you plan to use the [TPOT-MDR configuration](https://arxiv.org/abs/1702.01780), make sure to install [scikit-mdr](https://github.com/EpistasisLab/scikit-mdr) and [scikit-rebate](https://github.com/EpistasisLab/scikit-rebate): 60 | 61 | ```Shell 62 | pip install scikit-mdr skrebate 63 | ``` 64 | 65 | To enable support for [PyTorch](https://pytorch.org/)-based neural networks (TPOT-NN), you will need to install PyTorch. TPOT-NN will work with either CPU or GPU PyTorch, but we strongly recommend using a GPU version, if possible, as CPU PyTorch models tend to train very slowly. 66 | 67 | We recommend following [PyTorch's installation instructions](https://pytorch.org/get-started/locally/) customized for your operating system and Python distribution. 68 | 69 | Finally to install TPOT itself, run the following command: 70 | 71 | ```Shell 72 | pip install tpot 73 | ``` 74 | 75 | ## conda-forge 76 | 77 | To install tpot and its core dependencies you can use: 78 | 79 | ```Shell 80 | conda install -c conda-forge tpot 81 | ``` 82 | 83 | To install additional dependencies you can use: 84 | 85 | ```Shell 86 | conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate 87 | ``` 88 | 89 | As mentioned above, we recommend following [PyTorch's installation instructions](https://pytorch.org/get-started/locally/) for installing it to enable support for [PyTorch](https://pytorch.org/)-based neural networks (TPOT-NN). 90 | 91 | ## Installation for using TPOT-cuML configuration 92 | 93 | With "TPOT cuML" configuration (see built-in configurations), TPOT will search over a restricted configuration using the GPU-accelerated estimators in [RAPIDS cuML](https://github.com/rapidsai/cuml) and [DMLC XGBoost](https://github.com/dmlc/xgboost). **This configuration requires an NVIDIA Pascal architecture or better GPU with [compute capability 6.0+](https://developer.nvidia.com/cuda-gpus), and that the library cuML is installed.** With this configuration, all model training and predicting will be GPU-accelerated. This configuration is particularly useful for medium-sized and larger datasets on which CPU-based estimators are a common bottleneck, and works for both the `TPOTClassifier` and `TPOTRegressor`. 94 | 95 | Please download this conda environment yml file to install TPOT for using TPOT-cuML configuration. 96 | 97 | ``` 98 | conda env create -f tpot-cuml.yml -n tpot-cuml 99 | conda activate tpot-cuml 100 | ``` 101 | 102 | 103 | ## Installation problems 104 | 105 | Please [file a new issue](https://github.com/EpistasisLab/tpot/issues/new) if you run into installation problems. 106 | -------------------------------------------------------------------------------- /docs/archived/related.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ⚠️ Warning 4 |

This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.

5 | 6 |
7 | 8 | Other Automated Machine Learning (AutoML) tools and related projects: 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |
NameLanguageLicenseDescription
Auto-WEKAJavaGPL-v3Automated model selection and hyper-parameter tuning for Weka models.
auto-sklearnPythonBSD-3-ClauseAn automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator.
auto_mlPythonMITAutomated machine learning for analytics & production. Supports manual feature type declarations.
H2O AutoMLJava with Python, Scala & R APIs and web GUIApache 2.0Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform.
devolPythonMITAutomated deep neural network design via genetic programming.
MLBoxPythonBSD-3-ClauseAccurate hyper-parameter optimization in high-dimensional space with support for distributed computing.
RecipeCGPL-v3Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure.
XcessivPythonApache 2.0A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python.
GAMAPythonApache 2.0Machine-learning pipeline optimization through asynchronous evaluation based genetic programming.
72 | -------------------------------------------------------------------------------- /docs/archived/support.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ⚠️ Warning 4 |

This documentation is for the archived version of TPOT, which is no longer maintained. For the latest version, click here.

5 | 6 |
7 | 8 | TPOT was developed in the [Computational Genetics Lab](http://epistasis.org/) at the [University of Pennsylvania](https://www.upenn.edu/) with funding from the [NIH](http://www.nih.gov/) under grant R01 AI117694. We are incredibly grateful for the support of the NIH and the University of Pennsylvania during the development of this project. 9 | 10 | The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project. 11 | -------------------------------------------------------------------------------- /docs/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/docs/assets/favicon.ico -------------------------------------------------------------------------------- /docs/assets/tpot-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/docs/assets/tpot-logo.jpg -------------------------------------------------------------------------------- /docs/cite.md: -------------------------------------------------------------------------------- 1 | # Citing TPOT 2 | If you use TPOT in a scientific publication, please consider citing at least one of the following papers: 3 | 4 | Trang T. Le, Weixuan Fu and Jason H. Moore (2020). [Scaling tree-based automated machine learning to biomedical big data with a feature set selector](https://academic.oup.com/bioinformatics/article/36/1/250/5511404). *Bioinformatics*.36(1): 250-256. 5 | 6 | BibTeX entry: 7 | 8 | ```bibtex 9 | @article{le2020scaling, 10 | title={Scaling tree-based automated machine learning to biomedical big data with a feature set selector}, 11 | author={Le, Trang T and Fu, Weixuan and Moore, Jason H}, 12 | journal={Bioinformatics}, 13 | volume={36}, 14 | number={1}, 15 | pages={250--256}, 16 | year={2020}, 17 | publisher={Oxford University Press} 18 | } 19 | ``` 20 | 21 | 22 | Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). [Automating biomedical data science through tree-based pipeline optimization](http://link.springer.com/chapter/10.1007/978-3-319-31204-0_9). *Applications of Evolutionary Computation*, pages 123-137. 23 | 24 | BibTeX entry: 25 | 26 | ```bibtex 27 | @inbook{Olson2016EvoBio, 28 | author={Olson, Randal S. and Urbanowicz, Ryan J. and Andrews, Peter C. and Lavender, Nicole A. and Kidd, La Creis and Moore, Jason H.}, 29 | editor={Squillero, Giovanni and Burelli, Paolo}, 30 | chapter={Automating Biomedical Data Science Through Tree-Based Pipeline Optimization}, 31 | title={Applications of Evolutionary Computation: 19th European Conference, EvoApplications 2016, Porto, Portugal, March 30 -- April 1, 2016, Proceedings, Part I}, 32 | year={2016}, 33 | publisher={Springer International Publishing}, 34 | pages={123--137}, 35 | isbn={978-3-319-31204-0}, 36 | doi={10.1007/978-3-319-31204-0_9}, 37 | url={http://dx.doi.org/10.1007/978-3-319-31204-0_9} 38 | } 39 | ``` 40 | 41 | Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). [Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science](http://dl.acm.org/citation.cfm?id=2908918). *Proceedings of GECCO 2016*, pages 485-492. 42 | 43 | BibTeX entry: 44 | 45 | ```bibtex 46 | @inproceedings{OlsonGECCO2016, 47 | author = {Olson, Randal S. and Bartley, Nathan and Urbanowicz, Ryan J. and Moore, Jason H.}, 48 | title = {Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science}, 49 | booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference 2016}, 50 | series = {GECCO '16}, 51 | year = {2016}, 52 | isbn = {978-1-4503-4206-3}, 53 | location = {Denver, Colorado, USA}, 54 | pages = {485--492}, 55 | numpages = {8}, 56 | url = {http://doi.acm.org/10.1145/2908812.2908918}, 57 | doi = {10.1145/2908812.2908918}, 58 | acmid = {2908918}, 59 | publisher = {ACM}, 60 | address = {New York, NY, USA}, 61 | } 62 | ``` -------------------------------------------------------------------------------- /docs/contribute.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | We welcome you to check the existing issues for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please file a new issue so we can discuss it. 4 | 5 | # Contribution Guide 6 | 7 | We welcome you to [check the existing issues](https://github.com/EpistasisLab/tpot/issues/) for bugs or enhancements to work on. If you have an idea for an extension to TPOT, please [file a new issue](https://github.com/EpistasisLab/tpot/issues/new) so we can discuss it. 8 | 9 | ## Project layout 10 | 11 | The latest stable release of TPOT is on the [main branch](https://github.com/EpistasisLab/tpot/tree/main), whereas the latest version of TPOT in development is on the [development branch](https://github.com/EpistasisLab/tpot/tree/dev). Make sure you are looking at and working on the correct branch if you're looking to contribute code. 12 | 13 | In terms of directory structure: 14 | 15 | * All of TPOT's code sources are in the `tpot` directory 16 | * The documentation sources are in the `docs_sources` directory 17 | * Images in the documentation are in the `images` directory 18 | * Tutorials for TPOT are in the `tutorials` directory 19 | * Unit tests for TPOT are in the `tests.py` file 20 | 21 | Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the `development` branch. 22 | 23 | ## How to contribute 24 | 25 | The preferred way to contribute to TPOT is to fork the 26 | [main repository](https://github.com/EpistasisLab/tpot/) on 27 | GitHub: 28 | 29 | 1. Fork the [project repository](https://github.com/EpistasisLab/tpot): 30 | click on the 'Fork' button near the top of the page. This creates 31 | a copy of the code under your account on the GitHub server. 32 | 33 | 2. Clone this copy to your local disk: 34 | 35 | $ git clone git@github.com:YourUsername/tpot.git 36 | $ cd tpot 37 | 38 | 3. Create a branch to hold your changes: 39 | 40 | $ git checkout -b my-contribution 41 | 42 | 4. Make sure your local environment is setup correctly for development. Installation instructions are almost identical to [the user instructions](installing.md) except that TPOT should *not* be installed. If you have TPOT installed on your computer then make sure you are using a virtual environment that does not have TPOT installed. Furthermore, you should make sure you have installed the `pytest` package into your development environment so that you can test changes locally. 43 | 44 | $ conda install pytest 45 | 46 | 5. Start making changes on your newly created branch, remembering to never work on the ``main`` branch! Work on this copy on your computer using Git to do the version control. 47 | 48 | 49 | 6. Check your changes haven't broken any existing tests and pass all your new tests. Navigate the terminal into the `tpot/tpot/` folder and run the command `pytest` to start all tests. (note, you must have the `pytest` package installed within your dev environment for this to work): 50 | 51 | $ pytest 52 | 53 | 7. When you're done editing and local testing, run: 54 | 55 | $ git add modified_files 56 | $ git commit 57 | 58 | to record your changes in Git, then push them to GitHub with: 59 | 60 | $ git push -u origin my-contribution 61 | 62 | Finally, go to the web page of your fork of the TPOT repo, and click 'Pull Request' (PR) to send your changes to the maintainers for review. Make sure that you send your PR to the `dev` branch, as the `main` branch is reserved for the latest stable release. This will start the CI server to check all the project's unit tests run and send an email to the maintainers. 63 | 64 | (If any of the above seems like magic to you, then look up the 65 | [Git documentation](http://git-scm.com/documentation) on the web.) 66 | 67 | ## Before submitting your pull request 68 | 69 | Before you submit a pull request for your contribution, please work through this checklist to make sure that you have done everything necessary so we can efficiently review and accept your changes. 70 | 71 | If your contribution changes TPOT in any way: 72 | 73 | * Update the [documentation](https://github.com/EpistasisLab/tpot/tree/main/docs) so all of your changes are reflected there. 74 | 75 | * Update the [README](https://github.com/EpistasisLab/tpot/blob/main/README.md) if anything there has changed. 76 | 77 | If your contribution involves any code changes: 78 | 79 | * Update the [project unit tests](https://github.com/EpistasisLab/tpot/tree/main/tpot/tests) to test your code changes. 80 | 81 | * Make sure that your code is properly commented with [docstrings](https://www.python.org/dev/peps/pep-0257/) and comments explaining your rationale behind non-obvious coding practices. 82 | 83 | 84 | If your contribution requires a new library dependency: 85 | 86 | * Double-check that the new dependency is easy to install via `pip` or Anaconda. If the dependency requires a complicated installation, then we most likely won't merge your changes because we want to keep TPOT easy to install. 87 | 88 | 89 | ## After submitting your pull request 90 | 91 | After submitting your pull request, GitHub will automatically run unit tests on your changes and make sure that your updated code builds and runs. We also use services that automatically check code quality and test coverage. 92 | 93 | Check back shortly after submitting your pull request to make sure that your code passes these checks. If any of the checks come back with a red X, then do your best to address the errors. 94 | -------------------------------------------------------------------------------- /docs/css/extra.css: -------------------------------------------------------------------------------- 1 | .md-grid { 2 | max-width: 100%; 3 | } -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | {% 2 | include-markdown "../README.md" 3 | %} -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | TPOT requires a working installation of Python. 4 | 5 | ### Creating a conda environment (optional) 6 | 7 | We recommend using conda environments for installing TPOT, though it would work equally well if manually installed without it. 8 | 9 | [More information on making anaconda environments found here.](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) 10 | 11 | ``` 12 | conda create --name tpotenv python=3.13 13 | conda activate tpotenv 14 | ``` 15 | 16 | ### Note for M1 Mac or other Arm-based CPU users 17 | 18 | You need to install the lightgbm package directly from conda using the following command before installing TPOT. 19 | 20 | This is to ensure that you get the version that is compatible with your system. 21 | 22 | ``` 23 | conda install --yes -c conda-forge 'lightgbm>=3.3.3' 24 | ``` 25 | 26 | ### Developer/Latest Branch Installation 27 | 28 | 29 | ``` 30 | pip install -e /path/to/tpotrepo 31 | ``` 32 | 33 | If you downloaded with git pull, then the repository folder will be named TPOT. (Note: this folder is the one that includes setup.py inside of it and not the folder of the same name inside it). 34 | If you downloaded as a zip, the folder may be called tpot-main. 35 | -------------------------------------------------------------------------------- /docs/related.md: -------------------------------------------------------------------------------- 1 | Other Automated Machine Learning (AutoML) tools and related projects: 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 |
NameLanguageLicenseDescription
Auto-WEKAJavaGPL-v3Automated model selection and hyper-parameter tuning for Weka models.
auto-sklearnPythonBSD-3-ClauseAn automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator.
auto_mlPythonMITAutomated machine learning for analytics & production. Supports manual feature type declarations.
H2O AutoMLJava with Python, Scala & R APIs and web GUIApache 2.0Automated: data prep, hyperparameter tuning, random grid search and stacked ensembles in a distributed ML platform.
devolPythonMITAutomated deep neural network design via genetic programming.
MLBoxPythonBSD-3-ClauseAccurate hyper-parameter optimization in high-dimensional space with support for distributed computing.
RecipeCGPL-v3Machine-learning pipeline optimization through genetic programming. Uses grammars to define pipeline structure.
XcessivPythonApache 2.0A web-based application for quick, scalable, and automated hyper-parameter tuning and stacked ensembling in Python.
GAMAPythonApache 2.0Machine-learning pipeline optimization through asynchronous evaluation based genetic programming.
PyMooPythonApache 2.0Multi-objective optimization in Python.
Karoo GPPythonMITA Python based genetic programming application suite with support for symbolic regression and classification.
MABEC++See hereA Python based genetic programming application suite with support for symbolic regression and classification.
SBBFrameworkPythonBSD-2-ClausePython implementation of Symbiotic Bid-Based (SBB) framework for problem decomposition using Genetic Programming (GP).
Tiny GPPythonGPL-v3A minimalistic program implementing Koza-style (tree-based) genetic programming to solve a symbolic regression problem.
BaikalPythonBSD-3-ClauseA graph-based functional API for building complex scikit-learn pipelines.
skdagPythonMITA more flexible alternative to scikit-learn Pipelines.
d6tflowPythonMITA python library which makes building complex data science workflows easy, fast and intuitive.
113 | -------------------------------------------------------------------------------- /docs/requirements_docs.txt: -------------------------------------------------------------------------------- 1 | griffe==1.3.1 2 | mike==2.1.3 3 | mkdocs==1.6.1 4 | mkdocs-include-markdown-plugin==6.2.2 5 | mkdocs-jupyter==0.25.0 6 | mkdocs-material==9.5.35 7 | mkdocstrings==0.26.1 8 | mkdocstrings-python==1.11.1 9 | nbconvert==7.16.5 10 | -------------------------------------------------------------------------------- /docs/scripts/build_docs_sources.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function iterate_files() { 4 | local directory="$1" 5 | base_dir="docs/documentation" 6 | 7 | for file in "$directory"/*; do 8 | if [ -f "$file" ] && [[ "$file" == *.py ]] && [ "$(basename "$file")" != "__init__.py" ] && \ 9 | ! echo "$file" | grep -q "test" && [ "$(basename "$file")" != "graph_utils.py" ]; then 10 | directories=$base_dir/$(dirname "$file") 11 | file_name=$(basename "$file") 12 | md_file=$directories/"${file_name%.*}".md 13 | 14 | mkdir -p $directories && touch $md_file 15 | include_line=$(dirname "$file") 16 | include_line="${include_line//\//.}"."${file_name%.*}" 17 | echo "::: $include_line" > $md_file 18 | 19 | elif [ -d "$file" ]; then 20 | iterate_files "$file" 21 | fi 22 | done 23 | } 24 | 25 | iterate_files "tpot" 26 | -------------------------------------------------------------------------------- /docs/scripts/build_mkdocs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cat > mkdocs.yml <> mkdocs.yml 79 | echo " - tpot_api/estimator.md" >> mkdocs.yml 80 | echo " - tpot_api/classifier.md" >> mkdocs.yml 81 | echo " - tpot_api/regressor.md" >> mkdocs.yml 82 | echo " - Examples:" >> mkdocs.yml 83 | for file in docs/Tutorial/*.ipynb; do 84 | base=$(basename $file .ipynb) 85 | echo " - Tutorial/$base.ipynb" >> mkdocs.yml 86 | done 87 | echo " - Documentation:" >> mkdocs.yml 88 | function iterate_source_files() { 89 | local directory="$1" 90 | 91 | for file in "$directory"/*; do 92 | if [ -f "$file" ] && [[ "$file" == *.md ]]; then 93 | slash_count=$(echo "$file" | grep -o '/' | wc -l) 94 | num_spaces=$((slash_count * 2)) 95 | spaces=$(printf "%*s" $num_spaces) 96 | echo "$spaces- ${file#*/}" >> mkdocs.yml 97 | fi 98 | done 99 | 100 | for file in "$directory"/*; do 101 | if [ -d "$file" ]; then 102 | slash_count=$(echo "$file" | grep -o '/' | wc -l) 103 | num_spaces=$((slash_count * 2)) 104 | spaces=$(printf "%*s" $num_spaces) 105 | last_dir=$(basename "$file") 106 | echo "$spaces- $last_dir:" >> mkdocs.yml 107 | iterate_source_files "$file" 108 | fi 109 | done 110 | } 111 | iterate_source_files "docs/documentation" 112 | # make these static instead 113 | # for file in docs/*.md; do 114 | # base=$(basename $file .md) 115 | # if [ "$base" == "index" ]; then 116 | # continue 117 | # fi 118 | # echo " - $base.md" >> mkdocs.yml 119 | # done 120 | echo " - contribute.md" >> mkdocs.yml 121 | echo " - cite.md" >> mkdocs.yml 122 | echo " - support.md" >> mkdocs.yml 123 | echo " - related.md" >> mkdocs.yml 124 | # moved to the top 125 | # # test docstring 126 | # # echo " - Tutorials:" >> mkdocs.yml 127 | # for file in docs/tutorial/*.ipynb; do 128 | # base=$(basename $file .ipynb) 129 | # echo " - tutorial/$base.ipynb" >> mkdocs.yml 130 | # done -------------------------------------------------------------------------------- /docs/scripts/build_tutorial_toc_not_used.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for file in docs/tutorial/*.html; do 4 | base=$(basename "$file" .html) 5 | echo "
" > "docs/tutorial/$base.md" 6 | done 7 | -------------------------------------------------------------------------------- /docs/support.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | TPOT was developed in the [Artificial Intelligence Innovation (A2I) Lab](http://epistasis.org/) at Cedars-Sinai with funding from the [NIH](http://www.nih.gov/) under grants U01 AG066833 and R01 LM010098. We are incredibly grateful for the support of the NIH and the Cedars-Sinai during the development of this project. 4 | 5 | The TPOT logo was designed by Todd Newmuis, who generously donated his time to the project. -------------------------------------------------------------------------------- /docs/tpot_api/classifier.md: -------------------------------------------------------------------------------- 1 | ::: tpot.tpot_estimator.templates.tpottemplates.TPOTClassifier -------------------------------------------------------------------------------- /docs/tpot_api/estimator.md: -------------------------------------------------------------------------------- 1 | ::: tpot.tpot_estimator.estimator -------------------------------------------------------------------------------- /docs/tpot_api/regressor.md: -------------------------------------------------------------------------------- 1 | ::: tpot.tpot_estimator.templates.tpottemplates.TPOTRegressor -------------------------------------------------------------------------------- /docs/using.md: -------------------------------------------------------------------------------- 1 | # Using TPOT 2 | See the Tutorials Folder for more instructions and examples. 3 | 4 | ## Best Practices 5 | 6 | ### 1 7 | TPOT uses dask for parallel processing. When Python is parallelized, each module is imported within each processes. Therefore it is important to protect all code within a `if __name__ == "__main__"` when running TPOT from a script. This is not required when running TPOT from a notebook. 8 | 9 | For example: 10 | 11 | ``` 12 | #my_analysis.py 13 | 14 | import tpot 15 | if __name__ == "__main__": 16 | X, y = load_my_data() 17 | est = tpot.TPOTClassifier() 18 | est.fit(X,y) 19 | #rest of analysis 20 | ``` 21 | 22 | ### 2 23 | 24 | When designing custom objective functions, avoid the use of global variables. 25 | 26 | Don't Do: 27 | ``` 28 | global_X = [[1,2],[4,5]] 29 | global_y = [0,1] 30 | def foo(est): 31 | return my_scorer(est, X=global_X, y=global_y) 32 | 33 | ``` 34 | 35 | Instead use a partial 36 | 37 | ``` 38 | from functools import partial 39 | 40 | def foo_scorer(est, X, y): 41 | return my_scorer(est, X, y) 42 | 43 | if __name__=='__main__': 44 | X = [[1,2],[4,5]] 45 | y = [0,1] 46 | final_scorer = partial(foo_scorer, X=X, y=y) 47 | ``` 48 | 49 | Similarly when using lambda functions. 50 | 51 | Dont Do: 52 | 53 | ``` 54 | def new_objective(est, a, b) 55 | #definition 56 | 57 | a = 100 58 | b = 20 59 | bad_function = lambda est : new_objective(est=est, a=a, b=b) 60 | ``` 61 | 62 | Do: 63 | ``` 64 | def new_objective(est, a, b) 65 | #definition 66 | 67 | a = 100 68 | b = 20 69 | good_function = lambda est, a=a, b=b : new_objective(est=est, a=a, b=b) 70 | ``` 71 | 72 | ## Tips 73 | 74 | TPOT will not check if your data is correctly formatted. It will assume that you have passed in operators that can handle the type of data that was passed in. For instance, if you pass in a pandas dataframe with categorical features and missing data, then you should also include in your configuration operators that can handle those feautures of the data. Alternatively, if you pass in `preprocessing = True`, TPOT will impute missing values, one hot encode categorical features, then standardize the data. (Note that this is currently fitted and transformed on the entire training set before splitting for CV. Later there will be an option to apply per fold, and have the parameters be learnable.) 75 | 76 | Setting `verbose` to 5 can be helpful during debugging as it will print out the error generated by failing pipelines. -------------------------------------------------------------------------------- /mkdocs_archived.yml: -------------------------------------------------------------------------------- 1 | site_name: TPOT 2 | site_url: http://epistasislab.github.io/tpot 3 | site_author: Randal S. Olson 4 | site_description: Documentation for TPOT, a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. 5 | 6 | repo_url: https://github.com/epistasislab/tpot 7 | edit_uri: edit/master/docs/archived/ 8 | docs_dir: docs/archived/ 9 | site_dir: target/archived_site 10 | #theme: readthedocs 11 | theme: 12 | name: material 13 | logo: assets/tpot-logo.jpg 14 | favicon: assets/favicon.ico 15 | features: 16 | - toc.integrate 17 | - navigation.top 18 | palette: 19 | # light mode 20 | - scheme: default 21 | primary: grey 22 | toggle: 23 | icon: material/brightness-7 24 | name: Switch to dark mode 25 | 26 | # dark mode 27 | - scheme: slate 28 | primary: grey 29 | toggle: 30 | icon: material/brightness-4 31 | name: Switch to light mode 32 | 33 | extra: 34 | version: 35 | provider: mike 36 | 37 | extra_css: 38 | - css/archived.css 39 | 40 | markdown_extensions: 41 | - tables 42 | - fenced_code 43 | - pymdownx.highlight: 44 | anchor_linenums: true 45 | - pymdownx.inlinehilite 46 | - pymdownx.snippets 47 | - pymdownx.superfences 48 | 49 | plugins: 50 | - include-markdown 51 | 52 | copyright: Developed by Randal S. Olson and others at the University of Pennsylvania 53 | 54 | nav: 55 | - Home: index.md 56 | - Installation: installing.md 57 | - Using TPOT: using.md 58 | - TPOT API: api.md 59 | - Examples: examples.md 60 | - Contributing: contributing.md 61 | - Release Notes: releases.md 62 | - Citing TPOT: citing.md 63 | - Support: support.md 64 | - Related: related.md 65 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.pytest.ini_options] 6 | addopts = "--cov=tpot" 7 | testpaths = [ 8 | "tpot/tests", 9 | ] 10 | 11 | [tool.mypy] 12 | mypy_path = "tpot" 13 | check_untyped_defs = true 14 | disallow_any_generics = true 15 | ignore_missing_imports = true 16 | no_implicit_optional = true 17 | show_error_codes = true 18 | strict_equality = true 19 | warn_redundant_casts = true 20 | warn_return_any = true 21 | warn_unreachable = true 22 | warn_unused_configs = true 23 | no_implicit_reexport = true -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | flake8==6.0.0 2 | tox==4.4.12 3 | pytest==7.3.0 4 | pytest-cov==4.0.0 5 | mypy==1.2.0 6 | setuptools -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [options.extras_require] 2 | testing = 3 | pytest>=6.0 4 | pytest-cov>=2.0 5 | mypy>=0.910 6 | flake8>=3.9 7 | tox>=3.24 8 | 9 | [options.package_data] 10 | tpot = py.typed 11 | 12 | [flake8] 13 | max-line-length = 120 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | #TODO update this 4 | from setuptools import setup, find_packages 5 | 6 | def calculate_version(): 7 | initpy = open('tpot/_version.py').read().split('\n') 8 | version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1] 9 | return version 10 | 11 | 12 | package_version = calculate_version() 13 | 14 | setup( 15 | name='TPOT', 16 | python_requires='>=3.10, <3.14', 17 | version=package_version, 18 | author='Pedro Ribeiro', 19 | packages=find_packages(), 20 | url='https://github.com/EpistasisLab/tpot', 21 | license='GNU/LGPLv3', #TODO 22 | entry_points={'console_scripts': ['tpot=tpot:main', ]}, 23 | description=('Tree-based Pipeline Optimization Tool'), 24 | long_description=''' 25 | A Python tool that automatically creates and optimizes machine learning pipelines using genetic programming. 26 | 27 | 28 | ''', 29 | zip_safe=True, 30 | install_requires=['numpy==1.26.4', 31 | 'scipy>=1.3.1', 32 | 'scikit-learn>=1.4.2,<1.6', 33 | 'update_checker>=0.16', 34 | 'tqdm>=4.36.1', 35 | 'stopit>=1.1.1', 36 | 'pandas>=2.2.0', 37 | 'joblib>=1.1.1', 38 | 'xgboost>=1.7.0', 39 | 'matplotlib>=3.6.2', 40 | 'traitlets>=5.8.0', 41 | 'lightgbm>=3.3.3', 42 | 'optuna>=3.0.5', 43 | 'networkx>=3.0', 44 | 'dask>=2024.4.2', 45 | 'distributed>=2024.4.2', 46 | 'dask-expr>=1.0.12', 47 | 'dask-jobqueue>=0.8.5', 48 | 'func_timeout>=4.3.5', 49 | 'configspace>=1.1.1', 50 | 'dill>=0.3.9', 51 | 'seaborn>=0.13.2', 52 | ], 53 | extras_require={ 54 | 'skrebate': ['skrebate>=0.3.4'], 55 | 'mdr': ['scikit-mdr>=0.4.4'], 56 | 'sklearnex' : ['scikit-learn-intelex>=2023.2.1'], 57 | 'amltk' : ['amltk>=1.12.1'], 58 | }, 59 | classifiers=[ 60 | 'Intended Audience :: Science/Research', 61 | 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)', 62 | 'Programming Language :: Python :: 3.10', 63 | 'Programming Language :: Python :: 3.11', 64 | 'Programming Language :: Python :: 3.12', 65 | 'Programming Language :: Python :: 3.13', 66 | 'Topic :: Scientific/Engineering :: Artificial Intelligence' 67 | ], 68 | keywords=['pipeline optimization', 'hyperparameter optimization', 'data science', 'machine learning', 'genetic programming', 'evolutionary computation'], 69 | ) 70 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | minversion = 3.28.0 3 | # flake8 and mypy outputs severla errors, so we disable them for now 4 | # envlist = py310, flake8, mypy 5 | envlist = py310, py311, py312, py313 6 | isolated_build = true 7 | 8 | [gh-actions] 9 | python = 10 | 3.10: py310 11 | 3.11: py311 12 | 3.12: py312 13 | 3.13: py313 14 | # 3.10: py310, flake8, mypy 15 | 16 | [testenv] 17 | setenv = 18 | PYTHONPATH = {toxinidir} 19 | deps = 20 | -r{toxinidir}/requirements_dev.txt 21 | commands = 22 | pytest --basetemp={envtmpdir} 23 | 24 | [testenv:flake8] 25 | basepython = python3.10 26 | deps = flake8 27 | commands = flake8 tpot 28 | 29 | [testenv:mypy] 30 | basepython = python3.10 31 | deps = 32 | -r{toxinidir}/requirements_dev.txt 33 | commands = mypy tpot 34 | -------------------------------------------------------------------------------- /tpot/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | 37 | #TODO: are all the imports in the init files done correctly? 38 | #TODO clean up import organization 39 | 40 | from .individual import BaseIndividual 41 | 42 | from .graphsklearn import GraphPipeline 43 | from .population import Population 44 | 45 | from . import builtin_modules 46 | from . import config 47 | from . import search_spaces 48 | from . import utils 49 | from . import evolvers 50 | from . import objectives 51 | from . import selectors 52 | from . import tpot_estimator 53 | from . import old_config_utils 54 | 55 | from .tpot_estimator import TPOTClassifier, TPOTRegressor, TPOTEstimator, TPOTEstimatorSteadyState 56 | 57 | from update_checker import update_check 58 | from ._version import __version__ 59 | update_check("tpot",__version__) -------------------------------------------------------------------------------- /tpot/_version.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | __version__ = '1.0.0' 37 | -------------------------------------------------------------------------------- /tpot/builtin_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .feature_set_selector import FeatureSetSelector 2 | from .zero_count import ZeroCount 3 | from .column_one_hot_encoder import ColumnOneHotEncoder, ColumnOrdinalEncoder 4 | from .arithmetictransformer import ArithmeticTransformer 5 | from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer 6 | from .passthrough import Passthrough, SkipTransformer 7 | from .imputer import ColumnSimpleImputer 8 | from .estimatortransformer import EstimatorTransformer 9 | from .passkbinsdiscretizer import PassKBinsDiscretizer 10 | 11 | try: 12 | from .nn import PytorchLRClassifier, PytorchMLPClassifier 13 | except (ModuleNotFoundError, ImportError): 14 | pass 15 | # import warnings 16 | # warnings.warn("Warning: optional dependency `torch` is not available. - skipping import of NN models.") -------------------------------------------------------------------------------- /tpot/builtin_modules/feature_encoding_frequency_selector.py: -------------------------------------------------------------------------------- 1 | """ 2 | From https://github.com/EpistasisLab/autoqtl 3 | """ 4 | 5 | import numpy as np 6 | from sklearn.base import BaseEstimator 7 | from sklearn.feature_selection._base import SelectorMixin 8 | 9 | class FeatureEncodingFrequencySelector(BaseEstimator, SelectorMixin): 10 | """Feature selector based on Encoding Frequency. Encoding frequency is the frequency of each unique element(0/1/2/3) present in a feature set. 11 | Features are selected on the basis of a threshold assigned for encoding frequency. If frequency of any unique element is less than or equal to threshold, the feature is removed. """ 12 | 13 | @property 14 | def __name__(self): 15 | """Instance name is the same as the class name. """ 16 | return self.__class__.__name__ 17 | 18 | def __init__(self, threshold): 19 | """Create a FeatureEncodingFrequencySelector object. 20 | 21 | Parameters 22 | ---------- 23 | threshold : float, required 24 | Threshold value for allele frequency. If frequency of A or frequency of a is less than the threshold value then the feature is dropped. 25 | 26 | Returns 27 | ------- 28 | None 29 | 30 | """ 31 | self.threshold = threshold 32 | 33 | """def fit(self, X, y=None): 34 | Fit FeatureAlleleFrequencySelector for feature selection 35 | 36 | Parameters 37 | ---------- 38 | X : numpy ndarray, {n_samples, n_features} 39 | The training input samples. 40 | y : numpy array {n_samples,} 41 | The training target values. 42 | 43 | Returns 44 | ------- 45 | self : object 46 | Returns a copy of the estimator 47 | 48 | self.selected_feature_indexes = [] 49 | self.no_of_features = X.shape[1] 50 | 51 | # Finding the no of alleles in each feature column 52 | for i in range(0, X.shape[1]): 53 | no_of_AA_featurewise = np.count_nonzero(X[:,i]==0) 54 | no_of_Aa_featurewise = np.count_nonzero(X[:,i]==1) 55 | no_of_aa_featurewise = np.count_nonzero(X[:,i]==2) 56 | 57 | 58 | frequency_A_featurewise = (2*no_of_AA_featurewise + no_of_Aa_featurewise) / (2*no_of_AA_featurewise + 59 | 2*no_of_Aa_featurewise + 2*no_of_aa_featurewise) 60 | 61 | frequency_a_featurewise = 1 - frequency_A_featurewise 62 | 63 | if(not(frequency_A_featurewise <= self.threshold) and not(frequency_a_featurewise <= self.threshold)): 64 | self.selected_feature_indexes.append(i) 65 | return self""" 66 | 67 | """def transform(self, X): 68 | Make subset after fit 69 | 70 | Parameters 71 | ---------- 72 | X : numpy ndarray, {n_samples, n_features} 73 | New data, where n_samples is the number of samples and n_features is the number of features. 74 | 75 | Returns 76 | ------- 77 | X_transformed : numpy ndarray, {n_samples, n_features} 78 | The transformed feature set. 79 | 80 | 81 | X_transformed = X[:, self.selected_feature_indexes] 82 | 83 | return X_transformed""" 84 | 85 | def fit(self, X, y=None) : 86 | """Fit FeatureEncodingFrequencySelector for feature selection. This function gets the appropriate features. """ 87 | 88 | self.selected_feature_indexes = [] 89 | self.no_of_original_features = X.shape[1] 90 | 91 | # Finding the frequency of all the unique elements present featurewise in the input variable X 92 | for i in range(0, X.shape[1]): 93 | unique, counts = np.unique(X[:,i], return_counts=True) 94 | element_count_dict_featurewise = dict(zip(unique, counts)) 95 | element_frequency_dict_featurewise = {} 96 | feature_column_selected = True 97 | 98 | for x in unique: 99 | x_frequency_featurewise = element_count_dict_featurewise[x] / sum(counts) 100 | element_frequency_dict_featurewise[x] = x_frequency_featurewise 101 | 102 | for frequency in element_frequency_dict_featurewise.values(): 103 | if frequency <= self.threshold : 104 | feature_column_selected = False 105 | break 106 | 107 | if feature_column_selected == True : 108 | self.selected_feature_indexes.append(i) 109 | 110 | if not len(self.selected_feature_indexes): 111 | """msg = "No feature in X meets the encoding frequency threshold {0:.5f}" 112 | raise ValueError(msg.format(self.threshold))""" 113 | for i in range(0, X.shape[1]): 114 | self.selected_feature_indexes.append(i) 115 | 116 | return self 117 | 118 | def transform(self, X): 119 | """ Make subset after fit. This function returns a transformed version of X. """ 120 | X_transformed = X[:, self.selected_feature_indexes] 121 | 122 | return X_transformed 123 | 124 | 125 | def _get_support_mask(self): 126 | """ 127 | Get the boolean mask indicating which features are selected 128 | It is the abstractmethod 129 | 130 | Returns 131 | ------- 132 | support : boolean array of shape [# input features] 133 | An element is True iff its corresponding feature is selected for retention. 134 | """ 135 | n_features = self.no_of_original_features 136 | mask = np.zeros(n_features, dtype=bool) 137 | mask[np.asarray(self.selected_feature_indexes)] = True 138 | 139 | return mask 140 | -------------------------------------------------------------------------------- /tpot/builtin_modules/feature_set_selector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | 37 | #TODO handle sparse input? 38 | 39 | import numpy as np 40 | import pandas as pd 41 | import os, os.path 42 | from sklearn.base import BaseEstimator 43 | from sklearn.feature_selection._base import SelectorMixin 44 | 45 | 46 | 47 | #TODO clean this up and make sure it works 48 | class FeatureSetSelector(BaseEstimator, SelectorMixin): 49 | """ 50 | Select predefined feature subsets. 51 | 52 | 53 | """ 54 | 55 | def __init__(self, sel_subset=None, name=None): 56 | """Create a FeatureSetSelector object. 57 | 58 | Parameters 59 | ---------- 60 | sel_subset: list or int 61 | If X is a dataframe, items in sel_subset list must correspond to column names 62 | If X is a numpy array, items in sel_subset list must correspond to column indexes 63 | int: index of a single column 64 | Returns 65 | ------- 66 | None 67 | 68 | """ 69 | self.name = name 70 | self.sel_subset = sel_subset 71 | 72 | 73 | def fit(self, X, y=None): 74 | """Fit FeatureSetSelector for feature selection 75 | 76 | Parameters 77 | ---------- 78 | X: array-like of shape (n_samples, n_features) 79 | The training input samples. 80 | y: array-like, shape (n_samples,) 81 | The target values (integers that correspond to classes in classification, real numbers in regression). 82 | 83 | Returns 84 | ------- 85 | self: object 86 | Returns a copy of the estimator 87 | """ 88 | if isinstance(self.sel_subset, int) or isinstance(self.sel_subset, str): 89 | self.sel_subset = [self.sel_subset] 90 | 91 | #generate self.feat_list_idx 92 | if isinstance(X, pd.DataFrame): 93 | self.feature_names_in_ = X.columns.tolist() 94 | self.feat_list_idx = sorted([self.feature_names_in_.index(feat) for feat in self.sel_subset]) 95 | 96 | 97 | elif isinstance(X, np.ndarray): 98 | self.feature_names_in_ = None#list(range(X.shape[1])) 99 | 100 | self.feat_list_idx = sorted(self.sel_subset) 101 | 102 | n_features = X.shape[1] 103 | self.mask = np.zeros(n_features, dtype=bool) 104 | self.mask[np.asarray(self.feat_list_idx)] = True 105 | 106 | return self 107 | 108 | #TODO keep returned as dataframe if input is dataframe? may not be consistent with sklearn 109 | 110 | # def transform(self, X): 111 | 112 | def _get_tags(self): 113 | tags = {"allow_nan": True, "requires_y": False} 114 | return tags 115 | 116 | def _get_support_mask(self): 117 | """ 118 | Get the boolean mask indicating which features are selected 119 | Returns 120 | ------- 121 | support : boolean array of shape [# input features] 122 | An element is True iff its corresponding feature is selected for 123 | retention. 124 | """ 125 | return self.mask 126 | 127 | -------------------------------------------------------------------------------- /tpot/builtin_modules/imputer.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | 37 | #TODO support np arrays 38 | 39 | import numpy as np 40 | from scipy import sparse 41 | 42 | from sklearn.base import BaseEstimator, TransformerMixin 43 | from sklearn.utils import check_array 44 | from sklearn.preprocessing import OneHotEncoder 45 | import sklearn 46 | import sklearn.impute 47 | 48 | import pandas as pd 49 | from pandas.api.types import is_numeric_dtype 50 | import sklearn.compose 51 | 52 | 53 | class ColumnSimpleImputer(BaseEstimator, TransformerMixin): 54 | def __init__(self, columns="all", 55 | missing_values=np.nan, 56 | strategy="mean", 57 | fill_value=None, 58 | copy=True, 59 | add_indicator=False, 60 | keep_empty_features=False,): 61 | """" 62 | A wrapper for SimpleImputer that allows for imputation of specific columns in a DataFrame or np array. 63 | Passes through columns that are not imputed. 64 | 65 | Parameters 66 | ---------- 67 | columns : str, list, default='all' 68 | Determines which columns to impute with sklearn.impute.SimpleImputer. 69 | - 'categorical' : Automatically select categorical features 70 | - 'numeric' : Automatically select numeric features 71 | - 'all' : Select all features 72 | - list : A list of columns to select 73 | 74 | # See documentation from sklearn.impute.SimpleImputer for the following parameters 75 | missing_values, strategy, fill_value, copy, add_indicator, keep_empty_features 76 | 77 | """ 78 | 79 | self.columns = columns 80 | self.missing_values = missing_values 81 | self.strategy = strategy 82 | self.fill_value = fill_value 83 | self.copy = copy 84 | self.add_indicator = add_indicator 85 | self.keep_empty_features = keep_empty_features 86 | 87 | 88 | def fit(self, X, y=None): 89 | if (self.columns == "categorical" or self.columns == "numeric") and not isinstance(X, pd.DataFrame): 90 | raise ValueError(f"Invalid value for columns: {self.columns}. " 91 | "Only 'all' or is supported for np arrays") 92 | 93 | if self.columns == "categorical": 94 | self.columns_ = list(X.select_dtypes(exclude='number').columns) 95 | elif self.columns == "numeric": 96 | self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])] 97 | elif self.columns == "all": 98 | if isinstance(X, pd.DataFrame): 99 | self.columns_ = X.columns 100 | else: 101 | self.columns_ = list(range(X.shape[1])) 102 | elif isinstance(self.columns, list): 103 | self.columns_ = self.columns 104 | else: 105 | raise ValueError(f"Invalid value for columns: {self.columns}") 106 | 107 | if len(self.columns_) == 0: 108 | return self 109 | 110 | self.imputer = sklearn.impute.SimpleImputer(missing_values=self.missing_values, 111 | strategy=self.strategy, 112 | fill_value=self.fill_value, 113 | copy=self.copy, 114 | add_indicator=self.add_indicator, 115 | keep_empty_features=self.keep_empty_features) 116 | 117 | if isinstance(X, pd.DataFrame): 118 | self.imputer.set_output(transform="pandas") 119 | 120 | if isinstance(X, pd.DataFrame): 121 | self.imputer.fit(X[self.columns_], y) 122 | else: 123 | self.imputer.fit(X[:, self.columns_], y) 124 | 125 | return self 126 | 127 | def transform(self, X): 128 | if len(self.columns_) == 0: 129 | return X 130 | 131 | if isinstance(X, pd.DataFrame): 132 | X = X.copy() 133 | X[self.columns_] = self.imputer.transform(X[self.columns_]) 134 | return X 135 | else: 136 | X = np.copy(X) 137 | X[:, self.columns_] = self.imputer.transform(X[:, self.columns_]) 138 | return X 139 | 140 | 141 | -------------------------------------------------------------------------------- /tpot/builtin_modules/passkbinsdiscretizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import pandas as pd 37 | from sklearn.base import BaseEstimator, TransformerMixin 38 | from sklearn.compose import ColumnTransformer 39 | from sklearn.preprocessing import KBinsDiscretizer 40 | import numpy as np 41 | 42 | def select_features(X, min_unique=10,): 43 | """ 44 | Given a DataFrame or numpy array, return a list of column indices that have more than min_unique unique values. 45 | 46 | Parameters 47 | ---------- 48 | X: DataFrame or numpy array 49 | Data to select features from 50 | min_unique: int, default=10 51 | Minimum number of unique values a column must have to be selected 52 | 53 | Returns 54 | ------- 55 | list 56 | List of column indices that have more than min_unique unique values 57 | 58 | """ 59 | 60 | if isinstance(X, pd.DataFrame): 61 | return [col for col in X.columns if len(X[col].unique()) > min_unique] 62 | else: 63 | return [i for i in range(X.shape[1]) if len(np.unique(X[:, i])) > min_unique] 64 | 65 | class PassKBinsDiscretizer(BaseEstimator, TransformerMixin): 66 | def __init__(self, n_bins=5, encode='onehot-dense', strategy='quantile', subsample=None, random_state=None): 67 | self.n_bins = n_bins 68 | self.encode = encode 69 | self.strategy = strategy 70 | self.subsample = subsample 71 | self.random_state = random_state 72 | """ 73 | Same as sklearn.preprocessing.KBinsDiscretizer, but passes through columns that are not discretized due to having fewer than n_bins unique values instead of ignoring them. 74 | See sklearn.preprocessing.KBinsDiscretizer for more information. 75 | """ 76 | 77 | def fit(self, X, y=None): 78 | # Identify columns with more than n unique values 79 | # Create a ColumnTransformer to select and discretize the chosen columns 80 | self.selected_columns_ = select_features(X, min_unique=10) 81 | if isinstance(X, pd.DataFrame): 82 | self.not_selected_columns_ = [col for col in X.columns if col not in self.selected_columns_] 83 | else: 84 | self.not_selected_columns_ = [i for i in range(X.shape[1]) if i not in self.selected_columns_] 85 | 86 | enc = KBinsDiscretizer(n_bins=self.n_bins, encode=self.encode, strategy=self.strategy, subsample=self.subsample, random_state=self.random_state) 87 | self.transformer = ColumnTransformer([ 88 | ('discretizer', enc, self.selected_columns_), 89 | ('passthrough', 'passthrough', self.not_selected_columns_) 90 | ]) 91 | self.transformer.fit(X) 92 | return self 93 | 94 | def transform(self, X): 95 | return self.transformer.transform(X) -------------------------------------------------------------------------------- /tpot/builtin_modules/passthrough.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from sklearn.base import BaseEstimator, TransformerMixin 37 | import numpy as np 38 | 39 | class Passthrough(TransformerMixin,BaseEstimator): 40 | """ 41 | A transformer that does nothing. It just passes the input array as is. 42 | """ 43 | 44 | def fit(self, X=None, y=None): 45 | """ 46 | Nothing to fit, just returns self. 47 | """ 48 | return self 49 | 50 | def transform(self, X): 51 | """ 52 | returns the input array as is. 53 | """ 54 | return X 55 | 56 | 57 | class SkipTransformer(TransformerMixin,BaseEstimator): 58 | """ 59 | A transformer returns an empty array. When combined with FeatureUnion, it can be used to skip a branch. 60 | """ 61 | def fit(self, X=None, y=None): 62 | """ 63 | Nothing to fit, just returns self. 64 | """ 65 | return self 66 | 67 | def transform(self, X): 68 | """ 69 | returns an empty array. 70 | """ 71 | return np.array([]).reshape(X.shape[0],0) 72 | 73 | -------------------------------------------------------------------------------- /tpot/builtin_modules/tests/feature_set_selector_tests.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | 37 | 38 | import numpy as np 39 | import pandas as pd 40 | from tpot.config.custom_modules import FeatureSetSelector 41 | from nose.tools import assert_raises 42 | 43 | test_data = pd.read_csv("tests/tests.csv") 44 | test_X = test_data.drop("class", axis=1) 45 | 46 | 47 | def test_FeatureSetSelector_1(): 48 | """Assert that the StackingEstimator returns transformed X based on test feature list 1.""" 49 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") 50 | ds.fit(test_X, y=None) 51 | transformed_X = ds.transform(test_X) 52 | 53 | assert transformed_X.shape[0] == test_X.shape[0] 54 | assert transformed_X.shape[1] != test_X.shape[1] 55 | assert transformed_X.shape[1] == 5 56 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values) 57 | 58 | def test_FeatureSetSelector_2(): 59 | """Assert that the StackingEstimator returns transformed X based on test feature list 2.""" 60 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_2") 61 | ds.fit(test_X, y=None) 62 | transformed_X = ds.transform(test_X) 63 | 64 | assert transformed_X.shape[0] == test_X.shape[0] 65 | assert transformed_X.shape[1] != test_X.shape[1] 66 | assert transformed_X.shape[1] == 6 67 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values) 68 | 69 | def test_FeatureSetSelector_3(): 70 | """Assert that the StackingEstimator returns transformed X based on 2 subsets' names""" 71 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=["test_subset_1", "test_subset_2"]) 72 | ds.fit(test_X, y=None) 73 | transformed_X = ds.transform(test_X) 74 | 75 | assert transformed_X.shape[0] == test_X.shape[0] 76 | assert transformed_X.shape[1] != test_X.shape[1] 77 | assert transformed_X.shape[1] == 7 78 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values) 79 | 80 | def test_FeatureSetSelector_4(): 81 | """Assert that the StackingEstimator returns transformed X based on 2 subsets' indexs""" 82 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=[0, 1]) 83 | ds.fit(test_X, y=None) 84 | transformed_X = ds.transform(test_X) 85 | 86 | assert transformed_X.shape[0] == test_X.shape[0] 87 | assert transformed_X.shape[1] != test_X.shape[1] 88 | assert transformed_X.shape[1] == 7 89 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values) 90 | 91 | def test_FeatureSetSelector_5(): 92 | """Assert that the StackingEstimator returns transformed X seleced based on test feature list 1's index.""" 93 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=0) 94 | ds.fit(test_X, y=None) 95 | transformed_X = ds.transform(test_X) 96 | 97 | assert transformed_X.shape[0] == test_X.shape[0] 98 | assert transformed_X.shape[1] != test_X.shape[1] 99 | assert transformed_X.shape[1] == 5 100 | assert np.array_equal(transformed_X, test_X[ds.feat_list].values) 101 | 102 | def test_FeatureSetSelector_6(): 103 | """Assert that the _get_support_mask function returns correct mask.""" 104 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") 105 | ds.fit(test_X, y=None) 106 | mask = ds._get_support_mask() 107 | get_mask = ds.get_support() 108 | 109 | assert mask.shape[0] == 30 110 | assert np.count_nonzero(mask) == 5 111 | assert np.array_equal(get_mask, mask) 112 | 113 | def test_FeatureSetSelector_7(): 114 | """Assert that the StackingEstimator works as expected when input X is np.array.""" 115 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") 116 | ds.fit(test_X.values, y=None) 117 | transformed_X = ds.transform(test_X.values) 118 | str_feat_list = [str(i+2) for i in ds.feat_list_idx] 119 | 120 | 121 | assert transformed_X.shape[0] == test_X.shape[0] 122 | assert transformed_X.shape[1] != test_X.shape[1] 123 | assert transformed_X.shape[1] == 5 124 | assert np.array_equal(transformed_X, test_X.values[:, ds.feat_list_idx]) 125 | assert np.array_equal(transformed_X, test_X[str_feat_list].values) 126 | 127 | 128 | def test_FeatureSetSelector_8(): 129 | """Assert that the StackingEstimator rasies ValueError when features are not available.""" 130 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_4") 131 | assert_raises(ValueError, ds.fit, test_X) 132 | 133 | 134 | def test_FeatureSetSelector_9(): 135 | """Assert that the StackingEstimator __name__ returns correct class name.""" 136 | ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_4") 137 | assert ds.__name__ == 'FeatureSetSelector' 138 | -------------------------------------------------------------------------------- /tpot/builtin_modules/zero_count.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | 37 | 38 | import numpy as np 39 | from sklearn.base import BaseEstimator, TransformerMixin 40 | from sklearn.utils import check_array 41 | 42 | 43 | class ZeroCount(BaseEstimator, TransformerMixin): 44 | """Adds the count of zeros and count of non-zeros per sample as features.""" 45 | 46 | def fit(self, X, y=None): 47 | """Dummy function to fit in with the sklearn API.""" 48 | return self 49 | 50 | def transform(self, X, y=None): 51 | """Transform data by adding two virtual features. 52 | 53 | Parameters 54 | ---------- 55 | X: numpy ndarray, {n_samples, n_components} 56 | New data, where n_samples is the number of samples and n_components 57 | is the number of components. 58 | y: None 59 | Unused 60 | 61 | Returns 62 | ------- 63 | X_transformed: array-like, shape (n_samples, n_features) 64 | The transformed feature set 65 | """ 66 | X = check_array(X) 67 | n_features = X.shape[1] 68 | 69 | X_transformed = np.copy(X) 70 | 71 | non_zero_vector = np.count_nonzero(X_transformed, axis=1) 72 | non_zero = np.reshape(non_zero_vector, (-1, 1)) 73 | zero_col = np.reshape(n_features - non_zero_vector, (-1, 1)) 74 | 75 | X_transformed = np.hstack((non_zero, X_transformed)) 76 | X_transformed = np.hstack((zero_col, X_transformed)) 77 | 78 | return X_transformed 79 | -------------------------------------------------------------------------------- /tpot/config/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from .get_configspace import get_search_space -------------------------------------------------------------------------------- /tpot/config/autoqtl_builtins.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from tpot.builtin_modules import genetic_encoders 37 | from tpot.builtin_modules import feature_encoding_frequency_selector 38 | import sklearn 39 | import numpy as np 40 | 41 | from ConfigSpace import ConfigurationSpace 42 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal 43 | 44 | FeatureEncodingFrequencySelector_ConfigurationSpace = ConfigurationSpace( 45 | space = { 46 | 'threshold': Float("threshold", bounds=(0, .35)) 47 | } 48 | ) 49 | 50 | 51 | # genetic_encoders.DominantEncoder : {}, 52 | # genetic_encoders.RecessiveEncoder : {}, 53 | # genetic_encoders.HeterosisEncoder : {}, 54 | # genetic_encoders.UnderDominanceEncoder : {}, 55 | # genetic_encoders.OverDominanceEncoder : {}, 56 | 57 | -------------------------------------------------------------------------------- /tpot/config/classifiers_sklearnex.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from ConfigSpace import ConfigurationSpace 37 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal 38 | 39 | 40 | def get_RandomForestClassifier_ConfigurationSpace(random_state, n_jobs=1): 41 | space = { 42 | 'n_estimators': 100, #TODO make this a higher number? learned? 43 | 'bootstrap': Categorical("bootstrap", [True, False]), 44 | 'min_samples_split': Integer("min_samples_split", bounds=(2, 20)), 45 | 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 20)), 46 | 'n_jobs': n_jobs, 47 | 48 | } 49 | 50 | if random_state is not None: #This is required because configspace doesn't allow None as a value 51 | space['random_state'] = random_state 52 | 53 | return ConfigurationSpace( 54 | space = space 55 | ) 56 | 57 | def get_KNeighborsClassifier_ConfigurationSpace(n_samples): 58 | return ConfigurationSpace( 59 | space = { 60 | 'n_neighbors': Integer("n_neighbors", bounds=(1, max(n_samples, 100)), log=True), 61 | 'weights': Categorical("weights", ['uniform', 'distance']), 62 | } 63 | ) 64 | 65 | 66 | #TODO add conditionals 67 | def get_LogisticRegression_ConfigurationSpace(random_state): 68 | space = { 69 | 'solver': Categorical("solver", ['liblinear', 'sag', 'saga']), 70 | 'penalty': Categorical("penalty", ['l1', 'l2']), 71 | 'dual': Categorical("dual", [True, False]), 72 | 'C': Float("C", bounds=(1e-4, 1e4), log=True), 73 | 'max_iter': 1000, 74 | } 75 | 76 | if random_state is not None: #This is required because configspace doesn't allow None as a value 77 | space['random_state'] = random_state 78 | 79 | return ConfigurationSpace( 80 | space = space 81 | ) 82 | 83 | def get_SVC_ConfigurationSpace(random_state): 84 | space = { 85 | 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), 86 | 'C': Float("C", bounds=(1e-4, 25), log=True), 87 | 'degree': Integer("degree", bounds=(1, 4)), 88 | 'max_iter': 3000, 89 | 'tol': 0.001, 90 | 'probability': Categorical("probability", [True]), # configspace doesn't allow bools as a default value? but does allow them as a value inside a Categorical 91 | } 92 | 93 | if random_state is not None: #This is required because configspace doesn't allow None as a value 94 | space['random_state'] = random_state 95 | 96 | return ConfigurationSpace( 97 | space = space 98 | ) 99 | 100 | def get_NuSVC_ConfigurationSpace(random_state): 101 | space = { 102 | 'nu': Float("nu", bounds=(0.05, 1.0)), 103 | 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), 104 | #'C': Float("C", bounds=(1e-4, 25), log=True), 105 | 'degree': Integer("degree", bounds=(1, 4)), 106 | 'class_weight': Categorical("class_weight", [None, 'balanced']), 107 | 'max_iter': 3000, 108 | 'tol': 0.005, 109 | 'probability': Categorical("probability", [True]), # configspace doesn't allow bools as a default value? but does allow them as a value inside a Categorical 110 | } 111 | 112 | if random_state is not None: #This is required because configspace doesn't allow None as a value 113 | space['random_state'] = random_state 114 | 115 | return ConfigurationSpace( 116 | space = space 117 | ) -------------------------------------------------------------------------------- /tpot/config/imputers.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import sklearn 37 | import sklearn.ensemble 38 | import sklearn.linear_model 39 | import sklearn.neighbors 40 | from ConfigSpace import ConfigurationSpace 41 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal 42 | from ConfigSpace import EqualsCondition 43 | 44 | 45 | simple_imputer_cs = ConfigurationSpace( 46 | space = { 47 | 'strategy' : Categorical('strategy', 48 | ['mean','median', 'most_frequent', 'constant'] 49 | ), 50 | #'add_indicator' : Categorical('add_indicator', [True, False]), 51 | #Removed add_indicator, it appends a mask next to the rest of the data 52 | # and can cause errors. gk 53 | } 54 | ) 55 | #test 56 | def get_IterativeImputer_config_space(n_features, random_state): 57 | space = { 'initial_strategy' : Categorical('initial_strategy', 58 | ['mean', 'median', 59 | 'most_frequent', 'constant']), 60 | 'n_nearest_features' : Integer('n_nearest_features', 61 | bounds=(1, n_features)), 62 | 'imputation_order' : Categorical('imputation_order', 63 | ['ascending', 'descending', 64 | 'roman', 'arabic', 'random']), 65 | } 66 | 67 | estimator = Categorical('estimator', ['Bayesian', 'RFR', 'Ridge', 'KNN']) 68 | sample_posterior = Categorical('sample_posterior', [True, False]) 69 | sampling_condition = EqualsCondition(sample_posterior, estimator, 'Bayesian') 70 | 71 | if random_state is not None: 72 | #This is required because configspace doesn't allow None as a value 73 | space['random_state'] = random_state 74 | 75 | cs = ConfigurationSpace(space=space) 76 | cs.add([estimator, sample_posterior]) 77 | cs.add([sampling_condition]) 78 | return cs 79 | 80 | def get_IterativeImputer_config_space_no_estimator(n_features, random_state): 81 | space = { 'initial_strategy' : Categorical('initial_strategy', 82 | ['mean', 'median', 83 | 'most_frequent', 'constant']), 84 | 'n_nearest_features' : Integer('n_nearest_features', 85 | bounds=(1, n_features)), 86 | 'imputation_order' : Categorical('imputation_order', 87 | ['ascending', 'descending', 88 | 'roman', 'arabic', 'random']), 89 | } 90 | 91 | if random_state is not None: 92 | #This is required because configspace doesn't allow None as a value 93 | space['random_state'] = random_state 94 | 95 | cs = ConfigurationSpace(space=space) 96 | 97 | return cs 98 | 99 | def get_KNNImputer_config_space(n_samples): 100 | space = { 101 | 'n_neighbors': Integer('n_neighbors', bounds=(1, max(n_samples,100))), 102 | 'weights': Categorical('weights', ['uniform', 'distance']) 103 | } 104 | 105 | return ConfigurationSpace( 106 | space=space 107 | ) 108 | 109 | def IterativeImputer_hyperparameter_parser(params): 110 | est = params['estimator'] 111 | match est: 112 | case 'Bayesian': 113 | estimator = sklearn.linear_model.BayesianRidge() 114 | case 'RFR': 115 | estimator = sklearn.ensemble.RandomForestRegressor() 116 | case 'Ridge': 117 | estimator = sklearn.linear_model.Ridge() 118 | case 'KNN': 119 | estimator = sklearn.neighbors.KNeighborsRegressor() 120 | 121 | final_params = { 122 | 'estimator' : estimator, 123 | 'initial_strategy' : params['initial_strategy'], 124 | 'n_nearest_features' : params['n_nearest_features'], 125 | 'imputation_order' : params['imputation_order'], 126 | } 127 | 128 | if 'sample_posterior' in params: 129 | final_params['sample_posterior'] = params['sample_posterior'] 130 | 131 | if 'random_state' in params: 132 | final_params['random_state'] = params['random_state'] 133 | 134 | return final_params -------------------------------------------------------------------------------- /tpot/config/mdr_configs.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from ConfigSpace import ConfigurationSpace 37 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal 38 | 39 | 40 | 41 | #MDR 42 | MDR_configspace = ConfigurationSpace( 43 | space = { 44 | 'tie_break': Categorical('tie_break', [0,1]), 45 | 'default_label': Categorical('default_label', [0,1]), 46 | } 47 | ) 48 | 49 | 50 | 51 | 52 | def get_skrebate_ReliefF_config_space(n_features): 53 | return ConfigurationSpace( 54 | space = { 55 | 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True), 56 | 'n_neighbors': Integer('n_neighbors', bounds=(2,500), log=True), 57 | } 58 | ) 59 | 60 | 61 | def get_skrebate_SURF_config_space(n_features): 62 | return ConfigurationSpace( 63 | space = { 64 | 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True), 65 | } 66 | ) 67 | 68 | 69 | def get_skrebate_SURFstar_config_space(n_features): 70 | return ConfigurationSpace( 71 | space = { 72 | 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True), 73 | } 74 | ) 75 | def get_skrebate_MultiSURF_config_space(n_features): 76 | return ConfigurationSpace( 77 | space = { 78 | 'n_features_to_select': Integer('n_features_to_select', bounds=(1, n_features), log=True), 79 | } 80 | ) 81 | -------------------------------------------------------------------------------- /tpot/config/regressors_sklearnex.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from ConfigSpace import ConfigurationSpace 37 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal 38 | 39 | 40 | 41 | def get_RandomForestRegressor_ConfigurationSpace(random_state): 42 | space = { 43 | 'n_estimators': 100, 44 | 'max_features': Float("max_features", bounds=(0.05, 1.0)), 45 | 'bootstrap': Categorical("bootstrap", [True, False]), 46 | 'min_samples_split': Integer("min_samples_split", bounds=(2, 21)), 47 | 'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 21)), 48 | } 49 | 50 | if random_state is not None: #This is required because configspace doesn't allow None as a value 51 | space['random_state'] = random_state 52 | 53 | return ConfigurationSpace( 54 | space = space 55 | ) 56 | 57 | 58 | def get_KNeighborsRegressor_ConfigurationSpace(n_samples): 59 | return ConfigurationSpace( 60 | space = { 61 | 'n_neighbors': Integer("n_neighbors", bounds=(1, max(n_samples, 100))), 62 | 'weights': Categorical("weights", ['uniform', 'distance']), 63 | } 64 | ) 65 | 66 | 67 | def get_Ridge_ConfigurationSpace(random_state): 68 | space = { 69 | 'alpha': Float("alpha", bounds=(0.0, 1.0)), 70 | 'fit_intercept': Categorical("fit_intercept", [True]), 71 | 'tol': Float("tol", bounds=(1e-5, 1e-1)), 72 | } 73 | 74 | if random_state is not None: #This is required because configspace doesn't allow None as a value 75 | space['random_state'] = random_state 76 | 77 | return ConfigurationSpace( 78 | space = space 79 | ) 80 | 81 | def get_Lasso_ConfigurationSpace(random_state): 82 | space = { 83 | 'alpha': Float("alpha", bounds=(0.0, 1.0)), 84 | 'fit_intercept': Categorical("fit_intercept", [True]), 85 | 'precompute': Categorical("precompute", [True, False, 'auto']), 86 | 'tol': 0.001, 87 | 'positive': Categorical("positive", [True, False]), 88 | 'selection': Categorical("selection", ['cyclic', 'random']), 89 | } 90 | 91 | if random_state is not None: #This is required because configspace doesn't allow None as a value 92 | space['random_state'] = random_state 93 | 94 | return ConfigurationSpace( 95 | space = space 96 | ) 97 | 98 | def get_ElasticNet_ConfigurationSpace(random_state): 99 | space = { 100 | 'alpha': Float("alpha", bounds=(0.0, 1.0)), 101 | 'l1_ratio': Float("l1_ratio", bounds=(0.0, 1.0)), 102 | } 103 | 104 | if random_state is not None: #This is required because configspace doesn't allow None as a value 105 | space['random_state'] = random_state 106 | 107 | return ConfigurationSpace( 108 | space = space 109 | ) 110 | 111 | 112 | def get_SVR_ConfigurationSpace(random_state): 113 | space = { 114 | 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), 115 | 'C': Float("C", bounds=(1e-4, 25), log=True), 116 | 'degree': Integer("degree", bounds=(1, 4)), 117 | 'max_iter': 3000, 118 | 'tol': 0.001, 119 | } 120 | 121 | if random_state is not None: #This is required because configspace doesn't allow None as a value 122 | space['random_state'] = random_state 123 | 124 | return ConfigurationSpace( 125 | space = space 126 | ) 127 | 128 | def get_NuSVR_ConfigurationSpace(random_state): 129 | space = { 130 | 'nu': Float("nu", bounds=(0.05, 1.0)), 131 | 'kernel': Categorical("kernel", ['poly', 'rbf', 'linear', 'sigmoid']), 132 | 'C': Float("C", bounds=(1e-4, 25), log=True), 133 | 'degree': Integer("degree", bounds=(1, 4)), 134 | 'max_iter': 3000, 135 | 'tol': 0.005, 136 | } 137 | 138 | if random_state is not None: #This is required because configspace doesn't allow None as a value 139 | space['random_state'] = random_state 140 | 141 | return ConfigurationSpace( 142 | space = space 143 | ) -------------------------------------------------------------------------------- /tpot/config/selectors.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | #TODO: how to best support transformers/selectors that take other transformers with their own hyperparameters? 37 | import numpy as np 38 | import sklearn 39 | 40 | from ConfigSpace import ConfigurationSpace 41 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal 42 | 43 | SelectFwe_configspace = ConfigurationSpace( 44 | space = { 45 | 'alpha': Float('alpha', bounds=(1e-4, 0.05), log=True), 46 | } 47 | ) 48 | 49 | 50 | SelectPercentile_configspace = ConfigurationSpace( 51 | space = { 52 | 'percentile': Float('percentile', bounds=(1, 100.0)), 53 | } 54 | ) 55 | 56 | VarianceThreshold_configspace = ConfigurationSpace( 57 | space = { 58 | 'threshold': Float('threshold', bounds=(1e-4, .2), log=True), 59 | } 60 | ) 61 | 62 | 63 | 64 | # Note the RFE_configspace_part and SelectFromModel_configspace_part are not complete, they both require the estimator to be set. 65 | # These are indended to be used with the Wrapped search space. 66 | RFE_configspace_part = ConfigurationSpace( 67 | space = { 68 | 'step': Float('step', bounds=(1e-4, 1.0)), 69 | } 70 | ) 71 | 72 | SelectFromModel_configspace_part = ConfigurationSpace( 73 | space = { 74 | 'threshold': Float('threshold', bounds=(1e-4, 1.0), log=True), 75 | } 76 | ) 77 | -------------------------------------------------------------------------------- /tpot/config/special_configs.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from tpot.builtin_modules import ArithmeticTransformer, FeatureSetSelector 37 | from functools import partial 38 | import pandas as pd 39 | import numpy as np 40 | from tpot.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer 41 | 42 | from ConfigSpace import ConfigurationSpace 43 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal 44 | 45 | def get_ArithmeticTransformer_ConfigurationSpace(): 46 | return ConfigurationSpace( 47 | space = { 48 | 'function': Categorical("function", ["add", "mul_neg_1", "mul", "safe_reciprocal", "eq","ne","ge","gt","le","lt", "min","max","0","1"]), 49 | } 50 | ) 51 | 52 | 53 | 54 | 55 | # AddTransformer: {} 56 | # mul_neg_1_Transformer: {} 57 | # MulTransformer: {} 58 | # SafeReciprocalTransformer: {} 59 | # EQTransformer: {} 60 | # NETransformer: {} 61 | # GETransformer: {} 62 | # GTTransformer: {} 63 | # LETransformer: {} 64 | # LTTransformer: {} 65 | # MinTransformer: {} 66 | # MaxTransformer: {} 67 | 68 | -------------------------------------------------------------------------------- /tpot/config/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/tpot/config/tests/__init__.py -------------------------------------------------------------------------------- /tpot/config/tests/test_get_configspace.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tpot 3 | import sys 4 | from sklearn.datasets import load_iris 5 | import random 6 | import sklearn 7 | 8 | import tpot.config 9 | 10 | from ..get_configspace import STRING_TO_CLASS, GROUPNAMES 11 | 12 | def test_loop_through_all_hyperparameters(): 13 | 14 | n_classes=3 15 | n_samples=100 16 | n_features=100 17 | random_state=None 18 | 19 | for class_name, _ in STRING_TO_CLASS.items(): 20 | print(class_name) 21 | estnode_gen = tpot.config.get_search_space(class_name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) 22 | 23 | #generate 100 random hyperparameters and make sure they are all valid 24 | for i in range(25): 25 | estnode = estnode_gen.generate() 26 | est = estnode.export_pipeline() 27 | 28 | @pytest.mark.skipif(sys.platform == 'darwin', reason="sklearnex dependency not available on macOS") 29 | def test_loop_through_groupnames(): 30 | 31 | n_classes=3 32 | n_samples=100 33 | n_features=100 34 | random_state=None 35 | 36 | for groupname, group in GROUPNAMES.items(): 37 | for class_name in group: 38 | print(class_name) 39 | estnode_gen = tpot.config.get_search_space(class_name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) 40 | 41 | #generate 10 random hyperparameters and make sure they are all valid 42 | for i in range(25): 43 | estnode = estnode_gen.generate() 44 | est = estnode.export_pipeline() -------------------------------------------------------------------------------- /tpot/evolvers/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_evolver import * 2 | from .steady_state_evolver import * -------------------------------------------------------------------------------- /tpot/individual.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from abc import abstractmethod 37 | import types 38 | import numpy as np 39 | import copy 40 | import copy 41 | import typing 42 | 43 | 44 | class BaseIndividual: 45 | 46 | 47 | def __init__(self) -> None: 48 | self.mutation_list = [] 49 | self.crossover_list = [] 50 | 51 | def mutate(self, rng=None): 52 | rng = np.random.default_rng(rng) 53 | mutation_list_copy = self.mutation_list.copy() 54 | rng.shuffle(mutation_list_copy) 55 | for func in mutation_list_copy: 56 | if func(): 57 | return True 58 | return False 59 | 60 | def crossover(self, ind2, rng=None): 61 | rng = np.random.default_rng(rng) 62 | crossover_list_copy = self.crossover_list.copy() 63 | rng.shuffle(crossover_list_copy) 64 | for func in crossover_list_copy: 65 | if func(ind2): 66 | return True 67 | return False 68 | 69 | # a guided change of an individual when given an objective function 70 | def optimize(self, objective_function, rng=None , steps=5): 71 | rng = np.random.default_rng(rng) 72 | for _ in range(steps): 73 | self.mutate(rng=rng) 74 | 75 | #Return a hashable unique to this individual setup 76 | #For use when evaluating whether or not an individual is 'the same' and another individual 77 | def unique_id(self): 78 | return self 79 | 80 | 81 | #TODO https://www.pythontutorial.net/python-oop/python-__hash__/ 82 | #python hashing and __eq__ functions look into 83 | #whether or not this would be a better way of doing things 84 | 85 | # #TODO: use this instead of unique_id()? 86 | # #unique_id() and __repr__ could have different levels of specificity. 87 | # def __repr__(self) -> str: 88 | # pass 89 | 90 | # def __hash__(self) -> int: 91 | # pass 92 | 93 | # def __eq__(self, other): 94 | # self.unique_id() == other.unique_id() 95 | -------------------------------------------------------------------------------- /tpot/logbook.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | class CallBackInterface(): 37 | def __init__(self) -> None: 38 | pass 39 | 40 | def step_callback(self, population): 41 | pass 42 | 43 | def population_mutate_callback(self, offspring, parent=None): 44 | pass 45 | 46 | def population_crossover_callback(self, offspring, parent=None): 47 | pass 48 | 49 | def evolutionary_algorithm_step_callback(self, population): 50 | pass 51 | 52 | class Logbook(): 53 | 54 | pass 55 | 56 | -------------------------------------------------------------------------------- /tpot/objectives/__init__.py: -------------------------------------------------------------------------------- 1 | from .average_path_length import average_path_length_objective 2 | from .number_of_nodes import number_of_nodes_objective 3 | from .number_of_leaves import number_of_leaves_scorer, number_of_leaves_objective 4 | from .complexity import complexity_scorer 5 | 6 | 7 | #these scorers are calculated per fold of CV on the fitted pipeline for that fold 8 | SCORERS = { 9 | "complexity_scorer": complexity_scorer 10 | } 11 | 12 | #these objectives are calculated once on unfitted models as secondary objectives 13 | OBJECTIVES = { "average_path_length_objective": average_path_length_objective, 14 | "number_of_nodes_objective": number_of_nodes_objective, 15 | "number_of_leaves_objective": number_of_leaves_objective 16 | } -------------------------------------------------------------------------------- /tpot/objectives/average_path_length.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import networkx as nx 37 | import numpy as np 38 | 39 | def average_path_length_objective(graph_pipeline): 40 | """ 41 | Computes the average shortest path from all nodes to the root/final estimator (only supported for GraphPipeline) 42 | 43 | Parameters 44 | ---------- 45 | graph_pipeline: GraphPipeline 46 | The pipeline to compute the average path length for 47 | 48 | """ 49 | 50 | path_lengths = nx.shortest_path_length(graph_pipeline.graph, source=graph_pipeline.root) 51 | return np.mean(np.array(list(path_lengths.values())))+1 -------------------------------------------------------------------------------- /tpot/objectives/number_of_leaves.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | def number_of_leaves_scorer(est,X=None, y=None): 37 | return len([v for v, d in est.graph.out_degree() if d == 0]) 38 | 39 | def number_of_leaves_objective(est): 40 | """ 41 | Calculates the number of leaves (input nodes) in a GraphPipeline 42 | 43 | Parameters 44 | ---------- 45 | est: GraphPipeline 46 | The pipeline to compute the number of leaves for 47 | """ 48 | return len([v for v, d in est.graph.out_degree() if d == 0]) -------------------------------------------------------------------------------- /tpot/objectives/number_of_nodes.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from ..graphsklearn import GraphPipeline 37 | from sklearn.pipeline import Pipeline 38 | import sklearn 39 | 40 | def number_of_nodes_objective(est): 41 | """ 42 | Calculates the number of leaves (input nodes) in an sklearn pipeline 43 | 44 | Parameters 45 | ---------- 46 | est: GraphPipeline | Pipeline | FeatureUnion | BaseEstimator 47 | The pipeline to compute the number of nodes from. 48 | """ 49 | 50 | if isinstance(est, GraphPipeline): 51 | return sum(number_of_nodes_objective(est.graph.nodes[node]["instance"]) for node in est.graph.nodes) 52 | if isinstance(est, Pipeline): 53 | return sum(number_of_nodes_objective(estimator) for _,estimator in est.steps) 54 | if isinstance(est, sklearn.pipeline.FeatureUnion): 55 | return sum(number_of_nodes_objective(estimator) for _,estimator in est.transformer_list) 56 | 57 | return 1 -------------------------------------------------------------------------------- /tpot/objectives/tests/test_complexity_objective.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/tpot/objectives/tests/test_complexity_objective.py -------------------------------------------------------------------------------- /tpot/objectives/tests/test_number_of_nodes.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tpot 3 | from sklearn.datasets import load_iris 4 | import random 5 | import sklearn 6 | 7 | from sklearn.svm import SVC 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.datasets import make_classification 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.pipeline import Pipeline 13 | import networkx as nx 14 | import tpot 15 | from tpot import GraphPipeline 16 | import sklearn.metrics 17 | 18 | def test_number_of_nodes_objective_Graphpipeline(): 19 | g = nx.DiGraph() 20 | 21 | g.add_node("scaler", instance=StandardScaler()) 22 | g.add_node("svc", instance=SVC()) 23 | g.add_node("LogisticRegression", instance=LogisticRegression()) 24 | g.add_node("LogisticRegression2", instance=LogisticRegression()) 25 | 26 | g.add_edge("svc","scaler") 27 | g.add_edge("LogisticRegression", "scaler") 28 | g.add_edge("LogisticRegression2", "LogisticRegression") 29 | g.add_edge("LogisticRegression2", "svc") 30 | 31 | est = GraphPipeline(g) 32 | 33 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(est) == 4 34 | 35 | def test_number_of_nodes_objective_Pipeline(): 36 | pipe = Pipeline([("scaler", StandardScaler()), ("svc", SVC())]) 37 | 38 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(pipe) == 2 39 | 40 | def test_number_of_nodes_objective_not_pipeline_or_graphpipeline(): 41 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(SVC()) == 1 42 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(StandardScaler()) == 1 43 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(LogisticRegression()) == 1 44 | 45 | def test_number_of_nodes_objective_pipeline_in_graphpipeline(): 46 | g = nx.DiGraph() 47 | 48 | g.add_node("scaler", instance=StandardScaler()) 49 | g.add_node("pipe", instance=Pipeline([("scaler", StandardScaler()), ("svc", SVC())])) 50 | 51 | g.add_edge("pipe","scaler") 52 | 53 | est = GraphPipeline(g) 54 | 55 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(est) == 3 56 | 57 | def test_number_of_nodes_objective_graphpipeline_in_pipeline(): 58 | pipe = Pipeline([("scaler", StandardScaler()), ("svc", SVC())]) 59 | 60 | g = nx.DiGraph() 61 | 62 | g.add_node("scaler", instance=StandardScaler()) 63 | g.add_node("svc", instance=SVC()) 64 | g.add_node("LogisticRegression", instance=LogisticRegression()) 65 | g.add_node("LogisticRegression2", instance=LogisticRegression()) 66 | 67 | g.add_edge("svc","scaler") 68 | g.add_edge("LogisticRegression", "scaler") 69 | g.add_edge("LogisticRegression2", "LogisticRegression") 70 | g.add_edge("LogisticRegression2", "svc") 71 | 72 | est = GraphPipeline(g) 73 | 74 | pipe.steps.append(("graphpipe", est)) 75 | 76 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(pipe) == 6 77 | 78 | 79 | def test_number_of_nodes_objective_graphpipeline_in_graphpipeline(): 80 | g = nx.DiGraph() 81 | 82 | g.add_node("scaler", instance=StandardScaler()) 83 | g.add_node("svc", instance=SVC()) 84 | g.add_node("LogisticRegression", instance=LogisticRegression()) 85 | g.add_node("LogisticRegression2", instance=LogisticRegression()) 86 | 87 | g.add_edge("svc","scaler") 88 | g.add_edge("LogisticRegression", "scaler") 89 | g.add_edge("LogisticRegression2", "LogisticRegression") 90 | g.add_edge("LogisticRegression2", "svc") 91 | 92 | est = GraphPipeline(g) 93 | 94 | g2 = nx.DiGraph() 95 | 96 | g2.add_node("g1", instance=est) 97 | g2.add_node("svc", instance=SVC()) 98 | g2.add_node("LogisticRegression", instance=LogisticRegression()) 99 | g2.add_node("LogisticRegression2", instance=LogisticRegression()) 100 | 101 | g2.add_edge("svc","g1") 102 | g2.add_edge("LogisticRegression", "g1") 103 | g2.add_edge("LogisticRegression2", "LogisticRegression") 104 | g2.add_edge("LogisticRegression2", "svc") 105 | 106 | est2 = GraphPipeline(g2) 107 | 108 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(est2) == 7 109 | 110 | def test_number_of_nodes_objective_pipeline_in_pipeline(): 111 | pipe = Pipeline([("scaler", StandardScaler()), ("svc", SVC())]) 112 | 113 | pipe2 = Pipeline([("pipe", pipe), ("svc", SVC())]) 114 | 115 | assert tpot.objectives.number_of_nodes.number_of_nodes_objective(pipe2) == 3 116 | -------------------------------------------------------------------------------- /tpot/old_config_utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from .old_config_utils import convert_config_dict_to_list, convert_config_dict_to_choicepipeline, convert_config_dict_to_graphpipeline, convert_config_dict_to_linearpipeline -------------------------------------------------------------------------------- /tpot/search_spaces/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | from . import nodes 3 | from . import pipelines -------------------------------------------------------------------------------- /tpot/search_spaces/graph_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import networkx as nx 37 | import numpy as np 38 | 39 | 40 | def remove_and_stitch(graph, node): 41 | successors = graph.successors(node) 42 | predecessors = graph.predecessors(node) 43 | 44 | graph.remove_node(node) 45 | 46 | for s in successors: 47 | for p in predecessors: 48 | graph.add_edge(p,s) 49 | 50 | 51 | def remove_nodes_disconnected_from_node(graph, node): 52 | descendants = nx.descendants(graph, node) 53 | for n in list(graph.nodes): 54 | if n not in descendants and n is not node: 55 | graph.remove_node(n) 56 | #graph.remove_nodes_from([n for n in graph.nodes if n not in nx.descendants(graph, node) and n is not node]) 57 | 58 | 59 | def get_roots(graph): 60 | return [v for v, d in graph.in_degree() if d == 0] 61 | 62 | def get_leaves(graph): 63 | return [v for v, d in graph.out_degree() if d == 0] 64 | 65 | def get_max_path_through_node(graph, root, node): 66 | if len(list(graph.successors(node)))==0: 67 | return get_max_path_size(graph, root, node) 68 | else: 69 | leaves = [n for n in nx.descendants(graph,node) if len(list(graph.successors(n)))==0] 70 | 71 | return max([get_max_path_size(graph, root, l) for l in leaves]) 72 | 73 | 74 | def get_max_path_size(graph, fromnode1,tonode2, return_path=False): 75 | if fromnode1 is tonode2: 76 | if return_path: 77 | return [fromnode1] 78 | return 1 79 | else: 80 | max_length_path = max(nx.all_simple_paths(graph, fromnode1, tonode2), key=lambda x: len(x)) 81 | if return_path: 82 | return max_length_path 83 | return len(max_length_path) #gets the max path and finds the length of that path 84 | 85 | 86 | def invert_dictionary(d): 87 | inv_map = {} 88 | for k, v in d.items(): 89 | inv_map.setdefault(v, set()).add(k) 90 | 91 | return inv_map 92 | 93 | def select_nodes_same_depth(g1, node1, g2, node2, rng=None): 94 | rng = np.random.default_rng(rng) 95 | 96 | g1_nodes = nx.shortest_path_length(g1, source=node1) 97 | g2_nodes = nx.shortest_path_length(g2, source=node2) 98 | 99 | max_depth = max(list(g1_nodes.values()) + list(g2_nodes.values())) 100 | 101 | g1_nodes = invert_dictionary(g1_nodes) 102 | g2_nodes = invert_dictionary(g2_nodes) 103 | 104 | # depth_number_of_nodes = [] 105 | # for i in range(max_depth+1): 106 | # n = 0 107 | # if i in g1_nodes and i in g2_nodes: 108 | # depth_number_of_nodes.append(len(g1_nodes[i])+len(g1_nodes[i])) 109 | # else: 110 | # break 111 | 112 | possible_pairs = [] 113 | for i in range(max_depth+1): 114 | if i in g1_nodes and i in g2_nodes: 115 | for n1 in g1_nodes[i]: 116 | for n2 in g2_nodes[i]: 117 | possible_pairs.append( (n1,n2) ) 118 | 119 | rng.shuffle(possible_pairs) 120 | 121 | for p in possible_pairs: 122 | yield p[0], p[1] 123 | 124 | def select_nodes_randomly(g1, g2, rng=None): 125 | rng = np.random.default_rng(rng) 126 | 127 | sorted_self_nodes_list = list(g1.nodes) 128 | rng.shuffle(sorted_self_nodes_list) 129 | 130 | sorted_other_nodes_list = list(g2.nodes) 131 | rng.shuffle(sorted_other_nodes_list) 132 | for node1 in sorted_self_nodes_list: 133 | for node2 in sorted_other_nodes_list: 134 | if node1 is node2: 135 | continue 136 | yield node1, node2 -------------------------------------------------------------------------------- /tpot/search_spaces/nodes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from .estimator_node import * 37 | from .genetic_feature_selection import * 38 | from .fss_node import * -------------------------------------------------------------------------------- /tpot/search_spaces/nodes/estimator_node.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | # try https://automl.github.io/ConfigSpace/main/api/hyperparameters.html 37 | 38 | import numpy as np 39 | from ..base import SklearnIndividual, SearchSpace 40 | from ConfigSpace import ConfigurationSpace 41 | from typing import final 42 | 43 | 44 | def default_hyperparameter_parser(params:dict) -> dict: 45 | return params 46 | 47 | 48 | class EstimatorNodeIndividual(SklearnIndividual): 49 | """ 50 | Note that ConfigurationSpace does not support None as a parameter. Instead, use the special string "". TPOT will automatically replace instances of this string with the Python None. 51 | 52 | Parameters 53 | ---------- 54 | method : type 55 | The class of the estimator to be used 56 | 57 | space : ConfigurationSpace|dict 58 | The hyperparameter space to be used. If a dict is passed, hyperparameters are fixed and not learned. 59 | 60 | """ 61 | def __init__(self, method: type, 62 | space: ConfigurationSpace|dict, #TODO If a dict is passed, hyperparameters are fixed and not learned. Is this confusing? Should we make a second node type? 63 | hyperparameter_parser: callable = None, 64 | rng=None) -> None: 65 | super().__init__() 66 | self.method = method 67 | self.space = space 68 | 69 | if hyperparameter_parser is None: 70 | self.hyperparameter_parser = default_hyperparameter_parser 71 | else: 72 | self.hyperparameter_parser = hyperparameter_parser 73 | 74 | if isinstance(space, dict): 75 | self.hyperparameters = space 76 | else: 77 | rng = np.random.default_rng(rng) 78 | self.space.seed(rng.integers(0, 2**32)) 79 | self.hyperparameters = dict(self.space.sample_configuration()) 80 | 81 | def mutate(self, rng=None): 82 | if isinstance(self.space, dict): 83 | return False 84 | 85 | rng = np.random.default_rng(rng) 86 | self.space.seed(rng.integers(0, 2**32)) 87 | self.hyperparameters = dict(self.space.sample_configuration()) 88 | return True 89 | 90 | def crossover(self, other, rng=None): 91 | if isinstance(self.space, dict): 92 | return False 93 | 94 | rng = np.random.default_rng(rng) 95 | if self.method != other.method: 96 | return False 97 | 98 | #loop through hyperparameters, randomly swap items in self.hyperparameters with items in other.hyperparameters 99 | for hyperparameter in self.space: 100 | if rng.choice([True, False]): 101 | if hyperparameter in other.hyperparameters: 102 | self.hyperparameters[hyperparameter] = other.hyperparameters[hyperparameter] 103 | 104 | return True 105 | 106 | 107 | 108 | @final #this method should not be overridden, instead override hyperparameter_parser 109 | def export_pipeline(self, **kwargs): 110 | return self.method(**self.hyperparameter_parser(self.hyperparameters)) 111 | 112 | def unique_id(self): 113 | #return a dictionary of the method and the hyperparameters 114 | method_str = self.method.__name__ 115 | params = list(self.hyperparameters.keys()) 116 | params = sorted(params) 117 | 118 | id_str = f"{method_str}({', '.join([f'{param}={self.hyperparameters[param]}' for param in params])})" 119 | 120 | return id_str 121 | 122 | class EstimatorNode(SearchSpace): 123 | def __init__(self, method, space, hyperparameter_parser=default_hyperparameter_parser): 124 | self.method = method 125 | self.space = space 126 | self.hyperparameter_parser = hyperparameter_parser 127 | 128 | def generate(self, rng=None): 129 | return EstimatorNodeIndividual(self.method, self.space, hyperparameter_parser=self.hyperparameter_parser, rng=rng) -------------------------------------------------------------------------------- /tpot/search_spaces/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .choice import * 2 | from .dynamic_linear import * 3 | from .sequential import * 4 | from .graph import * 5 | from .tree import * 6 | from .wrapper import * 7 | from .dynamicunion import * 8 | from .union import * -------------------------------------------------------------------------------- /tpot/search_spaces/pipelines/choice.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import tpot 37 | import numpy as np 38 | import pandas as pd 39 | import sklearn 40 | from tpot import config 41 | from typing import Generator, List, Tuple, Union 42 | import random 43 | from ..base import SklearnIndividual, SearchSpace 44 | 45 | class ChoicePipelineIndividual(SklearnIndividual): 46 | def __init__(self, search_spaces : List[SearchSpace], rng=None) -> None: 47 | super().__init__() 48 | rng = np.random.default_rng(rng) 49 | self.search_spaces = search_spaces 50 | self.node = rng.choice(self.search_spaces).generate(rng=rng) 51 | 52 | 53 | def mutate(self, rng=None): 54 | rng = np.random.default_rng(rng) 55 | if rng.choice([True, False]): 56 | return self._mutate_select_new_node(rng) 57 | else: 58 | return self._mutate_node(rng) 59 | 60 | def _mutate_select_new_node(self, rng=None): 61 | rng = np.random.default_rng(rng) 62 | self.node = rng.choice(self.search_spaces).generate(rng=rng) 63 | return True 64 | 65 | def _mutate_node(self, rng=None): 66 | return self.node.mutate(rng) 67 | 68 | def crossover(self, other, rng=None): 69 | return self.node.crossover(other.node, rng) 70 | 71 | def export_pipeline(self, **kwargs): 72 | return self.node.export_pipeline(**kwargs) 73 | 74 | def unique_id(self): 75 | return self.node.unique_id() 76 | 77 | 78 | class ChoicePipeline(SearchSpace): 79 | def __init__(self, search_spaces : List[SearchSpace] ) -> None: 80 | self.search_spaces = search_spaces 81 | 82 | """ 83 | Takes in a list of search spaces. Will select one node from the search space. 84 | 85 | """ 86 | 87 | def generate(self, rng=None): 88 | rng = np.random.default_rng(rng) 89 | return ChoicePipelineIndividual(self.search_spaces, rng=rng) -------------------------------------------------------------------------------- /tpot/search_spaces/pipelines/tests/test_graphspace.py: -------------------------------------------------------------------------------- 1 | # Test all nodes have all dictionaries 2 | import pytest 3 | import tpot 4 | 5 | import tpot 6 | from ConfigSpace import ConfigurationSpace 7 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal 8 | from sklearn.neighbors import KNeighborsClassifier 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.tree import DecisionTreeClassifier 11 | from sklearn.preprocessing import StandardScaler 12 | 13 | 14 | def test_merge_duplicate_nodes(): 15 | knn_configspace = {} 16 | standard_scaler_configspace = {} 17 | 18 | knn_node = tpot.search_spaces.nodes.EstimatorNode( 19 | method = KNeighborsClassifier, 20 | space = knn_configspace, 21 | ) 22 | 23 | scaler_node = tpot.search_spaces.nodes.EstimatorNode( 24 | method = StandardScaler, 25 | space = standard_scaler_configspace, 26 | ) 27 | 28 | 29 | graph_search_space = tpot.search_spaces.pipelines.GraphSearchPipeline( 30 | root_search_space= knn_node, 31 | leaf_search_space = scaler_node, 32 | inner_search_space = None, 33 | max_size = 10, 34 | ) 35 | 36 | ind = graph_search_space.generate() 37 | 38 | # all of these leaves should be identical 39 | ind._mutate_insert_leaf() 40 | ind._mutate_insert_leaf() 41 | ind._mutate_insert_leaf() 42 | ind._mutate_insert_leaf() 43 | 44 | ind._merge_duplicated_nodes() 45 | 46 | assert len(ind.graph.nodes) == 2 -------------------------------------------------------------------------------- /tpot/search_spaces/pipelines/tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import tpot 37 | import numpy as np 38 | import pandas as pd 39 | import sklearn 40 | from tpot import config 41 | from typing import Generator, List, Tuple, Union 42 | import random 43 | from ..base import SklearnIndividual, SearchSpace 44 | import networkx as nx 45 | import copy 46 | import matplotlib.pyplot as plt 47 | 48 | from .graph import GraphPipelineIndividual 49 | 50 | 51 | from ..graph_utils import * 52 | 53 | class TreePipelineIndividual(GraphPipelineIndividual): 54 | def __init__(self, 55 | **kwargs) -> None: 56 | super().__init__(**kwargs) 57 | 58 | self.crossover_methods_list = [self._crossover_swap_branch, self._crossover_swap_node, self._crossover_nodes] 59 | self.mutate_methods_list = [self._mutate_insert_leaf, self._mutate_insert_inner_node, self._mutate_remove_node, self._mutate_node] 60 | self.merge_duplicated_nodes_toggle = False 61 | 62 | 63 | 64 | class TreePipeline(SearchSpace): 65 | def __init__(self, root_search_space : SearchSpace, 66 | leaf_search_space : SearchSpace = None, 67 | inner_search_space : SearchSpace =None, 68 | min_size: int = 2, 69 | max_size: int = 10, 70 | crossover_same_depth=False) -> None: 71 | 72 | """ 73 | Generates a pipeline of variable length. Pipeline will have a tree structure similar to TPOT1. 74 | 75 | """ 76 | 77 | self.search_space = root_search_space 78 | self.leaf_search_space = leaf_search_space 79 | self.inner_search_space = inner_search_space 80 | self.min_size = min_size 81 | self.max_size = max_size 82 | self.crossover_same_depth = crossover_same_depth 83 | 84 | def generate(self, rng=None): 85 | rng = np.random.default_rng(rng) 86 | return TreePipelineIndividual(self.search_space, self.leaf_search_space, self.inner_search_space, self.min_size, self.max_size, self.crossover_same_depth, rng=rng) -------------------------------------------------------------------------------- /tpot/search_spaces/pipelines/union.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import tpot 37 | import numpy as np 38 | import pandas as pd 39 | import sklearn 40 | from tpot import config 41 | from typing import Generator, List, Tuple, Union 42 | import random 43 | from ..base import SklearnIndividual, SearchSpace 44 | from ..tuple_index import TupleIndex 45 | 46 | class UnionPipelineIndividual(SklearnIndividual): 47 | """ 48 | Takes in a list of search spaces. each space is a list of SearchSpaces. 49 | Will produce a FeatureUnion pipeline. Each step in the pipeline will correspond to the the search space provided in the same index. 50 | The resulting pipeline will be a FeatureUnion of the steps in the pipeline. 51 | 52 | """ 53 | 54 | def __init__(self, search_spaces : List[SearchSpace], rng=None) -> None: 55 | super().__init__() 56 | self.search_spaces = search_spaces 57 | 58 | self.pipeline = [] 59 | for space in self.search_spaces: 60 | self.pipeline.append(space.generate(rng)) 61 | 62 | def mutate(self, rng=None): 63 | rng = np.random.default_rng(rng) 64 | step = rng.choice(self.pipeline) 65 | return step.mutate(rng) 66 | 67 | 68 | def crossover(self, other, rng=None): 69 | #swap a random step in the pipeline with the corresponding step in the other pipeline 70 | rng = np.random.default_rng(rng) 71 | 72 | cx_funcs = [self._crossover_node, self._crossover_swap_node] 73 | rng.shuffle(cx_funcs) 74 | for cx_func in cx_funcs: 75 | if cx_func(other, rng): 76 | return True 77 | 78 | return False 79 | 80 | def _crossover_swap_node(self, other, rng): 81 | rng = np.random.default_rng(rng) 82 | idx = rng.integers(1,len(self.pipeline)) 83 | 84 | self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx] 85 | return True 86 | 87 | def _crossover_node(self, other, rng): 88 | rng = np.random.default_rng(rng) 89 | 90 | crossover_success = False 91 | for idx in range(len(self.pipeline)): 92 | if rng.random() < 0.5: 93 | if self.pipeline[idx].crossover(other.pipeline[idx], rng): 94 | crossover_success = True 95 | 96 | return crossover_success 97 | 98 | def export_pipeline(self, **kwargs): 99 | return sklearn.pipeline.make_union(*[step.export_pipeline(**kwargs) for step in self.pipeline]) 100 | 101 | def unique_id(self): 102 | l = [step.unique_id() for step in self.pipeline] 103 | l = ["FeatureUnion"] + l 104 | return TupleIndex(tuple(l)) 105 | 106 | 107 | class UnionPipeline(SearchSpace): 108 | def __init__(self, search_spaces : List[SearchSpace] ) -> None: 109 | """ 110 | Takes in a list of search spaces. will produce a pipeline of Sequential length. Each step in the pipeline will correspond to the the search space provided in the same index. 111 | """ 112 | 113 | self.search_spaces = search_spaces 114 | 115 | def generate(self, rng=None): 116 | rng = np.random.default_rng(rng) 117 | return UnionPipelineIndividual(self.search_spaces, rng=rng) -------------------------------------------------------------------------------- /tpot/search_spaces/tests/test_search_spaces.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | # Test all nodes have all dictionaries 37 | import pytest 38 | import tpot 39 | 40 | import tpot 41 | from ConfigSpace import ConfigurationSpace 42 | from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal 43 | from sklearn.neighbors import KNeighborsClassifier 44 | from sklearn.linear_model import LogisticRegression 45 | from sklearn.tree import DecisionTreeClassifier 46 | from sklearn.preprocessing import StandardScaler 47 | 48 | 49 | def test_EstimatorNodeCrossover(): 50 | knn_configspace = {} 51 | standard_scaler_configspace = {} 52 | 53 | knn_node = tpot.search_spaces.nodes.EstimatorNode( 54 | method = KNeighborsClassifier, 55 | space = knn_configspace, 56 | ) 57 | 58 | knnind1 = knn_node.generate() 59 | knnind2 = knn_node.generate() 60 | 61 | for i in range(0,10): 62 | knnind1.mutate() 63 | knnind2.mutate() 64 | knnind1.crossover(knnind2) 65 | 66 | 67 | def test_ValueError_different_types(): 68 | knn_node = tpot.config.get_search_space(["KNeighborsClassifier"]) 69 | sfm_wrapper_node = tpot.config.get_search_space(["SelectFromModel_classification"]) 70 | 71 | for i in range(10): 72 | ind1 = knn_node.generate() 73 | ind2 = sfm_wrapper_node.generate() 74 | assert not ind1.crossover(ind2) 75 | assert not ind2.crossover(ind1) 76 | 77 | if __name__ == "__main__": 78 | test_EstimatorNodeCrossover() 79 | test_ValueError_different_types() -------------------------------------------------------------------------------- /tpot/search_spaces/tuple_index.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import numpy as np 37 | 38 | class TupleIndex(): 39 | """ 40 | TPOT uses tuples to create a unique id for some pipeline search spaces. However, tuples sometimes don't interact correctly with pandas indexes. 41 | This class is a wrapper around a tuple that allows it to be used as a key in a dictionary, without it being an itereable. 42 | 43 | An alternative could be to make unique id return a string, but this would not work with graphpipelines, which require a special object. 44 | This class allows linear pipelines to contain graph pipelines while still being able to be used as a key in a dictionary. 45 | 46 | """ 47 | def __init__(self, tup): 48 | self.tup = tup 49 | 50 | def __eq__(self,other) -> bool: 51 | return self.tup == other 52 | 53 | def __hash__(self) -> int: 54 | return self.tup.__hash__() 55 | 56 | def __str__(self) -> str: 57 | return self.tup.__str__() 58 | 59 | def __repr__(self) -> str: 60 | return self.tup.__repr__() -------------------------------------------------------------------------------- /tpot/selectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .lexicase_selection import lexicase_selection 2 | from .max_weighted_average_selector import max_weighted_average_selector 3 | from .random_selector import random_selector 4 | from .tournament_selection import tournament_selection 5 | from .tournament_selection_dominated import tournament_selection_dominated 6 | from .nsgaii import nondominated_sorting, crowding_distance, dominates, survival_select_NSGA2 7 | from .map_elites_selection import map_elites_survival_selector, map_elites_parent_selector 8 | 9 | 10 | SELECTORS = {"lexicase":lexicase_selection, 11 | "max_weighted_average":max_weighted_average_selector, 12 | "random":random_selector, 13 | "tournament":tournament_selection, 14 | "tournament_dominated":tournament_selection_dominated, 15 | "nsgaii":survival_select_NSGA2, 16 | "map_elites_survival":map_elites_survival_selector, 17 | "map_elites_parent":map_elites_parent_selector, 18 | } -------------------------------------------------------------------------------- /tpot/selectors/lexicase_selection.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import numpy as np 37 | 38 | def lexicase_selection(scores, k, n_parents=1, rng=None): 39 | """ 40 | Select the best individual according to Lexicase Selection, *k* times. 41 | The returned list contains the indices of the chosen *individuals*. 42 | 43 | Parameters 44 | ---------- 45 | scores : np.ndarray 46 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives. 47 | k : int 48 | The number of individuals to select. 49 | n_parents : int, optional 50 | The number of parents to select per individual. The default is 1. 51 | rng : int, np.random.Generator, optional 52 | The random number generator. The default is None. 53 | Returns 54 | ------- 55 | A array of indices of selected individuals of shape (k, n_parents). 56 | """ 57 | rng = np.random.default_rng(rng) 58 | chosen =[] 59 | for i in range(k*n_parents): 60 | candidates = list(range(len(scores))) 61 | cases = list(range(len(scores[0]))) 62 | rng.shuffle(cases) 63 | 64 | while len(cases) > 0 and len(candidates) > 1: 65 | best_val_for_case = max(scores[candidates,cases[0]]) 66 | candidates = [x for x in candidates if scores[x, cases[0]] == best_val_for_case] 67 | cases.pop(0) 68 | chosen.append(rng.choice(candidates)) 69 | 70 | return np.reshape(chosen, (k, n_parents)) -------------------------------------------------------------------------------- /tpot/selectors/max_weighted_average_selector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import numpy as np 37 | 38 | def max_weighted_average_selector(scores,k, n_parents=1, rng=None): 39 | """ 40 | Select the best individual according to Max Weighted Average Selection, *k* times. 41 | 42 | Parameters 43 | ---------- 44 | scores : np.ndarray 45 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives. 46 | k : int 47 | The number of individuals to select. 48 | n_parents : int, optional 49 | The number of parents to select per individual. The default is 1. 50 | rng : int, np.random.Generator, optional 51 | The random number generator. The default is None. 52 | 53 | Returns 54 | ------- 55 | A array of indices of selected individuals of shape (k, n_parents). 56 | 57 | """ 58 | ave_scores = [np.nanmean(s ) for s in scores ] #TODO make this more efficient 59 | chosen = np.argsort(ave_scores)[::-1][0:k] #TODO check this behavior with nans 60 | return np.reshape(chosen, (k, n_parents)) -------------------------------------------------------------------------------- /tpot/selectors/random_selector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import numpy as np 37 | 38 | def random_selector(scores, k, n_parents=1, rng=None, ): 39 | """ 40 | Randomly selects indeces of individuals from the scores matrix. 41 | 42 | Parameters 43 | ---------- 44 | scores : np.ndarray 45 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives. 46 | k : int 47 | The number of individuals to select. 48 | n_parents : int, optional 49 | The number of parents to select per individual. The default is 1. 50 | rng : int, np.random.Generator, optional 51 | The random number generator. The default is None. 52 | 53 | Returns 54 | ------- 55 | A array of indices of randomly selected individuals (with replacement) of shape (k, n_parents). 56 | 57 | """ 58 | rng = np.random.default_rng(rng) 59 | chosen = rng.choice(list(range(0,len(scores))), size=k*n_parents) 60 | return np.reshape(chosen, (k, n_parents)) -------------------------------------------------------------------------------- /tpot/selectors/tournament_selection.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import numpy as np 37 | 38 | def tournament_selection(scores, k, n_parents=1, rng=None, tournament_size=2, score_index=0): 39 | """ 40 | Select the best individual among *tournsize* randomly chosen 41 | individuals, *k* times. The returned list contains the indices of the chosen *individuals*. 42 | 43 | Parameters 44 | ---------- 45 | scores : np.ndarray 46 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives. 47 | k : int 48 | The number of individuals to select. 49 | n_parents : int, optional 50 | The number of parents to select per individual. The default is 1. 51 | rng : int, np.random.Generator, optional 52 | The random number generator. The default is None. 53 | tournament_size : int, optional 54 | The number of individuals participating in each tournament. 55 | score_index : int, str, optional 56 | The index of the score to use for selection. If "average" is passed, the average score is used. The default is 0 (only the first score is used). 57 | 58 | Returns 59 | ------- 60 | A array of indices of selected individuals of shape (k, n_parents). 61 | """ 62 | 63 | rng = np.random.default_rng(rng) 64 | 65 | if isinstance(score_index,int): 66 | key=lambda x:x[1][score_index] 67 | elif score_index == "average": 68 | key=lambda x:np.mean(x[1]) 69 | 70 | chosen = [] 71 | for i in range(k*n_parents): 72 | aspirants_idx =[rng.choice(len(scores)) for i in range(tournament_size)] 73 | aspirants = list(zip(aspirants_idx, scores[aspirants_idx])) # Zip indices and elements together 74 | chosen.append(max(aspirants, key=key)[0]) # Retrun the index of the maximum element 75 | 76 | return np.reshape(chosen, (k, n_parents)) -------------------------------------------------------------------------------- /tpot/selectors/tournament_selection_dominated.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import numpy as np 37 | 38 | from.nsgaii import nondominated_sorting, crowding_distance, dominates 39 | 40 | #based on deap 41 | def tournament_selection_dominated(scores, k, n_parents=2, rng=None): 42 | """ 43 | Select the best individual among 2 randomly chosen 44 | individuals, *k* times. Selection is first attempted by checking if one individual dominates the other. Otherwise one with the highest crowding distance is selected. 45 | The returned list contains the indices of the chosen *individuals*. 46 | 47 | Parameters 48 | ---------- 49 | scores : np.ndarray 50 | The score matrix, where rows the individuals and the columns are the corresponds to scores on different objectives. 51 | k : int 52 | The number of individuals to select. 53 | n_parents : int, optional 54 | The number of parents to select per individual. The default is 2. 55 | rng : int, np.random.Generator, optional 56 | The random number generator. The default is None. 57 | 58 | Returns 59 | ------- 60 | A array of indices of selected individuals of shape (k, n_parents). 61 | 62 | """ 63 | 64 | rng = np.random.default_rng(rng) 65 | pareto_fronts = nondominated_sorting(scores) 66 | 67 | # chosen = list(itertools.chain.from_iterable(fronts)) 68 | # if len(chosen) >= k: 69 | # return chosen[0:k] 70 | 71 | crowding_dict = {} 72 | chosen = [] 73 | current_front_number = 0 74 | while current_front_number < len(pareto_fronts): 75 | 76 | current_front = np.array(list(pareto_fronts[current_front_number])) 77 | front_scores = [scores[i] for i in current_front] 78 | crowding_distances = crowding_distance(front_scores) 79 | for i, crowding in zip(current_front,crowding_distances): 80 | crowding_dict[i] = crowding 81 | 82 | current_front_number += 1 83 | 84 | 85 | chosen = [] 86 | for i in range(k*n_parents): 87 | asp1 = rng.choice(len(scores)) 88 | asp2 = rng.choice(len(scores)) 89 | 90 | if dominates(scores[asp1], scores[asp2]): 91 | chosen.append(asp1) 92 | elif dominates(scores[asp2], scores[asp1]): 93 | chosen.append(asp2) 94 | 95 | elif crowding_dict[asp1] > crowding_dict[asp2]: 96 | chosen.append(asp1) 97 | elif crowding_dict[asp1] < crowding_dict[asp2]: 98 | chosen.append(asp2) 99 | 100 | else: 101 | chosen.append(rng.choice([asp1,asp2])) 102 | 103 | return np.reshape(chosen, (k, n_parents)) 104 | -------------------------------------------------------------------------------- /tpot/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/tpot/tests/__init__.py -------------------------------------------------------------------------------- /tpot/tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import pytest 37 | import sys 38 | 39 | 40 | @pytest.fixture 41 | def capture_stdout(monkeypatch): 42 | buffer = {"stdout": "", "write_calls": 0} 43 | 44 | def fake_write(s): 45 | buffer["stdout"] += s 46 | buffer["write_calls"] += 1 47 | 48 | monkeypatch.setattr(sys.stdout, "write", fake_write) 49 | return buffer 50 | -------------------------------------------------------------------------------- /tpot/tests/test_estimators.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import pytest 37 | import tpot 38 | from sklearn.datasets import load_iris 39 | import random 40 | import sklearn 41 | 42 | @pytest.fixture 43 | def sample_dataset(): 44 | X_train, y_train = load_iris(return_X_y=True) 45 | return X_train, y_train 46 | 47 | #standard test 48 | @pytest.fixture 49 | def tpot_estimator(): 50 | 51 | n_classes=3 52 | n_samples=100 53 | n_features=100 54 | 55 | search_space = tpot.search_spaces.pipelines.GraphSearchPipeline( 56 | root_search_space= tpot.config.get_search_space("classifiers", n_samples=n_samples, n_features=n_features, n_classes=n_classes), 57 | leaf_search_space = None, 58 | inner_search_space = tpot.config.get_search_space(["selectors","transformers"],n_samples=n_samples, n_features=n_features, n_classes=n_classes), 59 | max_size = 10, 60 | ) 61 | return tpot.TPOTEstimator( 62 | search_space=search_space, 63 | population_size=10, 64 | generations=2, 65 | scorers=['roc_auc_ovr'], 66 | scorers_weights=[1], 67 | classification=True, 68 | n_jobs=4, 69 | early_stop=5, 70 | other_objective_functions= [], 71 | other_objective_functions_weights=[], 72 | max_time_mins=20/60, 73 | verbose=3) 74 | 75 | @pytest.fixture 76 | def tpot_classifier(): 77 | return tpot.tpot_estimator.templates.TPOTClassifier(max_time_mins=60/60,verbose=0) 78 | 79 | @pytest.fixture 80 | def tpot_regressor(): 81 | return tpot.tpot_estimator.templates.TPOTRegressor(max_time_mins=10/60,verbose=0) 82 | 83 | 84 | @pytest.fixture 85 | def tpot_estimator_with_pipeline(tpot_estimator,sample_dataset): 86 | tpot_estimator.fit(sample_dataset[0], sample_dataset[1]) 87 | return tpot_estimator 88 | 89 | def test_tpot_estimator_predict(tpot_estimator_with_pipeline,sample_dataset): 90 | #X_test = [[1, 2, 3], [4, 5, 6]] 91 | X_test = sample_dataset[0] 92 | y_pred = tpot_estimator_with_pipeline.predict(X_test) 93 | assert len(y_pred) == len(X_test) 94 | assert tpot_estimator_with_pipeline.fitted_pipeline_ is not None 95 | 96 | def test_tpot_estimator_generations_type(): 97 | with pytest.raises(TypeError): 98 | tpot.TPOTEstimator(generations="two", population_size=10, verbosity=2) 99 | 100 | def test_tpot_estimator_population_size_type(): 101 | with pytest.raises(TypeError): 102 | tpot.TPOTEstimator(generations=2, population_size='ten', verbosity=2) 103 | 104 | def test_tpot_estimator_verbosity_type(): 105 | with pytest.raises(TypeError): 106 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity='high') 107 | 108 | def test_tpot_estimator_scoring_type(): 109 | with pytest.raises(TypeError): 110 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity=2, scoring=0.5) 111 | 112 | def test_tpot_estimator_cv_type(): 113 | with pytest.raises(TypeError): 114 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity=2, cv='kfold') 115 | 116 | def test_tpot_estimator_n_jobs_type(): 117 | with pytest.raises(TypeError): 118 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity=2, n_jobs='all') 119 | 120 | def test_tpot_estimator_config_dict_type(): 121 | with pytest.raises(TypeError): 122 | tpot.TPOTEstimator(generations=2, population_size=10, verbosity=2, config_dict='config') 123 | 124 | 125 | 126 | 127 | 128 | def test_tpot_classifier_fit(tpot_classifier,sample_dataset): 129 | #load iris dataset 130 | X_train = sample_dataset[0] 131 | y_train = sample_dataset[1] 132 | tpot_classifier.fit(X_train, y_train) 133 | assert tpot_classifier.fitted_pipeline_ is not None 134 | 135 | def test_tpot_regressor_fit(tpot_regressor): 136 | 137 | scorer = sklearn.metrics.get_scorer('neg_mean_squared_error') 138 | X, y = sklearn.datasets.load_diabetes(return_X_y=True) 139 | X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.05, test_size=0.95) 140 | tpot_regressor.fit(X_train, y_train) 141 | assert tpot_regressor.fitted_pipeline_ is not None 142 | 143 | -------------------------------------------------------------------------------- /tpot/tests/test_hello_world.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test hello world. 3 | Notes: 4 | parameterizing the test_input and expected values allows tests continue running even if one fails. 5 | xfail marks a test as expected to fail. This is useful for tests that are not yet implemented. 6 | fixtures are used to setup and teardown tests. They are useful for tests that require a lot of setup. 7 | We can implement fixtures if we need them. 8 | """ 9 | 10 | import pytest 11 | 12 | 13 | @pytest.mark.parametrize("test_input,expected", [ 14 | ("Hello World", "Hello World"), 15 | ]) 16 | def test_hello_world(test_input, expected): 17 | assert test_input is expected 18 | 19 | 20 | 21 | def test_print(capture_stdout): 22 | print("Hello World") 23 | assert capture_stdout["stdout"] == "Hello World\n" 24 | -------------------------------------------------------------------------------- /tpot/tpot_estimator/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from .estimator import TPOTEstimator 37 | from .steady_state_estimator import TPOTEstimatorSteadyState 38 | from .templates import TPOTClassifier, TPOTRegressor -------------------------------------------------------------------------------- /tpot/tpot_estimator/cross_val_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import time 37 | import sklearn.metrics 38 | from collections.abc import Iterable 39 | import pandas as pd 40 | import sklearn 41 | import numpy as np 42 | 43 | def cross_val_score_objective(estimator, X, y, scorers, cv, fold=None): 44 | """ 45 | Compute the cross validated scores for a estimator. Only fits the estimator once per fold, and loops over the scorers to evaluate the estimator. 46 | 47 | Parameters 48 | ---------- 49 | estimator: sklearn.base.BaseEstimator 50 | The estimator to fit and score. 51 | X: np.ndarray or pd.DataFrame 52 | The feature matrix. 53 | y: np.ndarray or pd.Series 54 | The target vector. 55 | scorers: list or scorer 56 | The scorers to use. 57 | If a list, will loop over the scorers and return a list of scorers. 58 | If a single scorer, will return a single score. 59 | cv: sklearn cross-validator 60 | The cross-validator to use. For example, sklearn.model_selection.KFold or sklearn.model_selection.StratifiedKFold. 61 | fold: int, optional 62 | The fold to return the scores for. If None, will return the mean of all the scores (per scorer). Default is None. 63 | 64 | Returns 65 | ------- 66 | scores: np.ndarray or float 67 | The scores for the estimator per scorer. If fold is None, will return the mean of all the scores (per scorer). 68 | Returns a list if multiple scorers are used, otherwise returns a float for the single scorer. 69 | 70 | """ 71 | 72 | #check if scores is not iterable 73 | if not isinstance(scorers, Iterable): 74 | scorers = [scorers] 75 | scores = [] 76 | if fold is None: 77 | for train_index, test_index in cv.split(X, y): 78 | this_fold_estimator = sklearn.base.clone(estimator) 79 | if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): 80 | X_train, X_test = X.iloc[train_index], X.iloc[test_index] 81 | else: 82 | X_train, X_test = X[train_index], X[test_index] 83 | 84 | if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): 85 | y_train, y_test = y.iloc[train_index], y.iloc[test_index] 86 | else: 87 | y_train, y_test = y[train_index], y[test_index] 88 | 89 | 90 | start = time.time() 91 | this_fold_estimator.fit(X_train,y_train) 92 | duration = time.time() - start 93 | 94 | this_fold_scores = [sklearn.metrics.get_scorer(scorer)(this_fold_estimator, X_test, y_test) for scorer in scorers] 95 | scores.append(this_fold_scores) 96 | del this_fold_estimator 97 | del X_train 98 | del X_test 99 | del y_train 100 | del y_test 101 | 102 | 103 | return np.mean(scores,0) 104 | else: 105 | this_fold_estimator = sklearn.base.clone(estimator) 106 | train_index, test_index = list(cv.split(X, y))[fold] 107 | if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): 108 | X_train, X_test = X.iloc[train_index], X.iloc[test_index] 109 | else: 110 | X_train, X_test = X[train_index], X[test_index] 111 | 112 | if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): 113 | y_train, y_test = y.iloc[train_index], y.iloc[test_index] 114 | else: 115 | y_train, y_test = y[train_index], y[test_index] 116 | 117 | start = time.time() 118 | this_fold_estimator.fit(X_train,y_train) 119 | duration = time.time() - start 120 | this_fold_scores = [sklearn.metrics.get_scorer(scorer)(this_fold_estimator, X_test, y_test) for scorer in scorers] 121 | return this_fold_scores 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /tpot/tpot_estimator/templates/__init__.py: -------------------------------------------------------------------------------- 1 | from .tpottemplates import * -------------------------------------------------------------------------------- /tpot/tpot_estimator/templates/tpot_autoimputer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/tpot/59e05db47d2c8667f5284d6c5c680d11cbc40c4e/tpot/tpot_estimator/templates/tpot_autoimputer.py -------------------------------------------------------------------------------- /tpot/tpot_estimator/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | -------------------------------------------------------------------------------- /tpot/tpot_estimator/tests/test_estimator_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | import pytest 37 | import numpy as np 38 | import pandas as pd 39 | from ..estimator_utils import * 40 | 41 | def test_remove_underrepresented_classes(): 42 | x = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) 43 | y = np.array([0, 1, 0, 2]) 44 | min_count = 2 45 | x_result, y_result = remove_underrepresented_classes(x, y, min_count) 46 | np.testing.assert_array_equal(x_result, np.array([[1, 2], [5, 6]])) 47 | np.testing.assert_array_equal(y_result, np.array([0, 0])) 48 | 49 | x = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6], 'd': [7, 8]}).T 50 | y = pd.Series([0, 1, 0, 2]) 51 | min_count = 2 52 | x_result, y_result = remove_underrepresented_classes(x, y, min_count) 53 | pd.testing.assert_frame_equal(x_result, pd.DataFrame({'a': [1, 2], 'c': [5, 6]}).T) 54 | pd.testing.assert_series_equal(y_result, pd.Series([0, 1, 0, 2])[[0,2]]) 55 | 56 | x = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) 57 | y = np.array([0, 1, 0, 1]) 58 | min_count = 2 59 | x_result, y_result = remove_underrepresented_classes(x, y, min_count) 60 | np.testing.assert_array_equal(x_result, np.array([[1, 2], [3, 4], [5, 6], [7, 8]])) 61 | np.testing.assert_array_equal(y_result, np.array([0, 1, 0, 1])) 62 | 63 | x = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6], 'd': [7, 8]}).T 64 | y = pd.Series([0, 1, 0, 1]) 65 | min_count = 2 66 | x_result, y_result = remove_underrepresented_classes(x, y, min_count) 67 | pd.testing.assert_frame_equal(x_result, pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6], 'd': [7, 8]}).T) 68 | pd.testing.assert_series_equal(y_result, pd.Series([0, 1, 0, 1])) 69 | 70 | 71 | def test_check_if_y_is_encoded(): 72 | assert check_if_y_is_encoded([0, 1, 2, 3]) == True 73 | assert check_if_y_is_encoded([0, 1, 3, 4]) == False 74 | assert check_if_y_is_encoded([0, 2, 3]) == False 75 | assert check_if_y_is_encoded([0]) == True 76 | assert check_if_y_is_encoded([0,0,0,0,1,1,1,1]) == True 77 | assert check_if_y_is_encoded([0,0,0,0,1,1,1,1,3]) == False 78 | assert check_if_y_is_encoded([1,1,1,1,2,2,2,2]) == False 79 | -------------------------------------------------------------------------------- /tpot/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from . import eval_utils 37 | from .utils import * 38 | 39 | # If amltk is installed, import the parser 40 | try: 41 | from .amltk_parser import tpot_parser 42 | except ImportError: 43 | # Handle the case when amltk is not installed 44 | pass 45 | # print("amltk is not installed. Please install it to use tpot_parser.") 46 | # Optional: raise an exception or provide alternative functionality -------------------------------------------------------------------------------- /tpot/utils/amltk_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is part of the TPOT library. 3 | 4 | The current version of TPOT was developed at Cedars-Sinai by: 5 | - Pedro Henrique Ribeiro (https://github.com/perib, https://www.linkedin.com/in/pedro-ribeiro/) 6 | - Anil Saini (anil.saini@cshs.org) 7 | - Jose Hernandez (jgh9094@gmail.com) 8 | - Jay Moran (jay.moran@cshs.org) 9 | - Nicholas Matsumoto (nicholas.matsumoto@cshs.org) 10 | - Hyunjun Choi (hyunjun.choi@cshs.org) 11 | - Gabriel Ketron (gabriel.ketron@cshs.org) 12 | - Miguel E. Hernandez (miguel.e.hernandez@cshs.org) 13 | - Jason Moore (moorejh28@gmail.com) 14 | 15 | The original version of TPOT was primarily developed at the University of Pennsylvania by: 16 | - Randal S. Olson (rso@randalolson.com) 17 | - Weixuan Fu (weixuanf@upenn.edu) 18 | - Daniel Angell (dpa34@drexel.edu) 19 | - Jason Moore (moorejh28@gmail.com) 20 | - and many more generous open-source contributors 21 | 22 | TPOT is free software: you can redistribute it and/or modify 23 | it under the terms of the GNU Lesser General Public License as 24 | published by the Free Software Foundation, either version 3 of 25 | the License, or (at your option) any later version. 26 | 27 | TPOT is distributed in the hope that it will be useful, 28 | but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | GNU Lesser General Public License for more details. 31 | 32 | You should have received a copy of the GNU Lesser General Public 33 | License along with TPOT. If not, see . 34 | 35 | """ 36 | from amltk.pipeline import Choice, Component, Sequential, Node, Fixed, Split, Join, Searchable 37 | from tpot.search_spaces.pipelines import SequentialPipeline, ChoicePipeline, UnionPipeline 38 | from tpot.search_spaces.nodes import EstimatorNode 39 | from ConfigSpace import ConfigurationSpace 40 | 41 | def component_to_estimatornode(component: Component) -> EstimatorNode: 42 | method = component.item 43 | space_dict = {} 44 | if component.space is not None: 45 | space_dict.update(component.space) 46 | if component.config is not None: 47 | space_dict.update(component.config) 48 | space = ConfigurationSpace(component.space) 49 | 50 | tpot_sp = EstimatorNode(method=method, space=space) 51 | return tpot_sp 52 | 53 | def fixed_to_estimatornode(node: Fixed) -> EstimatorNode: 54 | method = node.item 55 | #check if method is a class or an object 56 | if not isinstance(method, type): 57 | method = type(method) 58 | 59 | #if baseestimator, get params 60 | if hasattr(node.item, 'get_params'): 61 | space_dict = node.item.get_params(deep=False) 62 | else: 63 | space_dict = {} 64 | if node.space is not None: 65 | space_dict.update(node.space) 66 | if node.config is not None: 67 | space_dict.update(node.config) 68 | 69 | tpot_sp = EstimatorNode(method=method, space=space_dict) 70 | return tpot_sp 71 | 72 | def sequential_to_sequentialpipeline(sequential: Sequential) -> SequentialPipeline: 73 | nodes = [tpot_parser(node) for node in sequential.nodes] 74 | tpot_sp = SequentialPipeline(search_spaces=nodes) 75 | return tpot_sp 76 | 77 | def choice_to_choicepipeline(choice: Choice) -> ChoicePipeline: 78 | nodes = [tpot_parser(node) for node in choice.nodes] 79 | tpot_sp = ChoicePipeline(search_spaces=nodes) 80 | return tpot_sp 81 | 82 | 83 | def split_to_unionpipeline(split: Split) -> UnionPipeline: 84 | nodes = [tpot_parser(node) for node in split.nodes] 85 | tpot_sp = UnionPipeline(search_spaces=nodes) 86 | return tpot_sp 87 | 88 | def tpot_parser( 89 | node: Node, 90 | ): 91 | """ 92 | Convert amltk pipeline search space into a tpot pipeline search space. 93 | 94 | Parameters 95 | ---------- 96 | node: amltk.pipeline.Node 97 | The node to convert. 98 | 99 | Returns 100 | ------- 101 | tpot.search_spaces.base.SearchSpace 102 | The equivalent TPOT search space which can be optimized by TPOT. 103 | """ 104 | 105 | if isinstance(node, Component): 106 | return component_to_estimatornode(node) 107 | elif isinstance(node, Sequential): 108 | return sequential_to_sequentialpipeline(node) 109 | elif isinstance(node, Choice): 110 | return choice_to_choicepipeline(node) 111 | elif isinstance(node, Fixed): 112 | return fixed_to_estimatornode(node) 113 | elif isinstance(node, Split): 114 | return split_to_unionpipeline(node) 115 | else: 116 | raise ValueError(f"Node type {type(node)} not supported") 117 | --------------------------------------------------------------------------------