├── .editorconfig ├── .github ├── ISSUE_TEMPLATE.md └── workflows │ ├── docs.yml │ └── tests.yml ├── .gitignore ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── apt.txt ├── docs ├── Makefile ├── advanced_usage │ ├── adding_primitives.rst │ ├── hyperparameters.rst │ ├── pipelines.rst │ └── primitives.rst ├── api │ └── mlblocks.rst ├── authors.rst ├── changelog.rst ├── conf.py ├── contributing.rst ├── getting_started │ ├── install.rst │ └── quickstart.rst ├── images │ ├── favicon.ico │ ├── mlblocks-icon.png │ ├── mlblocks-logo-small.png │ └── mlblocks-logo.png ├── index.rst ├── make.bat └── pipeline_examples │ ├── graph.rst │ ├── image.rst │ ├── multi_table.rst │ ├── single_table.rst │ └── text.rst ├── examples ├── README.md ├── pipelines │ └── single_table.classification.categorical_encoder.xgboost.json ├── primitives │ ├── mlblocks.examples.ClassPrimitive.json │ └── mlblocks.examples.function_primitive.json └── tutorials │ ├── 1. Using and MLPipeline.ipynb │ ├── 2. Finding and Loading a Pipeline.ipynb │ ├── 3. Setting MLPipeline Hyperparameters.ipynb │ ├── 4. Saving and Loading a Pipeline.ipynb │ ├── 5. Partial execution and pipeline debugging.ipynb │ ├── 6. Flexible outputs specification.ipynb │ ├── 7. Tuning a Pipeline.ipynb │ ├── 8. Searching for the best pipeline with BTBSession.ipynb │ └── utils.py ├── mlblocks ├── __init__.py ├── discovery.py ├── mlblock.py └── mlpipeline.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── data │ └── diagrams │ │ ├── diagram_fit.txt │ │ ├── diagram_multiple_blocks.txt │ │ └── diagram_simple.txt ├── features │ ├── test_fit_predicr_args.py │ ├── test_partial_outputs.py │ └── test_pipeline_loading.py ├── test_discovery.py ├── test_mlblock.py └── test_mlpipeline.py └── tox.ini /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * MLBlocks version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Generate Docs 2 | 3 | on: 4 | push: 5 | branches: [ stable ] 6 | 7 | jobs: 8 | 9 | docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | 14 | - name: Python 15 | uses: actions/setup-python@v1 16 | with: 17 | python-version: 3.8 18 | 19 | - name: Build 20 | run: | 21 | sudo apt-get install graphviz pandoc 22 | python -m pip install --upgrade pip 23 | pip install -e .[dev] 24 | make docs 25 | - name: Deploy 26 | uses: peaceiris/actions-gh-pages@v3 27 | with: 28 | github_token: ${{secrets.GITHUB_TOKEN}} 29 | publish_dir: docs/_build/html 30 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 2 | 3 | on: 4 | push: 5 | branches: [ '*' ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | devel: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | python-version: ['3.10'] 15 | os: [ubuntu-latest] 16 | steps: 17 | - uses: actions/checkout@v1 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Upgrade pip 23 | run: pip install -U "pip<=24.1" setuptools wheel 24 | - name: Install lightfm 25 | run: python -m pip install --no-use-pep517 'lightfm<2' 26 | - name: Install package 27 | run: pip install .[dev] 28 | - name: make test-devel 29 | run: make test-devel 30 | 31 | readme: 32 | runs-on: ${{ matrix.os }} 33 | strategy: 34 | matrix: 35 | python-version: ['3.8', '3.9', '3.10', '3.11'] 36 | os: [ubuntu-20.04, macos-latest] 37 | steps: 38 | - uses: actions/checkout@v1 39 | - name: Set up Python ${{ matrix.python-version }} 40 | uses: actions/setup-python@v2 41 | with: 42 | python-version: ${{ matrix.python-version }} 43 | - name: Upgrade pip 44 | run: pip install -U pip setuptools wheel 45 | - name: Install lightfm 46 | run: python -m pip install --no-use-pep517 'lightfm<2' 47 | - name: Install package and dependencies 48 | run: pip install rundoc .[mlprimitives] 49 | - name: make test-readme 50 | run: make test-readme 51 | 52 | unit: 53 | runs-on: ${{ matrix.os }} 54 | strategy: 55 | matrix: 56 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] 57 | os: [ubuntu-20.04, macos-latest] 58 | steps: 59 | - uses: actions/checkout@v1 60 | - name: Set up Python ${{ matrix.python-version }} 61 | uses: actions/setup-python@v2 62 | with: 63 | python-version: ${{ matrix.python-version }} 64 | - name: Install package and dependencies 65 | run: pip install .[unit] 66 | - name: make test-unit 67 | run: make test-unit 68 | 69 | unit-mlprimitives: 70 | runs-on: ${{ matrix.os }} 71 | strategy: 72 | matrix: 73 | python-version: ['3.8', '3.9', '3.10', '3.11'] 74 | os: [ubuntu-20.04, macos-latest] 75 | steps: 76 | - uses: actions/checkout@v1 77 | - name: Set up Python ${{ matrix.python-version }} 78 | uses: actions/setup-python@v2 79 | with: 80 | python-version: ${{ matrix.python-version }} 81 | - name: Upgrade pip 82 | run: pip install -U pip setuptools wheel 83 | - name: Install lightfm 84 | run: python -m pip install --no-use-pep517 'lightfm<2' 85 | - name: Install package and dependencies 86 | run: pip install .[test] 87 | - name: make test-mlprimitives 88 | run: make test-mlprimitives 89 | 90 | tutorials: 91 | runs-on: ${{ matrix.os }} 92 | strategy: 93 | matrix: 94 | python-version: ['3.8', '3.9', '3.10', '3.11'] 95 | os: [ubuntu-20.04] 96 | steps: 97 | - uses: actions/checkout@v1 98 | - name: Set up Python ${{ matrix.python-version }} 99 | uses: actions/setup-python@v2 100 | with: 101 | python-version: ${{ matrix.python-version }} 102 | - if: matrix.os == 'ubuntu-20.04' 103 | name: Install dependencies - Ubuntu 104 | run: sudo apt-get install graphviz 105 | - name: Upgrade pip 106 | run: pip install -U pip setuptools wheel 107 | - name: Install lightfm 108 | run: python -m pip install --no-use-pep517 'lightfm<2' 109 | - name: Install package and dependencies 110 | run: pip install .[examples] 111 | - name: make test-tutorials 112 | run: make test-tutorials 113 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | docs/pipeline.json 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | # other 106 | .DS_Store 107 | 108 | # vim 109 | .*.swp 110 | 111 | mlblocks/data 112 | examples/tutorials/pipeline.pkl 113 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Contributors 6 | ------------ 7 | 8 | * Carles Sala 9 | * Kalyan Veeramachaneni 10 | * William Xue 11 | * Akshay Ravikumar 12 | * Laura Gustafson 13 | * Erica Chiu 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every little bit 8 | helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at https://github.com/MLBazaar/MLBlocks/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 30 | wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | MLBlocks could always use more documentation, whether as part of the 42 | official MLBlocks docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at https://github.com/MLBazaar/MLBlocks/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `MLBlocks` for local development. 61 | 62 | 1. Fork the `MLBlocks` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/MLBlocks.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, 68 | this is how you set up your fork for local development:: 69 | 70 | $ mkvirtualenv MLBlocks 71 | $ cd MLBlocks/ 72 | $ make install-develop 73 | 74 | 4. Create a branch for local development:: 75 | 76 | $ git checkout -b name-of-your-bugfix-or-feature 77 | 78 | Now you can make your changes locally. 79 | 80 | 5. While hacking your changes, make sure to cover all your developments with the required 81 | unit tests, and that none of the old tests fail as a consequence of your changes. 82 | For this, make sure to run the tests suite and check the code coverage:: 83 | 84 | $ make test # Run the tests 85 | $ make coverage # Get the coverage report 86 | 87 | 6. When you're done making changes, check that your changes pass flake8 and the 88 | tests, including testing other Python versions with tox:: 89 | 90 | $ make lint # Check code styling 91 | $ make test-all # Execute tests on all python versions 92 | 93 | 7. Make also sure to include the necessary documentation in the code as docstrings following 94 | the `google docstring`_ style. 95 | If you want to view how your documentation will look like when it is published, you can 96 | generate and view the docs with this command:: 97 | 98 | $ make viewdocs 99 | 100 | 8. Commit your changes and push your branch to GitHub:: 101 | 102 | $ git add . 103 | $ git commit -m "Your detailed description of your changes." 104 | $ git push origin name-of-your-bugfix-or-feature 105 | 106 | 9. Submit a pull request through the GitHub website. 107 | 108 | .. _google docstring: https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html 109 | 110 | Pull Request Guidelines 111 | ----------------------- 112 | 113 | Before you submit a pull request, check that it meets these guidelines: 114 | 115 | 1. It resolves an open GitHub Issue and contains its reference in the title or 116 | the comment. If there is no associated issue, feel free to create one. 117 | 2. Whenever possible, it resolves only **one** issue. If your PR resolves more than 118 | one issue, try to split it in more than one pull request. 119 | 3. The pull request should include unit tests that cover all the changed code 120 | 4. If the pull request adds functionality, the docs should be updated. Put 121 | your new functionality into a function with a docstring, and add the 122 | feature to the list in README.rst. 123 | 5. The pull request should work for all the supported python version. Check 124 | https://travis-ci.org/MLBazaar/MLBlocks/pull_requests 125 | and make sure that all the checks pass. 126 | 127 | Unit Testing Guidelines 128 | ----------------------- 129 | 130 | All the Unit Tests should comply with the following requirements: 131 | 132 | 1. Unit Tests should be based only in unittest and pytest modules. 133 | 134 | 2. The tests that cover a module called ``mlblocks/path/to/a_module.py`` should be 135 | implemented in a separated module called ``tests/mlblocks/path/to/test_a_module.py``. 136 | Note that the module name has the ``test_`` prefix and is located in a path similar 137 | to the one of the tested module, just inside te ``tests`` folder. 138 | 139 | 3. Each method of the tested module should have at least one associated test method, and 140 | each test method should cover only **one** use case or scenario. 141 | 142 | 4. Test case methods should start with the ``test_`` prefix and have descriptive names 143 | that indicate which scenario they cover. 144 | Names such as ``test_some_methed_input_none``, ``test_some_method_value_error`` or 145 | ``test_some_method_timeout`` are right, but names like ``test_some_method_1``, 146 | ``some_method`` or ``test_error`` are not. 147 | 148 | 5. Each test should validate only what the code of the method being tested does, and not 149 | cover the behavior of any third party package or tool being used, which is assumed to 150 | work properly as far as it is being passed the right values. 151 | 152 | 6. Any third party tool that may have any kind of random behavior, such as some Machine 153 | Learning models, databases or Web APIs, will be mocked using the ``mock`` library, and 154 | the only thing that will be tested is that our code passes the right values to them. 155 | 156 | 7. Unit tests should not use anything from outside the test and the code being tested. This 157 | includes not reading or writting to any filesystem or database, which will be properly 158 | mocked. 159 | 160 | Tips 161 | ---- 162 | 163 | To run a subset of tests:: 164 | 165 | $ pytest tests.test_mlblocks 166 | 167 | Release Workflow 168 | ---------------- 169 | 170 | The process of releasing a new version involves several steps combining both ``git`` and 171 | ``bumpversion`` which, briefly: 172 | 173 | 1. Merge what is in ``master`` branch into ``stable`` branch. 174 | 2. Update the version in ``setup.cfg``, ``mlblocks/__init__.py`` and ``HISTORY.md`` files. 175 | 3. Create a new git tag pointing at the corresponding commit in ``stable`` branch. 176 | 4. Merge the new commit from ``stable`` into ``master``. 177 | 5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py`` 178 | to open the next development iteration. 179 | 180 | .. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new 181 | entry that explains the changes that will be included in the new version. 182 | Normally this is just a list of the Pull Requests that have been merged to master 183 | since the last release. 184 | 185 | Once this is done, run of the following commands: 186 | 187 | 1. If you are releasing a patch version:: 188 | 189 | make release 190 | 191 | 2. If you are releasing a minor version:: 192 | 193 | make release-minor 194 | 195 | 3. If you are releasing a major version:: 196 | 197 | make release-major 198 | 199 | Release Candidates 200 | ~~~~~~~~~~~~~~~~~~ 201 | 202 | Sometimes it is necessary or convenient to upload a release candidate to PyPi as a pre-release, 203 | in order to make some of the new features available for testing on other projects before they 204 | are included in an actual full-blown release. 205 | 206 | In order to perform such an action, you can execute:: 207 | 208 | make release-candidate 209 | 210 | This will perform the following actions: 211 | 212 | 1. Build and upload the current version to PyPi as a pre-release, with the format ``X.Y.Z.devN`` 213 | 214 | 2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)`` 215 | 216 | After this is done, the new pre-release can be installed by including the ``dev`` section in the 217 | dependency specification, either in ``setup.py``:: 218 | 219 | install_requires = [ 220 | ... 221 | 'mlblocks>=X.Y.Z.dev', 222 | ... 223 | ] 224 | 225 | or in command line:: 226 | 227 | pip install 'mlblocks>=X.Y.Z.dev' 228 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | 0.6.2 - 2024-11-18 5 | ------------------ 6 | 7 | * Upgrade python version to include 3.12 and 3.13 - [Issue #144](https://github.com/MLBazaar/MLBlocks/issues/144) by @sarahmish 8 | 9 | 0.6.1 - 2023-09-26 10 | ------------------ 11 | 12 | * Add python 3.11 to MLBlocks - [Issue #143](https://github.com/MLBazaar/MLBlocks/issues/143) by @sarahmish 13 | 14 | 0.6.0 - 2023-04-14 15 | ------------------ 16 | 17 | * Support python 3.9 and 3.10 - [Issue #141](https://github.com/MLBazaar/MLBlocks/issues/141) by @sarahmish 18 | 19 | 0.5.0 - 2023-01-22 20 | ------------------ 21 | 22 | * Update `numpy` dependency and isolate tests - [Issue #139](https://github.com/MLBazaar/MLBlocks/issues/139) by @sarahmish 23 | 24 | 0.4.1 - 2021-10-08 25 | ------------------ 26 | 27 | * Update NumPy dependency - [Issue #136](https://github.com/MLBazaar/MLBlocks/issues/136) by @sarahmish 28 | * Support dynamic inputs and outputs - [Issue #134](https://github.com/MLBazaar/MLBlocks/issues/134) by @pvk-developer 29 | 30 | 0.4.0 - 2021-01-09 31 | ------------------ 32 | 33 | * Stop pipeline fitting after the last block - [Issue #131](https://github.com/MLBazaar/MLBlocks/issues/131) by @sarahmish 34 | * Add memory debug and profiling - [Issue #130](https://github.com/MLBazaar/MLBlocks/issues/130) by @pvk-developer 35 | * Update Python support - [Issue #129](https://github.com/MLBazaar/MLBlocks/issues/129) by @csala 36 | * Get execution time for each block - [Issue #127](https://github.com/MLBazaar/MLBlocks/issues/127) by @sarahmish 37 | * Allow loading a primitive or pipeline directly from the JSON path - [Issue #114](https://github.com/MLBazaar/MLBlocks/issues/114) by @csala 38 | * Pipeline Diagrams - [Issue #113](https://github.com/MLBazaar/MLBlocks/issues/113) by @erica-chiu 39 | * Get Pipeline Inputs - [Issue #112](https://github.com/MLBazaar/MLBlocks/issues/112) by @erica-chiu 40 | 41 | 0.3.4 - 2019-11-01 42 | ------------------ 43 | 44 | * Ability to return intermediate context - [Issue #110](https://github.com/MLBazaar/MLBlocks/issues/110) by @csala 45 | * Support for static or class methods - [Issue #107](https://github.com/MLBazaar/MLBlocks/issues/107) by @csala 46 | 47 | 0.3.3 - 2019-09-09 48 | ------------------ 49 | 50 | * Improved intermediate outputs management - [Issue #105](https://github.com/MLBazaar/MLBlocks/issues/105) by @csala 51 | 52 | 0.3.2 - 2019-08-12 53 | ------------------ 54 | 55 | * Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/MLBazaar/MLBlocks/issues/96) by @csala 56 | * Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/MLBazaar/MLBlocks/issues/95) by @csala 57 | * Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/MLBazaar/MLBlocks/issues/94) by @csala 58 | * Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/MLBazaar/MLBlocks/issues/90) by @csala 59 | * Add primitive caching - [Issue #22](https://github.com/MLBazaar/MLBlocks/issues/22) by @csala 60 | 61 | 0.3.1 - Pipelines Discovery 62 | --------------------------- 63 | 64 | * Support flat hyperparameter dictionaries - [Issue #92](https://github.com/MLBazaar/MLBlocks/issues/92) by @csala 65 | * Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/MLBazaar/MLBlocks/issues/88) by @csala 66 | * Implement partial re-fit -[Issue #61](https://github.com/MLBazaar/MLBlocks/issues/61) by @csala 67 | * Move argument parsing to MLBlock - [Issue #86](https://github.com/MLBazaar/MLBlocks/issues/86) by @csala 68 | * Allow getting intermediate outputs - [Issue #58](https://github.com/MLBazaar/MLBlocks/issues/58) by @csala 69 | 70 | 0.3.0 - New Primitives Discovery 71 | -------------------------------- 72 | 73 | * New primitives discovery system based on `entry_points`. 74 | * Conditional Hyperparameters filtering in MLBlock initialization. 75 | * Improved logging and exception reporting. 76 | 77 | 0.2.4 - New Datasets and Unit Tests 78 | ----------------------------------- 79 | 80 | * Add a new multi-table dataset. 81 | * Add Unit Tests up to 50% coverage. 82 | * Improve documentation. 83 | * Fix minor bug in newsgroups dataset. 84 | 85 | 0.2.3 - Demo Datasets 86 | --------------------- 87 | 88 | * Add new methods to Dataset class. 89 | * Add documentation for the datasets module. 90 | 91 | 0.2.2 - MLPipeline Load/Save 92 | ---------------------------- 93 | 94 | * Implement save and load methods for MLPipelines 95 | * Add more datasets 96 | 97 | 0.2.1 - New Documentation 98 | ------------------------- 99 | 100 | * Add mlblocks.datasets module with demo data download functions. 101 | * Extensive documentation, including multiple pipeline examples. 102 | 103 | 0.2.0 - New MLBlocks API 104 | ------------------------ 105 | 106 | A new MLBlocks API and Primitive format. 107 | 108 | This is a summary of the changes: 109 | 110 | * Primitives JSONs and Python code has been moved to a different repository, called MLPrimitives 111 | * Optional usage of multiple JSON primitive folders. 112 | * JSON format has been changed to allow more flexibility and features: 113 | * input and output arguments, as well as argument types, can be specified for each method 114 | * both classes and function as primitives are supported 115 | * multitype and conditional hyperparameters fully supported 116 | * data modalities and primitive classifiers introduced 117 | * metadata such as documentation, description and author fields added 118 | * Parsers are removed, and now the MLBlock class is responsible for loading and reading the 119 | JSON primitive. 120 | * Multiple blocks of the same primitive are supported within the same pipeline. 121 | * Arbitrary inputs and outputs for both pipelines and blocks are allowed. 122 | * Shared variables during pipeline execution, usable by multiple blocks. 123 | 124 | 0.1.9 - Bugfix Release 125 | ---------------------- 126 | 127 | * Disable some NetworkX functions for incompatibilities with some types of graphs. 128 | 129 | 0.1.8 - New primitives and some improvements 130 | -------------------------------------------- 131 | 132 | * Improve the NetworkX primitives. 133 | * Add String Vectorization and Datetime Featurization primitives. 134 | * Refactor some Keras primitives to work with single dimension `y` arrays and be compatible with `pickle`. 135 | * Add XGBClassifier and XGBRegressor primitives. 136 | * Add some `keras.applications` pretrained networks as preprocessing primitives. 137 | * Add helper class to allow function primitives. 138 | 139 | 0.1.7 - Nested hyperparams dicts 140 | -------------------------------- 141 | 142 | * Support passing hyperparams as nested dicts. 143 | 144 | 0.1.6 - Text and Graph Pipelines 145 | -------------------------------- 146 | 147 | * Add LSTM classifier and regressor primitives. 148 | * Add OneHotEncoder and MultiLabelEncoder primitives. 149 | * Add several NetworkX graph featurization primitives. 150 | * Add `community.best_partition` primitive. 151 | 152 | 0.1.5 - Collaborative Filtering Pipelines 153 | ----------------------------------------- 154 | 155 | * Add LightFM primitive. 156 | 157 | 0.1.4 - Image pipelines improved 158 | -------------------------------- 159 | 160 | * Allow passing `init_params` on `MLPipeline` creation. 161 | * Fix bug with MLHyperparam types and Keras. 162 | * Rename `produce_params` as `predict_params`. 163 | * Add SingleCNN Classifier and Regressor primitives. 164 | * Simplify and improve Trivial Predictor 165 | 166 | 0.1.3 - Multi Table pipelines improved 167 | -------------------------------------- 168 | 169 | * Improve RandomForest primitive ranges 170 | * Improve DFS primitive 171 | * Add Tree Based Feature Selection primitives 172 | * Fix bugs in TrivialPredictor 173 | * Improved documentation 174 | 175 | 0.1.2 - Bugfix release 176 | ---------------------- 177 | 178 | * Fix bug in TrivialMedianPredictor 179 | * Fix bug in OneHotLabelEncoder 180 | 181 | 0.1.1 - Single Table pipelines improved 182 | --------------------------------------- 183 | 184 | * New project structure and primitives for integration into MIT-TA2. 185 | * MIT-TA2 default pipelines and single table pipelines fully working. 186 | 187 | 0.1.0 188 | ----- 189 | 190 | * First release on PyPI. 191 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018, MIT Data To AI Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.md 4 | include LICENSE 5 | include README.md 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include docs *.md *.rst conf.py Makefile make.bat *.jpg *.png *.gif 12 | recursive-include mlblocks *.json 13 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := help 2 | 3 | define BROWSER_PYSCRIPT 4 | import os, webbrowser, sys 5 | 6 | try: 7 | from urllib import pathname2url 8 | except: 9 | from urllib.request import pathname2url 10 | 11 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 12 | endef 13 | export BROWSER_PYSCRIPT 14 | 15 | define PRINT_HELP_PYSCRIPT 16 | import re, sys 17 | 18 | for line in sys.stdin: 19 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 20 | if match: 21 | target, help = match.groups() 22 | print("%-20s %s" % (target, help)) 23 | endef 24 | export PRINT_HELP_PYSCRIPT 25 | 26 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 27 | 28 | .PHONY: help 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | 33 | # CLEAN TARGETS 34 | 35 | .PHONY: clean-build 36 | clean-build: ## remove build artifacts 37 | rm -fr build/ 38 | rm -fr dist/ 39 | rm -fr .eggs/ 40 | find . -name '*.egg-info' -exec rm -fr {} + 41 | find . -name '*.egg' -exec rm -f {} + 42 | 43 | .PHONY: clean-pyc 44 | clean-pyc: ## remove Python file artifacts 45 | find . -name '*.pyc' -exec rm -f {} + 46 | find . -name '*.pyo' -exec rm -f {} + 47 | find . -name '*~' -exec rm -f {} + 48 | find . -name '__pycache__' -exec rm -fr {} + 49 | 50 | .PHONY: clean-docs 51 | clean-docs: ## remove previously built docs 52 | -$(MAKE) -C docs clean 2>/dev/null # this fails if sphinx is not yet installed 53 | 54 | .PHONY: clean-coverage 55 | clean-coverage: ## remove coverage artifacts 56 | rm -f .coverage 57 | rm -f .coverage.* 58 | rm -fr htmlcov/ 59 | 60 | .PHONY: clean-test 61 | clean-test: ## remove test artifacts 62 | rm -fr .tox/ 63 | rm -fr .pytest_cache 64 | 65 | .PHONY: clean 66 | clean: clean-build clean-pyc clean-test clean-coverage clean-docs ## remove all build, test, coverage, docs and Python artifacts 67 | 68 | 69 | # INSTALL TARGETS 70 | 71 | .PHONY: install 72 | install: clean-build clean-pyc ## install the package to the active Python's site-packages 73 | pip install . 74 | 75 | .PHONY: install-examples 76 | install-examples: clean-build clean-pyc ## install the package and the examples dependencies 77 | pip install .[examples] 78 | 79 | .PHONY: install-unit 80 | install-unit: clean-build clean-pyc ## install the package and dependencies for unit tests 81 | pip install .[unit] 82 | 83 | .PHONY: install-test 84 | install-test: clean-build clean-pyc ## install the package and test dependencies 85 | pip install .[test] 86 | 87 | .PHONY: install-develop 88 | install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development 89 | pip install -e .[dev] 90 | 91 | MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | grep -v -e '[][]' | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=') 92 | 93 | .PHONY: install-minimum 94 | install-minimum: ## install the minimum supported versions of the package dependencies 95 | pip install $(MINIMUM) 96 | 97 | 98 | # LINT TARGETS 99 | 100 | .PHONY: lint 101 | lint: ## check style with flake8 and isort 102 | flake8 mlblocks tests 103 | isort -c --recursive mlblocks tests 104 | 105 | .PHONY: fix-lint 106 | fix-lint: ## fix lint issues using autoflake, autopep8, and isort 107 | find mlblocks -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables 108 | autopep8 --in-place --recursive --aggressive mlblocks 109 | isort --apply --atomic --recursive mlblocks 110 | 111 | find tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables 112 | autopep8 --in-place --recursive --aggressive tests 113 | isort --apply --atomic --recursive tests 114 | 115 | .PHONY: lint-docs 116 | lint-docs: ## check docs formatting with doc8 and pydocstyle 117 | doc8 mlblocks/ 118 | pydocstyle mlblocks/ 119 | 120 | 121 | # TEST TARGETS 122 | 123 | .PHONY: test-unit 124 | test-unit: ## run tests quickly with the default Python 125 | python -m pytest --cov=mlblocks --ignore=tests/features/ 126 | 127 | .PHONY: test-mlprimitives 128 | test-mlprimitives: ## run tests quickly with the default Python 129 | python -m pytest --cov=mlblocks 130 | 131 | .PHONY: test-readme 132 | test-readme: ## run the readme snippets 133 | rm -rf tests/readme_test && mkdir tests/readme_test 134 | cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md 135 | rm -rf tests/readme_test 136 | 137 | .PHONY: test-tutorials 138 | test-tutorials: ## run the tutorial notebooks 139 | find examples/tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \ 140 | jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null + 141 | 142 | .PHONY: test 143 | test: test-unit test-mlprimitives test-readme ## test everything that needs test dependencies 144 | 145 | .PHONY: check-dependencies 146 | check-dependencies: ## test if there are any broken dependencies 147 | pip check 148 | 149 | .PHONY: test-devel 150 | test-devel: check-dependencies lint docs ## test everything that needs development dependencies 151 | 152 | .PHONY: test-all 153 | test-all: ## run tests on every Python version with tox 154 | tox -r 155 | 156 | .PHONY: coverage 157 | coverage: ## check code coverage quickly with the default Python 158 | coverage run --source mlblocks -m pytest 159 | coverage report -m 160 | coverage html 161 | $(BROWSER) htmlcov/index.html 162 | 163 | 164 | # DOCS TARGETS 165 | 166 | .PHONY: docs 167 | docs: clean-docs ## generate Sphinx HTML documentation, including API docs 168 | $(MAKE) -C docs html 169 | 170 | .PHONY: view-docs 171 | view-docs: ## view the docs in a browser 172 | $(BROWSER) docs/_build/html/index.html 173 | 174 | .PHONY: serve-docs 175 | serve-docs: ## compile the docs watching for changes 176 | watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' docs 177 | 178 | 179 | # RELEASE TARGETS 180 | 181 | .PHONY: dist 182 | dist: clean ## builds source and wheel package 183 | python setup.py sdist 184 | python setup.py bdist_wheel 185 | ls -l dist 186 | 187 | .PHONY: publish-confirm 188 | publish-confirm: 189 | @echo "WARNING: This will irreversibly upload a new version to PyPI!" 190 | @echo -n "Please type 'confirm' to proceed: " \ 191 | && read answer \ 192 | && [ "$${answer}" = "confirm" ] 193 | 194 | .PHONY: publish-test 195 | publish-test: dist publish-confirm ## package and upload a release on TestPyPI 196 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* 197 | 198 | .PHONY: publish 199 | publish: dist publish-confirm ## package and upload a release 200 | twine upload dist/* 201 | 202 | .PHONY: bumpversion-release 203 | bumpversion-release: ## Merge master to stable and bumpversion release 204 | git checkout stable || git checkout -b stable 205 | git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" 206 | bumpversion release 207 | git push --tags origin stable 208 | 209 | .PHONY: bumpversion-patch 210 | bumpversion-patch: ## Merge stable to master and bumpversion patch 211 | git checkout master 212 | git merge stable 213 | bumpversion --no-tag patch 214 | git push 215 | 216 | .PHONY: bumpversion-candidate 217 | bumpversion-candidate: ## Bump the version to the next candidate 218 | bumpversion candidate --no-tag 219 | 220 | .PHONY: bumpversion-minor 221 | bumpversion-minor: ## Bump the version the next minor skipping the release 222 | bumpversion --no-tag minor 223 | 224 | .PHONY: bumpversion-major 225 | bumpversion-major: ## Bump the version the next major skipping the release 226 | bumpversion --no-tag major 227 | 228 | .PHONY: bumpversion-revert 229 | bumpversion-revert: ## Undo a previous bumpversion-release 230 | git checkout master 231 | git branch -D stable 232 | 233 | CLEAN_DIR := $(shell git status --short | grep -v ??) 234 | CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) 235 | CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) 236 | 237 | .PHONY: check-clean 238 | check-clean: ## Check if the directory has uncommitted changes 239 | ifneq ($(CLEAN_DIR),) 240 | $(error There are uncommitted changes) 241 | endif 242 | 243 | .PHONY: check-master 244 | check-master: ## Check if we are in master branch 245 | ifneq ($(CURRENT_BRANCH),master) 246 | $(error Please make the release from master branch\n) 247 | endif 248 | 249 | .PHONY: check-history 250 | check-history: ## Check if HISTORY.md has been modified 251 | ifeq ($(CHANGELOG_LINES),0) 252 | $(error Please insert the release notes in HISTORY.md before releasing) 253 | endif 254 | 255 | .PHONY: check-release 256 | check-release: check-clean check-master check-history ## Check if the release can be made 257 | @echo "A new release can be made" 258 | 259 | .PHONY: release 260 | release: check-release bumpversion-release publish bumpversion-patch 261 | 262 | .PHONY: release-test 263 | release-test: check-release bumpversion-release-test publish-test bumpversion-revert 264 | 265 | .PHONY: release-candidate 266 | release-candidate: check-master publish bumpversion-candidate 267 | 268 | .PHONY: release-candidate-test 269 | release-candidate-test: check-clean check-master publish-test 270 | 271 | .PHONY: release-minor 272 | release-minor: check-release bumpversion-minor release 273 | 274 | .PHONY: release-major 275 | release-major: check-release bumpversion-major release 276 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | DAI-Lab 4 | 5 | An Open Source Project from the Data to AI Lab, at MIT 6 |

7 | 8 |

9 | “MLBlocks” 10 |

11 | 12 |

13 | Pipelines and Primitives for Machine Learning and Data Science. 14 |

15 | 16 | [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) 17 | [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks) 18 | [![Tests](https://github.com/MLBazaar/MLBlocks/workflows/Run%20Tests/badge.svg)](https://github.com/MLBazaar/MLBlocks/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster) 19 | [![CodeCov](https://codecov.io/gh/MLBazaar/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/MLBazaar/MLBlocks) 20 | [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks) 21 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/MLBazaar/MLBlocks/master?filepath=examples/tutorials) 22 | 23 |
24 | 25 | # MLBlocks 26 | 27 | * Documentation: https://mlbazaar.github.io/MLBlocks 28 | * Github: https://github.com/MLBazaar/MLBlocks 29 | * License: [MIT](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE) 30 | * Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha) 31 | 32 | ## Overview 33 | 34 | MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by 35 | seamlessly combining tools from any python library with a simple, common and uniform interface. 36 | 37 | Features include: 38 | 39 | * Build Machine Learning Pipelines combining **any Machine Learning Library in Python**. 40 | * Access a repository with hundreds of primitives and pipelines ready to be used with little to 41 | no python code to write, carefully curated by Machine Learning and Domain experts. 42 | * Extract machine-readable information about which hyperparameters can be tuned and within 43 | which ranges, allowing automated integration with Hyperparameter Optimization tools like 44 | [BTB](https://github.com/MLBazaar/BTB). 45 | * Complex multi-branch pipelines and DAG configurations, with unlimited number of inputs and 46 | outputs per primitive. 47 | * Easy save and load Pipelines using JSON Annotations. 48 | 49 | # Install 50 | 51 | ## Requirements 52 | 53 | **MLBlocks** has been developed and tested on [Python 3.8, 3.9, 3.10, 3.11, 3.12, 3.13](https://www.python.org/downloads/) 54 | 55 | ## Install with `pip` 56 | 57 | The easiest and recommended way to install **MLBlocks** is using [pip]( 58 | https://pip.pypa.io/en/stable/): 59 | 60 | ```bash 61 | pip install mlblocks 62 | ``` 63 | 64 | This will pull and install the latest stable release from [PyPi](https://pypi.org/). 65 | 66 | If you want to install from source or contribute to the project please read the 67 | [Contributing Guide](https://mlbazaar.github.io/MLBlocks/contributing.html#get-started). 68 | 69 | ## MLPrimitives 70 | 71 | In order to be usable, MLBlocks requires a compatible primitives library. 72 | 73 | The official library, required in order to follow the following MLBlocks tutorial, 74 | is [MLPrimitives](https://github.com/MLBazaar/MLPrimitives), which you can install 75 | with this command: 76 | 77 | ```bash 78 | pip install mlprimitives 79 | ``` 80 | 81 | # Quickstart 82 | 83 | Below there is a short example about how to use **MLBlocks** to solve the [Adult Census 84 | Dataset](https://archive.ics.uci.edu/ml/datasets/Adult) classification problem using a 85 | pipeline which combines primitives from [MLPrimitives](https://github.com/MLBazaar/MLPrimitives), 86 | [scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/). 87 | 88 | ```python3 89 | import pandas as pd 90 | from mlblocks import MLPipeline 91 | from sklearn.model_selection import train_test_split 92 | from sklearn.metrics import accuracy_score 93 | 94 | dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv') 95 | label = dataset.pop('label') 96 | 97 | X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label) 98 | 99 | primitives = [ 100 | 'mlprimitives.custom.preprocessing.ClassEncoder', 101 | 'mlprimitives.custom.feature_extraction.CategoricalEncoder', 102 | 'sklearn.impute.SimpleImputer', 103 | 'xgboost.XGBClassifier', 104 | 'mlprimitives.custom.preprocessing.ClassDecoder' 105 | ] 106 | pipeline = MLPipeline(primitives) 107 | 108 | pipeline.fit(X_train, y_train) 109 | predictions = pipeline.predict(X_test) 110 | 111 | accuracy_score(y_test, predictions) 112 | ``` 113 | 114 | # What's Next? 115 | 116 | If you want to learn more about how to tune the pipeline hyperparameters, save and load 117 | the pipelines using JSON annotations or build complex multi-branched pipelines, please 118 | check our [documentation site](https://mlbazaar.github.io/MLBlocks). 119 | 120 | Also do not forget to have a look at the [notebook tutorials]( 121 | https://github.com/MLBazaar/MLBlocks/tree/master/examples/tutorials)! 122 | 123 | # Citing MLBlocks 124 | 125 | If you use MLBlocks for your research, please consider citing our related papers. 126 | 127 | For the current design of MLBlocks and its usage within the larger *Machine Learning Bazaar* project at 128 | the MIT Data To AI Lab, please see: 129 | 130 | Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. ["The Machine Learning Bazaar: 131 | Harnessing the ML Ecosystem for Effective System Development."](https://arxiv.org/abs/1905.08942) arXiv 132 | Preprint 1905.08942. 2019. 133 | 134 | ```bibtex 135 | @article{smith2019mlbazaar, 136 | author = {Smith, Micah J. and Sala, Carles and Kanter, James Max and Veeramachaneni, Kalyan}, 137 | title = {The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development}, 138 | journal = {arXiv e-prints}, 139 | year = {2019}, 140 | eid = {arXiv:1905.08942}, 141 | pages = {arXiv:1905.08942}, 142 | archivePrefix = {arXiv}, 143 | eprint = {1905.08942}, 144 | } 145 | ``` 146 | 147 | For the first MLBlocks version from 2015, designed for only multi table, multi entity temporal data, please 148 | refer to Bryan Collazo’s thesis: 149 | 150 | * [Machine learning blocks](https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf). 151 | Bryan Collazo. Masters thesis, MIT EECS, 2015. 152 | 153 | With recent availability of a multitude of libraries and tools, we decided it was time to integrate 154 | them and expand the library to address other data types: images, text, graph, time series and 155 | integrate with deep learning libraries. 156 | -------------------------------------------------------------------------------- /apt.txt: -------------------------------------------------------------------------------- 1 | # apt-get requirements for development and mybinder environment 2 | graphviz 3 | pandoc 4 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = mlblocks 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/advanced_usage/adding_primitives.rst: -------------------------------------------------------------------------------- 1 | Adding Primitives 2 | ================= 3 | 4 | The **MLBlocks** library is only the engine and it has no use without primitives, so here we 5 | explain how to add new primitives for **MLBlocks**. 6 | 7 | MLPrimitives 8 | ------------ 9 | 10 | **MLBlocks** has a related project, `MLPrimitives`_, which already includes a huge list of 11 | integrated primitives, so the easiest and recommended way to add primitives for **MLBlocks** 12 | is to install **MLPrimitives**. 13 | 14 | This can be achieved by running the commands:: 15 | 16 | pip install mlprimitives 17 | 18 | For further details, please refer to the `MLPrimitives Documentation`_. 19 | 20 | .. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives 21 | .. _MLPrimitives Documentation: https://mlbazaar.github.io/MLPrimitives/ 22 | 23 | Writing Primitives 24 | ------------------ 25 | 26 | Sometimes you will find that you want to use a primitive that is not in the list of 27 | `MLPrimitives integrated primitives`_, so you will have to integrate the primitive yourself 28 | by writing the corresponding `JSON annotation `_. 29 | 30 | .. _MLPrimitives integrated primitives: https://github.com/MLBazaar/MLPrimitives/tree/master/mlblocks_primitives 31 | 32 | .. note:: If you create new primitives for MLBlocks, please consider contributing them to the 33 | **MLPrimitives** project! 34 | 35 | The first thing to do when adding a new primitive is making sure that it complies with the 36 | necessary requirements, which depend on whether the primitive is a function or a class. 37 | 38 | For `Function Primitives`_, the only requirement is that they have to be a single function. 39 | Calling multiple functions sequentially as part of a single primitive is not supported, and 40 | in order to achieve this you are expected to write a separated primitive for each function. 41 | 42 | For `Class Primitives`_, just like the function primitives, the `fit` and `produce` phases 43 | must consist of a single method each. Calling multiple methods sequentially within a single 44 | primitive is not supported either. 45 | 46 | `Class Primitives`_ also need to be able to be instantiated at once. Running setup or compiling 47 | calls after the instance creation is not possible. 48 | 49 | .. _Function Primitives: primitives.html#function-primitives 50 | .. _Class Primitives: primitives.html#class-primitives 51 | 52 | Primitives Lookup 53 | ----------------- 54 | 55 | Once you have written the JSON annotation for your primitive, you will need to put in it in a 56 | place known to **MLBlocks**. 57 | 58 | **MLBlocks** looks for primitives in the following folders, in this order: 59 | 60 | 1. Any folder specified by the user, starting by the latest one. 61 | 2. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the current working directory. 62 | 3. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the `system prefix`_. 63 | 64 | .. _system prefix: https://docs.python.org/3/library/sys.html#sys.prefix 65 | 66 | The list of folders where **MLBlocks** will search for primitives can be seen at any time 67 | by calling the method `mlblocks.get_primitives_paths`_. 68 | 69 | .. _mlblocks.get_primitives_paths: ../api_reference.html#mlblocks.get_primitives_paths 70 | 71 | Adding a Primitives Folder 72 | -------------------------- 73 | 74 | The simplest option in order to quickly add new primitives is to put their JSON annotations 75 | in a folder called `mlblocks_primitives` in the root of your project, or in your current 76 | working directory. 77 | 78 | However, sometimes you will want to add a custom directory. 79 | 80 | This can be easily done by using the `mlblocks.add_primitives_path`_ method. 81 | 82 | .. _mlblocks.add_primitives_path: ../api_reference.html#mlblocks.add_primitives_path 83 | 84 | Developing a Primitives Library 85 | ------------------------------- 86 | 87 | Another option to add multiple libraries is creating a primitives library, such as 88 | `MLPrimitives`_. 89 | 90 | In order to make **MLBLocks** able to find the primitives defined in such a library, 91 | all you need to do is setting up an `Entry Point`_ in your `setup.py` script with the 92 | following specification: 93 | 94 | 1. It has to be published under the group ``mlblocks``. 95 | 2. It has to be named exactly ``primitives``. 96 | 3. It has to point at a variable that contains a path or a list of paths to the JSONS folder(s). 97 | 98 | An example of such an entry point would be:: 99 | 100 | entry_points = { 101 | 'mlblocks': [ 102 | 'primitives=some_module:SOME_VARIABLE' 103 | ] 104 | } 105 | 106 | where the module `some_module` contains a variable such as:: 107 | 108 | SOME_VARIABLE = 'path/to/primitives' 109 | 110 | or:: 111 | 112 | SOME_VARIABLE = [ 113 | 'path/to/primitives', 114 | 'path/to/more/primitives' 115 | ] 116 | 117 | .. _Entry Point: https://packaging.python.org/specifications/entry-points/ 118 | -------------------------------------------------------------------------------- /docs/advanced_usage/hyperparameters.rst: -------------------------------------------------------------------------------- 1 | Hyperparameters 2 | =============== 3 | 4 | A very important element of both Function and Class primitives are the hyperparameters. 5 | 6 | The hyperparameters are arguments that modify the behavior of the primitive and its learning 7 | process, which are set before the learning process starts and are not deduced from the data. 8 | These hyperparameters are usually passed as arguments to the primitive constructor or to the 9 | methods or functions that will be called during the fitting or predicting phase. 10 | 11 | In **MLBlocks**, each primitive has all its hyperparameters and their valid values specified 12 | on their `JSON Annotations`_. 13 | 14 | Here, for example, we are looking at the ``hyperparameters`` section of the 15 | ``keras.preprocessing.text.Tokenizer`` primitive from `MLPrimitives`_:: 16 | 17 | "hyperparameters: { 18 | "fixed": { 19 | "filters": { 20 | "type": "str", 21 | "default": "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\n" 22 | }, 23 | "split": { 24 | "type": "str", 25 | "default": " " 26 | }, 27 | "oov_token": { 28 | "type": "str", 29 | "default": null 30 | } 31 | }, 32 | "tunable": { 33 | "num_words": { 34 | "type": "int", 35 | "default": null, 36 | "range": [1, 10000] 37 | }, 38 | "lower": { 39 | "type": "bool", 40 | "default": true 41 | }, 42 | "char_level": { 43 | "type": "bool", 44 | "default": false 45 | } 46 | } 47 | } 48 | 49 | As can be seen, two types of hyperparameters exist: **fixed** and **tunable**. 50 | 51 | Fixed Hyperparameters 52 | --------------------- 53 | 54 | These hyperparameters do not alter the learning process, and their values modify 55 | the behavior of the primitive but not its prediction performance. In some cases these 56 | hyperparameters have a default value, but most of the times their values have to be explicitly 57 | set by the user. 58 | 59 | In the `JSON Annotations`_, these hyperparameters are specified as a JSON that has the argument 60 | name as the keyword and a nested JSON that specifies its details:: 61 | 62 | "fixed": { 63 | "filters": { 64 | "type": "str", 65 | "default": "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\n" 66 | }, 67 | "split": { 68 | "type": "str", 69 | "default": " " 70 | }, 71 | "oov_token": { 72 | "type": "str", 73 | "default": null 74 | } 75 | } 76 | 77 | Each entry in the ``fixed`` hyperparameters contains: 78 | 79 | * **default**: This indicates the default value that the argument will take if the user does 80 | not specify another value when the `MLPipeline`_ is created. This keyword is optional, and 81 | if it is not specified, the user expected to always provide a value. 82 | * **type**: The type of the argument. This is only informative and is not used by MLBlocks, but 83 | it is always included in all the `MLPrimitives`_ annotations. 84 | 85 | Tunable Hyperparameters 86 | ----------------------- 87 | 88 | These hyperparameters do not modify the primitive behaviour, but they have a direct 89 | impact on the learning process and on how well the primitive learns from the data. 90 | For this reason, their values can be tuned to improve the prediction performance. 91 | 92 | In the `JSON Annotations`_, these hyperparameters are specified as a JSON that has the argument 93 | name as the keyword and a nested JSON that specifies its details:: 94 | 95 | "tunable": { 96 | "num_words": { 97 | "type": "int", 98 | "default": null, 99 | "range": [1, 10000] 100 | }, 101 | "lower": { 102 | "type": "bool", 103 | "default": true 104 | }, 105 | "char_level": { 106 | "type": "bool", 107 | "default": false 108 | } 109 | } 110 | 111 | Each entry in the ``tunable`` hyperparameters contains: 112 | 113 | * **type**: The type of the argument. This can be one of the primitive variable types, ``int``, 114 | ``float``, ``str`` or ``bool``, or one of the special types, `multitype`_ or `conditional`_. 115 | * **default**: This indicates the default value that the argument will take if the user does 116 | not specify another value when the `MLPipeline`_ is created. 117 | * **range**: Optional - This is expected to be found in numeric hyperparameters, and specifies 118 | the minimum and maximum values that this primitive will work well with. 119 | * **values**: Optional - this is expected to be found in categorical hyperparameters, and 120 | indicates the list of possible values that it can work with. 121 | 122 | Special Hyperparameter Types 123 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 124 | 125 | Sometimes, hyperparameters do not accept only one type of value, or their possible values may 126 | depend on the value of other hyperparameters. 127 | 128 | Multitype Hyperparameters 129 | ************************* 130 | 131 | Some hyperparameters accept more than one type of value. 132 | 133 | For example, suppose a primitive expects a hyperparameter called `max_features` that can take 134 | one of three types: 135 | 136 | * An integer indicating the absolute number of features to use. 137 | * A float between 0 and 1 indicating the proportion of the maximum possible number of features. 138 | * The strings ``"min"``, ``"max"`` or ``"mean"``, indicating that the number needs to be computed 139 | by the primitive itself in some way. 140 | 141 | In this case, the ``type`` of this hyperparameter is ``multitype``, and its specification could 142 | be as follows:: 143 | 144 | "max_features": { 145 | "type": "multitype", 146 | "default": "mean", 147 | "types": { 148 | "int": { 149 | "range": [1, 100] 150 | }, 151 | "float": { 152 | "range": [0.1, 0.9] 153 | }, 154 | "string": { 155 | "values": ["mean", "min", "max"] 156 | } 157 | } 158 | } 159 | 160 | Note how a new keyword ``types`` exist, that holds the possible values for each one of the 161 | possible types that this hyperparameter can have. 162 | 163 | Conditional Hyperparameters 164 | *************************** 165 | 166 | In some other cases, the values that a hyperparameter can take depend on the value of another 167 | one. 168 | For example, sometimes a primitive has a hyperparameter that specifies a kernel, and depending 169 | on the kernel used some other hyperparameters may be or not be used, or they might be able 170 | to take only some specific values. 171 | 172 | In this case, the ``type`` of the hyperparameter whose values depend on the other is specified 173 | as ``conditional``. 174 | In this case, two additional entries are required: 175 | 176 | * an entry called ``condition``, which specifies the name of the other hyperparameter, the value 177 | of which is evaluated to decide which values this hyperparameter can take. 178 | * an additional subdictionary called ``values``, which relates the possible values that the 179 | `condition` hyperparameter can have with the full specifications of the type and values that 180 | this hyperparameter can take in each case. 181 | 182 | Suppose, for example, that the primitive explained in the previous point does not expect 183 | the ``mean``, ``min`` or ``max`` strings as values for the ``max_features`` hyperparameter, 184 | but as a separated one called ``max_feature_aggregation``, which is only used then the 185 | ``max_features`` hyperparameter has been given the value ``auto``. 186 | 187 | In this case, the hyperparameters would be annotated like this:: 188 | 189 | "max_features": { 190 | "type": "multitype", 191 | "default": "auto", 192 | "types": { 193 | "int": { 194 | "range": [1, 100] 195 | }, 196 | "float": { 197 | "range": [0.1, 0.9] 198 | }, 199 | "string": { 200 | "values": ["auto"] 201 | } 202 | } 203 | } 204 | "max_features_aggregation": { 205 | "type": "conditional", 206 | "condition": "max_features", 207 | "default": null, 208 | "values": { 209 | "auto": { 210 | "description": "this will be used only if the value of max_features is `auto`", 211 | "type": "str", 212 | "default": "mean", 213 | "range": ["mean", "max", "min"] 214 | } 215 | } 216 | } 217 | 218 | .. note:: Just like a regular hyperparameter, if there is no match the default entry is used. 219 | In this example, the ``null`` value indicates that the hyperparameter needs to be 220 | disabled if there is no match, but instead of it we could add there a full specification 221 | of type, range and default value as a nested dictionary to be used by default. 222 | 223 | .. _JSON Annotations: primitives.html#json-annotations 224 | .. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives 225 | .. _BTB: https://github.com/MLBazaar/BTB 226 | .. _MLPipeline: ../api_reference.html#mlblocks.MLPipeline 227 | .. _multitype: #multitype-hyperparameters 228 | .. _conditional: #conditional-hyperparameters 229 | -------------------------------------------------------------------------------- /docs/advanced_usage/primitives.rst: -------------------------------------------------------------------------------- 1 | Primitives 2 | ========== 3 | 4 | MLBlocks goal is to seamlessly combine any possible set of Machine Learning tools developed 5 | in Python, whether they are custom developments or belong to third party libraries, and 6 | build `pipelines`_ out of them that can be fitted and then used to make predictions. 7 | 8 | We call each one of these Machine Learning tools a **primitive**. 9 | 10 | What is a Primitive? 11 | -------------------- 12 | 13 | A valid MLBlocks primitive is an importable Python object that: 14 | 15 | * Must be either a function or a class. 16 | * If it is a class, it **might** have a `fitting` stage, where the primitive is passed some 17 | training data and it `learns` from it, and which can be executed with a single method call. 18 | Function primitives have no `fitting` stage. 19 | * **Must** have a `producing` stage, where the primitive is passed some data and it returns some 20 | other data, whether it is a transformation of the input data or some new data derived from it, 21 | such as a set of predictions. This `producing` stage must be executed with a single function or 22 | method call. 23 | * Might have `hyperparameters`_, additional arguments to be passed to either the function call or 24 | the class constructor in order to alter or control the way the fitting and producing stages work. 25 | 26 | Here are some examples of primitives: 27 | 28 | +-----------------------------------------------+-----------+--------------+--------------------+ 29 | | primitive | type | fit | produce | 30 | +===============================================+===========+==============+====================+ 31 | | sklearn.preprocessing.StandardScaler | class | fit | transform | 32 | +-----------------------------------------------+-----------+--------------+--------------------+ 33 | | sklearn.ensemble.RandomForestClassifier | class | fit | predict | 34 | +-----------------------------------------------+-----------+--------------+--------------------+ 35 | | skimage.feature.hog | function | -- | -- | 36 | +-----------------------------------------------+-----------+--------------+--------------------+ 37 | | xgboost.XGBRegressor | class | fit | predict | 38 | +-----------------------------------------------+-----------+--------------+--------------------+ 39 | | keras.applications.resnet50.preprocess_input | function | -- | -- | 40 | +-----------------------------------------------+-----------+--------------+--------------------+ 41 | | keras.applications.resnet50.ResNet50 | class | -- | predict | 42 | +-----------------------------------------------+-----------+--------------+--------------------+ 43 | | keras.preprocessing.sequence.pad_sequences | function | -- | -- | 44 | +-----------------------------------------------+-----------+--------------+--------------------+ 45 | | keras.preprocessing.text.Tokenizer | class | fit_on_texts | texts_to_sequences | 46 | +-----------------------------------------------+-----------+--------------+--------------------+ 47 | | lightfm.LightFM | class | fit | predict | 48 | +-----------------------------------------------+-----------+--------------+--------------------+ 49 | 50 | JSON Annotations 51 | ---------------- 52 | 53 | Each integrated primitive has an associated JSON file that specifies its methods, their arguments, 54 | their types and, most importantly, any possible `hyperparameters`_ that the primitive has, as well 55 | as their types, ranges and conditions, if any. 56 | 57 | These JSON annotations can be: 58 | 59 | * **Installed** using the `MLPrimitives`_ related project, which is the recommended approach. 60 | * **Created by the user** and configured for MLBlocks to use them. 61 | 62 | And the primitives can be of two types: 63 | 64 | * Function Primitives: Simple functions that can be called directly. 65 | * Class Primitives: Class objects that need to be instantiated before they can be used. 66 | 67 | Here are some simplified examples of these JSONs, but for more detailed examples, please refer to 68 | the `examples folder`_ of the project. 69 | 70 | Function Primitives 71 | ~~~~~~~~~~~~~~~~~~~ 72 | 73 | The most simple type of primitives are simple functions that can be called directly, without 74 | the need to created any class instance before. 75 | 76 | In most cases, if not all, these functions do not have any associated learning process, 77 | and their behavior is always the same both during the fitting and the predicting phases 78 | of the pipeline. 79 | 80 | A simple example of such a primitive would be the ``numpy.argmax`` function, which expects a 2 81 | dimensional array as input, and returns a 1 dimensional array that indicates the index of the 82 | maximum values along an axis. 83 | 84 | The simplest JSON annotation for this primitive would look like this:: 85 | 86 | { 87 | "primitive": "numpy.argmax", 88 | "produce": { 89 | "args": [ 90 | { 91 | "name": "y", 92 | "type": "ndarray" 93 | } 94 | ], 95 | "output": [ 96 | { 97 | "name": "y", 98 | "type": "ndarray" 99 | } 100 | ] 101 | }, 102 | "hyperparameters": { 103 | "fixed": { 104 | "axis": { 105 | "type": "int", 106 | "default": 1 107 | } 108 | } 109 | } 110 | } 111 | 112 | The main elements of this JSON are: 113 | 114 | * **primitive**: The fully qualified, directly importable name of the function to be used:: 115 | 116 | "primitive": "numpy.argmax", 117 | 118 | * **produce**: A nested JSON that specifies the names and types of arguments and the output values 119 | of the primitive:: 120 | 121 | "produce": { 122 | "args": [ 123 | { 124 | "name": "y", 125 | "type": "ndarray" 126 | } 127 | ], 128 | "output": [ 129 | { 130 | "name": "y", 131 | "type": "ndarray" 132 | } 133 | ] 134 | } 135 | 136 | * **hyperparameters**: A nested JSON that specifies the `hyperparameters`_ of this primitive. 137 | Note that multiple types of hyperparameters exist, but that this primitive has only one ``fixed`` 138 | hyperparameter, which mean that this is not tunable and that, even though the user can specify 139 | a value different than the default, changes are not expected during the MLBlock instance life 140 | cycle:: 141 | 142 | "hyperparameters": { 143 | "fixed": { 144 | "axis": { 145 | "type": "int", 146 | "default": 1 147 | } 148 | } 149 | } 150 | 151 | Class Primitives 152 | ~~~~~~~~~~~~~~~~ 153 | 154 | A more complex type of primitives are classes which need to be instantiated before they can 155 | be used. 156 | 157 | In most cases, these classes will have an associated learning process, and they will have some 158 | fit method or equivalent that will be called during the fitting phase but not during the 159 | predicting one. 160 | 161 | A simple example of such a primitive would be the ``sklearn.preprocessing.StandardScaler`` class, 162 | which is used to standardize a set of values by calculating their z-score, which means centering 163 | them around 0 and scaling them to unit variance. 164 | 165 | This primitive has an associated learning process, where it calculates the mean and standard 166 | deviation of the training data, to later on use them to transform the prediction data to the 167 | same center and scale. 168 | 169 | The simplest JSON annotation for this primitive would look like this:: 170 | 171 | { 172 | "primitive": "sklearn.preprocessing.StandardScaler", 173 | "fit": { 174 | "method": "fit", 175 | "args": [ 176 | { 177 | "name": "X", 178 | "type": "ndarray" 179 | } 180 | ] 181 | }, 182 | "produce": { 183 | "method": "transform", 184 | "args": [ 185 | { 186 | "name": "X", 187 | "type": "ndarray" 188 | } 189 | ], 190 | "output": [ 191 | { 192 | "name": "X", 193 | "type": "ndarray" 194 | } 195 | ] 196 | }, 197 | "hyperparameters": { 198 | "tunable": { 199 | "with_mean": { 200 | "type": "bool", 201 | "default": true 202 | }, 203 | "with_std": { 204 | "type": "bool", 205 | "default": true 206 | } 207 | } 208 | } 209 | } 210 | 211 | Note that there are some details of this JSON annotation that make it different from the 212 | Function Primitive one that explained above: 213 | 214 | * **primitive**: The fully qualified, directly importable name of the class to be used. This 215 | class is the one that will be used to create the actual primitive instance:: 216 | 217 | "primitive": "sklearn.preprocessing.StandardScaler", 218 | 219 | * **fit**: A nested JSON that specifies the name of the method to call during the fitting phase, 220 | which in this case happens to also be ``fit``, as well as the names and types of 221 | arguments that this method expects:: 222 | 223 | "fit": { 224 | "method": "fit", 225 | "args": [ 226 | { 227 | "name": "X", 228 | "type": "ndarray" 229 | } 230 | ] 231 | } 232 | 233 | * **produce**: A nested JSON that specifies the name of the method to call during the predicting 234 | phase, in this case called ``transform``, as well as the names and types of 235 | arguments that this method expects and its outputs:: 236 | 237 | "produce": { 238 | "method": "transform", 239 | "args": [ 240 | { 241 | "name": "X", 242 | "type": "ndarray" 243 | } 244 | ], 245 | "output": [ 246 | { 247 | "name": "X", 248 | "type": "ndarray" 249 | } 250 | ] 251 | } 252 | 253 | * **hyperparameters**: A nested JSON that specifies the hyperparameters of this primitive. 254 | In this case, only ``tunable`` hyperparameters are specified, with their 255 | names and types. If the type was something other than ``bool``, a list or 256 | range of valid values would also be specified:: 257 | 258 | "hyperparameters": { 259 | "tunable": { 260 | "with_mean": { 261 | "type": "bool", 262 | "default": true 263 | }, 264 | "with_std": { 265 | "type": "bool", 266 | "default": true 267 | } 268 | } 269 | } 270 | 271 | The MLBlock Class 272 | ----------------- 273 | 274 | Within the **MLBlocks** library, a primitive is represented through the `mlblocks.MLBlock`_ class. 275 | 276 | This is used to wrap around the annotated primitives, offering a common and uniform interface to 277 | all of them. 278 | 279 | More specifically, the `mlblocks.MLBlock`_ class offers two public methods, `fit`_ and `produce`_, 280 | which are directly linked to the methods specified in the JSON Annotation: 281 | 282 | For example, we can look at the `keras.preprocessing.text.Tokenizer`_ primitive from 283 | `MLPrimitives`_, which calls the method ``fit_on_texts`` when ``fit`` is called, and 284 | ``tests_to_sequences`` when ``produce`` is called: 285 | 286 | .. graphviz:: 287 | 288 | digraph { 289 | { 290 | node [shape=box] 291 | fit_on_texts; 292 | texts_to_sequences; 293 | fit; 294 | produce; 295 | } 296 | subgraph cluster_1 { 297 | {rank=same; fit produce}; 298 | fit -> produce [style=invis]; 299 | fit -> fit_on_texts; 300 | produce -> texts_to_sequences; 301 | label = "mlblocks.MLBlock"; 302 | subgraph cluster_2 { 303 | fit_on_texts; 304 | texts_to_sequences; 305 | label = "keras.preprocessing.text.Tokenizer"; 306 | } 307 | } 308 | } 309 | 310 | For a more detailed description of this class, please check the corresponding 311 | section in the `API Reference`_ documentation. 312 | 313 | .. _API Reference: ../api_reference.html 314 | .. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives 315 | .. _keras.preprocessing.text.Tokenizer: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json 316 | .. _hyperparameters: hyperparameters.html 317 | .. _mlblocks.MLBlock: ../api_reference.html#mlblocks.MLBlock 318 | .. _pipelines: pipelines.html 319 | .. _examples folder: https://github.com/MLBazaar/MLBlocks/tree/master/examples 320 | .. _fit: ../api_reference.html#mlblocks.MLBlock.fit 321 | .. _produce: ../api_reference.html#mlblocks.MLBlock.produce 322 | -------------------------------------------------------------------------------- /docs/api/mlblocks.rst: -------------------------------------------------------------------------------- 1 | mlblocks 2 | ======== 3 | 4 | .. automodule:: mlblocks 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. mdinclude:: ../HISTORY.md 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # MLBlocks documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another 17 | # directory, add these directories to sys.path here. If the directory is 18 | # relative to the documentation root, use os.path.abspath to make it 19 | # absolute, like shown here. 20 | 21 | import sphinx_rtd_theme # For read the docs theme 22 | 23 | import mlblocks 24 | 25 | # -- General configuration --------------------------------------------- 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 32 | extensions = [ 33 | 'm2r', 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.githubpages', 36 | 'sphinx.ext.viewcode', 37 | 'sphinx.ext.napoleon', 38 | 'sphinx.ext.graphviz', 39 | 'IPython.sphinxext.ipython_console_highlighting', 40 | 'IPython.sphinxext.ipython_directive', 41 | 'autodocsumm', 42 | ] 43 | 44 | autodoc_default_options = { 45 | 'autosummary': True, 46 | } 47 | 48 | ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"] 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ['_templates'] 51 | 52 | # The suffix(es) of source filenames. 53 | # You can specify multiple suffix as a list of string: 54 | source_suffix = ['.rst', '.md', '.ipynb'] 55 | 56 | # The master toctree document. 57 | master_doc = 'index' 58 | 59 | # General information about the project. 60 | project = 'MLBlocks' 61 | slug = 'mlblocks' 62 | title = project + ' Documentation', 63 | copyright = '2018, MIT Data To AI Lab' 64 | author = 'MIT Data To AI Lab' 65 | description = 'Pipelines and Primitives for Machine Learning and Data Science.' 66 | user = 'MLBazaar' 67 | 68 | # The version info for the project you're documenting, acts as replacement 69 | # for |version| and |release|, also used in various other places throughout 70 | # the built documents. 71 | # 72 | # The short X.Y version. 73 | version = mlblocks.__version__ 74 | # The full version, including alpha/beta/rc tags. 75 | release = mlblocks.__version__ 76 | 77 | # The language for content autogenerated by Sphinx. Refer to documentation 78 | # for a list of supported languages. 79 | # 80 | # This is also used if you do content translation via gettext catalogs. 81 | # Usually you set "language" from the command line for these cases. 82 | language = None 83 | 84 | # List of patterns, relative to source directory, that match files and 85 | # directories to ignore when looking for source files. 86 | # This patterns also effect to html_static_path and html_extra_path 87 | exclude_patterns = ['.py', '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] 88 | 89 | # The name of the Pygments (syntax highlighting) style to use. 90 | pygments_style = 'sphinx' 91 | 92 | # If true, `todo` and `todoList` produce output, else they produce nothing. 93 | todo_include_todos = False 94 | 95 | # -- Options for HTML output ------------------------------------------- 96 | 97 | # The theme to use for HTML and HTML Help pages. See the documentation for 98 | # a list of builtin themes. 99 | # 100 | html_theme = 'sphinx_rtd_theme' 101 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 102 | 103 | # Readthedocs additions 104 | html_context = { 105 | 'display_github': True, 106 | 'github_user': user, 107 | 'github_repo': project, 108 | 'github_version': 'master', 109 | 'conf_py_path': '/docs/', 110 | } 111 | 112 | # Theme options are theme-specific and customize the look and feel of a 113 | # theme further. For a list of options available for each theme, see the 114 | # documentation. 115 | html_theme_options = { 116 | 'collapse_navigation': False, 117 | 'display_version': True, 118 | } 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | # html_static_path = ['_static'] 124 | 125 | # The name of an image file (relative to this directory) to use as a favicon of 126 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 127 | # pixels large. 128 | html_favicon = 'images/favicon.ico' 129 | 130 | # If given, this must be the name of an image file (path relative to the 131 | # configuration directory) that is the logo of the docs. It is placed at 132 | # the top of the sidebar; its width should therefore not exceed 200 pixels. 133 | # html_logo = 'images/mlblocks-logo-small.png' 134 | 135 | # -- Options for HTMLHelp output --------------------------------------- 136 | 137 | # Output file base name for HTML help builder. 138 | htmlhelp_basename = slug + 'doc' 139 | 140 | 141 | # -- Options for LaTeX output ------------------------------------------ 142 | 143 | latex_elements = { 144 | # The paper size ('letterpaper' or 'a4paper'). 145 | # 146 | # 'papersize': 'letterpaper', 147 | 148 | # The font size ('10pt', '11pt' or '12pt'). 149 | # 150 | # 'pointsize': '10pt', 151 | 152 | # Additional stuff for the LaTeX preamble. 153 | # 154 | # 'preamble': '', 155 | 156 | # Latex figure (float) alignment 157 | # 158 | # 'figure_align': 'htbp', 159 | } 160 | 161 | # Grouping the document tree into LaTeX files. List of tuples 162 | # (source start file, target name, title, author, documentclass 163 | # [howto, manual, or own class]). 164 | latex_documents = [( 165 | master_doc, 166 | slug + '.tex', 167 | title, 168 | author, 169 | 'manual' 170 | )] 171 | 172 | 173 | # -- Options for manual page output ------------------------------------ 174 | 175 | # One entry per manual page. List of tuples 176 | # (source start file, name, description, authors, manual section). 177 | man_pages = [( 178 | master_doc, 179 | slug, 180 | title, 181 | [author], 182 | 1 183 | )] 184 | 185 | 186 | # -- Options for Texinfo output ---------------------------------------- 187 | 188 | # Grouping the document tree into Texinfo files. List of tuples 189 | # (source start file, target name, title, author, 190 | # dir menu entry, description, category) 191 | texinfo_documents = [( 192 | master_doc, 193 | slug, 194 | title, 195 | author, 196 | slug, 197 | description, 198 | 'Miscellaneous' 199 | )] 200 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/getting_started/install.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | Installation 4 | ============ 5 | 6 | From PyPi 7 | --------- 8 | 9 | The simplest and recommended way to install MLBlocks is using `pip`: 10 | 11 | .. code-block:: console 12 | 13 | pip install mlblocks 14 | 15 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 16 | you through the process. 17 | 18 | .. _pip: https://pip.pypa.io 19 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 20 | 21 | Additional dependencies 22 | ----------------------- 23 | 24 | In order to be usable, MLBlocks requires a compatible primitives library. 25 | 26 | The official library, required in order to follow the MLBlocks tutorials and documentation examples, 27 | is `MLPrimitives`_, which you can install with this command: 28 | 29 | .. code-block:: console 30 | 31 | pip install mlprimitives 32 | 33 | .. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives 34 | 35 | Install for development 36 | ----------------------- 37 | 38 | If you are installing **MLBlocks** in order to modify its code, the installation must be done 39 | from its sources, in the editable mode, and also including some additional dependencies in 40 | order to be able to run the tests and build the documentation. Instructions about this process 41 | can be found in the `Contributing guide`_. 42 | 43 | .. _Contributing guide: ../contributing.html#get-started 44 | -------------------------------------------------------------------------------- /docs/getting_started/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ========== 3 | 4 | Below is a short tutorial that will show you how to get started using **MLBlocks**. 5 | 6 | In this tutorial we will learn how to: 7 | 8 | * Create a pipeline using multiple primitives 9 | * Obtain the list of tunable hyperparameters from the pipeline 10 | * Specify hyperparameters for each primitive in the pipeline 11 | * Fit the pipeline using training data 12 | * Use the pipeline to make predictions from new data 13 | 14 | .. note:: Some additional dependencies are required in order to run this Quickstart. 15 | Make sure that `you have already installed them`_. 16 | 17 | Creating a pipeline 18 | ------------------- 19 | 20 | With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing 21 | them to the `MLPipeline class`_: 22 | 23 | .. ipython:: python 24 | 25 | from mlblocks import MLPipeline 26 | primitives = [ 27 | 'mlprimitives.custom.preprocessing.ClassEncoder', 28 | 'mlprimitives.custom.feature_extraction.CategoricalEncoder', 29 | 'sklearn.impute.SimpleImputer', 30 | 'xgboost.XGBClassifier', 31 | 'mlprimitives.custom.preprocessing.ClassDecoder' 32 | ] 33 | pipeline = MLPipeline(primitives) 34 | 35 | Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and 36 | passing them as the ``init_params`` argument: 37 | 38 | .. ipython:: python 39 | 40 | init_params = { 41 | 'sklearn.impute.SimpleImputer': { 42 | 'strategy': 'median' 43 | } 44 | } 45 | pipeline = MLPipeline(primitives, init_params=init_params) 46 | 47 | Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set 48 | for each block, by calling the `get_hyperparameters method`_. 49 | 50 | The output of this method is a dictionary which has the name of each block as keys and 51 | a dictionary with the `hyperparameters`_ of the corresponding block as values. 52 | 53 | .. ipython:: python 54 | 55 | pipeline.get_hyperparameters() 56 | 57 | Tunable Hyperparameters 58 | ----------------------- 59 | 60 | One of the main features of `MLBlocks JSON Annotations`_ is the possibility to indicate 61 | the type and possible values that each primitive hyperparameter accepts. 62 | 63 | The list of possible hyperparameters and their details can easily be obtained from the pipeline 64 | instance by calling its `get_tunable_hyperparameters method`_. 65 | 66 | The output of this method is a dictionary that contains the list of tunable hyperparameters 67 | for each block in the pipeline, ready to be passed to any hyperparameter tuning library such 68 | as `BTB`_. 69 | 70 | .. ipython:: python 71 | 72 | pipeline.get_tunable_hyperparameters() 73 | 74 | Setting Hyperparameters 75 | ----------------------- 76 | 77 | Modifying the hyperparameters of an already instantiated pipeline can be done using the 78 | `set_hyperparameters method`_, which expects a dictionary with the same format as the returned 79 | by the `get_hyperparameters method`_. 80 | 81 | Note that if a subset of the hyperparameters is passed, only these will be modified, and the 82 | other ones will remain unmodified. 83 | 84 | .. ipython:: python 85 | 86 | new_hyperparameters = { 87 | 'xgboost.XGBClassifier#1': { 88 | 'max_depth': 15 89 | } 90 | } 91 | pipeline.set_hyperparameters(new_hyperparameters) 92 | hyperparameters = pipeline.get_hyperparameters() 93 | hyperparameters['xgboost.XGBClassifier#1']['max_depth'] 94 | 95 | Making predictions 96 | ------------------ 97 | 98 | Once we have created the pipeline with the desired hyperparameters we can fit it 99 | and then use it to make predictions on new data. 100 | 101 | To do this, we first call the ``fit`` method passing the training data and the corresponding 102 | labels. 103 | 104 | .. ipython:: python 105 | :okwarning: 106 | 107 | import pandas as pd 108 | from sklearn.model_selection import train_test_split 109 | 110 | dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv') 111 | label = dataset.pop('label') 112 | 113 | X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label) 114 | pipeline.fit(X_train, y_train) 115 | 116 | Once we have fitted our model to our data, we can call the ``predict`` method passing new data 117 | to obtain predictions from the pipeline. 118 | 119 | .. ipython:: python 120 | :okwarning: 121 | 122 | from sklearn.metrics import accuracy_score 123 | 124 | predictions = pipeline.predict(X_test) 125 | predictions 126 | accuracy_score(y_test, predictions) 127 | 128 | .. _you have already installed them: install.html#additional-dependencies 129 | .. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline 130 | .. _get_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_hyperparameters 131 | .. _hyperparameters: ../advanced_usage/hyperparameters.html 132 | .. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations 133 | .. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters 134 | .. _BTB: https://github.com/MLBazaar/BTB 135 | .. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters 136 | -------------------------------------------------------------------------------- /docs/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/docs/images/favicon.ico -------------------------------------------------------------------------------- /docs/images/mlblocks-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/docs/images/mlblocks-icon.png -------------------------------------------------------------------------------- /docs/images/mlblocks-logo-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/docs/images/mlblocks-logo-small.png -------------------------------------------------------------------------------- /docs/images/mlblocks-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/docs/images/mlblocks-logo.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | What is MLBlocks? 2 | ================= 3 | 4 | .. image:: images/mlblocks-logo.png 5 | :width: 300 px 6 | :alt: MLBlocks 7 | :align: center 8 | 9 | * Documentation: https://mlbazaar.github.io/MLBlocks 10 | * Github: https://github.com/MLBazaar/MLBlocks 11 | * License: `MIT `_ 12 | 13 | MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning 14 | tools developed in Python, whether they are custom developments or belong to third party 15 | libraries, and build Pipelines out of them that can be fitted and then used to make predictions. 16 | 17 | This is achieved by providing a simple and intuitive annotation language that allows the 18 | user to specify how to integrate with each tool, here called primitives, in order to provide 19 | a common uniform interface to each one of them. 20 | 21 | At a high level: 22 | 23 | * Each available primitive has been annotated using a standardized JSON file that specifies its 24 | native interface, as well as which hyperparameters can be used to tune its behavior. 25 | * A list of primitives that will be combined into a pipeline is provided by the user, optionally 26 | passing along the hyperparameters to use for each primitive. 27 | * An MLBlock instance is build for each primitive, offering a common interface for all of them. 28 | * The MLBlock instances are then combined into an MLPipeline instance, able to run them all in 29 | the right order, passing the output from each one as input to the next one. 30 | * The training data is passed to the `MLPipeline.fit` method, which sequentially fits each 31 | MLBlock instance following the JSON annotation specification. 32 | * The data used to make predictions is passed to the `MLPipeline.predict` method, which uses each 33 | MLBlock sequentially to obtain the desired predictions. 34 | 35 | History 36 | ------- 37 | 38 | In its first iteration, in 2015, MLBlocks was designed for only multi table, multi entity temporal 39 | data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis, written 40 | under the supervision of Kalyan Veeramachaneni: 41 | 42 | * `Machine learning blocks`_. 43 | Bryan Collazo. Masters thesis, MIT EECS, 2015. 44 | 45 | In 2018, with recent availability of a multitude of libraries and tools, we decided it was time to 46 | integrate them and expand the library to address other data types, like images, text, graph or 47 | time series, as well as introduce the usage of deep learning libraries. A second iteration of our 48 | work was then started by the hand of William Xue: 49 | 50 | * `A Flexible Framework for Composing End to End Machine Learning Pipelines`_. 51 | William Xue. Masters thesis, MIT EECS, 2018. 52 | 53 | Later in 2018, Carles Sala joined the project to make it grow as a reliable open-source library 54 | that would become part of a bigger software ecosystem designed to facilitate the development of 55 | robust end-to-end solutions based on Machine Learning tools. This third iteration of our work 56 | was presented in 2019 as part of the Machine Learning Bazaar: 57 | 58 | * `The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development`_. 59 | Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. Sigmod 2020. 60 | 61 | .. toctree:: 62 | :caption: Getting Started 63 | :titlesonly: 64 | 65 | self 66 | getting_started/install 67 | getting_started/quickstart 68 | 69 | .. toctree:: 70 | :caption: Advanced Usage 71 | :maxdepth: 1 72 | 73 | advanced_usage/primitives 74 | advanced_usage/hyperparameters 75 | advanced_usage/pipelines 76 | advanced_usage/adding_primitives 77 | 78 | .. toctree:: 79 | :caption: Pipeline Examples 80 | :maxdepth: 1 81 | 82 | pipeline_examples/single_table 83 | pipeline_examples/multi_table 84 | pipeline_examples/text 85 | pipeline_examples/image 86 | pipeline_examples/graph 87 | 88 | .. toctree:: 89 | :caption: API Reference 90 | :titlesonly: 91 | 92 | api/mlblocks 93 | api/mlblocks.datasets 94 | api/mlblocks.discovery 95 | 96 | .. toctree:: 97 | :caption: Resources 98 | :titlesonly: 99 | 100 | contributing 101 | authors 102 | changelog 103 | 104 | Indices and tables 105 | ================== 106 | * :ref:`genindex` 107 | * :ref:`modindex` 108 | * :ref:`search` 109 | 110 | .. _Machine learning blocks: https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf 111 | 112 | .. _A Flexible Framework for Composing End to End Machine Learning Pipelines: https://dai.lids.mit.edu/wp-content/uploads/2018/12/William_MEng.pdf 113 | .. _The Machine Learning Bazaar\: Harnessing the ML Ecosystem for Effective System Development: https://arxiv.org/abs/1905.08942 114 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=mlblocks 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/pipeline_examples/graph.rst: -------------------------------------------------------------------------------- 1 | Graph Pipelines 2 | =============== 3 | 4 | Here we will be showing some examples using **MLBlocks** to resolve graph problems. 5 | 6 | Link Prediction 7 | --------------- 8 | 9 | For the Graph Link Prediction examples we will be using the UMLS biomedical ontology dataset, 10 | which we will load using the ``mlblocks.dataset.load_umls`` function. 11 | 12 | The data consists of information about a 135 Graph and the relations between their nodes given 13 | as a DataFrame with three columns, `source`, `target` and `type`, indicating which nodes are 14 | related and with which type of link. 15 | The target is a 1d numpy binary integer array indicating whether the indicated link exists or not. 16 | 17 | 18 | NetworkX + MLPrimitives + Scikit-learn + XGBoost 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | In this example, we will go use some `NetworkX Link Prediction` functions to extract attributes 22 | from the Graph, to later on encode the categorical features with the `CategoricalEncoder from 23 | MLPrimitives`_, scale the data using the `StandardScaler from scikit-learn`_ and finally go into 24 | an `XGBClassifier`. 25 | 26 | Note how in this example, the Graph objects and the names of the node columns are passed as 27 | additional variables to be added to the context, as the NetworkX primitive will need some 28 | additional information not found inside `X`. 29 | 30 | .. code-block:: python 31 | 32 | from mlblocks import MLPipeline 33 | from mlprimitives.datasets import load_umls 34 | 35 | dataset = load_umls() 36 | dataset.describe() 37 | 38 | X_train, X_test, y_train, y_test = dataset.get_splits(1) 39 | 40 | primitives = [ 41 | 'networkx.link_prediction_feature_extraction', 42 | 'mlprimitives.custom.feature_extraction.CategoricalEncoder', 43 | 'sklearn.preprocessing.StandardScaler', 44 | 'xgboost.XGBClassifier' 45 | ] 46 | init_params = { 47 | 'xgboost.XGBClassifier': { 48 | 'n_estimators': 300, 49 | 'learning_rate': 0.1 50 | } 51 | } 52 | pipeline = MLPipeline(primitives) 53 | 54 | node_columns = ['source', 'target'] 55 | pipeline.fit( 56 | X_train, 57 | y_train, 58 | graph=dataset.graph, # These will be set in the pipeline Context 59 | node_columns=node_columns # and made available for the networkx primitive 60 | ) 61 | 62 | predictions = pipeline.predict( 63 | X_test, 64 | graph=dataset.graph, # These will be set in the pipeline Context 65 | node_columns=node_columns # and made available for the networkx primitive 66 | ) 67 | 68 | dataset.score(y_test, predictions) 69 | 70 | 71 | .. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html 72 | .. _CategoricalEncoder from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json 73 | .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html 74 | .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn 75 | -------------------------------------------------------------------------------- /docs/pipeline_examples/image.rst: -------------------------------------------------------------------------------- 1 | Image Pipelines 2 | =============== 3 | 4 | Here we will be showing some examples using **MLBlocks** to resolve image problems. 5 | 6 | Image Classification 7 | -------------------- 8 | 9 | For the image classification examples we will be using the `USPS Dataset`_, which we will 10 | load using the ``mlblocks.dataset.load_usps`` function. 11 | 12 | The data of this dataset is a 3d numpy array vector with shape ``(224, 224, 3)`` containing 9298 13 | 224x224 RGB photos of handwritten digits, and the target is a 1d numpy integer array containing 14 | the label of the digit represented in the image. 15 | 16 | OpenCV GaussianBlur + Scikit-image HOG + Scikit-Learn RandomForestClassifier 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | 19 | In this first example, we will attempt to resolve the problem using some basic preprocessing 20 | with the `OpenCV GaussianBlur function`_, to later on calculate the Histogram of Oriented 21 | Gradients using the corresponding `scikit-image function`_ to later on use a simple 22 | `RandomForestClassifier from scikit-learn`_ on the generated features. 23 | 24 | .. code-block:: python 25 | 26 | from mlblocks import MLPipeline 27 | from mlprimitives.datasets import load_usps 28 | 29 | dataset = load_usps() 30 | dataset.describe() 31 | 32 | X_train, X_test, y_train, y_test = dataset.get_splits(1) 33 | 34 | primitives = [ 35 | 'cv2.GaussianBlur', 36 | 'skimage.feature.hog', 37 | 'sklearn.ensemble.RandomForestClassifier' 38 | ] 39 | init_params = { 40 | 'skimage.feature.hog': { 41 | 'multichannel': True, 42 | 'visualize': False 43 | } 44 | } 45 | pipeline = MLPipeline(primitives, init_params) 46 | 47 | pipeline.fit(X_train, y_train) 48 | 49 | predictions = pipeline.predict(X_test) 50 | 51 | dataset.score(y_test, predictions) 52 | 53 | 54 | OpenCV GaussianBlur + Keras Single Layer CNN 55 | -------------------------------------------- 56 | 57 | In this example, we will preprocess the images using the `OpenCV GaussianBlur function`_ 58 | and directly after go into a Single Layer CNN Classifier built on Keras using the corresponding 59 | `MLPrimitives primitive`_. 60 | 61 | .. code-block:: python 62 | 63 | from mlblocks import MLPipeline 64 | from mlprimitives.datasets import load_usps 65 | 66 | dataset = load_usps() 67 | dataset.describe() 68 | 69 | X_train, X_test, y_train, y_test = dataset.get_splits(1) 70 | 71 | primitives = [ 72 | 'cv2.GaussianBlur', 73 | 'keras.Sequential.SingleLayerCNNImageClassifier' 74 | ] 75 | init_params = { 76 | 'keras.Sequential.SingleLayerCNNImageClassifier': { 77 | 'dense_units': 11, 78 | 'epochs': 5 79 | } 80 | } 81 | pipeline = MLPipeline(primitives, init_params) 82 | 83 | pipeline.fit(X_train, y_train) 84 | 85 | predictions = pipeline.predict(X_test) 86 | 87 | dataset.score(y_test, predictions) 88 | 89 | 90 | Image Regression 91 | ---------------- 92 | 93 | For the image regression examples we will be using the Handgeometry Dataset, which we will 94 | load using the ``mlblocks.dataset.load_handgeometry`` function. 95 | 96 | The data of this dataset is a 3d numpy array vector with shape ``(224, 224, 3)`` containing 112 97 | 224x224 RGB photos of hands, and the target is a 1d numpy float array containing the width of 98 | the wrist in centimeters. 99 | 100 | Keras MobileNet + XGBRegressor 101 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 102 | 103 | Here we will introduce the usage of the `Pretrained Networks from Keras`_. 104 | In particular, we will be using the `MobileNet`_ for feature extraction, and pass its features 105 | to an `XGBRegressor`_ primitive. 106 | 107 | .. code-block:: python 108 | 109 | from mlblocks import MLPipeline 110 | from mlprimitives.datasets import load_handgeometry 111 | 112 | dataset = load_handgeometry() 113 | dataset.describe() 114 | 115 | X_train, X_test, y_train, y_test = dataset.get_splits(1) 116 | 117 | primitives = [ 118 | 'keras.applications.mobilenet.preprocess_input', 119 | 'keras.applications.mobilenet.MobileNet', 120 | 'xgboost.XGBRegressor' 121 | ] 122 | init_params = { 123 | 'xgboost.XGBRegressor': { 124 | 'n_estimators': 300, 125 | 'learning_rate': 0.1 126 | } 127 | } 128 | pipeline = MLPipeline(primitives, init_params) 129 | 130 | pipeline.fit(X_train, y_train) 131 | 132 | predictions = pipeline.predict(X_test) 133 | 134 | dataset.score(y_test, predictions) 135 | 136 | 137 | .. _USPS Dataset: https://ieeexplore.ieee.org/document/291440/ 138 | .. _OpenCV GaussianBlur function: https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur 139 | .. _MLPrimitives primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json 140 | .. _scikit-image function: http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog 141 | .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html 142 | .. _Pretrained Networks from Keras: https://keras.io/applications/ 143 | .. _MobileNet: https://keras.io/applications/#mobilenet 144 | .. _XGBRegressor: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn 145 | -------------------------------------------------------------------------------- /docs/pipeline_examples/multi_table.rst: -------------------------------------------------------------------------------- 1 | Multi Table Pipelines 2 | ===================== 3 | 4 | In the previous section we explored the most simple use cases, where the datasets 5 | consisted in a single table. 6 | 7 | In this section we will cover cases where the dataset consist on multiple tables 8 | related by foreign keys. 9 | 10 | Multi Table Classification Pipeline 11 | ----------------------------------- 12 | 13 | In this example, we will be using the `WikiQA dataset`_, which contains 4 different tables 14 | with simple parent/child relationships, and which we will load using the 15 | ``mlblocks.dataset.load_wikiqa`` function. 16 | 17 | In our pipeline, we will be using the `DeepFeatureSynthesis`_ primitive from `featuretools`_ 18 | for feature extraction over the various tables that we have and later on apply an 19 | `XGBClassifier`_ on the resulting feature matrix. 20 | 21 | Note how in this example we need to pass some additional information to the pipeline 22 | for the DFS primitive for it to know what the relationships between the multiple 23 | tables are. 24 | 25 | .. code-block:: python 26 | 27 | from mlblocks import MLPipeline 28 | from mlprimitives.datasets import load_wikiqa 29 | 30 | dataset = load_wikiqa() 31 | dataset.describe() 32 | 33 | X_train, X_test, y_train, y_test = dataset.get_splits(1) 34 | 35 | primitives = [ 36 | 'featuretools.dfs', 37 | 'xgboost.XGBClassifier' 38 | ] 39 | pipeline = MLPipeline(primitives) 40 | 41 | pipeline.fit(X_train, y_train, entities=dataset.entities, 42 | relationships=dataset.relationships, target_entity='data') 43 | 44 | predictions = pipeline.predict(X_test, entities=dataset.entities, 45 | relationships=dataset.relationships, target_entity='data') 46 | 47 | dataset.score(y_test, predictions) 48 | 49 | 50 | .. _WikiQA dataset: https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/ 51 | .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn 52 | .. _DeepFeatureSynthesis: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json 53 | .. _featuretools: https://www.featuretools.com/ 54 | -------------------------------------------------------------------------------- /docs/pipeline_examples/single_table.rst: -------------------------------------------------------------------------------- 1 | Single Table Pipelines 2 | ====================== 3 | 4 | In this section we will go over a few pipeline examples to show **MLBlocks** working 5 | in different scenarios and with different types of data. 6 | 7 | For each example, we will be using example datasets which can be downloaded using the 8 | various functions found in the ``mlprimitives.datasets`` module. 9 | 10 | .. note:: Even though the datasets are not especially big, some of the examples might 11 | use a considerable amount of resources, especially memory, and might take 12 | several minutes to run. 13 | 14 | Regression Pipeline 15 | ------------------- 16 | 17 | In the most simple example, we will be using a single `RandomForestRegressor`_ primitive over 18 | the numeric data from `The Boston Dataset`_, which we will load using the 19 | ``mlblocks.dataset.load_boston`` function. 20 | 21 | .. code-block:: python 22 | 23 | from mlblocks import MLPipeline 24 | from mlprimitives.datasets import load_boston 25 | 26 | dataset = load_boston() 27 | dataset.describe() 28 | 29 | X_train, X_test, y_train, y_test = dataset.get_splits(1) 30 | 31 | primitives = [ 32 | 'sklearn.ensemble.RandomForestRegressor' 33 | ] 34 | pipeline = MLPipeline(primitives) 35 | 36 | pipeline.fit(X_train, y_train) 37 | 38 | predictions = pipeline.predict(X_test) 39 | 40 | dataset.score(y_test, predictions) 41 | 42 | Classification Pipeline 43 | ----------------------- 44 | 45 | As a Classification example, we will be using `The Iris Dataset`_, which we will load using the 46 | ``mlblocks.dataset.load_iris`` function. 47 | 48 | Here we will combine the `StandardScaler from scikit-learn`_ with an `XGBClassifier primitive`_. 49 | 50 | In this case, we will also be passing some initialization parameters for the XGBClassifier. 51 | 52 | .. code-block:: python 53 | 54 | from mlblocks import MLPipeline 55 | from mlprimitives.datasets import load_iris 56 | 57 | dataset = load_iris() 58 | dataset.describe() 59 | 60 | X_train, X_test, y_train, y_test = dataset.get_splits(1) 61 | 62 | primitives = [ 63 | 'sklearn.preprocessing.StandardScaler', 64 | 'xgboost.XGBClassifier' 65 | ] 66 | init_params = { 67 | 'xgboost.XGBClassifier': { 68 | 'learning_rate': 0.1 69 | } 70 | } 71 | pipeline = MLPipeline(primitives, init_params) 72 | 73 | pipeline.fit(X_train, y_train) 74 | 75 | predictions = pipeline.predict(X_test) 76 | 77 | dataset.score(y_test, predictions) 78 | 79 | 80 | .. _The Boston Dataset: http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html 81 | .. _RandomForestRegressor: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html 82 | .. _XGBRegressor: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn 83 | .. _The Iris Dataset: https://en.wikipedia.org/wiki/Iris_flower_data_set 84 | .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html 85 | .. _XGBClassifier primitive: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn 86 | -------------------------------------------------------------------------------- /docs/pipeline_examples/text.rst: -------------------------------------------------------------------------------- 1 | Text Pipelines 2 | ============== 3 | 4 | Here we will be showing some examples using **MLBlocks** to resolve text problems. 5 | 6 | Text Classification 7 | ------------------- 8 | 9 | For the text classification examples we will be using the `Twenty Newsgroups Dataset`_, 10 | which we will load using the ``mlblocks.dataset.load_newsgroups`` function. 11 | 12 | The data of this dataset is a 1d numpy array vector containing the texts from 11314 newsgroups 13 | posts, and the target is a 1d numpy integer array containing the label of one of the 20 topics 14 | that they are about. 15 | 16 | MLPrimitives + Keras Preprocessing + Keras LSTM 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | 19 | In this example we will start by applying some text cleanup using the `TextCleaner primitive`_ 20 | from MLPrimitives, to then go into some `keras preprocessing` primitives and end 21 | using a `Keras LSTM Classifier from MLPrimitives`_ 22 | 23 | Note how in this case we are using the ``input_names`` and ``output_names`` to properly 24 | setup the pipeline and allow using the outputs from some primitives as additional inputs 25 | for later ones. 26 | 27 | .. code-block:: python 28 | 29 | import nltk 30 | from mlblocks import MLPipeline 31 | from mlprimitives.datasets import load_newsgroups 32 | 33 | dataset = load_newsgroups() 34 | dataset.describe() 35 | 36 | X_train, X_test, y_train, y_test = dataset.get_splits(1) 37 | 38 | # Make sure that we have the necessary data 39 | nltk.download('stopwords') 40 | 41 | # set up the pipeline 42 | primitives = [ 43 | "mlprimitives.custom.counters.UniqueCounter", 44 | "mlprimitives.custom.text.TextCleaner", 45 | "mlprimitives.custom.counters.VocabularyCounter", 46 | "keras.preprocessing.text.Tokenizer", 47 | "keras.preprocessing.sequence.pad_sequences", 48 | "keras.Sequential.LSTMTextClassifier" 49 | ] 50 | input_names = { 51 | "mlprimitives.custom.counters.UniqueCounter#1": { 52 | "X": "y" 53 | } 54 | } 55 | output_names = { 56 | "mlprimitives.custom.counters.UniqueCounter#1": { 57 | "counts": "classes" 58 | }, 59 | "mlprimitives.custom.counters.VocabularyCounter#1": { 60 | "counts": "vocabulary_size" 61 | } 62 | } 63 | init_params = { 64 | "mlprimitives.custom.counters.VocabularyCounter#1": { 65 | "add": 1 66 | }, 67 | "mlprimitives.custom.text.TextCleaner#1": { 68 | "language": "en" 69 | }, 70 | "keras.preprocessing.sequence.pad_sequences#1": { 71 | "maxlen": 100 72 | }, 73 | "keras.Sequential.LSTMTextClassifier#1": { 74 | "input_length": 100 75 | } 76 | } 77 | pipeline = MLPipeline(primitives, init_params, input_names, output_names) 78 | 79 | pipeline.fit(X_train, y_train) 80 | 81 | predictions = pipeline.predict(X_test) 82 | 83 | dataset.score(y_test, predictions) 84 | 85 | 86 | Tabular Data with Text 87 | ---------------------- 88 | 89 | For these examples examples we will be using the `Personae Dataset`_, which we will load 90 | using the ``mlblocks.dataset.load_personae`` function. 91 | 92 | The data of this dataset is a 2d numpy array vector containing 145 entries that include 93 | texts written by Dutch users in Twitter, with some additional information about the author, 94 | and the target is a 1d numpy binary integer array indicating whether the author was extrovert 95 | or not. 96 | 97 | MLPrimitives + Scikit-learn RandomForestClassifier 98 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 99 | 100 | In this example use again the `TextCleaner primitive`_, then use a `StringVectorizer primitive`_, 101 | to encode all the string features, and go directly into the 102 | `RandomForestClassifier from scikit-learn`_. 103 | 104 | .. code-block:: python 105 | 106 | import nltk 107 | from mlblocks import MLPipeline 108 | from mlprimitives.datasets import load_personae 109 | 110 | dataset = load_personae() 111 | dataset.describe() 112 | 113 | X_train, X_test, y_train, y_test = dataset.get_splits(1) 114 | 115 | # Make sure that we have the necessary data 116 | nltk.download('stopwords') 117 | 118 | primitives = [ 119 | 'mlprimitives.custom.text.TextCleaner', 120 | 'mlprimitives.custom.feature_extraction.StringVectorizer', 121 | 'sklearn.ensemble.RandomForestClassifier', 122 | ] 123 | init_params = { 124 | 'mlprimitives.custom.text.TextCleaner': { 125 | 'column': 'text', 126 | 'language': 'nl' 127 | }, 128 | 'sklearn.ensemble.RandomForestClassifier': { 129 | 'n_jobs': -1, 130 | 'n_estimators': 100 131 | } 132 | } 133 | pipeline = MLPipeline(primitives, init_params) 134 | 135 | pipeline.fit(X_train, y_train) 136 | 137 | predictions = pipeline.predict(X_test) 138 | 139 | dataset.score(y_test, predictions) 140 | 141 | 142 | .. _Twenty Newsgroups Dataset: http://scikit-learn.org/stable/datasets/twenty_newsgroups.html 143 | .. _TextCleaner primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/text.py 144 | .. _StringVectorizer primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/feature_extraction.py 145 | .. _keras text preprocessing: https://keras.io/preprocessing/text/ 146 | .. _Keras LSTM Classifier from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json 147 | .. _Personae Dataset: https://www.clips.uantwerpen.be/datasets/personae-corpus 148 | .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html 149 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # MLBlocks Examples 2 | 3 | This folder contains Python code, Jupyter Notebooks and JSON examples to demonstrate MLBlocks 4 | functionaliry. 5 | 6 | Within this folder you will find: 7 | 8 | 9 | * `primitives`: Example primitive JSONs to demonstrate different MLBlocks functionalities. 10 | * `pipelines`: Example pipeline JSONs to demonstrate different MLBlocks functionalities. 11 | * `tutorials`: Collection of Jupyter Notebooks to show the usage of different MLBlocks functionalities. 12 | 13 | 14 | # Requirements 15 | 16 | In order to run the examples contained in this folder you should have [pip installed on your system 17 | ](https://pip.pypa.io/en/stable/installing/). 18 | 19 | Optionally, also install and activate a [virtualenv](https://virtualenv.pypa.io/en/latest/) to 20 | run them in an isolated environment. 21 | 22 | # Usage 23 | 24 | In order to run these tutorials on your computer, please follow these steps: 25 | 26 | 1. Clone this github repository: 27 | 28 | ```bash 29 | git clone git@github.com:MLBazaar/MLBlocks.git 30 | ``` 31 | 32 | 2. (Optional) Create a virtualenv to execute the examples in an environment isolated from the 33 | rest of your computer: 34 | 35 | ```bash 36 | pip install virtualenv 37 | virtualenv -p $(which python3.6) mlblocks-venv 38 | soucre mlblocks-venv/bin/activate 39 | ``` 40 | 41 | 3. Enter the repository and install the dependencies 42 | 43 | ```bash 44 | cd MLBlocks 45 | make install-examples 46 | ``` 47 | 48 | This will install [MLBLocks](https://github.com/MLBazaar/MLBlocks.git) as well as [MLPrimitives]( 49 | https://github.com/MLBazaar/MLPrimitives.git) and [Jupyter](https://jupyter.org/). 50 | 51 | 4. Enter the `examples` folder and start a Jupyter Notebook: 52 | 53 | ```bash 54 | jupyter notebook 55 | ``` 56 | 57 | 5. Point your browser at the link shown in your console and run the examples from the `examples/tutorials` folder. 58 | -------------------------------------------------------------------------------- /examples/pipelines/single_table.classification.categorical_encoder.xgboost.json: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "data_modality": "single_table", 4 | "task_type": "classification" 5 | }, 6 | "validation": { 7 | "dataset": "census" 8 | }, 9 | "primitives": [ 10 | "mlprimitives.custom.preprocessing.ClassEncoder", 11 | "mlprimitives.custom.feature_extraction.CategoricalEncoder", 12 | "sklearn.impute.SimpleImputer", 13 | "xgboost.XGBClassifier", 14 | "mlprimitives.custom.preprocessing.ClassDecoder" 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /examples/primitives/mlblocks.examples.ClassPrimitive.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "the_primitive_name", 3 | "primitive": "full.python.path.to.AClass", 4 | "fit": { 5 | "method": "fit", 6 | "args": [ 7 | { 8 | "name": "X", 9 | "keyword": "optional_name_of_the_fit_method_argument", 10 | "description": "each input can be described", 11 | "type": "pandas.DataFrame" 12 | }, 13 | { 14 | "name": "y", 15 | "description": "each input can be described", 16 | "default": "default_value_for_this_argument", 17 | "type": "pandas.Series" 18 | } 19 | ] 20 | }, 21 | "produce": { 22 | "method": "predict", 23 | "args": [ 24 | { 25 | "name": "X", 26 | "keyword": "optional_name_of_the_produce_method_argument", 27 | "description": "each input can be described", 28 | "type": "DataFrame" 29 | } 30 | ], 31 | "output": [ 32 | { 33 | "name": "y", 34 | "descrtiption": "each output argument can be described", 35 | "type": "Series" 36 | } 37 | ] 38 | }, 39 | "hyperparameters": { 40 | "fixed": { 41 | "a_required_hyperparameter": { 42 | "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user because it does not have a default value", 43 | "type": "int" 44 | }, 45 | "an_optional_hyperparameter": { 46 | "descrtiption": "this is a non tunable hyperparameter that is optional because it has a default value", 47 | "type": "int", 48 | "default": 1 49 | } 50 | }, 51 | "tunable": { 52 | "a_simple_range_hyperparameter": { 53 | "description": "hyperparameter documentation can be put here", 54 | "default": 1, 55 | "type": "int", 56 | "range": [1, 10] 57 | }, 58 | "a_categorical_hyperparameter_of_type_int": { 59 | "description": "Note that it has the field `values` instead of `range`", 60 | "default": 1, 61 | "type": "int", 62 | "values": [1, 3, 7, 10] 63 | }, 64 | "a_categorical_hyperparameter_of_type_str": { 65 | "default": "a", 66 | "type": "str", 67 | "values": ["a", "b", "c"] 68 | }, 69 | "a_multi_type_hyperprameter": { 70 | "description": "this is a hyperparameter that allows more than one type", 71 | "type": "multitype", 72 | "default": "auto", 73 | "types": { 74 | "int": { 75 | "description": "documentation can also be included here", 76 | "range": [1, 10] 77 | }, 78 | "string": { 79 | "values": ["some", "string", "values"] 80 | } 81 | } 82 | }, 83 | "conditional_hyperparameter": { 84 | "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter", 85 | "type": "conditional", 86 | "condition": "the_name_of_the_other_hyperparameter", 87 | "values": { 88 | "a": { 89 | "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`", 90 | "type": "int", 91 | "default": 0, 92 | "range": [0, 10] 93 | }, 94 | "*": { 95 | "description": "this will be used only if the value does not match any other definition", 96 | "type": "float", 97 | "default": 0.0, 98 | "range": [0.0, 1.0] 99 | } 100 | } 101 | } 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /examples/primitives/mlblocks.examples.function_primitive.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "the_primitive_name", 3 | "primitive": "full.python.path.to.a_function", 4 | "produce": { 5 | "args": [ 6 | { 7 | "name": "X", 8 | "keyword": "optional_name_of_the_produce_method_argument", 9 | "description": "each input can be described", 10 | "type": "DataFrame" 11 | } 12 | ], 13 | "output": [ 14 | { 15 | "descrtiption": "each output argument can be described", 16 | "name": "y", 17 | "type": "Series" 18 | } 19 | ] 20 | }, 21 | "hyperparameters": { 22 | "fixed": { 23 | "a_required_hyperparameter": { 24 | "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user, because it does not have a default value", 25 | "type": "int" 26 | }, 27 | "an_optional_hyperparameter": { 28 | "descrtiption": "this is a non tunable hyperparameter that is optional, because it has a default value", 29 | "type": "int", 30 | "default": 1 31 | } 32 | }, 33 | "tunable": { 34 | "a_simple_range_hyperparameter": { 35 | "description": "hyperparameter documentation can be put here", 36 | "default": 1, 37 | "type": "int", 38 | "range": [1, 10] 39 | }, 40 | "a_categorical_hyperparameter_of_type_int": { 41 | "description": "Note that it has the filed `values` instead of `range`", 42 | "default": 1, 43 | "type": "int", 44 | "values": [1, 3, 7, 10] 45 | }, 46 | "a_categorical_hyperparameter_of_type_str": { 47 | "default": "a", 48 | "type": "str", 49 | "values": ["a", "b", "c"] 50 | }, 51 | "a_multi_type_hyperprameter": { 52 | "description": "this is a hyperparameter that allows more than one type", 53 | "type": "multitype", 54 | "default": "auto", 55 | "types": { 56 | "int": { 57 | "description": "documentation can also be included here", 58 | "range": [1, 10] 59 | }, 60 | "string": { 61 | "values": ["some", "string", "values"] 62 | } 63 | } 64 | }, 65 | "conditional_hyperparameter": { 66 | "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter", 67 | "type": "conditional", 68 | "condition": "the_name_of_the_other_hyperparameter", 69 | "values": { 70 | "a": { 71 | "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`", 72 | "type": "int", 73 | "default": 0, 74 | "range": [0, 10] 75 | }, 76 | "*": { 77 | "description": "this will be used only if the value does not match any other definition", 78 | "type": "float", 79 | "default": 0.0, 80 | "range": [0.0, 1.0] 81 | } 82 | } 83 | } 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /examples/tutorials/2. Finding and Loading a Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Finding and Loading a Pipeline\n", 8 | "\n", 9 | "In this short tutorial we will show you how to search for pipelines suitable to solve\n", 10 | "your prediction problem." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "In order to find a suitable pipeline, the first thing we need is to identify\n", 18 | "the type of problem (data modality + task type) that we are facing.\n", 19 | "\n", 20 | "This is a full list of current data modalities and task types that we cover:\n", 21 | "\n", 22 | "| Problem Type | Data Modality | Task Type |\n", 23 | "|:-------------------------------------|:--------------|:------------------------|\n", 24 | "| Single Table Classification | single_table | classification |\n", 25 | "| Single Table Regression | single_table | regression |\n", 26 | "| Single Table Collaborative Filtering | single_table | collaborative_filtering |\n", 27 | "| Multi Table Classification | multi_table | classification |\n", 28 | "| Multi Table Regression | multi_table | regression |\n", 29 | "| Time Series Classification | timeseries | classification |\n", 30 | "| Time Series Regression | timeseries | regression |\n", 31 | "| Time Series Forecasting | timeseries | forecasting |\n", 32 | "| Time Series Anomaly Detection | timeseries | anomaly_detection |\n", 33 | "| Image Classification | image | classification |\n", 34 | "| Image Regression | image | regression |\n", 35 | "| Graph Link Prediction | graph | link_prediction |\n", 36 | "| Graph Vertex Nomination | graph | vertex_nomination |\n", 37 | "| Graph Community Detection | graph | community_detection |\n", 38 | "| Graph Matching | graph | graph_matching |" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Once we have identified our data modality and task type we can use the\n", 46 | "`mlblocks.discovery.find_pipelines` function to find all the pipelines\n", 47 | "that support this particular problem type.\n", 48 | "\n", 49 | "For example, if we are looking for a pipeline to work on Image Classification\n", 50 | "we will do the following query." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 1, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "['image.classification.hog.rf',\n", 62 | " 'image.classification.hog.xgb',\n", 63 | " 'image.classification.resnet50.xgb',\n", 64 | " 'keras.Sequential.SingleLayerCNNImageClassifier',\n", 65 | " 'keras.Sequential.VGGCNNClassifier']" 66 | ] 67 | }, 68 | "execution_count": 1, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "from mlblocks.discovery import find_pipelines\n", 75 | "\n", 76 | "filters = {\n", 77 | " 'metadata.data_type': 'image',\n", 78 | " 'metadata.task_type': 'classification',\n", 79 | "}\n", 80 | "\n", 81 | "find_pipelines(filters=filters)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "After finding and choosing a pipeline, we can load it as an `MLPipeline`\n", 89 | "by passing its name to the `MLPipeline`." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 2, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stderr", 99 | "output_type": "stream", 100 | "text": [ 101 | "Using TensorFlow backend.\n", 102 | "2020-09-16 16:03:19,939 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", 103 | "Instructions for updating:\n", 104 | "If using Keras pass *_constraint arguments to layers.\n", 105 | "2020-09-16 16:03:20,025 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n", 106 | "\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "from mlblocks import MLPipeline\n", 112 | "\n", 113 | "pipeline = MLPipeline('image.classification.resnet50.xgb')" 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.6.9" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 4 138 | } 139 | -------------------------------------------------------------------------------- /examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Setting MLPipeline Hyperparameters\n", 8 | "\n", 9 | "In this short guide we will see how to modify the hyperparameters\n", 10 | "of an MLPipeline in order to modify its behavior or performance.\n", 11 | "\n", 12 | "Note that some steps are not explained for simplicity. Full details\n", 13 | "about them can be found in the previous parts of the tutorial.\n", 14 | "\n", 15 | "We will:\n", 16 | "\n", 17 | "1. Load a dataset and a Pipeline.\n", 18 | "2. Explore the pipeline hyperparamters.\n", 19 | "3. Reload the pipeline with different hyperparameters.\n", 20 | "4. Evaluate the pipeline performance on the dataset.\n", 21 | "5. Set different pipeline hyperparameters.\n", 22 | "6. Re-evaluate the pipeline performance on the dataset." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Load the Dataset and the Pipeline\n", 30 | "\n", 31 | "The first step will be to load the dataset and the pipeline that we will be using." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "from utils import load_census\n", 41 | "\n", 42 | "dataset = load_census()\n", 43 | "X_train, X_test, y_train, y_test = dataset.get_splits(1)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "from mlblocks import MLPipeline\n", 53 | "\n", 54 | "primitives = [\n", 55 | " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", 56 | " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", 57 | " 'sklearn.impute.SimpleImputer',\n", 58 | " 'xgboost.XGBClassifier',\n", 59 | " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", 60 | "]\n", 61 | "pipeline = MLPipeline(primitives)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Explore the Pipeline Hyperparameters" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Once we have loaded the pipeline, we can see the hyperparameters that it is using by\n", 76 | "calling its `get_hyperparameters` method." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n", 88 | " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n", 89 | " 'copy': True,\n", 90 | " 'features': 'auto',\n", 91 | " 'max_unique_ratio': 0,\n", 92 | " 'max_labels': 0},\n", 93 | " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", 94 | " 'fill_value': None,\n", 95 | " 'verbose': False,\n", 96 | " 'copy': True,\n", 97 | " 'strategy': 'mean'},\n", 98 | " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", 99 | " 'n_estimators': 100,\n", 100 | " 'max_depth': 3,\n", 101 | " 'learning_rate': 0.1,\n", 102 | " 'gamma': 0,\n", 103 | " 'min_child_weight': 1},\n", 104 | " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}" 105 | ] 106 | }, 107 | "execution_count": 3, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "pipeline.get_hyperparameters()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "This will return us a dictionary that contains one entry for each step in the pipeline.\n", 121 | "Each entry will also be a dictionary, indicating the names and the values of the hyperparameters of that step.\n", 122 | "\n", 123 | "**NOTE** that here we see the names of the pipeline steps, which are the primitive names with a numerical suffix that allows us to tell the difference between multiple steps that use the same primitive. \n", 124 | "\n", 125 | "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/MLBazaar/BTB)\n", 126 | "that work with flat, one-level, dictionaries, the argument `flat=True` can be passed." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 4, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", 138 | " 'keep'): False,\n", 139 | " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'copy'): True,\n", 140 | " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", 141 | " 'features'): 'auto',\n", 142 | " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", 143 | " 'max_unique_ratio'): 0,\n", 144 | " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", 145 | " 'max_labels'): 0,\n", 146 | " ('sklearn.impute.SimpleImputer#1', 'missing_values'): nan,\n", 147 | " ('sklearn.impute.SimpleImputer#1', 'fill_value'): None,\n", 148 | " ('sklearn.impute.SimpleImputer#1', 'verbose'): False,\n", 149 | " ('sklearn.impute.SimpleImputer#1', 'copy'): True,\n", 150 | " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", 151 | " ('xgboost.XGBClassifier#1', 'n_jobs'): -1,\n", 152 | " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", 153 | " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", 154 | " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", 155 | " ('xgboost.XGBClassifier#1', 'gamma'): 0,\n", 156 | " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" 157 | ] 158 | }, 159 | "execution_count": 4, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "pipeline.get_hyperparameters(flat=True)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "This will return us the same information as before, but organized a single one-level\n", 173 | "dictionary where each key is a `tuple` containing both the name of the step and the hyperparameter." 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## Setting Pipeline hyperparameter values\n", 181 | "\n", 182 | "We can set some different hyperparameter values when loading the pipeline by adding the\n", 183 | "`init_params` argument to `MLPipeline`.\n", 184 | "\n", 185 | "The `init_params` has to be a dictionary where each entry corresponds to the name of one of the\n", 186 | "pipeline steps and each value is another dictionary indicating the hyperparameter values that we\n", 187 | "want to use on that step.\n", 188 | "\n", 189 | "As an example, we will set a different imputer strategy and a different xgboost max dempt." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 5, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "init_params = {\n", 199 | " 'sklearn.impute.SimpleImputer#1': {\n", 200 | " 'strategy': 'median'\n", 201 | " },\n", 202 | " 'xgboost.XGBClassifier#1': {\n", 203 | " 'max_depth': 4\n", 204 | " }\n", 205 | "}\n", 206 | "pipeline = MLPipeline(\n", 207 | " primitives,\n", 208 | " init_params=init_params\n", 209 | ")" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "We can now see how the hyperparameters are different than before." 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 6, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/plain": [ 227 | "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n", 228 | " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n", 229 | " 'copy': True,\n", 230 | " 'features': 'auto',\n", 231 | " 'max_unique_ratio': 0,\n", 232 | " 'max_labels': 0},\n", 233 | " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", 234 | " 'fill_value': None,\n", 235 | " 'verbose': False,\n", 236 | " 'copy': True,\n", 237 | " 'strategy': 'median'},\n", 238 | " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", 239 | " 'max_depth': 4,\n", 240 | " 'n_estimators': 100,\n", 241 | " 'learning_rate': 0.1,\n", 242 | " 'gamma': 0,\n", 243 | " 'min_child_weight': 1},\n", 244 | " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}" 245 | ] 246 | }, 247 | "execution_count": 6, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "pipeline.get_hyperparameters()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## Evaluate the Pipeline performance\n", 261 | "\n", 262 | "We can now evaluate the pipeline performance to see what results these\n", 263 | "hyperparameters produce." 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 7, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "name": "stderr", 273 | "output_type": "stream", 274 | "text": [ 275 | "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", 276 | " warnings.warn(\n" 277 | ] 278 | }, 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "0.8647586291610367" 283 | ] 284 | }, 285 | "execution_count": 7, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "pipeline.fit(X_train, y_train)\n", 292 | "y_pred = pipeline.predict(X_test)\n", 293 | "\n", 294 | "dataset.score(y_test, y_pred)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "## Setting hyperparameter values\n", 302 | "\n", 303 | "Another way of setting the pipeline hyperparameters without having to recreate it\n", 304 | "from scratch, is to use its `set_hyperparameters` method.\n", 305 | "\n", 306 | "In this case, we will change the CategoricalEncoder `max_labels` and the xgboost `learning_rate`." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 8, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "hyperparameters = {\n", 316 | " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {\n", 317 | " 'max_labels': 10\n", 318 | " },\n", 319 | " 'xgboost.XGBClassifier#1': {\n", 320 | " 'learning_rate': 0.3\n", 321 | " }\n", 322 | "}\n", 323 | "pipeline.set_hyperparameters(hyperparameters)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "Alternatively, the hyperparameters can be set using the `flat` format:" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 9, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "hyperparameters = {\n", 340 | " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 10,\n", 341 | " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3\n", 342 | "}\n", 343 | "pipeline.set_hyperparameters(hyperparameters)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "And we can see how these hyperparameters now are different than before:" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 10, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n", 362 | " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n", 363 | " 'copy': True,\n", 364 | " 'features': 'auto',\n", 365 | " 'max_unique_ratio': 0,\n", 366 | " 'max_labels': 10},\n", 367 | " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", 368 | " 'fill_value': None,\n", 369 | " 'verbose': False,\n", 370 | " 'copy': True,\n", 371 | " 'strategy': 'median'},\n", 372 | " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", 373 | " 'max_depth': 4,\n", 374 | " 'n_estimators': 100,\n", 375 | " 'learning_rate': 0.3,\n", 376 | " 'gamma': 0,\n", 377 | " 'min_child_weight': 1},\n", 378 | " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}" 379 | ] 380 | }, 381 | "execution_count": 10, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "pipeline.get_hyperparameters()" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "## Evaluate the Pipeline performance\n", 395 | "\n", 396 | "We can now evaluate again the pipeline performance and see how the hyperparameter\n", 397 | "change affected the pipeline performance." 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 11, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "name": "stderr", 407 | "output_type": "stream", 408 | "text": [ 409 | "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", 410 | " warnings.warn(\n" 411 | ] 412 | }, 413 | { 414 | "data": { 415 | "text/plain": [ 416 | "0.870531875690947" 417 | ] 418 | }, 419 | "execution_count": 11, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "pipeline.fit(X_train, y_train)\n", 426 | "y_pred = pipeline.predict(X_test)\n", 427 | "\n", 428 | "dataset.score(y_test, y_pred)" 429 | ] 430 | } 431 | ], 432 | "metadata": { 433 | "kernelspec": { 434 | "display_name": "Python 3 (ipykernel)", 435 | "language": "python", 436 | "name": "python3" 437 | }, 438 | "language_info": { 439 | "codemirror_mode": { 440 | "name": "ipython", 441 | "version": 3 442 | }, 443 | "file_extension": ".py", 444 | "mimetype": "text/x-python", 445 | "name": "python", 446 | "nbconvert_exporter": "python", 447 | "pygments_lexer": "ipython3", 448 | "version": "3.8.16" 449 | } 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 4 453 | } 454 | -------------------------------------------------------------------------------- /examples/tutorials/4. Saving and Loading a Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Saving and Loading a Pipeline\n", 8 | "\n", 9 | "This short guide shows how serialize a Pipeline into a file and later on load it\n", 10 | "to make predictions.\n", 11 | "\n", 12 | "Note that some steps are not explained for simplicity. Full details\n", 13 | "about them can be found in the previous parts of the tutorial.\n", 14 | "\n", 15 | "We will:\n", 16 | "\n", 17 | "1. Load and fit a pipeline to a dataset\n", 18 | "2. Save the pipeline to a file.\n", 19 | "3. Load the pipeline as a new object.\n", 20 | "4. Make predictions using the new pipeline object." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Fit the pipeline\n", 28 | "\n", 29 | "The first step will be to load and fit the pipeline to the dataset." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "from utils import load_census\n", 39 | "\n", 40 | "dataset = load_census()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "X_train, X_test, y_train, y_test = dataset.get_splits(1)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "from mlblocks import MLPipeline\n", 59 | "\n", 60 | "primitives = [\n", 61 | " 'mlprimitives.custom.preprocessing.ClassEncoder',\n", 62 | " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", 63 | " 'sklearn.impute.SimpleImputer',\n", 64 | " 'xgboost.XGBClassifier',\n", 65 | " 'mlprimitives.custom.preprocessing.ClassDecoder'\n", 66 | "]\n", 67 | "pipeline = MLPipeline(primitives)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stderr", 77 | "output_type": "stream", 78 | "text": [ 79 | "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", 80 | " warnings.warn(\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "pipeline.fit(X_train, y_train)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Save the Pipeline\n", 93 | "\n", 94 | "Once the pipeline is fit and ready to make predictions we can store it in a file.\n", 95 | "We will do so using [pickle](https://docs.python.org/3/library/pickle.html)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "import pickle\n", 105 | "\n", 106 | "with open('pipeline.pkl', 'wb') as f:\n", 107 | " pickle.dump(pipeline, f)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Load the Pipeline\n", 115 | "\n", 116 | "The saved pipeline can then be moved to another system where we can load it back to\n", 117 | "memory using pickle again." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "with open('pipeline.pkl', 'rb') as f:\n", 127 | " loaded_pipeline = pickle.load(f)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "**IMPORTANT**: All the dependencies need to also be installed in the system that is loading the pipeline. This includes **MLBlocks** and **MLPrimitives** or any other libraries required by the pipeline primitives." 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "## Make Predictions\n", 142 | "\n", 143 | "Once the pipeline is loaded it is ready to make predictions again" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "pred = pipeline.predict(X_test)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 8, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)" 164 | ] 165 | }, 166 | "execution_count": 8, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "pred[0:5]" 173 | ] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3 (ipykernel)", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.8.16" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 4 197 | } 198 | -------------------------------------------------------------------------------- /examples/tutorials/7. Tuning a Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tuning a Pipeline\n", 8 | "\n", 9 | "This short guide shows how tune a Pipeline using a [BTB](https://github.com/MLBazaar/BTB) Tuner.\n", 10 | "\n", 11 | "Note that some steps are not explained for simplicity. Full details\n", 12 | "about them can be found in the previous parts of the tutorial.\n", 13 | "\n", 14 | "Here we will:\n", 15 | "1. Load a dataset and a pipeline\n", 16 | "2. Explore the pipeline tunable hyperparameters\n", 17 | "3. Write a scoring function\n", 18 | "4. Build a BTB Tunable and BTB Tuner.\n", 19 | "5. Write a tuning loop" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Load dataset and the pipeline\n", 27 | "\n", 28 | "The first step will be to load the dataset that we were using in previous tutorials." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from utils import load_census\n", 38 | "\n", 39 | "dataset = load_census()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "And load a suitable pipeline.\n", 47 | "\n", 48 | "Note how in this case we are using the variable name `template` instead of `pipeline`,\n", 49 | "because this will only be used as a template for the pipelines that we will create\n", 50 | "and evaluate during the later tuning loop." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "from mlblocks import MLPipeline\n", 60 | "\n", 61 | "template = MLPipeline('single_table.classification.xgb')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Explore the pipeline tunable hyperparameters" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Once we have loaded the pipeline, we can now extract the hyperparameters that we will tune\n", 76 | "by calling the `get_tunable_hyperparameters` method.\n", 77 | "\n", 78 | "In this case we will call it using `flat=True` to obtain the hyperparameters in a format\n", 79 | "that is compatible with BTB." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "tunable_hyperparameters = template.get_tunable_hyperparameters(flat=True)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", 100 | " 'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n", 101 | " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n", 102 | " 'default': 'mean',\n", 103 | " 'values': ['mean', 'median', 'most_frequent', 'constant']},\n", 104 | " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n", 105 | " 'default': 100,\n", 106 | " 'range': [10, 1000]},\n", 107 | " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n", 108 | " 'default': 3,\n", 109 | " 'range': [3, 10]},\n", 110 | " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n", 111 | " 'default': 0.1,\n", 112 | " 'range': [0, 1]},\n", 113 | " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n", 114 | " 'default': 0,\n", 115 | " 'range': [0, 1]},\n", 116 | " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n", 117 | " 'default': 1,\n", 118 | " 'range': [1, 10]}}" 119 | ] 120 | }, 121 | "execution_count": 4, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "tunable_hyperparameters" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Write a scoring function\n", 135 | "\n", 136 | "To tune the pipeline we will need to evaluate its performance multiple times with different hyperparameters.\n", 137 | "\n", 138 | "For this reason, we will start by writing a scoring function that will expect only one\n", 139 | "input, the hyperparameters dictionary, and evaluate the performance of the pipeline using them.\n", 140 | "\n", 141 | "In this case, the evaluation will be done using 5-fold cross validation based on the `get_splits`\n", 142 | "method from the dataset." 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 5, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "import numpy as np\n", 152 | "\n", 153 | "def cross_validate(hyperparameters=None):\n", 154 | " scores = []\n", 155 | " for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n", 156 | " pipeline = MLPipeline(template.to_dict()) # Make a copy of the template\n", 157 | " if hyperparameters:\n", 158 | " pipeline.set_hyperparameters(hyperparameters)\n", 159 | "\n", 160 | " pipeline.fit(X_train, y_train)\n", 161 | " y_pred = pipeline.predict(X_test)\n", 162 | " \n", 163 | " scores.append(dataset.score(y_test, y_pred))\n", 164 | " \n", 165 | " return np.mean(scores)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "By calling this function without any arguments we will obtain the score obtained\n", 173 | "with the default hyperparameters." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 6, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "0.863978563379761" 185 | ] 186 | }, 187 | "execution_count": 6, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "default_score = cross_validate()\n", 194 | "default_score" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "Optionally, we can certify that by passing a hyperparameters dictionary the new hyperparameters\n", 202 | "will be used, resulting on a different score." 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 7, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/plain": [ 213 | "0.868554574842" 214 | ] 215 | }, 216 | "execution_count": 7, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "hyperparameters = {\n", 223 | " ('xgboost.XGBClassifier#1', 'max_depth'): 4\n", 224 | "}\n", 225 | "cross_validate(hyperparameters)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Create a BTB Tunable\n", 233 | "\n", 234 | "The next step is to create the BTB Tunable instance that will be tuned by the BTB Tuner.\n", 235 | "\n", 236 | "For this we will use its `from_dict` method, passing our hyperparameters dict." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 8, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "from baytune.tuning import Tunable\n", 246 | "\n", 247 | "tunable = Tunable.from_dict(tunable_hyperparameters)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "## Create the BTB Tuner\n", 255 | "\n", 256 | "After creating the Tunable, we need to create a Tuner to tune it.\n", 257 | "\n", 258 | "In this case we will use the GPTuner, a Meta-model based tuner that uses a Gaussian Process Regressor\n", 259 | "for the optimization." 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 9, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "from baytune.tuning import GPTuner\n", 269 | "\n", 270 | "tuner = GPTuner(tunable)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "Optionally, since we already know the score obtained by the default arguments and\n", 278 | "these have a high probability of being already decent, we will inform the tuner\n", 279 | "about their performance.\n", 280 | "\n", 281 | "In order to obtain the default hyperparameters used before we can either call\n", 282 | "the template `get_hyperparameters(flat=True)` method, the `tunable.get_defaults()`." 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 10, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/plain": [ 293 | "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", 294 | " 'max_labels'): 0,\n", 295 | " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", 296 | " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", 297 | " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", 298 | " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", 299 | " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", 300 | " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" 301 | ] 302 | }, 303 | "execution_count": 10, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "defaults = tunable.get_defaults()\n", 310 | "defaults" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 11, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "tuner.record(defaults, default_score)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "## Start the Tuning loop\n", 327 | "\n", 328 | "Once we have the tuner ready we can the tuning loop.\n", 329 | "\n", 330 | "During this loop we will:\n", 331 | "\n", 332 | "1. Ask the tuner for a new hyperparameter proposal\n", 333 | "2. Run the `cross_validate` function to evaluate these hyperparameters\n", 334 | "3. Record the obtained score back to the tuner.\n", 335 | "4. If the obtained score is better than the previous one, store the proposal." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 12, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "scoring pipeline 1\n", 348 | "New best found: 0.871994161365419\n", 349 | "scoring pipeline 2\n", 350 | "New best found: 0.8723319756253888\n", 351 | "scoring pipeline 3\n", 352 | "scoring pipeline 4\n", 353 | "scoring pipeline 5\n", 354 | "scoring pipeline 6\n", 355 | "scoring pipeline 7\n", 356 | "scoring pipeline 8\n", 357 | "scoring pipeline 9\n", 358 | "scoring pipeline 10\n" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "best_score = default_score\n", 364 | "best_proposal = defaults\n", 365 | "\n", 366 | "for iteration in range(10):\n", 367 | " print(\"scoring pipeline {}\".format(iteration + 1))\n", 368 | " \n", 369 | " proposal = tuner.propose()\n", 370 | " score = cross_validate(proposal)\n", 371 | " \n", 372 | " tuner.record(proposal, score)\n", 373 | " \n", 374 | " if score > best_score:\n", 375 | " print(\"New best found: {}\".format(score))\n", 376 | " best_score = score\n", 377 | " best_proposal = proposal" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "After the loop has finished, the best proposal will be stored in the `best_proposal` variable,\n", 385 | "which can be used to generate a new pipeline instance." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 13, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/plain": [ 396 | "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", 397 | " 'max_labels'): 60,\n", 398 | " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", 399 | " ('xgboost.XGBClassifier#1', 'n_estimators'): 190,\n", 400 | " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", 401 | " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.13575511242790694,\n", 402 | " ('xgboost.XGBClassifier#1', 'gamma'): 0.6326488945712287,\n", 403 | " ('xgboost.XGBClassifier#1', 'min_child_weight'): 8}" 404 | ] 405 | }, 406 | "execution_count": 13, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "best_proposal" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 14, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "best_pipeline = MLPipeline(template.to_dict())" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 15, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "best_pipeline.set_hyperparameters(best_proposal)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 16, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "best_pipeline.fit(dataset.data, dataset.target)" 440 | ] 441 | } 442 | ], 443 | "metadata": { 444 | "kernelspec": { 445 | "display_name": "Python 3 (ipykernel)", 446 | "language": "python", 447 | "name": "python3" 448 | }, 449 | "language_info": { 450 | "codemirror_mode": { 451 | "name": "ipython", 452 | "version": 3 453 | }, 454 | "file_extension": ".py", 455 | "mimetype": "text/x-python", 456 | "name": "python", 457 | "nbconvert_exporter": "python", 458 | "pygments_lexer": "ipython3", 459 | "version": "3.10.15" 460 | } 461 | }, 462 | "nbformat": 4, 463 | "nbformat_minor": 4 464 | } 465 | -------------------------------------------------------------------------------- /examples/tutorials/utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | 4 | import pandas as pd 5 | from sklearn.metrics import accuracy_score 6 | from mlprimitives.datasets import Dataset 7 | 8 | DATA_PATH = os.path.join( 9 | os.path.dirname(__file__), 10 | 'data' 11 | ) 12 | 13 | DATA_URL = 'http://mlblocks.s3.amazonaws.com/{}.csv' 14 | 15 | def _download(dataset_name, dataset_path): 16 | url = DATA_URL.format(dataset_name) 17 | 18 | data = pd.read_csv(url) 19 | data.to_csv(dataset_path, index=False) 20 | 21 | def _load(dataset_name): 22 | if not os.path.exists(DATA_PATH): 23 | os.makedirs(DATA_PATH) 24 | 25 | dataset_path = os.path.join(DATA_PATH, dataset_name + '.csv') 26 | if not os.path.exists(dataset_path): 27 | _download(dataset_name, dataset_path) 28 | 29 | return dataset_path 30 | 31 | def load_census(): 32 | """Adult Census dataset. 33 | 34 | Predict whether income exceeds $50K/yr based on census data. Also known as "Adult" dataset. 35 | 36 | Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean 37 | records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && 38 | (AFNLWGT>1)&& (HRSWK>0)) 39 | 40 | Prediction task is to determine whether a person makes over 50K a year. 41 | 42 | source: "UCI 43 | sourceURI: "https://archive.ics.uci.edu/ml/datasets/census+income" 44 | """ 45 | 46 | dataset_path = _load('census_train') 47 | 48 | X = pd.read_csv(dataset_path) 49 | y = X.pop('label').values 50 | 51 | return Dataset(load_census.__doc__, X, y, accuracy_score, 'single_table', 52 | 'classification', 'binary', stratify=True) -------------------------------------------------------------------------------- /mlblocks/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | MLBlocks top module. 5 | 6 | MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by 7 | seamlessly combining tools from any python library with a simple, common and uniform interface. 8 | 9 | * Free software: MIT license 10 | * Documentation: https://MLBazaar.github.io/MLBlocks 11 | """ 12 | 13 | from mlblocks.discovery import ( 14 | add_pipelines_path, add_primitives_path, find_pipelines, find_primitives, get_pipelines_paths, 15 | get_primitives_paths, load_pipeline, load_primitive) 16 | from mlblocks.mlblock import MLBlock 17 | from mlblocks.mlpipeline import MLPipeline 18 | 19 | __author__ = 'MIT Data To AI Lab' 20 | __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' 21 | __email__ = 'dailabmit@gmail.com' 22 | __license__ = 'MIT' 23 | __version__ = '0.6.3.dev0' 24 | 25 | __all__ = [ 26 | 'MLBlock', 27 | 'MLPipeline', 28 | 'add_pipelines_path', 29 | 'add_primitives_path', 30 | 'find_pipelines', 31 | 'find_primitives', 32 | 'get_pipelines_paths', 33 | 'get_primitives_paths', 34 | 'load_pipeline', 35 | 'load_primitive' 36 | ] 37 | -------------------------------------------------------------------------------- /mlblocks/discovery.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Primitives and Pipelines discovery module. 5 | 6 | This module contains functions to load primitive and pipeline 7 | annotations, as well as to configure how MLBlocks finds the 8 | primitives and pipelines. 9 | """ 10 | 11 | import json 12 | import logging 13 | import os 14 | import re 15 | import sys 16 | 17 | import pkg_resources 18 | 19 | LOGGER = logging.getLogger(__name__) 20 | 21 | _PRIMITIVES_PATHS = [ 22 | os.path.join(os.getcwd(), 'mlprimitives'), 23 | os.path.join(sys.prefix, 'mlprimitives'), 24 | os.path.join(os.getcwd(), 'mlblocks_primitives'), # legacy 25 | os.path.join(sys.prefix, 'mlblocks_primitives'), # legacy 26 | ] 27 | 28 | _PIPELINES_PATHS = [ 29 | os.path.join(os.getcwd(), 'mlpipelines'), 30 | ] 31 | 32 | 33 | def _add_lookup_path(path, paths): 34 | """Add a new path to lookup. 35 | 36 | The new path will be inserted in the first place of the list, 37 | so any element found in this new folder will take precedence 38 | over any other element with the same name that existed in the 39 | system before. 40 | 41 | Args: 42 | path (str): 43 | path to add 44 | paths (list): 45 | list where the new path will be added. 46 | 47 | Raises: 48 | ValueError: 49 | A ``ValueError`` will be raised if the path is not valid. 50 | 51 | Returns: 52 | bool: 53 | Whether the new path was added or not. 54 | """ 55 | if path not in paths: 56 | if not os.path.isdir(path): 57 | raise ValueError('Invalid path: {}'.format(path)) 58 | 59 | paths.insert(0, os.path.abspath(path)) 60 | return True 61 | 62 | return False 63 | 64 | 65 | def add_primitives_path(path): 66 | """Add a new path to look for primitives. 67 | 68 | The new path will be inserted in the first place of the list, 69 | so any primitive found in this new folder will take precedence 70 | over any other primitive with the same name that existed in the 71 | system before. 72 | 73 | Args: 74 | path (str): 75 | path to add 76 | 77 | Raises: 78 | ValueError: 79 | A ``ValueError`` will be raised if the path is not valid. 80 | """ 81 | added = _add_lookup_path(path, _PRIMITIVES_PATHS) 82 | if added: 83 | LOGGER.debug('New primitives path added: %s', path) 84 | 85 | 86 | def add_pipelines_path(path): 87 | """Add a new path to look for pipelines. 88 | 89 | The new path will be inserted in the first place of the list, 90 | so any primitive found in this new folder will take precedence 91 | over any other pipeline with the same name that existed in the 92 | system before. 93 | 94 | Args: 95 | path (str): 96 | path to add 97 | 98 | Raises: 99 | ValueError: 100 | A ``ValueError`` will be raised if the path is not valid. 101 | """ 102 | added = _add_lookup_path(path, _PIPELINES_PATHS) 103 | if added: 104 | LOGGER.debug('New pipelines path added: %s', path) 105 | 106 | 107 | def _load_entry_points(entry_point_name, entry_point_group='mlblocks'): 108 | """Get a list of folders from entry points. 109 | 110 | This list will include the value of any entry point named after the given 111 | ``entry_point_name`` published under the given ``entry_point_group``. 112 | 113 | An example of such an entry point would be:: 114 | 115 | entry_points = { 116 | 'mlblocks': [ 117 | 'primitives=some_module:SOME_VARIABLE' 118 | ] 119 | } 120 | 121 | where the module ``some_module`` contains a variable such as:: 122 | 123 | SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') 124 | 125 | Args: 126 | entry_point: 127 | The name of the ``entry_point`` to look for. 128 | 129 | Returns: 130 | list: 131 | The list of folders. 132 | """ 133 | lookup_paths = list() 134 | entry_points = pkg_resources.iter_entry_points(entry_point_group) 135 | for entry_point in entry_points: 136 | if entry_point.name == entry_point_name: 137 | paths = entry_point.load() 138 | if isinstance(paths, str): 139 | lookup_paths.append(paths) 140 | elif isinstance(paths, (list, tuple)): 141 | lookup_paths.extend(paths) 142 | 143 | return lookup_paths 144 | 145 | 146 | def get_primitives_paths(): 147 | """Get the list of folders where primitives will be looked for. 148 | 149 | This list will include the values of all the entry points named ``primitives`` 150 | published under the entry point group ``mlblocks``. 151 | 152 | Also, for backwards compatibility reasons, the paths from the entry points 153 | named ``jsons_path`` published under the ``mlprimitives`` group will also 154 | be included. 155 | 156 | An example of such an entry point would be:: 157 | 158 | entry_points = { 159 | 'mlblocks': [ 160 | 'primitives=some_module:SOME_VARIABLE' 161 | ] 162 | } 163 | 164 | where the module ``some_module`` contains a variable such as:: 165 | 166 | SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') 167 | 168 | Returns: 169 | list: 170 | The list of folders. 171 | """ 172 | paths = _load_entry_points('primitives') + _load_entry_points('jsons_path', 'mlprimitives') 173 | return _PRIMITIVES_PATHS + list(set(paths)) 174 | 175 | 176 | def get_pipelines_paths(): 177 | """Get the list of folders where pipelines will be looked for. 178 | 179 | This list will include the values of all the entry points named ``pipelines`` 180 | published under the entry point group ``mlblocks``. 181 | 182 | An example of such an entry point would be:: 183 | 184 | entry_points = { 185 | 'mlblocks': [ 186 | 'pipelines=some_module:SOME_VARIABLE' 187 | ] 188 | } 189 | 190 | where the module ``some_module`` contains a variable such as:: 191 | 192 | SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons') 193 | 194 | Returns: 195 | list: 196 | The list of folders. 197 | """ 198 | return _PIPELINES_PATHS + _load_entry_points('pipelines') 199 | 200 | 201 | def _load_json(json_path): 202 | with open(json_path, 'r') as json_file: 203 | LOGGER.debug('Loading %s', json_path) 204 | return json.load(json_file) 205 | 206 | 207 | def _load(name, paths): 208 | """Locate and load the JSON annotation in any of the given paths. 209 | 210 | All the given paths will be scanned to find a JSON file with the given name, 211 | and as soon as a JSON with the given name is found it is returned. 212 | 213 | Args: 214 | name (str): 215 | Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. 216 | paths (list): 217 | list of paths where the primitives will be looked for. 218 | 219 | Returns: 220 | dict: 221 | The content of the JSON annotation file loaded into a dict. 222 | """ 223 | if os.path.isfile(name): 224 | return _load_json(name) 225 | 226 | for base_path in paths: 227 | parts = name.split('.') 228 | number_of_parts = len(parts) 229 | 230 | for folder_parts in range(number_of_parts): 231 | folder = os.path.join(base_path, *parts[:folder_parts]) 232 | filename = '.'.join(parts[folder_parts:]) + '.json' 233 | json_path = os.path.join(folder, filename) 234 | 235 | if os.path.isfile(json_path): 236 | return _load_json(json_path) 237 | 238 | 239 | def load_primitive(name): 240 | """Locate and load the primitive JSON annotation. 241 | 242 | All the primitive paths will be scanned to find a JSON file with the given name, 243 | and as soon as a JSON with the given name is found it is returned. 244 | 245 | Args: 246 | name (str): 247 | Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. 248 | 249 | Returns: 250 | dict: 251 | The content of the JSON annotation file loaded into a dict. 252 | 253 | Raises: 254 | ValueError: 255 | A ``ValueError`` will be raised if the primitive cannot be found. 256 | """ 257 | primitive = _load(name, get_primitives_paths()) 258 | if primitive is None: 259 | raise ValueError("Unknown primitive: {}".format(name)) 260 | 261 | return primitive 262 | 263 | 264 | def load_pipeline(name): 265 | """Locate and load the pipeline JSON annotation. 266 | 267 | All the pipeline paths will be scanned to find a JSON file with the given name, 268 | and as soon as a JSON with the given name is found it is returned. 269 | 270 | Args: 271 | name (str): 272 | Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension. 273 | 274 | Returns: 275 | dict: 276 | The content of the JSON annotation file loaded into a dict. 277 | 278 | Raises: 279 | ValueError: 280 | A ``ValueError`` will be raised if the pipeline cannot be found. 281 | """ 282 | pipeline = _load(name, get_pipelines_paths()) 283 | if pipeline is None: 284 | raise ValueError("Unknown pipeline: {}".format(name)) 285 | 286 | return pipeline 287 | 288 | 289 | def _search_annotations(base_path, pattern, parts=None): 290 | """Search for annotations within the given path. 291 | 292 | If the indicated path has subfolders, search recursively within them. 293 | 294 | If a pattern is given, return only the annotations whose name 295 | matches the pattern. 296 | 297 | Args: 298 | base_path (str): 299 | path to the folder to be searched for annotations. 300 | pattern (str): 301 | Regular expression to search in the annotation names. 302 | parts (list): 303 | Optional. List containing the parent folders that are also part 304 | of the annotation name. Used during recursion to be able to 305 | build the final annotation name before returning it. 306 | 307 | Returns: 308 | dict: 309 | dictionary containing paths as keys and annotation names as 310 | values. 311 | """ 312 | pattern = re.compile(pattern) 313 | annotations = dict() 314 | parts = parts or list() 315 | if os.path.exists(base_path): 316 | for name in os.listdir(base_path): 317 | path = os.path.abspath(os.path.join(base_path, name)) 318 | if os.path.isdir(path): 319 | annotations.update(_search_annotations(path, pattern, parts + [name])) 320 | elif path not in annotations: 321 | name = '.'.join(parts + [name]) 322 | if pattern.search(name) and name.endswith('.json'): 323 | annotations[path] = name[:-5] 324 | 325 | return annotations 326 | 327 | 328 | def _match(annotation, key, values): 329 | """Check if the anotation has the key and it matches any of the values. 330 | 331 | If the given key is not found but it contains dots, split by the dots 332 | and consider each part a sublevel in the annotation. 333 | 334 | If the key value within the annotation is a list or a dict, check 335 | whether any of the given values is contained within it instead of 336 | checking for equality. 337 | 338 | Args: 339 | annotation (dict): 340 | Dictionary annotation. 341 | key (str): 342 | Key to search within the annoation. It can contain dots to 343 | separated nested subdictionary levels within the annotation. 344 | values (object or list): 345 | Value or list of values to search for. 346 | 347 | Returns: 348 | bool: 349 | whether there is a match or not. 350 | """ 351 | if not isinstance(values, list): 352 | values = [values] 353 | 354 | if key not in annotation: 355 | if '.' in key: 356 | name, key = key.split('.', 1) 357 | part = annotation.get(name) or dict() 358 | return _match(part, key, values) 359 | else: 360 | return False 361 | 362 | annotation_value = annotation[key] 363 | 364 | for value in values: 365 | if isinstance(annotation_value, (list, dict)): 366 | return value in annotation_value 367 | elif annotation_value == value: 368 | return True 369 | 370 | return False 371 | 372 | 373 | def _find_annotations(paths, loader, pattern, filters): 374 | """Find matching annotations within the given paths. 375 | 376 | Math annotations by both name pattern and filters. 377 | 378 | Args: 379 | paths (list): 380 | List of paths to search annotations in. 381 | loader (callable): 382 | Function to use to load the annotation contents. 383 | pattern (str): 384 | Pattern to match against the annotation name. 385 | filters (dict): 386 | Dictionary containing key/value filters. 387 | 388 | Returns: 389 | list: 390 | names of the matching annotations. 391 | """ 392 | annotations = dict() 393 | for base_path in paths: 394 | annotations.update(_search_annotations(base_path, pattern)) 395 | 396 | matching = list() 397 | for name in sorted(annotations.values()): 398 | annotation = loader(name) 399 | for key, value in filters.items(): 400 | if not _match(annotation, key, value): 401 | break 402 | 403 | else: 404 | matching.append(name) 405 | 406 | return matching 407 | 408 | 409 | def find_primitives(pattern='', filters=None): 410 | """Find primitives by name and filters. 411 | 412 | If a patter is given, only the primitives whose name matches 413 | the pattern will be returned. 414 | 415 | If filters are given, they should be a dictionary containing key/value 416 | filters that will have to be matched within the primitive annotation 417 | for it to be included in the results. 418 | 419 | If the given key is not found but it contains dots, split by the dots 420 | and consider each part a sublevel in the annotation. 421 | 422 | If the key value within the annotation is a list or a dict, check 423 | whether any of the given values is contained within it instead of 424 | checking for equality. 425 | 426 | Args: 427 | pattern (str): 428 | Regular expression to match agains the primitive names. 429 | filters (dict): 430 | Dictionary containing the filters to apply over the matchin 431 | primitives. 432 | 433 | Returns: 434 | list: 435 | Names of the matching primitives. 436 | """ 437 | filters = filters or dict() 438 | return _find_annotations(get_primitives_paths(), load_primitive, pattern, filters) 439 | 440 | 441 | def find_pipelines(pattern='', filters=None): 442 | """Find pipelines by name and filters. 443 | 444 | If a patter is given, only the pipelines whose name matches 445 | the pattern will be returned. 446 | 447 | If filters are given, they should be a dictionary containing key/value 448 | filters that will have to be matched within the pipeline annotation 449 | for it to be included in the results. 450 | 451 | If the given key is not found but it contains dots, split by the dots 452 | and consider each part a sublevel in the annotation. 453 | 454 | If the key value within the annotation is a list or a dict, check 455 | whether any of the given values is contained within it instead of 456 | checking for equality. 457 | 458 | Args: 459 | pattern (str): 460 | Regular expression to match agains the pipeline names. 461 | filters (dict): 462 | Dictionary containing the filters to apply over the matchin 463 | pipelines. 464 | 465 | Returns: 466 | list: 467 | Names of the matching pipelines. 468 | """ 469 | filters = filters or dict() 470 | return _find_annotations(get_pipelines_paths(), load_pipeline, pattern, filters) 471 | -------------------------------------------------------------------------------- /mlblocks/mlblock.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Package where the MLBlock class is defined.""" 4 | 5 | import importlib 6 | import logging 7 | from copy import deepcopy 8 | 9 | from mlblocks.discovery import load_primitive 10 | 11 | LOGGER = logging.getLogger(__name__) 12 | 13 | 14 | def import_object(object_name): 15 | """Import an object from its Fully Qualified Name.""" 16 | 17 | if isinstance(object_name, str): 18 | parent_name, attribute = object_name.rsplit('.', 1) 19 | try: 20 | parent = importlib.import_module(parent_name) 21 | except ImportError: 22 | grand_parent_name, parent_name = parent_name.rsplit('.', 1) 23 | grand_parent = importlib.import_module(grand_parent_name) 24 | parent = getattr(grand_parent, parent_name) 25 | 26 | return getattr(parent, attribute) 27 | 28 | return object_name 29 | 30 | 31 | class MLBlock(): 32 | """MLBlock Class. 33 | 34 | The MLBlock class represents a single step within an MLPipeline. 35 | 36 | It is responsible for loading and interpreting JSON primitives, as well 37 | as wrapping them and providing a common interface to run them. 38 | 39 | Attributes: 40 | name (str): 41 | Primitive name. 42 | metadata (dict): 43 | Additional information about this primitive 44 | primitive (object): 45 | the actual function or instance which this MLBlock wraps. 46 | fit_args (dict): 47 | specification of the arguments expected by the ``fit`` method. 48 | fit_method (str): 49 | name of the primitive method to call on ``fit``. ``None`` if the 50 | primitive is a function. 51 | produce_args (dict): 52 | specification of the arguments expected by the ``predict`` method. 53 | produce_output (dict): 54 | specification of the outputs of the ``produce`` method. 55 | produce_method (str): 56 | name of the primitive method to call on ``produce``. ``None`` if the primitive is a 57 | function. 58 | 59 | Args: 60 | primitive (str or dict): 61 | primitive name or primitive dictionary. 62 | **kwargs: 63 | Any additional arguments that will be used as hyperparameters or passed to the 64 | ``fit`` or ``produce`` methods. 65 | 66 | Raises: 67 | TypeError: 68 | A ``TypeError`` is raised if a required argument is not found within the ``kwargs`` 69 | or if an unexpected argument has been given. 70 | """ # pylint: disable=too-many-instance-attributes 71 | 72 | def _extract_params(self, kwargs, hyperparameters): 73 | """Extract init, fit and produce params from kwargs. 74 | 75 | The ``init_params``, ``fit_params`` and ``produce_params`` are extracted 76 | from the passed ``kwargs`` taking the metadata hyperparameters as a 77 | reference. 78 | 79 | During this extraction, make sure that all the required hyperparameters 80 | have been given and that nothing unexpected exists in the input. 81 | 82 | Args: 83 | kwargs (dict): 84 | dict containing the Keyword arguments that have been passed to the ``__init__`` 85 | method upon initialization. 86 | hyperparameters (dict): 87 | hyperparameters dictionary, as found in the JSON annotation. 88 | 89 | Raises: 90 | TypeError: 91 | A ``TypeError`` is raised if a required argument is not found in the 92 | ``kwargs`` dict, or if an unexpected argument has been given. 93 | """ 94 | init_params = dict() 95 | fit_params = dict() 96 | produce_params = dict() 97 | 98 | for name, param in hyperparameters.get('fixed', dict()).items(): 99 | if name in kwargs: 100 | value = kwargs.pop(name) 101 | 102 | elif 'default' in param: 103 | value = param['default'] 104 | 105 | else: 106 | raise TypeError("{} required argument '{}' not found".format(self.name, name)) 107 | 108 | init_params[name] = value 109 | 110 | for name, param in hyperparameters.get('tunable', dict()).items(): 111 | if name in kwargs: 112 | init_params[name] = kwargs.pop(name) 113 | 114 | if not isinstance(self.fit_args, str): 115 | fit_args = [arg['name'] for arg in self.fit_args] 116 | else: 117 | fit_args = [] 118 | 119 | if not isinstance(self.produce_args, str): 120 | produce_args = [arg['name'] for arg in self.produce_args] 121 | else: 122 | produce_args = [] 123 | 124 | for name in list(kwargs.keys()): 125 | if name in fit_args: 126 | fit_params[name] = kwargs.pop(name) 127 | 128 | elif name in produce_args: 129 | produce_params[name] = kwargs.pop(name) 130 | 131 | if kwargs: 132 | error = "Unexpected hyperparameters '{}'".format(', '.join(kwargs.keys())) 133 | raise TypeError(error) 134 | 135 | return init_params, fit_params, produce_params 136 | 137 | @staticmethod 138 | def _filter_conditional(conditional, init_params): 139 | condition = conditional['condition'] 140 | default = conditional.get('default') 141 | 142 | if condition not in init_params: 143 | return default 144 | 145 | condition_value = init_params[condition] 146 | values = conditional['values'] 147 | return values.get(condition_value, default) 148 | 149 | @classmethod 150 | def _get_tunable(cls, hyperparameters, init_params): 151 | tunable = dict() 152 | for name, param in hyperparameters.get('tunable', dict()).items(): 153 | if name not in init_params: 154 | if param['type'] == 'conditional': 155 | param = cls._filter_conditional(param, init_params) 156 | if param is not None: 157 | tunable[name] = param 158 | 159 | else: 160 | tunable[name] = param 161 | 162 | return tunable 163 | 164 | def __init__(self, primitive, **kwargs): 165 | if isinstance(primitive, str): 166 | primitive = load_primitive(primitive) 167 | 168 | self.metadata = primitive 169 | self.name = primitive['name'] 170 | 171 | self.primitive = import_object(self.metadata['primitive']) 172 | 173 | self._fit = self.metadata.get('fit', dict()) 174 | self.fit_args = self._fit.get('args', []) 175 | self.fit_method = self._fit.get('method') 176 | 177 | self._produce = self.metadata['produce'] 178 | self.produce_args = self._produce['args'] 179 | self.produce_output = self._produce['output'] 180 | self.produce_method = self._produce.get('method') 181 | 182 | self._class = bool(self.produce_method) 183 | 184 | hyperparameters = self.metadata.get('hyperparameters', dict()) 185 | init_params, fit_params, produce_params = self._extract_params(kwargs, hyperparameters) 186 | 187 | self._hyperparameters = init_params 188 | self._fit_params = fit_params 189 | self._produce_params = produce_params 190 | 191 | self._tunable = self._get_tunable(hyperparameters, init_params) 192 | 193 | default = { 194 | name: param['default'] 195 | for name, param in self._tunable.items() 196 | # TODO: support undefined defaults 197 | } 198 | 199 | self.set_hyperparameters(default) 200 | 201 | def __str__(self): 202 | """Return a string that represents this block.""" 203 | return 'MLBlock - {}'.format(self.name) 204 | 205 | def get_tunable_hyperparameters(self): 206 | """Get the hyperparameters that can be tuned for this MLBlock. 207 | 208 | The list of hyperparameters is taken from the JSON annotation, 209 | filtering out any hyperparameter for which a value has been given 210 | during the initalization. 211 | 212 | Returns: 213 | dict: 214 | the dictionary containing the hyperparameters that can be 215 | tuned, their types and, if applicable, the accepted 216 | ranges or values. 217 | """ 218 | return deepcopy(self._tunable) 219 | 220 | def get_hyperparameters(self): 221 | """Get hyperparameters values that the current MLBlock is using. 222 | 223 | Returns: 224 | dict: 225 | the dictionary containing the hyperparameter values that the 226 | MLBlock is currently using. 227 | """ 228 | return deepcopy(self._hyperparameters) 229 | 230 | def set_hyperparameters(self, hyperparameters): 231 | """Set new hyperparameters. 232 | 233 | Only the specified hyperparameters are modified, so any other 234 | hyperparameter keeps the value that had been previously given. 235 | 236 | If necessary, a new instance of the primitive is created. 237 | 238 | Args: 239 | hyperparameters (dict): 240 | Dictionary containing as keys the name of the hyperparameters and as 241 | values the values to be used. 242 | """ 243 | self._hyperparameters.update(hyperparameters) 244 | 245 | if self._class: 246 | LOGGER.debug('Creating a new primitive instance for %s', self.name) 247 | self.instance = self.primitive(**self.get_hyperparameters()) 248 | 249 | def _get_method_kwargs(self, kwargs, method_args): 250 | """Prepare the kwargs for the method. 251 | 252 | The kwargs dict will be altered according to the method_kwargs 253 | specification to make them ready for the primitive method to 254 | accept them. 255 | 256 | Args: 257 | kwargs (dict): 258 | keyword arguments that have been passed to the block method. 259 | method_args (list): 260 | method arguments as specified in the JSON annotation. 261 | 262 | Returns: 263 | dict: 264 | A dictionary containing the argument names and values to pass 265 | to the primitive method. 266 | """ 267 | if isinstance(method_args, str): 268 | method_args = getattr(self.instance, method_args)() 269 | 270 | method_kwargs = dict() 271 | for arg in method_args: 272 | name = arg['name'] 273 | keyword = arg.get('keyword', name) 274 | 275 | if name in kwargs: 276 | value = kwargs[name] 277 | elif 'default' in arg: 278 | value = arg['default'] 279 | elif arg.get('required', True): 280 | raise TypeError("missing expected argument '{}'".format(name)) 281 | 282 | method_kwargs[keyword] = value 283 | 284 | return method_kwargs 285 | 286 | def fit(self, **kwargs): 287 | """Call the fit method of the primitive. 288 | 289 | The given keyword arguments will be passed directly to the ``fit`` 290 | method of the primitive instance specified in the JSON annotation. 291 | 292 | If any of the arguments expected by the produce method had been 293 | given during the MLBlock initialization, they will be passed as well. 294 | 295 | If the fit method was not specified in the JSON annotation, or if 296 | the primitive is a simple function, this will be a noop. 297 | 298 | Args: 299 | **kwargs: 300 | Any given keyword argument will be directly passed to the primitive fit method. 301 | 302 | Raises: 303 | TypeError: 304 | A ``TypeError`` might be raised if any argument not expected by the primitive fit 305 | method is given. 306 | """ 307 | if self.fit_method is not None: 308 | fit_kwargs = self._fit_params.copy() 309 | fit_kwargs.update(kwargs) 310 | fit_kwargs = self._get_method_kwargs(fit_kwargs, self.fit_args) 311 | getattr(self.instance, self.fit_method)(**fit_kwargs) 312 | 313 | def produce(self, **kwargs): 314 | """Call the primitive function, or the predict method of the primitive. 315 | 316 | The given keyword arguments will be passed directly to the primitive, 317 | if it is a simple function, or to the ``produce`` method of the 318 | primitive instance specified in the JSON annotation, if it is a class. 319 | 320 | If any of the arguments expected by the fit method had been given 321 | during the MLBlock initialization, they will be passed as well. 322 | 323 | Returns: 324 | The output of the call to the primitive function or primitive 325 | produce method. 326 | """ 327 | produce_kwargs = self._produce_params.copy() 328 | produce_kwargs.update(kwargs) 329 | produce_kwargs = self._get_method_kwargs(produce_kwargs, self.produce_args) 330 | if self._class: 331 | return getattr(self.instance, self.produce_method)(**produce_kwargs) 332 | 333 | produce_kwargs.update(self.get_hyperparameters()) 334 | return self.primitive(**produce_kwargs) 335 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements for development and mybinder environment 2 | -e .[dev] 3 | docutils<0.16,>=0.10 # Fix dependency conflict on mybinder 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.6.3.dev0 3 | commit = True 4 | tag = True 5 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? 6 | serialize = 7 | {major}.{minor}.{patch}.{release}{candidate} 8 | {major}.{minor}.{patch} 9 | 10 | [bumpversion:part:release] 11 | optional_value = release 12 | first_value = dev 13 | values = 14 | dev 15 | release 16 | 17 | [bumpversion:part:candidate] 18 | 19 | [bumpversion:file:setup.py] 20 | search = version='{current_version}' 21 | replace = version='{new_version}' 22 | 23 | [bumpversion:file:mlblocks/__init__.py] 24 | search = __version__ = '{current_version}' 25 | replace = __version__ = '{new_version}' 26 | 27 | [bdist_wheel] 28 | universal = 1 29 | 30 | [flake8] 31 | max-line-length = 99 32 | exclude = .tox, .git, __pycache__, .ipynb_checkpoints 33 | ignore = # Keep empty to prevent default ignores 34 | 35 | [isort] 36 | line_length = 99 37 | lines_between_types = 0 38 | multi_line_output = 4 39 | use_parentheses = True 40 | not_skip = __init__.py 41 | skip_glob = *.bak 42 | 43 | [metadata] 44 | description-file = README.md 45 | 46 | [aliases] 47 | test = pytest 48 | 49 | [tool:pytest] 50 | collect_ignore = ['setup.py'] 51 | 52 | [tool:pylint] 53 | good-names = X,y 54 | 55 | [doc8] 56 | max-line-length = 99 57 | 58 | [pydocstyle] 59 | add-ignore = D403,D413,D105,D107 60 | 61 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | from setuptools import find_packages, setup 7 | 8 | with open('README.md', encoding='utf-8') as readme_file: 9 | readme = readme_file.read() 10 | 11 | with open('HISTORY.md', encoding='utf-8') as history_file: 12 | history = history_file.read() 13 | 14 | 15 | install_requires = [ 16 | 'graphviz>=0.9,<1', 17 | 'numpy>=1.17.1,<3', 18 | 'psutil>=5,<7', 19 | ] 20 | 21 | 22 | mlprimitives_requires = [ 23 | 'mlprimitives>=0.4.0,<0.5', 24 | 'h5py<4,>=2.10.0', # <- tensorflow 2.3.2 conflict 25 | 'matplotlib<4,>=2.2.2', # <- copulas 0.3.3 26 | 'protobuf<4', # <- importlib 27 | ] 28 | 29 | examples_require = mlprimitives_requires + [ 30 | 'jupyter==1.0.0', 31 | 'baytune>=0.5.0,<0.6', 32 | 'copulas<0.12', 33 | ] 34 | 35 | 36 | tests_require = [ 37 | 'pytest>=3.4.2', 38 | 'pytest-cov>=2.6.0', 39 | 'setuptools>=41.0.0', 40 | 'rundoc>=0.4.3', 41 | 'prompt-toolkit>=2.0,<3.0', 42 | ] 43 | 44 | 45 | setup_requires = [ 46 | 'pytest-runner>=2.11.1', 47 | ] 48 | 49 | 50 | development_requires = [ 51 | # general 52 | 'bumpversion>=0.5.3,<0.6', 53 | 'pip>=9.0.1', 54 | 'watchdog>=0.8.3,<5', 55 | 56 | # docs 57 | 'm2r>=0.2.0,<0.3', 58 | 'Sphinx>=1.7.1,<3', 59 | 'sphinx_rtd_theme>=0.2.4,<0.5', 60 | 'docutils>=0.12,<0.18', 61 | 'ipython>=6.5.0', 62 | 'autodocsumm>=0.1.10', 63 | 'Jinja2>=2,<3', # >=3 makes sphinx theme fail 64 | 'markupsafe<2.1.0', 65 | 66 | # fails on Sphinx < v3.4 67 | 'alabaster<=0.7.12', 68 | # fails on Sphins < v5.0 69 | 'sphinxcontrib-applehelp<1.0.8', 70 | 'sphinxcontrib-devhelp<1.0.6', 71 | 'sphinxcontrib-htmlhelp<2.0.5', 72 | 'sphinxcontrib-serializinghtml<1.1.10', 73 | 'sphinxcontrib-qthelp<1.0.7', 74 | 75 | # style check 76 | 'flake8>=3.7.7,<4', 77 | 'isort>=4.3.4,<5', 78 | 79 | # fix style issues 80 | 'autoflake>=1.1,<2', 81 | 'autopep8>=1.4.3,<2', 82 | 83 | # distribute on PyPI 84 | 'twine>=1.10.0,<4', 85 | 'wheel>=0.30.0', 86 | 87 | # Advanced testing 88 | 'coverage>=4.5.1,<6', 89 | 'tox>=2.9.1,<4', 90 | 91 | # Documentation style 92 | 'doc8>=0.8.0', 93 | 'pydocstyle>=3.0.0', 94 | ] 95 | 96 | 97 | setup( 98 | author='MIT Data To AI Lab', 99 | author_email='dailabmit@gmail.com', 100 | classifiers=[ 101 | 'Development Status :: 2 - Pre-Alpha', 102 | 'Intended Audience :: Developers', 103 | 'License :: OSI Approved :: MIT License', 104 | 'Natural Language :: English', 105 | 'Programming Language :: Python :: 3', 106 | 'Programming Language :: Python :: 3.8', 107 | 'Programming Language :: Python :: 3.9', 108 | 'Programming Language :: Python :: 3.10', 109 | 'Programming Language :: Python :: 3.11', 110 | 'Programming Language :: Python :: 3.12', 111 | 'Programming Language :: Python :: 3.13', 112 | ], 113 | description='Pipelines and primitives for machine learning and data science.', 114 | extras_require={ 115 | 'dev': development_requires + tests_require + examples_require, 116 | 'unit': tests_require, 117 | 'test': tests_require + examples_require, 118 | 'examples': examples_require, 119 | 'mlprimitives': mlprimitives_requires, 120 | }, 121 | include_package_data=True, 122 | install_requires=install_requires, 123 | keywords='auto machine learning classification regression data science pipeline', 124 | license='MIT license', 125 | long_description=readme + '\n\n' + history, 126 | long_description_content_type='text/markdown', 127 | name='mlblocks', 128 | packages=find_packages(include=['mlblocks', 'mlblocks.*']), 129 | python_requires='>=3.8,<3.14', 130 | setup_requires=setup_requires, 131 | test_suite='tests', 132 | tests_require=tests_require, 133 | url='https://github.com/MLBazaar/MLBlocks', 134 | version='0.6.3.dev0', 135 | zip_safe=False, 136 | ) 137 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/diagrams/diagram_fit.txt: -------------------------------------------------------------------------------- 1 | digraph { 2 | graph [splines=ortho] 3 | tooltip=" " 4 | node [penwidth=0 shape=box] 5 | subgraph cluster_outputs { 6 | tooltip="Output variables" 7 | graph [bgcolor=azure3 penwidth=0 rank=source] 8 | node [fontsize=20 penwidth=0] 9 | edge [arrowhead=none penwidth=0] 10 | Output [label=Output fontsize=14 tooltip="Output variables"] 11 | output_variable_output [label=output_variable] 12 | output_variable_output -> Output 13 | { 14 | rank=same 15 | rankdir=LR 16 | } 17 | } 18 | "a_primitive#1" [label=a_primitive penwidth=1] 19 | "a_primitive#1 output_variable" [label=output_variable] 20 | "a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none] 21 | "a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal] 22 | input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1] 23 | subgraph cluster_inputs { 24 | tooltip="Input variables" 25 | graph [bgcolor=azure3 penwidth=0 rank=source] 26 | node [fontsize=20 penwidth=0] 27 | edge [arrowhead=none penwidth=0] 28 | Input [label=Input fontsize=14 tooltip="Input variables"] 29 | input_variable_input [label=input_variable] 30 | Input -> input_variable_input 31 | { 32 | rank=same 33 | } 34 | } 35 | { 36 | graph [penwidth=0] 37 | node [penwidth=0] 38 | edge [len=1 minlen=1 penwidth=1] 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tests/data/diagrams/diagram_multiple_blocks.txt: -------------------------------------------------------------------------------- 1 | digraph { 2 | graph [splines=ortho] 3 | tooltip=" " 4 | node [penwidth=0 shape=box] 5 | subgraph cluster_outputs { 6 | tooltip="Output variables" 7 | graph [bgcolor=azure3 penwidth=0 rank=source] 8 | node [fontsize=20 penwidth=0] 9 | edge [arrowhead=none penwidth=0] 10 | Output [label=Output fontsize=14 tooltip="Output variables"] 11 | output_variable_b_output [label=output_variable_b] 12 | output_variable_b_output -> Output 13 | { 14 | rank=same 15 | rankdir=LR 16 | } 17 | } 18 | "b_primitive#1" [label=b_primitive penwidth=1] 19 | "b_primitive#1 output_variable_b" [label=output_variable_b] 20 | "b_primitive#1" -> "b_primitive#1 output_variable_b" [arrowhead=none] 21 | "b_primitive#1 output_variable_b" -> output_variable_b_output [arrowhead=normal] 22 | "a_primitive#1" [label=a_primitive penwidth=1] 23 | "a_primitive#1 output_variable_a" [label=output_variable_a] 24 | "a_primitive#1" -> "a_primitive#1 output_variable_a" [arrowhead=none] 25 | "a_primitive#1 output_variable_a" -> "b_primitive#1" [arrowhead=normal] 26 | input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1] 27 | subgraph cluster_inputs { 28 | tooltip="Input variables" 29 | graph [bgcolor=azure3 penwidth=0 rank=source] 30 | node [fontsize=20 penwidth=0] 31 | edge [arrowhead=none penwidth=0] 32 | Input [label=Input fontsize=14 tooltip="Input variables"] 33 | input_variable_input [label=input_variable] 34 | Input -> input_variable_input 35 | { 36 | rank=same 37 | } 38 | } 39 | { 40 | graph [penwidth=0] 41 | node [penwidth=0] 42 | edge [len=1 minlen=1 penwidth=1] 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /tests/data/diagrams/diagram_simple.txt: -------------------------------------------------------------------------------- 1 | digraph { 2 | graph [splines=ortho] 3 | tooltip=" " 4 | node [penwidth=0 shape=box] 5 | subgraph cluster_outputs { 6 | tooltip="Output variables" 7 | graph [bgcolor=azure3 penwidth=0 rank=source] 8 | node [fontsize=20 penwidth=0] 9 | edge [arrowhead=none penwidth=0] 10 | Output [label=Output fontsize=14 tooltip="Output variables"] 11 | output_variable_output [label=output_variable] 12 | output_variable_output -> Output 13 | { 14 | rank=same 15 | rankdir=LR 16 | } 17 | } 18 | "a_primitive#1" [label=a_primitive penwidth=1] 19 | "a_primitive#1 output_variable" [label=output_variable] 20 | "a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none] 21 | "a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal] 22 | input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1] 23 | subgraph cluster_inputs { 24 | tooltip="Input variables" 25 | graph [bgcolor=azure3 penwidth=0 rank=source] 26 | node [fontsize=20 penwidth=0] 27 | edge [arrowhead=none penwidth=0] 28 | Input [label=Input fontsize=14 tooltip="Input variables"] 29 | input_variable_input [label=input_variable] 30 | Input -> input_variable_input 31 | { 32 | rank=same 33 | } 34 | } 35 | { 36 | graph [penwidth=0] 37 | node [penwidth=0] 38 | edge [len=1 minlen=1 penwidth=1] 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tests/features/test_fit_predicr_args.py: -------------------------------------------------------------------------------- 1 | from mlblocks.mlpipeline import MLPipeline 2 | 3 | 4 | def test_fit_predict_args_in_init(): 5 | 6 | def add(a, b): 7 | return a + b 8 | 9 | primitive = { 10 | 'name': 'add', 11 | 'primitive': add, 12 | 'produce': { 13 | 'args': [ 14 | { 15 | 'name': 'a', 16 | 'type': 'float', 17 | }, 18 | { 19 | 'name': 'b', 20 | 'type': 'float', 21 | }, 22 | ], 23 | 'output': [ 24 | { 25 | 'type': 'float', 26 | 'name': 'out' 27 | } 28 | ] 29 | } 30 | } 31 | 32 | primitives = [primitive] 33 | init_params = { 34 | 'add': { 35 | 'b': 10 36 | } 37 | } 38 | pipeline = MLPipeline(primitives, init_params=init_params) 39 | 40 | out = pipeline.predict(a=3) 41 | 42 | assert out == 13 43 | -------------------------------------------------------------------------------- /tests/features/test_partial_outputs.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from unittest.mock import Mock 3 | 4 | import numpy as np 5 | 6 | from mlblocks.mlpipeline import MLPipeline 7 | 8 | 9 | def almost_equal(obj1, obj2): 10 | if isinstance(obj1, dict): 11 | if not isinstance(obj2, dict): 12 | raise AssertionError("{} is not equal to {}".format(type(obj2), dict)) 13 | 14 | for key, value in obj1.items(): 15 | if key not in obj2: 16 | raise AssertionError("{} not in {}".format(key, obj2)) 17 | 18 | almost_equal(value, obj2[key]) 19 | 20 | else: 21 | np.testing.assert_almost_equal(obj1, obj2) 22 | 23 | 24 | class TestPartialOutputs(TestCase): 25 | def setUp(self): 26 | self.X = np.array([ 27 | [1, 0, 0, 0, 0], 28 | [0, 1, 0, 0, 0], 29 | [0, 0, 1, 0, 0], 30 | [0, 0, 0, 1, 0], 31 | [0, 0, 0, 0, 1], 32 | ]) 33 | self.y = np.array([0, 0, 0, 0, 1]) 34 | 35 | def test_fit_output(self): 36 | 37 | # Setup variables 38 | primitives = [ 39 | 'sklearn.preprocessing.StandardScaler', 40 | 'sklearn.linear_model.LogisticRegression' 41 | ] 42 | pipeline = MLPipeline(primitives) 43 | 44 | named = 'default' 45 | list_ = ['default', 0] 46 | int_block = 0 47 | invalid_int = 10 48 | str_block = 'sklearn.preprocessing.StandardScaler#1' 49 | invalid_block = 'InvalidBlockName' 50 | str_block_variable = 'sklearn.preprocessing.StandardScaler#1.X' 51 | invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid' 52 | 53 | # Run 54 | named_out = pipeline.fit(self.X, self.y, output_=named) 55 | list_out = pipeline.fit(self.X, self.y, output_=list_) 56 | int_out = pipeline.fit(self.X, self.y, output_=int_block) 57 | str_out = pipeline.fit(self.X, self.y, output_=str_block) 58 | str_out_variable = pipeline.fit(self.X, self.y, 59 | output_=str_block_variable) 60 | no_output = pipeline.fit(self.X, self.y) 61 | 62 | # Assert successful calls 63 | X = np.array([ 64 | [2., -0.5, -0.5, -0.5, -0.5], 65 | [-0.5, 2., -0.5, -0.5, -0.5], 66 | [-0.5, -0.5, 2., -0.5, -0.5], 67 | [-0.5, -0.5, -0.5, 2., -0.5], 68 | [-0.5, -0.5, -0.5, -0.5, 2.], 69 | ]) 70 | y = np.array([ 71 | 0, 0, 0, 0, 1 72 | ]) 73 | context = {'X': X, 'y': y} 74 | 75 | almost_equal(named_out, y) 76 | assert len(list_out) == 2 77 | almost_equal(list_out[0], y) 78 | almost_equal(list_out[1], context) 79 | almost_equal(context, int_out) 80 | almost_equal(context, str_out) 81 | almost_equal(X, str_out_variable) 82 | assert no_output is None 83 | 84 | # Run asserting exceptions 85 | with self.assertRaises(IndexError): 86 | pipeline.fit(self.X, self.y, output_=invalid_int) 87 | 88 | with self.assertRaises(ValueError): 89 | pipeline.fit(self.X, self.y, output_=invalid_block) 90 | 91 | with self.assertRaises(ValueError): 92 | pipeline.fit(self.X, self.y, output_=invalid_variable) 93 | 94 | def test_fit_start(self): 95 | # Setup variables 96 | primitives = [ 97 | 'sklearn.preprocessing.StandardScaler', 98 | 'sklearn.linear_model.LogisticRegression' 99 | ] 100 | pipeline = MLPipeline(primitives) 101 | 102 | # Mock the first block 103 | block_mock = Mock() 104 | pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock 105 | 106 | # Run first block 107 | context = { 108 | 'X': self.X, 109 | 'y': self.y 110 | } 111 | int_start = 1 112 | str_start = 'sklearn.linear_model.LogisticRegression#1' 113 | 114 | pipeline.fit(start_=int_start, **context) 115 | pipeline.fit(start_=str_start, **context) 116 | 117 | # Assert that mock has not been called 118 | block_mock.fit.assert_not_called() 119 | 120 | def test_predict_start(self): 121 | # Setup variables 122 | primitives = [ 123 | 'sklearn.preprocessing.StandardScaler', 124 | 'sklearn.linear_model.LogisticRegression' 125 | ] 126 | pipeline = MLPipeline(primitives) 127 | pipeline.fit(self.X, self.y) 128 | 129 | # Mock the first block 130 | block_mock = Mock() 131 | pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock 132 | 133 | # Run first block 134 | context = { 135 | 'X': self.X, 136 | } 137 | int_start = 1 138 | str_start = 'sklearn.linear_model.LogisticRegression#1' 139 | 140 | pipeline.predict(start_=int_start, **context) 141 | pipeline.predict(start_=str_start, **context) 142 | 143 | # Assert that mock has not been called 144 | block_mock.predict.assert_not_called() 145 | -------------------------------------------------------------------------------- /tests/features/test_pipeline_loading.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from mlblocks import MLPipeline 4 | 5 | 6 | class TestMLPipeline(TestCase): 7 | 8 | def test_dict(self): 9 | pipeline_dict = { 10 | 'primitives': [ 11 | 'sklearn.ensemble.RandomForestClassifier' 12 | ], 13 | 'init_params': { 14 | 'sklearn.ensemble.RandomForest#1': { 15 | 'n_estimators': 500 16 | } 17 | }, 18 | 'input_names': { 19 | 'sklearn.ensemble.RandomForest#1': { 20 | 'X': 'X1' 21 | } 22 | }, 23 | 'output_names': { 24 | 'sklearn.ensemble.RandomForest#1': { 25 | 'y': 'y1' 26 | } 27 | } 28 | } 29 | 30 | pipeline = MLPipeline(pipeline_dict) 31 | 32 | assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier'] 33 | assert pipeline.init_params == { 34 | 'sklearn.ensemble.RandomForest#1': { 35 | 'n_estimators': 500 36 | } 37 | } 38 | assert pipeline.input_names == { 39 | 'sklearn.ensemble.RandomForest#1': { 40 | 'X': 'X1' 41 | } 42 | } 43 | assert pipeline.output_names == { 44 | 'sklearn.ensemble.RandomForest#1': { 45 | 'y': 'y1' 46 | } 47 | } 48 | 49 | def test_list(self): 50 | primitives = [ 51 | 'sklearn.ensemble.RandomForestClassifier' 52 | ] 53 | init_params = { 54 | 'sklearn.ensemble.RandomForest#1': { 55 | 'n_estimators': 500 56 | } 57 | } 58 | 59 | pipeline = MLPipeline(primitives, init_params=init_params) 60 | 61 | assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier'] 62 | assert pipeline.init_params == { 63 | 'sklearn.ensemble.RandomForest#1': { 64 | 'n_estimators': 500 65 | } 66 | } 67 | 68 | def test_none(self): 69 | primitives = [ 70 | 'sklearn.ensemble.RandomForestClassifier' 71 | ] 72 | init_params = { 73 | 'sklearn.ensemble.RandomForest#1': { 74 | 'n_estimators': 500 75 | } 76 | } 77 | 78 | pipeline = MLPipeline(primitives=primitives, init_params=init_params) 79 | 80 | assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier'] 81 | assert pipeline.init_params == { 82 | 'sklearn.ensemble.RandomForest#1': { 83 | 'n_estimators': 500 84 | } 85 | } 86 | 87 | def test_mlpipeline(self): 88 | primitives = [ 89 | 'sklearn.ensemble.RandomForestClassifier' 90 | ] 91 | init_params = { 92 | 'sklearn.ensemble.RandomForest#1': { 93 | 'n_estimators': 500 94 | } 95 | } 96 | 97 | pipeline = MLPipeline(primitives=primitives, init_params=init_params) 98 | pipeline2 = MLPipeline(pipeline) 99 | 100 | assert pipeline2.primitives == ['sklearn.ensemble.RandomForestClassifier'] 101 | assert pipeline2.init_params == { 102 | 'sklearn.ensemble.RandomForest#1': { 103 | 'n_estimators': 500 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /tests/test_discovery.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import os 5 | import tempfile 6 | import uuid 7 | from unittest.mock import Mock, call, patch 8 | 9 | import pytest 10 | from pkg_resources import Distribution, EntryPoint 11 | 12 | from mlblocks import discovery 13 | 14 | FAKE_PRIMITIVES_PATH = 'this/is/a/fake' 15 | FAKE_PRIMITIVES_PATHS = [ 16 | 'this/is/another/fake', 17 | 'this/is/yet/another/fake', 18 | ] 19 | 20 | 21 | def test__add_lookup_path_do_nothing(): 22 | paths = ['a', 'b'] 23 | discovery._add_lookup_path('a', paths) 24 | 25 | assert paths == ['a', 'b'] 26 | 27 | 28 | def test__add_lookup_path_exception(): 29 | paths = ['a', 'b'] 30 | invalid_path = str(uuid.uuid4()) 31 | 32 | with pytest.raises(ValueError): 33 | discovery._add_lookup_path(invalid_path, paths) 34 | 35 | 36 | def test__add_lookup_path(): 37 | paths = ['a', 'b'] 38 | discovery._add_lookup_path('tests', paths) 39 | 40 | expected_path = os.path.abspath('tests') 41 | 42 | assert paths == [expected_path, 'a', 'b'] 43 | 44 | 45 | @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) 46 | def test_add_primitives_path(): 47 | discovery.add_primitives_path(os.path.abspath('tests')) 48 | 49 | expected_path = os.path.abspath('tests') 50 | assert discovery._PRIMITIVES_PATHS == [expected_path, 'a', 'b'] 51 | 52 | 53 | @patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b']) 54 | def test_add_pipelines_path(): 55 | discovery.add_pipelines_path('tests') 56 | 57 | expected_path = os.path.abspath('tests') 58 | assert discovery._PIPELINES_PATHS == [expected_path, 'a', 'b'] 59 | 60 | 61 | @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) 62 | @patch('mlblocks.discovery.pkg_resources.iter_entry_points') 63 | def test__load_entry_points_no_entry_points(iep_mock): 64 | # setup 65 | iep_mock.return_value == [] 66 | 67 | # run 68 | paths = discovery._load_entry_points('jsons_path', 'mlprimitives') 69 | 70 | # assert 71 | assert paths == [] 72 | expected_calls = [ 73 | call('mlprimitives'), 74 | ] 75 | assert iep_mock.call_args_list == expected_calls 76 | 77 | 78 | @patch('mlblocks.discovery.pkg_resources.iter_entry_points') 79 | def test__load_entry_points_entry_points(iep_mock): 80 | # setup 81 | something_else_ep = EntryPoint('something_else', 'mlblocks.__version__') 82 | primitives_ep = EntryPoint( 83 | 'primitives', 84 | 'tests.test_discovery', 85 | attrs=['FAKE_PRIMITIVES_PATH'], 86 | dist=Distribution() 87 | ) 88 | another_primitives_ep = EntryPoint( 89 | 'primitives', 90 | 'tests.test_discovery', 91 | attrs=['FAKE_PRIMITIVES_PATHS'], 92 | dist=Distribution() 93 | ) 94 | iep_mock.return_value = [ 95 | something_else_ep, 96 | primitives_ep, 97 | another_primitives_ep 98 | ] 99 | 100 | # run 101 | paths = discovery._load_entry_points('primitives') 102 | 103 | # assert 104 | expected = [ 105 | 'this/is/a/fake', 106 | 'this/is/another/fake', 107 | 'this/is/yet/another/fake', 108 | ] 109 | assert paths == expected 110 | 111 | expected_calls = [ 112 | call('mlblocks'), 113 | ] 114 | assert iep_mock.call_args_list == expected_calls 115 | 116 | 117 | @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b']) 118 | @patch('mlblocks.discovery._load_entry_points') 119 | def test_get_primitives_paths(lep_mock): 120 | lep_mock.side_effect = [['c'], []] 121 | 122 | paths = discovery.get_primitives_paths() 123 | 124 | assert paths == ['a', 'b', 'c'] 125 | expected_calls = [ 126 | call('primitives'), 127 | call('jsons_path', 'mlprimitives'), 128 | ] 129 | assert lep_mock.call_args_list == expected_calls 130 | 131 | 132 | @patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b']) 133 | @patch('mlblocks.discovery._load_entry_points') 134 | def test_get_pipelines_paths(lep_mock): 135 | lep_mock.return_value = ['c'] 136 | 137 | paths = discovery.get_pipelines_paths() 138 | 139 | assert paths == ['a', 'b', 'c'] 140 | lep_mock.assert_called_once_with('pipelines') 141 | 142 | 143 | def test__load_value_error(): 144 | primitive = discovery._load('invalid.primitive', ['a', 'b']) 145 | 146 | assert primitive is None 147 | 148 | 149 | def test__load_success(): 150 | primitive = { 151 | 'name': 'temp.primitive', 152 | 'primitive': 'temp.primitive' 153 | } 154 | 155 | with tempfile.TemporaryDirectory() as tempdir: 156 | paths = [tempdir] 157 | primitive_path = os.path.join(tempdir, 'temp.primitive.json') 158 | with open(primitive_path, 'w') as primitive_file: 159 | json.dump(primitive, primitive_file, indent=4) 160 | 161 | loaded = discovery._load('temp.primitive', paths) 162 | 163 | assert primitive == loaded 164 | 165 | 166 | def test__load_json_path(): 167 | primitive = { 168 | 'name': 'temp.primitive', 169 | 'primitive': 'temp.primitive' 170 | } 171 | 172 | with tempfile.TemporaryDirectory() as tempdir: 173 | paths = [tempdir] 174 | primitive_path = os.path.join(tempdir, 'temp.primitive.json') 175 | with open(primitive_path, 'w') as primitive_file: 176 | json.dump(primitive, primitive_file, indent=4) 177 | 178 | loaded = discovery._load(primitive_path, paths) 179 | 180 | assert primitive == loaded 181 | 182 | 183 | @patch('mlblocks.discovery.get_primitives_paths') 184 | @patch('mlblocks.discovery._load') 185 | def test__load_primitive_value_error(load_mock, gpp_mock): 186 | load_mock.return_value = None 187 | gpp_mock.return_value = ['a', 'b'] 188 | 189 | with pytest.raises(ValueError): 190 | discovery.load_primitive('invalid.primitive') 191 | 192 | load_mock.assert_called_once_with('invalid.primitive', ['a', 'b']) 193 | 194 | 195 | @patch('mlblocks.discovery.get_primitives_paths') 196 | @patch('mlblocks.discovery._load') 197 | def test__load_primitive_success(load_mock, gpp_mock): 198 | gpp_mock.return_value = ['a', 'b'] 199 | 200 | primitive = discovery.load_primitive('valid.primitive') 201 | 202 | load_mock.assert_called_once_with('valid.primitive', ['a', 'b']) 203 | 204 | assert primitive == load_mock.return_value 205 | 206 | 207 | @patch('mlblocks.discovery.get_pipelines_paths') 208 | @patch('mlblocks.discovery._load') 209 | def test__load_pipeline_value_error(load_mock, gpp_mock): 210 | load_mock.return_value = None 211 | gpp_mock.return_value = ['a', 'b'] 212 | 213 | with pytest.raises(ValueError): 214 | discovery.load_pipeline('invalid.pipeline') 215 | 216 | load_mock.assert_called_once_with('invalid.pipeline', ['a', 'b']) 217 | 218 | 219 | @patch('mlblocks.discovery.get_pipelines_paths') 220 | @patch('mlblocks.discovery._load') 221 | def test__load_pipeline_success(load_mock, gpp_mock): 222 | gpp_mock.return_value = ['a', 'b'] 223 | 224 | pipeline = discovery.load_pipeline('valid.pipeline') 225 | 226 | load_mock.assert_called_once_with('valid.pipeline', ['a', 'b']) 227 | 228 | assert pipeline == load_mock.return_value 229 | 230 | 231 | @patch('mlblocks.discovery.os') 232 | def test__search_annotations(os_mock): 233 | os_mock.path.abspath = os.path.abspath 234 | os_mock.path.join = os.path.join 235 | os_mock.path.exists.return_value = True 236 | os_mock.listdir.side_effect = [ 237 | [ 238 | 'a.primitive.json', 239 | 'another.primitive.json', 240 | 'some', 241 | ], 242 | [ 243 | 'other', 244 | ], 245 | [ 246 | 'primitive.json' 247 | ] 248 | ] 249 | os_mock.path.isdir.return_value = False 250 | os_mock.path.isdir.side_effect = [ 251 | False, 252 | False, 253 | True, 254 | True, 255 | False 256 | ] 257 | 258 | annotations = discovery._search_annotations('/path/to', 'other') 259 | 260 | assert annotations == { 261 | '/path/to/another.primitive.json': 'another.primitive', 262 | '/path/to/some/other/primitive.json': 'some.other.primitive' 263 | } 264 | 265 | 266 | def test__match_no_match(): 267 | annotation = { 268 | 'name': 'a.primitive', 269 | } 270 | 271 | matches = discovery._match(annotation, 'key', 'value') 272 | 273 | assert not matches 274 | 275 | 276 | def test__match_root(): 277 | annotation = { 278 | 'name': 'a.primitive', 279 | 'key': 'value' 280 | } 281 | 282 | matches = discovery._match(annotation, 'key', 'value') 283 | 284 | assert matches 285 | 286 | 287 | def test__match_sublevel(): 288 | annotation = { 289 | 'name': 'a.primitive', 290 | 'some': { 291 | 'sublevel': { 292 | 'key': 'value' 293 | } 294 | } 295 | } 296 | 297 | matches = discovery._match(annotation, 'some.sublevel.key', 'value') 298 | 299 | assert matches 300 | 301 | 302 | def test__match_list_no_match(): 303 | annotation = { 304 | 'name': 'a.primitive', 305 | 'key': [ 306 | 'another_value' 307 | 'yet_another_value' 308 | ] 309 | } 310 | 311 | matches = discovery._match(annotation, 'key', 'value') 312 | 313 | assert not matches 314 | 315 | 316 | def test__match_list(): 317 | annotation = { 318 | 'name': 'a.primitive', 319 | 'key': [ 320 | 'value', 321 | 'another_value' 322 | ] 323 | } 324 | 325 | matches = discovery._match(annotation, 'key', 'value') 326 | 327 | assert matches 328 | 329 | 330 | def test__match_dict(): 331 | annotation = { 332 | 'name': 'a.primitive', 333 | 'key': { 334 | 'value': 'subvalue', 335 | 'another_value': 'another_subvalue' 336 | } 337 | } 338 | 339 | matches = discovery._match(annotation, 'key', 'value') 340 | 341 | assert matches 342 | 343 | 344 | def test__match_multiple_keys(): 345 | annotation = { 346 | 'name': 'a.primitive', 347 | 'key': 'value' 348 | } 349 | 350 | matches = discovery._match(annotation, 'key', ['value', 'another_value']) 351 | 352 | assert matches 353 | 354 | 355 | @patch('mlblocks.discovery._search_annotations') 356 | def test__find_annotations(search_annotations_mock): 357 | search_annotations_mock.return_value = { 358 | '/path/to/a/classifier.primitive.json': 'classifier.primitive', 359 | '/path/to/a/regressor.primitive.json': 'regressor.primitive', 360 | } 361 | 362 | loader = Mock() 363 | loader.side_effect = [ 364 | { 365 | 'name': 'classifier.primitive', 366 | 'classifiers': { 367 | 'type': 'estimator', 368 | 'subtype': 'classifier', 369 | } 370 | }, 371 | { 372 | 'name': 'regressor.primitive', 373 | 'classifiers': { 374 | 'type': 'estimator', 375 | 'subtype': 'regressor', 376 | } 377 | } 378 | ] 379 | 380 | filters = { 381 | 'classifiers.subtype': 'regressor' 382 | } 383 | annotations = discovery._find_annotations(['/a/path'], loader, 'pattern', filters) 384 | 385 | assert annotations == ['regressor.primitive'] 386 | search_annotations_mock.assert_called_once_with('/a/path', 'pattern') 387 | 388 | 389 | @patch('mlblocks.discovery._find_annotations') 390 | @patch('mlblocks.discovery.get_primitives_paths') 391 | def test_find_primitives(gpp_mock, fa_mock): 392 | primitives = discovery.find_primitives('pattern') 393 | 394 | fa_mock.assert_called_once_with( 395 | gpp_mock.return_value, discovery.load_primitive, 'pattern', dict()) 396 | 397 | assert primitives == fa_mock.return_value 398 | 399 | 400 | @patch('mlblocks.discovery._find_annotations') 401 | @patch('mlblocks.discovery.get_pipelines_paths') 402 | def test_find_pipelines(gpp_mock, fa_mock): 403 | primitives = discovery.find_pipelines('pattern', {'a': 'filter'}) 404 | 405 | fa_mock.assert_called_once_with( 406 | gpp_mock.return_value, discovery.load_pipeline, 'pattern', {'a': 'filter'}) 407 | 408 | assert primitives == fa_mock.return_value 409 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py3{6,7,8,9,10,11}, test-devel 3 | 4 | [travis] 5 | python = 6 | 3.13: py313 7 | 3.12: py312 8 | 3.11: py311 9 | 3.10: py310 10 | 3.9: py39 11 | 3.8: py38, test-devel 12 | 13 | [testenv] 14 | passenv = CI TRAVIS TRAVIS_* 15 | allowlist_externals = rm 16 | skipsdist = false 17 | skip_install = false 18 | extras = test 19 | commands = 20 | /usr/bin/env make test 21 | rm -r {envdir} 22 | 23 | [testenv:test-devel] 24 | extras = dev 25 | commands = 26 | /usr/bin/env make test-devel 27 | rm -r {envdir} 28 | --------------------------------------------------------------------------------