├── .editorconfig
├── .github
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   ├── docs.yml
    │   └── tests.yml
├── .gitignore
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── HISTORY.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── apt.txt
├── docs
    ├── Makefile
    ├── advanced_usage
    │   ├── adding_primitives.rst
    │   ├── hyperparameters.rst
    │   ├── pipelines.rst
    │   └── primitives.rst
    ├── api
    │   └── mlblocks.rst
    ├── authors.rst
    ├── changelog.rst
    ├── conf.py
    ├── contributing.rst
    ├── getting_started
    │   ├── install.rst
    │   └── quickstart.rst
    ├── images
    │   ├── favicon.ico
    │   ├── mlblocks-icon.png
    │   ├── mlblocks-logo-small.png
    │   └── mlblocks-logo.png
    ├── index.rst
    ├── make.bat
    └── pipeline_examples
    │   ├── graph.rst
    │   ├── image.rst
    │   ├── multi_table.rst
    │   ├── single_table.rst
    │   └── text.rst
├── examples
    ├── README.md
    ├── pipelines
    │   └── single_table.classification.categorical_encoder.xgboost.json
    ├── primitives
    │   ├── mlblocks.examples.ClassPrimitive.json
    │   └── mlblocks.examples.function_primitive.json
    └── tutorials
    │   ├── 1. Using and MLPipeline.ipynb
    │   ├── 2. Finding and Loading a Pipeline.ipynb
    │   ├── 3. Setting MLPipeline Hyperparameters.ipynb
    │   ├── 4. Saving and Loading a Pipeline.ipynb
    │   ├── 5. Partial execution and pipeline debugging.ipynb
    │   ├── 6. Flexible outputs specification.ipynb
    │   ├── 7. Tuning a Pipeline.ipynb
    │   ├── 8. Searching for the best pipeline with BTBSession.ipynb
    │   └── utils.py
├── mlblocks
    ├── __init__.py
    ├── discovery.py
    ├── mlblock.py
    └── mlpipeline.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── data
    │   └── diagrams
    │   │   ├── diagram_fit.txt
    │   │   ├── diagram_multiple_blocks.txt
    │   │   └── diagram_simple.txt
    ├── features
    │   ├── test_fit_predicr_args.py
    │   ├── test_partial_outputs.py
    │   └── test_pipeline_loading.py
    ├── test_discovery.py
    ├── test_mlblock.py
    └── test_mlpipeline.py
└── tox.ini


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | * MLBlocks version:
 2 | * Python version:
 3 | * Operating System:
 4 | 
 5 | ### Description
 6 | 
 7 | Describe what you were trying to get done.
 8 | Tell us what happened, what went wrong, and what you expected to happen.
 9 | 
10 | ### What I Did
11 | 
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Generate Docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ stable ]
 6 | 
 7 | jobs:
 8 | 
 9 |   docs:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v2
13 | 
14 |     - name: Python
15 |       uses: actions/setup-python@v1
16 |       with:
17 |         python-version: 3.8
18 | 
19 |     - name: Build
20 |       run: |
21 |         sudo apt-get install graphviz pandoc
22 |         python -m pip install --upgrade pip
23 |         pip install -e .[dev]
24 |         make docs
25 |     - name: Deploy
26 |       uses: peaceiris/actions-gh-pages@v3
27 |       with:
28 |         github_token: ${{secrets.GITHUB_TOKEN}}
29 |         publish_dir: docs/_build/html
30 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
  1 | name: Run Tests
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ '*' ]
  6 |   pull_request:
  7 |     branches: [ master ]
  8 | 
  9 | jobs:
 10 |   devel:
 11 |     runs-on: ${{ matrix.os }}
 12 |     strategy:
 13 |       matrix:
 14 |         python-version: ['3.10']
 15 |         os: [ubuntu-latest]
 16 |     steps:
 17 |     - uses: actions/checkout@v1
 18 |     - name: Set up Python ${{ matrix.python-version }}
 19 |       uses: actions/setup-python@v2
 20 |       with:
 21 |         python-version: ${{ matrix.python-version }}
 22 |     - name: Upgrade pip
 23 |       run: pip install -U "pip<=24.1" setuptools wheel
 24 |     - name: Install lightfm
 25 |       run: python -m pip install --no-use-pep517 'lightfm<2'
 26 |     - name: Install package
 27 |       run: pip install .[dev]
 28 |     - name: make test-devel
 29 |       run: make test-devel
 30 | 
 31 |   readme:
 32 |     runs-on: ${{ matrix.os }}
 33 |     strategy:
 34 |       matrix:
 35 |         python-version: ['3.8', '3.9', '3.10', '3.11']
 36 |         os: [ubuntu-20.04, macos-latest]
 37 |     steps:
 38 |     - uses: actions/checkout@v1
 39 |     - name: Set up Python ${{ matrix.python-version }}
 40 |       uses: actions/setup-python@v2
 41 |       with:
 42 |         python-version: ${{ matrix.python-version }}
 43 |     - name: Upgrade pip
 44 |       run: pip install -U pip setuptools wheel
 45 |     - name: Install lightfm
 46 |       run: python -m pip install --no-use-pep517 'lightfm<2'
 47 |     - name: Install package and dependencies
 48 |       run: pip install rundoc .[mlprimitives]
 49 |     - name: make test-readme
 50 |       run: make test-readme
 51 | 
 52 |   unit:
 53 |     runs-on: ${{ matrix.os }}
 54 |     strategy:
 55 |       matrix:
 56 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
 57 |         os: [ubuntu-20.04, macos-latest]
 58 |     steps:
 59 |     - uses: actions/checkout@v1
 60 |     - name: Set up Python ${{ matrix.python-version }}
 61 |       uses: actions/setup-python@v2
 62 |       with:
 63 |         python-version: ${{ matrix.python-version }}
 64 |     - name: Install package and dependencies
 65 |       run: pip install .[unit]
 66 |     - name: make test-unit
 67 |       run: make test-unit
 68 | 
 69 |   unit-mlprimitives:
 70 |     runs-on: ${{ matrix.os }}
 71 |     strategy:
 72 |       matrix:
 73 |         python-version: ['3.8', '3.9', '3.10', '3.11']
 74 |         os: [ubuntu-20.04, macos-latest]
 75 |     steps:
 76 |     - uses: actions/checkout@v1
 77 |     - name: Set up Python ${{ matrix.python-version }}
 78 |       uses: actions/setup-python@v2
 79 |       with:
 80 |         python-version: ${{ matrix.python-version }}
 81 |     - name: Upgrade pip
 82 |       run: pip install -U pip setuptools wheel
 83 |     - name: Install lightfm
 84 |       run: python -m pip install --no-use-pep517 'lightfm<2'
 85 |     - name: Install package and dependencies
 86 |       run: pip install .[test]
 87 |     - name: make test-mlprimitives
 88 |       run: make test-mlprimitives
 89 | 
 90 |   tutorials:
 91 |     runs-on: ${{ matrix.os }}
 92 |     strategy:
 93 |       matrix:
 94 |         python-version: ['3.8', '3.9', '3.10', '3.11']
 95 |         os: [ubuntu-20.04]
 96 |     steps:
 97 |     - uses: actions/checkout@v1
 98 |     - name: Set up Python ${{ matrix.python-version }}
 99 |       uses: actions/setup-python@v2
100 |       with:
101 |         python-version: ${{ matrix.python-version }}
102 |     - if: matrix.os == 'ubuntu-20.04'
103 |       name: Install dependencies - Ubuntu
104 |       run: sudo apt-get install graphviz
105 |     - name: Upgrade pip
106 |       run: pip install -U pip setuptools wheel
107 |     - name: Install lightfm
108 |       run: python -m pip install --no-use-pep517 'lightfm<2'
109 |     - name: Install package and dependencies
110 |       run: pip install .[examples]
111 |     - name: make test-tutorials
112 |       run: make test-tutorials
113 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | docs/pipeline.json
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # dotenv
 85 | .env
 86 | 
 87 | # virtualenv
 88 | .venv
 89 | venv/
 90 | ENV/
 91 | 
 92 | # Spyder project settings
 93 | .spyderproject
 94 | .spyproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # mkdocs documentation
100 | /site
101 | 
102 | # mypy
103 | .mypy_cache/
104 | 
105 | # other
106 | .DS_Store
107 | 
108 | # vim
109 | .*.swp
110 | 
111 | mlblocks/data
112 | examples/tutorials/pipeline.pkl
113 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Contributors
 6 | ------------
 7 | 
 8 | * Carles Sala <csala@csail.mit.edu>
 9 | * Kalyan Veeramachaneni <kalyan@csail.mit.edu>
10 | * William Xue <wgxue@mit.edu>
11 | * Akshay Ravikumar <akshayr@mit.edu>
12 | * Laura Gustafson <lgustaf@mit.edu>
13 | * Erica Chiu <ejchiu@mit.edu>
14 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | .. highlight:: shell
  2 | 
  3 | ============
  4 | Contributing
  5 | ============
  6 | 
  7 | Contributions are welcome, and they are greatly appreciated! Every little bit
  8 | helps, and credit will always be given.
  9 | 
 10 | You can contribute in many ways:
 11 | 
 12 | Types of Contributions
 13 | ----------------------
 14 | 
 15 | Report Bugs
 16 | ~~~~~~~~~~~
 17 | 
 18 | Report bugs at https://github.com/MLBazaar/MLBlocks/issues.
 19 | 
 20 | If you are reporting a bug, please include:
 21 | 
 22 | * Your operating system name and version.
 23 | * Any details about your local setup that might be helpful in troubleshooting.
 24 | * Detailed steps to reproduce the bug.
 25 | 
 26 | Fix Bugs
 27 | ~~~~~~~~
 28 | 
 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
 30 | wanted" is open to whoever wants to implement it.
 31 | 
 32 | Implement Features
 33 | ~~~~~~~~~~~~~~~~~~
 34 | 
 35 | Look through the GitHub issues for features. Anything tagged with "enhancement"
 36 | and "help wanted" is open to whoever wants to implement it.
 37 | 
 38 | Write Documentation
 39 | ~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | MLBlocks could always use more documentation, whether as part of the
 42 | official MLBlocks docs, in docstrings, or even on the web in blog posts,
 43 | articles, and such.
 44 | 
 45 | Submit Feedback
 46 | ~~~~~~~~~~~~~~~
 47 | 
 48 | The best way to send feedback is to file an issue at https://github.com/MLBazaar/MLBlocks/issues.
 49 | 
 50 | If you are proposing a feature:
 51 | 
 52 | * Explain in detail how it would work.
 53 | * Keep the scope as narrow as possible, to make it easier to implement.
 54 | * Remember that this is a volunteer-driven project, and that contributions
 55 |   are welcome :)
 56 | 
 57 | Get Started!
 58 | ------------
 59 | 
 60 | Ready to contribute? Here's how to set up `MLBlocks` for local development.
 61 | 
 62 | 1. Fork the `MLBlocks` repo on GitHub.
 63 | 2. Clone your fork locally::
 64 | 
 65 |     $ git clone git@github.com:your_name_here/MLBlocks.git
 66 | 
 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed,
 68 |    this is how you set up your fork for local development::
 69 | 
 70 |     $ mkvirtualenv MLBlocks
 71 |     $ cd MLBlocks/
 72 |     $ make install-develop
 73 | 
 74 | 4. Create a branch for local development::
 75 | 
 76 |     $ git checkout -b name-of-your-bugfix-or-feature
 77 | 
 78 |    Now you can make your changes locally.
 79 | 
 80 | 5. While hacking your changes, make sure to cover all your developments with the required
 81 |    unit tests, and that none of the old tests fail as a consequence of your changes.
 82 |    For this, make sure to run the tests suite and check the code coverage::
 83 | 
 84 |     $ make test       # Run the tests
 85 |     $ make coverage   # Get the coverage report
 86 | 
 87 | 6. When you're done making changes, check that your changes pass flake8 and the
 88 |    tests, including testing other Python versions with tox::
 89 | 
 90 |     $ make lint       # Check code styling
 91 |     $ make test-all   # Execute tests on all python versions
 92 | 
 93 | 7. Make also sure to include the necessary documentation in the code as docstrings following
 94 |    the `google docstring`_ style.
 95 |    If you want to view how your documentation will look like when it is published, you can
 96 |    generate and view the docs with this command::
 97 | 
 98 |     $ make viewdocs
 99 | 
100 | 8. Commit your changes and push your branch to GitHub::
101 | 
102 |     $ git add .
103 |     $ git commit -m "Your detailed description of your changes."
104 |     $ git push origin name-of-your-bugfix-or-feature
105 | 
106 | 9. Submit a pull request through the GitHub website.
107 | 
108 | .. _google docstring: https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
109 | 
110 | Pull Request Guidelines
111 | -----------------------
112 | 
113 | Before you submit a pull request, check that it meets these guidelines:
114 | 
115 | 1. It resolves an open GitHub Issue and contains its reference in the title or
116 |    the comment. If there is no associated issue, feel free to create one.
117 | 2. Whenever possible, it resolves only **one** issue. If your PR resolves more than
118 |    one issue, try to split it in more than one pull request.
119 | 3. The pull request should include unit tests that cover all the changed code
120 | 4. If the pull request adds functionality, the docs should be updated. Put
121 |    your new functionality into a function with a docstring, and add the
122 |    feature to the list in README.rst.
123 | 5. The pull request should work for all the supported python version. Check
124 |    https://travis-ci.org/MLBazaar/MLBlocks/pull_requests
125 |    and make sure that all the checks pass.
126 | 
127 | Unit Testing Guidelines
128 | -----------------------
129 | 
130 | All the Unit Tests should comply with the following requirements:
131 | 
132 | 1. Unit Tests should be based only in unittest and pytest modules.
133 | 
134 | 2. The tests that cover a module called ``mlblocks/path/to/a_module.py`` should be
135 |    implemented in a separated module called ``tests/mlblocks/path/to/test_a_module.py``.
136 |    Note that the module name has the ``test_`` prefix and is located in a path similar
137 |    to the one of the tested module, just inside te ``tests`` folder.
138 | 
139 | 3. Each method of the tested module should have at least one associated test method, and
140 |    each test method should cover only **one** use case or scenario.
141 | 
142 | 4. Test case methods should start with the ``test_`` prefix and have descriptive names
143 |    that indicate which scenario they cover.
144 |    Names such as ``test_some_methed_input_none``, ``test_some_method_value_error`` or
145 |    ``test_some_method_timeout`` are right, but names like ``test_some_method_1``,
146 |    ``some_method`` or ``test_error`` are not.
147 | 
148 | 5. Each test should validate only what the code of the method being tested does, and not
149 |    cover the behavior of any third party package or tool being used, which is assumed to
150 |    work properly as far as it is being passed the right values.
151 | 
152 | 6. Any third party tool that may have any kind of random behavior, such as some Machine
153 |    Learning models, databases or Web APIs, will be mocked using the ``mock`` library, and
154 |    the only thing that will be tested is that our code passes the right values to them.
155 | 
156 | 7. Unit tests should not use anything from outside the test and the code being tested. This
157 |    includes not reading or writting to any filesystem or database, which will be properly
158 |    mocked.
159 | 
160 | Tips
161 | ----
162 | 
163 | To run a subset of tests::
164 | 
165 |     $ pytest tests.test_mlblocks
166 | 
167 | Release Workflow
168 | ----------------
169 | 
170 | The process of releasing a new version involves several steps combining both ``git`` and
171 | ``bumpversion`` which, briefly:
172 | 
173 | 1. Merge what is in ``master`` branch into ``stable`` branch.
174 | 2. Update the version in ``setup.cfg``, ``mlblocks/__init__.py`` and ``HISTORY.md`` files.
175 | 3. Create a new git tag pointing at the corresponding commit in ``stable`` branch.
176 | 4. Merge the new commit from ``stable`` into ``master``.
177 | 5. Update the version in ``setup.cfg`` and ``mlblocks/__init__.py``
178 |    to open the next development iteration.
179 | 
180 | .. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new
181 |           entry that explains the changes that will be included in the new version.
182 |           Normally this is just a list of the Pull Requests that have been merged to master
183 |           since the last release.
184 | 
185 | Once this is done, run of the following commands:
186 | 
187 | 1. If you are releasing a patch version::
188 | 
189 |     make release
190 | 
191 | 2. If you are releasing a minor version::
192 | 
193 |     make release-minor
194 | 
195 | 3. If you are releasing a major version::
196 | 
197 |     make release-major
198 | 
199 | Release Candidates
200 | ~~~~~~~~~~~~~~~~~~
201 | 
202 | Sometimes it is necessary or convenient to upload a release candidate to PyPi as a pre-release,
203 | in order to make some of the new features available for testing on other projects before they
204 | are included in an actual full-blown release.
205 | 
206 | In order to perform such an action, you can execute::
207 | 
208 |     make release-candidate
209 | 
210 | This will perform the following actions:
211 | 
212 | 1. Build and upload the current version to PyPi as a pre-release, with the format ``X.Y.Z.devN``
213 | 
214 | 2. Bump the current version to the next release candidate, ``X.Y.Z.dev(N+1)``
215 | 
216 | After this is done, the new pre-release can be installed by including the ``dev`` section in the
217 | dependency specification, either in ``setup.py``::
218 | 
219 |     install_requires = [
220 |         ...
221 |         'mlblocks>=X.Y.Z.dev',
222 |         ...
223 |     ]
224 | 
225 | or in command line::
226 | 
227 |     pip install 'mlblocks>=X.Y.Z.dev'
228 | 


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | =========
  3 | 
  4 | 0.6.2 - 2024-11-18
  5 | ------------------
  6 | 
  7 | * Upgrade python version to include 3.12 and 3.13 - [Issue #144](https://github.com/MLBazaar/MLBlocks/issues/144) by @sarahmish
  8 | 
  9 | 0.6.1 - 2023-09-26
 10 | ------------------
 11 | 
 12 | * Add python 3.11 to MLBlocks - [Issue #143](https://github.com/MLBazaar/MLBlocks/issues/143) by @sarahmish
 13 | 
 14 | 0.6.0 - 2023-04-14
 15 | ------------------
 16 | 
 17 | * Support python 3.9 and 3.10 - [Issue #141](https://github.com/MLBazaar/MLBlocks/issues/141) by @sarahmish
 18 | 
 19 | 0.5.0 - 2023-01-22
 20 | ------------------
 21 | 
 22 | * Update `numpy` dependency and isolate tests - [Issue #139](https://github.com/MLBazaar/MLBlocks/issues/139) by @sarahmish
 23 | 
 24 | 0.4.1 - 2021-10-08
 25 | ------------------
 26 | 
 27 | * Update NumPy dependency - [Issue #136](https://github.com/MLBazaar/MLBlocks/issues/136) by @sarahmish
 28 | * Support dynamic inputs and outputs - [Issue #134](https://github.com/MLBazaar/MLBlocks/issues/134) by @pvk-developer
 29 | 
 30 | 0.4.0 - 2021-01-09
 31 | ------------------
 32 | 
 33 | * Stop pipeline fitting after the last block - [Issue #131](https://github.com/MLBazaar/MLBlocks/issues/131) by @sarahmish
 34 | * Add memory debug and profiling - [Issue #130](https://github.com/MLBazaar/MLBlocks/issues/130) by @pvk-developer
 35 | * Update Python support - [Issue #129](https://github.com/MLBazaar/MLBlocks/issues/129) by @csala
 36 | * Get execution time for each block - [Issue #127](https://github.com/MLBazaar/MLBlocks/issues/127) by @sarahmish
 37 | * Allow loading a primitive or pipeline directly from the JSON path - [Issue #114](https://github.com/MLBazaar/MLBlocks/issues/114) by @csala
 38 | * Pipeline Diagrams - [Issue #113](https://github.com/MLBazaar/MLBlocks/issues/113) by @erica-chiu
 39 | * Get Pipeline Inputs - [Issue #112](https://github.com/MLBazaar/MLBlocks/issues/112) by @erica-chiu
 40 | 
 41 | 0.3.4 - 2019-11-01
 42 | ------------------
 43 | 
 44 | * Ability to return intermediate context - [Issue #110](https://github.com/MLBazaar/MLBlocks/issues/110) by @csala
 45 | * Support for static or class methods - [Issue #107](https://github.com/MLBazaar/MLBlocks/issues/107) by @csala
 46 | 
 47 | 0.3.3 - 2019-09-09
 48 | ------------------
 49 | 
 50 | * Improved intermediate outputs management - [Issue #105](https://github.com/MLBazaar/MLBlocks/issues/105) by @csala
 51 | 
 52 | 0.3.2 - 2019-08-12
 53 | ------------------
 54 | 
 55 | * Allow passing fit and produce arguments as `init_params` - [Issue #96](https://github.com/MLBazaar/MLBlocks/issues/96) by @csala
 56 | * Support optional fit and produce args and arg defaults - [Issue #95](https://github.com/MLBazaar/MLBlocks/issues/95) by @csala
 57 | * Isolate primitives from their hyperparameters dictionary - [Issue #94](https://github.com/MLBazaar/MLBlocks/issues/94) by @csala
 58 | * Add functions to explore the available primitives and pipelines - [Issue #90](https://github.com/MLBazaar/MLBlocks/issues/90) by @csala
 59 | * Add primitive caching - [Issue #22](https://github.com/MLBazaar/MLBlocks/issues/22) by @csala
 60 | 
 61 | 0.3.1 - Pipelines Discovery
 62 | ---------------------------
 63 | 
 64 | * Support flat hyperparameter dictionaries - [Issue #92](https://github.com/MLBazaar/MLBlocks/issues/92) by @csala
 65 | * Load pipelines by name and register them as `entry_points` - [Issue #88](https://github.com/MLBazaar/MLBlocks/issues/88) by @csala
 66 | * Implement partial re-fit -[Issue #61](https://github.com/MLBazaar/MLBlocks/issues/61) by @csala
 67 | * Move argument parsing to MLBlock - [Issue #86](https://github.com/MLBazaar/MLBlocks/issues/86) by @csala
 68 | * Allow getting intermediate outputs - [Issue #58](https://github.com/MLBazaar/MLBlocks/issues/58) by @csala
 69 | 
 70 | 0.3.0 - New Primitives Discovery
 71 | --------------------------------
 72 | 
 73 | * New primitives discovery system based on `entry_points`.
 74 | * Conditional Hyperparameters filtering in MLBlock initialization.
 75 | * Improved logging and exception reporting.
 76 | 
 77 | 0.2.4 - New Datasets and Unit Tests
 78 | -----------------------------------
 79 | 
 80 | * Add a new multi-table dataset.
 81 | * Add Unit Tests up to 50% coverage.
 82 | * Improve documentation.
 83 | * Fix minor bug in newsgroups dataset.
 84 | 
 85 | 0.2.3 - Demo Datasets
 86 | ---------------------
 87 | 
 88 | * Add new methods to Dataset class.
 89 | * Add documentation for the datasets module.
 90 | 
 91 | 0.2.2 - MLPipeline Load/Save
 92 | ----------------------------
 93 | 
 94 | * Implement save and load methods for MLPipelines
 95 | * Add more datasets
 96 | 
 97 | 0.2.1 - New Documentation
 98 | -------------------------
 99 | 
100 | * Add mlblocks.datasets module with demo data download functions.
101 | * Extensive documentation, including multiple pipeline examples.
102 | 
103 | 0.2.0 - New MLBlocks API
104 | ------------------------
105 | 
106 | A new MLBlocks API and Primitive format.
107 | 
108 | This is a summary of the changes:
109 | 
110 | * Primitives JSONs and Python code has been moved to a different repository, called MLPrimitives
111 | * Optional usage of multiple JSON primitive folders.
112 | * JSON format has been changed to allow more flexibility and features:
113 |     * input and output arguments, as well as argument types, can be specified for each method
114 |     * both classes and function as primitives are supported
115 |     * multitype and conditional hyperparameters fully supported
116 |     * data modalities and primitive classifiers introduced
117 |     * metadata such as documentation, description and author fields added
118 | * Parsers are removed, and now the MLBlock class is responsible for loading and reading the
119 |   JSON primitive.
120 | * Multiple blocks of the same primitive are supported within the same pipeline.
121 | * Arbitrary inputs and outputs for both pipelines and blocks are allowed.
122 | * Shared variables during pipeline execution, usable by multiple blocks.
123 | 
124 | 0.1.9 - Bugfix Release
125 | ----------------------
126 | 
127 | * Disable some NetworkX functions for incompatibilities with some types of graphs.
128 | 
129 | 0.1.8 - New primitives and some improvements
130 | --------------------------------------------
131 | 
132 | * Improve the NetworkX primitives.
133 | * Add String Vectorization and Datetime Featurization primitives.
134 | * Refactor some Keras primitives to work with single dimension `y` arrays and be compatible with `pickle`.
135 | * Add XGBClassifier and XGBRegressor primitives.
136 | * Add some `keras.applications` pretrained networks as preprocessing primitives.
137 | * Add helper class to allow function primitives.
138 | 
139 | 0.1.7 - Nested hyperparams dicts
140 | --------------------------------
141 | 
142 | * Support passing hyperparams as nested dicts.
143 | 
144 | 0.1.6 - Text and Graph Pipelines
145 | --------------------------------
146 | 
147 | * Add LSTM classifier and regressor primitives.
148 | * Add OneHotEncoder and MultiLabelEncoder primitives.
149 | * Add several NetworkX graph featurization primitives.
150 | * Add `community.best_partition` primitive.
151 | 
152 | 0.1.5 - Collaborative Filtering Pipelines
153 | -----------------------------------------
154 | 
155 | * Add LightFM primitive.
156 | 
157 | 0.1.4 - Image pipelines improved
158 | --------------------------------
159 | 
160 | * Allow passing `init_params` on `MLPipeline` creation.
161 | * Fix bug with MLHyperparam types and Keras.
162 | * Rename `produce_params` as `predict_params`.
163 | * Add SingleCNN Classifier and Regressor primitives.
164 | * Simplify and improve Trivial Predictor
165 | 
166 | 0.1.3 - Multi Table pipelines improved
167 | --------------------------------------
168 | 
169 | * Improve RandomForest primitive ranges
170 | * Improve DFS primitive
171 | * Add Tree Based Feature Selection primitives
172 | * Fix bugs in TrivialPredictor
173 | * Improved documentation
174 | 
175 | 0.1.2 - Bugfix release
176 | ----------------------
177 | 
178 | * Fix bug in TrivialMedianPredictor
179 | * Fix bug in OneHotLabelEncoder
180 | 
181 | 0.1.1 - Single Table pipelines improved
182 | ---------------------------------------
183 | 
184 | * New project structure and primitives for integration into MIT-TA2.
185 | * MIT-TA2 default pipelines and single table pipelines fully working.
186 | 
187 | 0.1.0
188 | -----
189 | 
190 | * First release on PyPI.
191 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018, MIT Data To AI Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.md
 4 | include LICENSE
 5 | include README.md
 6 | 
 7 | recursive-include tests *
 8 | recursive-exclude * __pycache__
 9 | recursive-exclude * *.py[co]
10 | 
11 | recursive-include docs *.md *.rst conf.py Makefile make.bat *.jpg *.png *.gif
12 | recursive-include mlblocks *.json
13 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .DEFAULT_GOAL := help
  2 | 
  3 | define BROWSER_PYSCRIPT
  4 | import os, webbrowser, sys
  5 | 
  6 | try:
  7 | 	from urllib import pathname2url
  8 | except:
  9 | 	from urllib.request import pathname2url
 10 | 
 11 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
 12 | endef
 13 | export BROWSER_PYSCRIPT
 14 | 
 15 | define PRINT_HELP_PYSCRIPT
 16 | import re, sys
 17 | 
 18 | for line in sys.stdin:
 19 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
 20 | 	if match:
 21 | 		target, help = match.groups()
 22 | 		print("%-20s %s" % (target, help))
 23 | endef
 24 | export PRINT_HELP_PYSCRIPT
 25 | 
 26 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
 27 | 
 28 | .PHONY: help
 29 | help:
 30 | 	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
 31 | 
 32 | 
 33 | # CLEAN TARGETS
 34 | 
 35 | .PHONY: clean-build
 36 | clean-build: ## remove build artifacts
 37 | 	rm -fr build/
 38 | 	rm -fr dist/
 39 | 	rm -fr .eggs/
 40 | 	find . -name '*.egg-info' -exec rm -fr {} +
 41 | 	find . -name '*.egg' -exec rm -f {} +
 42 | 
 43 | .PHONY: clean-pyc
 44 | clean-pyc: ## remove Python file artifacts
 45 | 	find . -name '*.pyc' -exec rm -f {} +
 46 | 	find . -name '*.pyo' -exec rm -f {} +
 47 | 	find . -name '*~' -exec rm -f {} +
 48 | 	find . -name '__pycache__' -exec rm -fr {} +
 49 | 
 50 | .PHONY: clean-docs
 51 | clean-docs: ## remove previously built docs
 52 | 	-$(MAKE) -C docs clean 2>/dev/null  # this fails if sphinx is not yet installed
 53 | 
 54 | .PHONY: clean-coverage
 55 | clean-coverage: ## remove coverage artifacts
 56 | 	rm -f .coverage
 57 | 	rm -f .coverage.*
 58 | 	rm -fr htmlcov/
 59 | 
 60 | .PHONY: clean-test
 61 | clean-test: ## remove test artifacts
 62 | 	rm -fr .tox/
 63 | 	rm -fr .pytest_cache
 64 | 
 65 | .PHONY: clean
 66 | clean: clean-build clean-pyc clean-test clean-coverage clean-docs ## remove all build, test, coverage, docs and Python artifacts
 67 | 
 68 | 
 69 | # INSTALL TARGETS
 70 | 
 71 | .PHONY: install
 72 | install: clean-build clean-pyc ## install the package to the active Python's site-packages
 73 | 	pip install .
 74 | 
 75 | .PHONY: install-examples
 76 | install-examples: clean-build clean-pyc ## install the package and the examples dependencies
 77 | 	pip install .[examples]
 78 | 
 79 | .PHONY: install-unit
 80 | install-unit: clean-build clean-pyc ## install the package and dependencies for unit tests
 81 | 	pip install .[unit]
 82 | 
 83 | .PHONY: install-test
 84 | install-test: clean-build clean-pyc ## install the package and test dependencies
 85 | 	pip install .[test]
 86 | 
 87 | .PHONY: install-develop
 88 | install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development
 89 | 	pip install -e .[dev]
 90 | 
 91 | MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | grep -v -e '[][]' | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=')
 92 | 
 93 | .PHONY: install-minimum
 94 | install-minimum: ## install the minimum supported versions of the package dependencies
 95 | 	pip install $(MINIMUM)
 96 | 
 97 | 
 98 | # LINT TARGETS
 99 | 
100 | .PHONY: lint
101 | lint: ## check style with flake8 and isort
102 | 	flake8 mlblocks tests
103 | 	isort -c --recursive mlblocks tests
104 | 
105 | .PHONY: fix-lint
106 | fix-lint: ## fix lint issues using autoflake, autopep8, and isort
107 | 	find mlblocks -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables
108 | 	autopep8 --in-place --recursive --aggressive mlblocks
109 | 	isort --apply --atomic --recursive mlblocks
110 | 
111 | 	find tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables
112 | 	autopep8 --in-place --recursive --aggressive tests
113 | 	isort --apply --atomic --recursive tests
114 | 
115 | .PHONY: lint-docs
116 | lint-docs: ## check docs formatting with doc8 and pydocstyle
117 | 	doc8 mlblocks/
118 | 	pydocstyle mlblocks/
119 | 
120 | 
121 | # TEST TARGETS
122 | 
123 | .PHONY: test-unit
124 | test-unit: ## run tests quickly with the default Python
125 | 	python -m pytest --cov=mlblocks --ignore=tests/features/
126 | 
127 | .PHONY: test-mlprimitives
128 | test-mlprimitives: ## run tests quickly with the default Python
129 | 	python -m pytest --cov=mlblocks
130 | 
131 | .PHONY: test-readme
132 | test-readme: ## run the readme snippets
133 | 	rm -rf tests/readme_test && mkdir tests/readme_test
134 | 	cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md
135 | 	rm -rf tests/readme_test
136 | 
137 | .PHONY: test-tutorials
138 | test-tutorials: ## run the tutorial notebooks
139 | 	find examples/tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \
140 | 		jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --stdout --to html {} > /dev/null +
141 | 
142 | .PHONY: test
143 | test: test-unit test-mlprimitives test-readme ## test everything that needs test dependencies
144 | 
145 | .PHONY: check-dependencies
146 | check-dependencies: ## test if there are any broken dependencies
147 | 	pip check
148 | 
149 | .PHONY: test-devel
150 | test-devel: check-dependencies lint docs ## test everything that needs development dependencies
151 | 
152 | .PHONY: test-all
153 | test-all: ## run tests on every Python version with tox
154 | 	tox -r
155 | 
156 | .PHONY: coverage
157 | coverage: ## check code coverage quickly with the default Python
158 | 	coverage run --source mlblocks -m pytest
159 | 	coverage report -m
160 | 	coverage html
161 | 	$(BROWSER) htmlcov/index.html
162 | 
163 | 
164 | # DOCS TARGETS
165 | 
166 | .PHONY: docs
167 | docs: clean-docs ## generate Sphinx HTML documentation, including API docs
168 | 	$(MAKE) -C docs html
169 | 
170 | .PHONY: view-docs
171 | view-docs: ## view the docs in a browser
172 | 	$(BROWSER) docs/_build/html/index.html
173 | 
174 | .PHONY: serve-docs
175 | serve-docs: ## compile the docs watching for changes
176 | 	watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' docs
177 | 
178 | 
179 | # RELEASE TARGETS
180 | 
181 | .PHONY: dist
182 | dist: clean ## builds source and wheel package
183 | 	python setup.py sdist
184 | 	python setup.py bdist_wheel
185 | 	ls -l dist
186 | 
187 | .PHONY: publish-confirm
188 | publish-confirm:
189 | 	@echo "WARNING: This will irreversibly upload a new version to PyPI!"
190 | 	@echo -n "Please type 'confirm' to proceed: " \
191 | 		&& read answer \
192 | 		&& [ "$${answer}" = "confirm" ]
193 | 
194 | .PHONY: publish-test
195 | publish-test: dist publish-confirm ## package and upload a release on TestPyPI
196 | 	twine upload --repository-url https://test.pypi.org/legacy/ dist/*
197 | 
198 | .PHONY: publish
199 | publish: dist publish-confirm ## package and upload a release
200 | 	twine upload dist/*
201 | 
202 | .PHONY: bumpversion-release
203 | bumpversion-release: ## Merge master to stable and bumpversion release
204 | 	git checkout stable || git checkout -b stable
205 | 	git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
206 | 	bumpversion release
207 | 	git push --tags origin stable
208 | 
209 | .PHONY: bumpversion-patch
210 | bumpversion-patch: ## Merge stable to master and bumpversion patch
211 | 	git checkout master
212 | 	git merge stable
213 | 	bumpversion --no-tag patch
214 | 	git push
215 | 
216 | .PHONY: bumpversion-candidate
217 | bumpversion-candidate: ## Bump the version to the next candidate
218 | 	bumpversion candidate --no-tag
219 | 
220 | .PHONY: bumpversion-minor
221 | bumpversion-minor: ## Bump the version the next minor skipping the release
222 | 	bumpversion --no-tag minor
223 | 
224 | .PHONY: bumpversion-major
225 | bumpversion-major: ## Bump the version the next major skipping the release
226 | 	bumpversion --no-tag major
227 | 
228 | .PHONY: bumpversion-revert
229 | bumpversion-revert: ## Undo a previous bumpversion-release
230 | 	git checkout master
231 | 	git branch -D stable
232 | 
233 | CLEAN_DIR := $(shell git status --short | grep -v ??)
234 | CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
235 | CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l)
236 | 
237 | .PHONY: check-clean
238 | check-clean: ## Check if the directory has uncommitted changes
239 | ifneq ($(CLEAN_DIR),)
240 | 	$(error There are uncommitted changes)
241 | endif
242 | 
243 | .PHONY: check-master
244 | check-master: ## Check if we are in master branch
245 | ifneq ($(CURRENT_BRANCH),master)
246 | 	$(error Please make the release from master branch\n)
247 | endif
248 | 
249 | .PHONY: check-history
250 | check-history: ## Check if HISTORY.md has been modified
251 | ifeq ($(CHANGELOG_LINES),0)
252 | 	$(error Please insert the release notes in HISTORY.md before releasing)
253 | endif
254 | 
255 | .PHONY: check-release
256 | check-release: check-clean check-master check-history ## Check if the release can be made
257 | 	@echo "A new release can be made"
258 | 
259 | .PHONY: release
260 | release: check-release bumpversion-release publish bumpversion-patch
261 | 
262 | .PHONY: release-test
263 | release-test: check-release bumpversion-release-test publish-test bumpversion-revert
264 | 
265 | .PHONY: release-candidate
266 | release-candidate: check-master publish bumpversion-candidate
267 | 
268 | .PHONY: release-candidate-test
269 | release-candidate-test: check-clean check-master publish-test
270 | 
271 | .PHONY: release-minor
272 | release-minor: check-release bumpversion-minor release
273 | 
274 | .PHONY: release-major
275 | release-major: check-release bumpversion-major release
276 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="left">
  2 |   <a href="https://dai.lids.mit.edu">
  3 |     <img width=15% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/Logo_DAI_highres.png" alt="DAI-Lab" />
  4 |   </a>
  5 |   <i>An Open Source Project from the <a href="https://dai.lids.mit.edu">Data to AI Lab, at MIT</a></i>
  6 | </p>
  7 | 
  8 | <p align="left">
  9 | <img width=20% src="https://dai.lids.mit.edu/wp-content/uploads/2018/06/mlblocks-icon.png" alt=“MLBlocks” />
 10 | </p>
 11 | 
 12 | <p align="left">
 13 | Pipelines and Primitives for Machine Learning and Data Science.
 14 | </p>
 15 | 
 16 | [![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
 17 | [![PyPi](https://img.shields.io/pypi/v/mlblocks.svg)](https://pypi.python.org/pypi/mlblocks)
 18 | [![Tests](https://github.com/MLBazaar/MLBlocks/workflows/Run%20Tests/badge.svg)](https://github.com/MLBazaar/MLBlocks/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster)
 19 | [![CodeCov](https://codecov.io/gh/MLBazaar/MLBlocks/branch/master/graph/badge.svg)](https://codecov.io/gh/MLBazaar/MLBlocks)
 20 | [![Downloads](https://pepy.tech/badge/mlblocks)](https://pepy.tech/project/mlblocks)
 21 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/MLBazaar/MLBlocks/master?filepath=examples/tutorials)
 22 | 
 23 | <br>
 24 | 
 25 | # MLBlocks
 26 | 
 27 | * Documentation: https://mlbazaar.github.io/MLBlocks
 28 | * Github: https://github.com/MLBazaar/MLBlocks
 29 | * License: [MIT](https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE)
 30 | * Development Status: [Pre-Alpha](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
 31 | 
 32 | ## Overview
 33 | 
 34 | MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by
 35 | seamlessly combining tools from any python library with a simple, common and uniform interface.
 36 | 
 37 | Features include:
 38 | 
 39 | * Build Machine Learning Pipelines combining **any Machine Learning Library in Python**.
 40 | * Access a repository with hundreds of primitives and pipelines ready to be used with little to
 41 |   no python code to write, carefully curated by Machine Learning and Domain experts.
 42 | * Extract machine-readable information about which hyperparameters can be tuned and within
 43 |   which ranges, allowing automated integration with Hyperparameter Optimization tools like
 44 |   [BTB](https://github.com/MLBazaar/BTB).
 45 | * Complex multi-branch pipelines and DAG configurations, with unlimited number of inputs and
 46 |   outputs per primitive.
 47 | * Easy save and load Pipelines using JSON Annotations.
 48 | 
 49 | # Install
 50 | 
 51 | ## Requirements
 52 | 
 53 | **MLBlocks** has been developed and tested on [Python 3.8, 3.9, 3.10, 3.11, 3.12, 3.13](https://www.python.org/downloads/)
 54 | 
 55 | ## Install with `pip`
 56 | 
 57 | The easiest and recommended way to install **MLBlocks** is using [pip](
 58 | https://pip.pypa.io/en/stable/):
 59 | 
 60 | ```bash
 61 | pip install mlblocks
 62 | ```
 63 | 
 64 | This will pull and install the latest stable release from [PyPi](https://pypi.org/).
 65 | 
 66 | If you want to install from source or contribute to the project please read the
 67 | [Contributing Guide](https://mlbazaar.github.io/MLBlocks/contributing.html#get-started).
 68 | 
 69 | ## MLPrimitives
 70 | 
 71 | In order to be usable, MLBlocks requires a compatible primitives library.
 72 | 
 73 | The official library, required in order to follow the following MLBlocks tutorial,
 74 | is [MLPrimitives](https://github.com/MLBazaar/MLPrimitives), which you can install
 75 | with this command:
 76 | 
 77 | ```bash
 78 | pip install mlprimitives
 79 | ```
 80 | 
 81 | # Quickstart
 82 | 
 83 | Below there is a short example about how to use **MLBlocks** to solve the [Adult Census
 84 | Dataset](https://archive.ics.uci.edu/ml/datasets/Adult) classification problem using a
 85 | pipeline which combines primitives from [MLPrimitives](https://github.com/MLBazaar/MLPrimitives),
 86 | [scikit-learn](https://scikit-learn.org/) and [xgboost](https://xgboost.readthedocs.io/).
 87 | 
 88 | ```python3
 89 | import pandas as pd
 90 | from mlblocks import MLPipeline
 91 | from sklearn.model_selection import train_test_split
 92 | from sklearn.metrics import accuracy_score
 93 | 
 94 | dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv')
 95 | label = dataset.pop('label')
 96 | 
 97 | X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label)
 98 | 
 99 | primitives = [
100 |     'mlprimitives.custom.preprocessing.ClassEncoder',
101 |     'mlprimitives.custom.feature_extraction.CategoricalEncoder',
102 |     'sklearn.impute.SimpleImputer',
103 |     'xgboost.XGBClassifier',
104 |     'mlprimitives.custom.preprocessing.ClassDecoder'
105 | ]
106 | pipeline = MLPipeline(primitives)
107 | 
108 | pipeline.fit(X_train, y_train)
109 | predictions = pipeline.predict(X_test)
110 | 
111 | accuracy_score(y_test, predictions)
112 | ```
113 | 
114 | # What's Next?
115 | 
116 | If you want to learn more about how to tune the pipeline hyperparameters, save and load
117 | the pipelines using JSON annotations or build complex multi-branched pipelines, please
118 | check our [documentation site](https://mlbazaar.github.io/MLBlocks).
119 | 
120 | Also do not forget to have a look at the [notebook tutorials](
121 | https://github.com/MLBazaar/MLBlocks/tree/master/examples/tutorials)!
122 | 
123 | # Citing MLBlocks
124 | 
125 | If you use MLBlocks for your research, please consider citing our related papers.
126 | 
127 | For the current design of MLBlocks and its usage within the larger *Machine Learning Bazaar* project at
128 | the MIT Data To AI Lab, please see:
129 | 
130 | Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. ["The Machine Learning Bazaar:
131 | Harnessing the ML Ecosystem for Effective System Development."](https://arxiv.org/abs/1905.08942) arXiv
132 | Preprint 1905.08942. 2019.
133 | 
134 | ```bibtex
135 | @article{smith2019mlbazaar,
136 |   author = {Smith, Micah J. and Sala, Carles and Kanter, James Max and Veeramachaneni, Kalyan},
137 |   title = {The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development},
138 |   journal = {arXiv e-prints},
139 |   year = {2019},
140 |   eid = {arXiv:1905.08942},
141 |   pages = {arXiv:1905.08942},
142 |   archivePrefix = {arXiv},
143 |   eprint = {1905.08942},
144 | }
145 | ```
146 | 
147 | For the first MLBlocks version from 2015, designed for only multi table, multi entity temporal data, please
148 | refer to Bryan Collazo’s thesis:
149 | 
150 | * [Machine learning blocks](https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf).
151 |   Bryan Collazo. Masters thesis, MIT EECS, 2015.
152 | 
153 | With recent availability of a multitude of libraries and tools, we decided it was time to integrate
154 | them and expand the library to address other data types: images, text, graph, time series and
155 | integrate with deep learning libraries.
156 | 


--------------------------------------------------------------------------------
/apt.txt:
--------------------------------------------------------------------------------
1 | # apt-get requirements for development and mybinder environment
2 | graphviz
3 | pandoc
4 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = mlblocks
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/advanced_usage/adding_primitives.rst:
--------------------------------------------------------------------------------
  1 | Adding Primitives
  2 | =================
  3 | 
  4 | The **MLBlocks** library is only the engine and it has no use without primitives, so here we
  5 | explain how to add new primitives for **MLBlocks**.
  6 | 
  7 | MLPrimitives
  8 | ------------
  9 | 
 10 | **MLBlocks** has a related project, `MLPrimitives`_, which already includes a huge list of
 11 | integrated primitives, so the easiest and recommended way to add primitives for **MLBlocks**
 12 | is to install **MLPrimitives**.
 13 | 
 14 | This can be achieved by running the commands::
 15 | 
 16 |     pip install mlprimitives
 17 | 
 18 | For further details, please refer to the `MLPrimitives Documentation`_.
 19 | 
 20 | .. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
 21 | .. _MLPrimitives Documentation: https://mlbazaar.github.io/MLPrimitives/
 22 | 
 23 | Writing Primitives
 24 | ------------------
 25 | 
 26 | Sometimes you will find that you want to use a primitive that is not in the list of
 27 | `MLPrimitives integrated primitives`_, so you will have to integrate the primitive yourself
 28 | by writing the corresponding `JSON annotation <primitives.html#json-annotations>`_.
 29 | 
 30 | .. _MLPrimitives integrated primitives: https://github.com/MLBazaar/MLPrimitives/tree/master/mlblocks_primitives
 31 | 
 32 | .. note:: If you create new primitives for MLBlocks, please consider contributing them to the
 33 |           **MLPrimitives** project!
 34 | 
 35 | The first thing to do when adding a new primitive is making sure that it complies with the
 36 | necessary requirements, which depend on whether the primitive is a function or a class.
 37 | 
 38 | For `Function Primitives`_, the only requirement is that they have to be a single function.
 39 | Calling multiple functions sequentially as part of a single primitive is not supported, and
 40 | in order to achieve this you are expected to write a separated primitive for each function.
 41 | 
 42 | For `Class Primitives`_, just like the function primitives, the `fit` and `produce` phases
 43 | must consist of a single method each. Calling multiple methods sequentially within a single
 44 | primitive is not supported either.
 45 | 
 46 | `Class Primitives`_ also need to be able to be instantiated at once. Running setup or compiling
 47 | calls after the instance creation is not possible.
 48 | 
 49 | .. _Function Primitives: primitives.html#function-primitives
 50 | .. _Class Primitives: primitives.html#class-primitives
 51 | 
 52 | Primitives Lookup
 53 | -----------------
 54 | 
 55 | Once you have written the JSON annotation for your primitive, you will need to put in it in a
 56 | place known to **MLBlocks**.
 57 | 
 58 | **MLBlocks** looks for primitives in the following folders, in this order:
 59 | 
 60 | 1. Any folder specified by the user, starting by the latest one.
 61 | 2. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the current working directory.
 62 | 3. A folder named ``mlblocks_primitives`` or ``mlprimitives`` in the `system prefix`_.
 63 | 
 64 | .. _system prefix: https://docs.python.org/3/library/sys.html#sys.prefix
 65 | 
 66 | The list of folders where **MLBlocks** will search for primitives can be seen at any time
 67 | by calling the method `mlblocks.get_primitives_paths`_.
 68 | 
 69 | .. _mlblocks.get_primitives_paths: ../api_reference.html#mlblocks.get_primitives_paths
 70 | 
 71 | Adding a Primitives Folder
 72 | --------------------------
 73 | 
 74 | The simplest option in order to quickly add new primitives is to put their JSON annotations
 75 | in a folder called `mlblocks_primitives` in the root of your project, or in your current
 76 | working directory.
 77 | 
 78 | However, sometimes you will want to add a custom directory.
 79 | 
 80 | This can be easily done by using the `mlblocks.add_primitives_path`_ method.
 81 | 
 82 | .. _mlblocks.add_primitives_path: ../api_reference.html#mlblocks.add_primitives_path
 83 | 
 84 | Developing a Primitives Library
 85 | -------------------------------
 86 | 
 87 | Another option to add multiple libraries is creating a primitives library, such as
 88 | `MLPrimitives`_.
 89 | 
 90 | In order to make **MLBLocks** able to find the primitives defined in such a library,
 91 | all you need to do is setting up an `Entry Point`_ in your `setup.py` script with the
 92 | following specification:
 93 | 
 94 | 1. It has to be published under the group ``mlblocks``.
 95 | 2. It has to be named exactly ``primitives``.
 96 | 3. It has to point at a variable that contains a path or a list of paths to the JSONS folder(s).
 97 | 
 98 | An example of such an entry point would be::
 99 | 
100 |     entry_points = {
101 |         'mlblocks': [
102 |             'primitives=some_module:SOME_VARIABLE'
103 |         ]
104 |     }
105 | 
106 | where the module `some_module` contains a variable such as::
107 | 
108 |     SOME_VARIABLE = 'path/to/primitives'
109 | 
110 | or::
111 | 
112 |     SOME_VARIABLE = [
113 |         'path/to/primitives',
114 |         'path/to/more/primitives'
115 |     ]
116 | 
117 | .. _Entry Point: https://packaging.python.org/specifications/entry-points/
118 | 


--------------------------------------------------------------------------------
/docs/advanced_usage/hyperparameters.rst:
--------------------------------------------------------------------------------
  1 | Hyperparameters
  2 | ===============
  3 | 
  4 | A very important element of both Function and Class primitives are the hyperparameters.
  5 | 
  6 | The hyperparameters are arguments that modify the behavior of the primitive and its learning
  7 | process, which are set before the learning process starts and are not deduced from the data.
  8 | These hyperparameters are usually passed as arguments to the primitive constructor or to the
  9 | methods or functions that will be called during the fitting or predicting phase.
 10 | 
 11 | In **MLBlocks**, each primitive has all its hyperparameters and their valid values specified
 12 | on their `JSON Annotations`_.
 13 | 
 14 | Here, for example, we are looking at the ``hyperparameters`` section of the
 15 | ``keras.preprocessing.text.Tokenizer`` primitive from `MLPrimitives`_::
 16 | 
 17 |     "hyperparameters: {
 18 |         "fixed": {
 19 |             "filters": {
 20 |                 "type": "str",
 21 |                 "default": "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\n"
 22 |             },
 23 |             "split": {
 24 |                 "type": "str",
 25 |                 "default": " "
 26 |             },
 27 |             "oov_token": {
 28 |                 "type": "str",
 29 |                 "default": null
 30 |             }
 31 |         },
 32 |         "tunable": {
 33 |             "num_words": {
 34 |                 "type": "int",
 35 |                 "default": null,
 36 |                 "range": [1, 10000]
 37 |             },
 38 |             "lower": {
 39 |                 "type": "bool",
 40 |                 "default": true
 41 |             },
 42 |             "char_level": {
 43 |                 "type": "bool",
 44 |                 "default": false
 45 |             }
 46 |         }
 47 |     }
 48 | 
 49 | As can be seen, two types of hyperparameters exist: **fixed** and **tunable**.
 50 | 
 51 | Fixed Hyperparameters
 52 | ---------------------
 53 | 
 54 | These hyperparameters do not alter the learning process, and their values modify
 55 | the behavior of the primitive but not its prediction performance. In some cases these
 56 | hyperparameters have a default value, but most of the times their values have to be explicitly
 57 | set by the user.
 58 | 
 59 | In the `JSON Annotations`_, these hyperparameters are specified as a JSON that has the argument
 60 | name as the keyword and a nested JSON that specifies its details::
 61 | 
 62 |     "fixed": {
 63 |         "filters": {
 64 |             "type": "str",
 65 |             "default": "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\n"
 66 |         },
 67 |         "split": {
 68 |             "type": "str",
 69 |             "default": " "
 70 |         },
 71 |         "oov_token": {
 72 |             "type": "str",
 73 |             "default": null
 74 |         }
 75 |     }
 76 | 
 77 | Each entry in the ``fixed`` hyperparameters contains:
 78 | 
 79 | * **default**: This indicates the default value that the argument will take if the user does
 80 |   not specify another value when the `MLPipeline`_ is created. This keyword is optional, and
 81 |   if it is not specified, the user expected to always provide a value.
 82 | * **type**: The type of the argument. This is only informative and is not used by MLBlocks, but
 83 |   it is always included in all the `MLPrimitives`_ annotations.
 84 | 
 85 | Tunable Hyperparameters
 86 | -----------------------
 87 | 
 88 | These hyperparameters do not modify the primitive behaviour, but they have a direct
 89 | impact on the learning process and on how well the primitive learns from the data.
 90 | For this reason, their values can be tuned to improve the prediction performance.
 91 | 
 92 | In the `JSON Annotations`_, these hyperparameters are specified as a JSON that has the argument
 93 | name as the keyword and a nested JSON that specifies its details::
 94 | 
 95 |     "tunable": {
 96 |         "num_words": {
 97 |             "type": "int",
 98 |             "default": null,
 99 |             "range": [1, 10000]
100 |         },
101 |         "lower": {
102 |             "type": "bool",
103 |             "default": true
104 |         },
105 |         "char_level": {
106 |             "type": "bool",
107 |             "default": false
108 |         }
109 |     }
110 | 
111 | Each entry in the ``tunable`` hyperparameters contains:
112 | 
113 | * **type**: The type of the argument. This can be one of the primitive variable types, ``int``,
114 |   ``float``, ``str`` or ``bool``, or one of the special types, `multitype`_ or `conditional`_.
115 | * **default**: This indicates the default value that the argument will take if the user does
116 |   not specify another value when the `MLPipeline`_ is created.
117 | * **range**: Optional - This is expected to be found in numeric hyperparameters, and specifies
118 |   the minimum and maximum values that this primitive will work well with.
119 | * **values**: Optional - this is expected to be found in categorical hyperparameters, and
120 |   indicates the list of possible values that it can work with.
121 | 
122 | Special Hyperparameter Types
123 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
124 | 
125 | Sometimes, hyperparameters do not accept only one type of value, or their possible values may
126 | depend on the value of other hyperparameters.
127 | 
128 | Multitype Hyperparameters
129 | *************************
130 | 
131 | Some hyperparameters accept more than one type of value.
132 | 
133 | For example, suppose a primitive expects a hyperparameter called `max_features` that can take
134 | one of three types:
135 | 
136 | * An integer indicating the absolute number of features to use.
137 | * A float between 0 and 1 indicating the proportion of the maximum possible number of features.
138 | * The strings ``"min"``, ``"max"`` or ``"mean"``, indicating that the number needs to be computed
139 |   by the primitive itself in some way.
140 | 
141 | In this case, the ``type`` of this hyperparameter is ``multitype``, and its specification could
142 | be as follows::
143 | 
144 |     "max_features": {
145 |         "type": "multitype",
146 |         "default": "mean",
147 |         "types": {
148 |             "int": {
149 |                 "range": [1, 100]
150 |             },
151 |             "float": {
152 |                 "range": [0.1, 0.9]
153 |             },
154 |             "string": {
155 |                 "values": ["mean", "min", "max"]
156 |             }
157 |         }
158 |     }
159 | 
160 | Note how a new keyword ``types`` exist, that holds the possible values for each one of the
161 | possible types that this hyperparameter can have.
162 | 
163 | Conditional Hyperparameters
164 | ***************************
165 | 
166 | In some other cases, the values that a hyperparameter can take depend on the value of another
167 | one.
168 | For example, sometimes a primitive has a hyperparameter that specifies a kernel, and depending
169 | on the kernel used some other hyperparameters may be or not be used, or they might be able
170 | to take only some specific values.
171 | 
172 | In this case, the ``type`` of the hyperparameter whose values depend on the other is specified
173 | as ``conditional``.
174 | In this case, two additional entries are required:
175 | 
176 | * an entry called ``condition``, which specifies the name of the other hyperparameter, the value
177 |   of which is evaluated to decide which values this hyperparameter can take.
178 | * an additional subdictionary called ``values``, which relates the  possible values that the
179 |   `condition` hyperparameter can have with the full specifications of the type and values that
180 |   this hyperparameter can take in each case.
181 | 
182 | Suppose, for example, that the primitive explained in the previous point does not expect
183 | the ``mean``, ``min`` or ``max`` strings as values for the ``max_features`` hyperparameter,
184 | but as a separated one called ``max_feature_aggregation``, which is only used then the
185 | ``max_features`` hyperparameter has been given the value ``auto``.
186 | 
187 | In this case, the hyperparameters would be annotated like this::
188 | 
189 |     "max_features": {
190 |         "type": "multitype",
191 |         "default": "auto",
192 |         "types": {
193 |             "int": {
194 |                 "range": [1, 100]
195 |             },
196 |             "float": {
197 |                 "range": [0.1, 0.9]
198 |             },
199 |             "string": {
200 |                 "values": ["auto"]
201 |             }
202 |         }
203 |     }
204 |     "max_features_aggregation": {
205 |         "type": "conditional",
206 |         "condition": "max_features",
207 |         "default": null,
208 |         "values": {
209 |             "auto": {
210 |                 "description": "this will be used only if the value of max_features is `auto`",
211 |                 "type": "str",
212 |                 "default": "mean",
213 |                 "range": ["mean", "max", "min"]
214 |             }
215 |         }
216 |     }
217 | 
218 | .. note:: Just like a regular hyperparameter, if there is no match the default entry is used.
219 |           In this example, the ``null`` value indicates that the hyperparameter needs to be
220 |           disabled if there is no match, but instead of it we could add there a full specification
221 |           of type, range and default value as a nested dictionary to be used by default.
222 | 
223 | .. _JSON Annotations: primitives.html#json-annotations
224 | .. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
225 | .. _BTB: https://github.com/MLBazaar/BTB
226 | .. _MLPipeline: ../api_reference.html#mlblocks.MLPipeline
227 | .. _multitype: #multitype-hyperparameters
228 | .. _conditional: #conditional-hyperparameters
229 | 


--------------------------------------------------------------------------------
/docs/advanced_usage/primitives.rst:
--------------------------------------------------------------------------------
  1 | Primitives
  2 | ==========
  3 | 
  4 | MLBlocks goal is to seamlessly combine any possible set of Machine Learning tools developed
  5 | in Python, whether they are custom developments or belong to third party libraries, and
  6 | build `pipelines`_ out of them that can be fitted and then used to make predictions.
  7 | 
  8 | We call each one of these Machine Learning tools a **primitive**.
  9 | 
 10 | What is a Primitive?
 11 | --------------------
 12 | 
 13 | A valid MLBlocks primitive is an importable Python object that:
 14 | 
 15 | * Must be either a function or a class.
 16 | * If it is a class, it **might** have a `fitting` stage, where the primitive is passed some
 17 |   training data and it `learns` from it, and which can be executed with a single method call.
 18 |   Function primitives have no `fitting` stage.
 19 | * **Must** have a `producing` stage, where the primitive is passed some data and it returns some
 20 |   other data, whether it is a transformation of the input data or some new data derived from it,
 21 |   such as a set of predictions. This `producing` stage must be executed with a single function or
 22 |   method call.
 23 | * Might have `hyperparameters`_, additional arguments to be passed to either the function call or
 24 |   the class constructor in order to alter or control the way the fitting and producing stages work.
 25 | 
 26 | Here are some examples of primitives:
 27 | 
 28 | +-----------------------------------------------+-----------+--------------+--------------------+
 29 | | primitive                                     | type      | fit          | produce            |
 30 | +===============================================+===========+==============+====================+
 31 | | sklearn.preprocessing.StandardScaler          | class     | fit          | transform          |
 32 | +-----------------------------------------------+-----------+--------------+--------------------+
 33 | | sklearn.ensemble.RandomForestClassifier       | class     | fit          | predict            |
 34 | +-----------------------------------------------+-----------+--------------+--------------------+
 35 | | skimage.feature.hog                           | function  | --           | --                 |
 36 | +-----------------------------------------------+-----------+--------------+--------------------+
 37 | | xgboost.XGBRegressor                          | class     | fit          | predict            |
 38 | +-----------------------------------------------+-----------+--------------+--------------------+
 39 | | keras.applications.resnet50.preprocess_input  | function  | --           | --                 |
 40 | +-----------------------------------------------+-----------+--------------+--------------------+
 41 | | keras.applications.resnet50.ResNet50          | class     | --           | predict            |
 42 | +-----------------------------------------------+-----------+--------------+--------------------+
 43 | | keras.preprocessing.sequence.pad_sequences    | function  | --           | --                 |
 44 | +-----------------------------------------------+-----------+--------------+--------------------+
 45 | | keras.preprocessing.text.Tokenizer            | class     | fit_on_texts | texts_to_sequences |
 46 | +-----------------------------------------------+-----------+--------------+--------------------+
 47 | | lightfm.LightFM                               | class     | fit          | predict            |
 48 | +-----------------------------------------------+-----------+--------------+--------------------+
 49 | 
 50 | JSON Annotations
 51 | ----------------
 52 | 
 53 | Each integrated primitive has an associated JSON file that specifies its methods, their arguments,
 54 | their types and, most importantly, any possible `hyperparameters`_ that the primitive has, as well
 55 | as their types, ranges and conditions, if any.
 56 | 
 57 | These JSON annotations can be:
 58 | 
 59 | * **Installed** using the `MLPrimitives`_ related project, which is the recommended approach.
 60 | * **Created by the user** and configured for MLBlocks to use them.
 61 | 
 62 | And the primitives can be of two types:
 63 | 
 64 | * Function Primitives: Simple functions that can be called directly.
 65 | * Class Primitives: Class objects that need to be instantiated before they can be used.
 66 | 
 67 | Here are some simplified examples of these JSONs, but for more detailed examples, please refer to
 68 | the `examples folder`_ of the project.
 69 | 
 70 | Function Primitives
 71 | ~~~~~~~~~~~~~~~~~~~
 72 | 
 73 | The most simple type of primitives are simple functions that can be called directly, without
 74 | the need to created any class instance before.
 75 | 
 76 | In most cases, if not all, these functions do not have any associated learning process,
 77 | and their behavior is always the same both during the fitting and the predicting phases
 78 | of the pipeline.
 79 | 
 80 | A simple example of such a primitive would be the ``numpy.argmax`` function, which expects a 2
 81 | dimensional array as input, and returns a 1 dimensional array that indicates the index of the
 82 | maximum values along an axis.
 83 | 
 84 | The simplest JSON annotation for this primitive would look like this::
 85 | 
 86 |     {
 87 |         "primitive": "numpy.argmax",
 88 |         "produce": {
 89 |             "args": [
 90 |                 {
 91 |                     "name": "y",
 92 |                     "type": "ndarray"
 93 |                 }
 94 |             ],
 95 |             "output": [
 96 |                 {
 97 |                     "name": "y",
 98 |                     "type": "ndarray"
 99 |                 }
100 |             ]
101 |         },
102 |         "hyperparameters": {
103 |             "fixed": {
104 |                 "axis": {
105 |                     "type": "int",
106 |                     "default": 1
107 |                 }
108 |             }
109 |         }
110 |     }
111 | 
112 | The main elements of this JSON are:
113 | 
114 | * **primitive**: The fully qualified, directly importable name of the function to be used::
115 | 
116 |     "primitive": "numpy.argmax",
117 | 
118 | * **produce**: A nested JSON that specifies the names and types of arguments and the output values
119 |   of the primitive::
120 | 
121 |     "produce": {
122 |         "args": [
123 |             {
124 |                 "name": "y",
125 |                 "type": "ndarray"
126 |             }
127 |         ],
128 |         "output": [
129 |             {
130 |                 "name": "y",
131 |                 "type": "ndarray"
132 |             }
133 |         ]
134 |     }
135 | 
136 | * **hyperparameters**: A nested JSON that specifies the `hyperparameters`_ of this primitive.
137 |   Note that multiple types of hyperparameters exist, but that this primitive has only one ``fixed``
138 |   hyperparameter, which mean that this is not tunable and that, even though the user can specify
139 |   a value different than the default, changes are not expected during the MLBlock instance life
140 |   cycle::
141 | 
142 |     "hyperparameters": {
143 |         "fixed": {
144 |             "axis": {
145 |                 "type": "int",
146 |                 "default": 1
147 |             }
148 |         }
149 |     }
150 | 
151 | Class Primitives
152 | ~~~~~~~~~~~~~~~~
153 | 
154 | A more complex type of primitives are classes which need to be instantiated before they can
155 | be used.
156 | 
157 | In most cases, these classes will have an associated learning process, and they will have some
158 | fit method or equivalent that will be called during the fitting phase but not during the
159 | predicting one.
160 | 
161 | A simple example of such a primitive would be the ``sklearn.preprocessing.StandardScaler`` class,
162 | which is used to standardize a set of values by calculating their z-score, which means centering
163 | them around 0 and scaling them to unit variance.
164 | 
165 | This primitive has an associated learning process, where it calculates the mean and standard
166 | deviation of the training data, to later on use them to transform the prediction data to the
167 | same center and scale.
168 | 
169 | The simplest JSON annotation for this primitive would look like this::
170 | 
171 |     {
172 |         "primitive": "sklearn.preprocessing.StandardScaler",
173 |         "fit": {
174 |             "method": "fit",
175 |             "args": [
176 |                 {
177 |                     "name": "X",
178 |                     "type": "ndarray"
179 |                 }
180 |             ]
181 |         },
182 |         "produce": {
183 |             "method": "transform",
184 |             "args": [
185 |                 {
186 |                     "name": "X",
187 |                     "type": "ndarray"
188 |                 }
189 |             ],
190 |             "output": [
191 |                 {
192 |                     "name": "X",
193 |                     "type": "ndarray"
194 |                 }
195 |             ]
196 |         },
197 |         "hyperparameters": {
198 |             "tunable": {
199 |                 "with_mean": {
200 |                     "type": "bool",
201 |                     "default": true
202 |                 },
203 |                 "with_std": {
204 |                     "type": "bool",
205 |                     "default": true
206 |                 }
207 |             }
208 |         }
209 |     }
210 | 
211 | Note that there are some details of this JSON annotation that make it different from the
212 | Function Primitive one that explained above:
213 | 
214 | * **primitive**: The fully qualified, directly importable name of the class to be used. This
215 |   class is the one that will be used to create the actual primitive instance::
216 | 
217 |     "primitive": "sklearn.preprocessing.StandardScaler",
218 | 
219 | * **fit**: A nested JSON that specifies the name of the method to call during the fitting phase,
220 |   which in this case happens to also be ``fit``, as well as the names and types of
221 |   arguments that this method expects::
222 | 
223 |     "fit": {
224 |         "method": "fit",
225 |         "args": [
226 |             {
227 |                 "name": "X",
228 |                 "type": "ndarray"
229 |             }
230 |         ]
231 |     }
232 | 
233 | * **produce**: A nested JSON that specifies the name of the method to call during the predicting
234 |   phase, in this case called ``transform``, as well as the names and types of
235 |   arguments that this method expects and its outputs::
236 | 
237 |     "produce": {
238 |         "method": "transform",
239 |         "args": [
240 |             {
241 |                 "name": "X",
242 |                 "type": "ndarray"
243 |             }
244 |         ],
245 |         "output": [
246 |             {
247 |                 "name": "X",
248 |                 "type": "ndarray"
249 |             }
250 |         ]
251 |     }
252 | 
253 | * **hyperparameters**: A nested JSON that specifies the hyperparameters of this primitive.
254 |   In this case, only ``tunable`` hyperparameters are specified, with their
255 |   names and types. If the type was something other than ``bool``, a list or
256 |   range of valid values would also be specified::
257 | 
258 |     "hyperparameters": {
259 |         "tunable": {
260 |             "with_mean": {
261 |                 "type": "bool",
262 |                 "default": true
263 |             },
264 |             "with_std": {
265 |                 "type": "bool",
266 |                 "default": true
267 |             }
268 |         }
269 |     }
270 | 
271 | The MLBlock Class
272 | -----------------
273 | 
274 | Within the **MLBlocks** library, a primitive is represented through the `mlblocks.MLBlock`_ class.
275 | 
276 | This is used to wrap around the annotated primitives, offering a common and uniform interface to
277 | all of them.
278 | 
279 | More specifically, the `mlblocks.MLBlock`_ class offers two public methods, `fit`_ and `produce`_,
280 | which are directly linked to the methods specified in the JSON Annotation:
281 | 
282 | For example, we can look at the `keras.preprocessing.text.Tokenizer`_ primitive from
283 | `MLPrimitives`_, which calls the method ``fit_on_texts`` when ``fit`` is called, and
284 | ``tests_to_sequences`` when ``produce`` is called:
285 | 
286 | .. graphviz::
287 | 
288 |     digraph {
289 |         {
290 |             node [shape=box]
291 |             fit_on_texts;
292 |             texts_to_sequences;
293 |             fit;
294 |             produce;
295 |         }
296 |         subgraph cluster_1 {
297 |             {rank=same; fit produce};
298 |             fit -> produce [style=invis];
299 |             fit -> fit_on_texts;
300 |             produce -> texts_to_sequences;
301 |             label = "mlblocks.MLBlock";
302 |             subgraph cluster_2 {
303 |                 fit_on_texts;
304 |                 texts_to_sequences;
305 |                 label = "keras.preprocessing.text.Tokenizer";
306 |             }
307 |         }
308 |     }
309 | 
310 | For a more detailed description of this class, please check the corresponding
311 | section in the `API Reference`_ documentation.
312 | 
313 | .. _API Reference: ../api_reference.html
314 | .. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
315 | .. _keras.preprocessing.text.Tokenizer: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.preprocessing.text.Tokenizer.json
316 | .. _hyperparameters: hyperparameters.html
317 | .. _mlblocks.MLBlock: ../api_reference.html#mlblocks.MLBlock
318 | .. _pipelines: pipelines.html
319 | .. _examples folder: https://github.com/MLBazaar/MLBlocks/tree/master/examples
320 | .. _fit: ../api_reference.html#mlblocks.MLBlock.fit
321 | .. _produce: ../api_reference.html#mlblocks.MLBlock.produce
322 | 


--------------------------------------------------------------------------------
/docs/api/mlblocks.rst:
--------------------------------------------------------------------------------
1 | mlblocks
2 | ========
3 | 
4 | .. automodule:: mlblocks
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | .. mdinclude:: ../HISTORY.md
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # MLBlocks documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri Jun  9 13:47:02 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another
 17 | # directory, add these directories to sys.path here. If the directory is
 18 | # relative to the documentation root, use os.path.abspath to make it
 19 | # absolute, like shown here.
 20 | 
 21 | import sphinx_rtd_theme # For read the docs theme
 22 | 
 23 | import mlblocks
 24 | 
 25 | # -- General configuration ---------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 32 | extensions = [
 33 |     'm2r',
 34 |     'sphinx.ext.autodoc',
 35 |     'sphinx.ext.githubpages',
 36 |     'sphinx.ext.viewcode',
 37 |     'sphinx.ext.napoleon',
 38 |     'sphinx.ext.graphviz',
 39 |     'IPython.sphinxext.ipython_console_highlighting',
 40 |     'IPython.sphinxext.ipython_directive',
 41 |     'autodocsumm',
 42 | ]
 43 | 
 44 | autodoc_default_options = {
 45 |     'autosummary': True,
 46 | }
 47 | 
 48 | ipython_execlines = ["import pandas as pd", "pd.set_option('display.width', 1000000)"]
 49 | # Add any paths that contain templates here, relative to this directory.
 50 | templates_path = ['_templates']
 51 | 
 52 | # The suffix(es) of source filenames.
 53 | # You can specify multiple suffix as a list of string:
 54 | source_suffix = ['.rst', '.md', '.ipynb']
 55 | 
 56 | # The master toctree document.
 57 | master_doc = 'index'
 58 | 
 59 | # General information about the project.
 60 | project = 'MLBlocks'
 61 | slug = 'mlblocks'
 62 | title = project + ' Documentation',
 63 | copyright = '2018, MIT Data To AI Lab'
 64 | author = 'MIT Data To AI Lab'
 65 | description = 'Pipelines and Primitives for Machine Learning and Data Science.'
 66 | user = 'MLBazaar'
 67 | 
 68 | # The version info for the project you're documenting, acts as replacement
 69 | # for |version| and |release|, also used in various other places throughout
 70 | # the built documents.
 71 | #
 72 | # The short X.Y version.
 73 | version = mlblocks.__version__
 74 | # The full version, including alpha/beta/rc tags.
 75 | release = mlblocks.__version__
 76 | 
 77 | # The language for content autogenerated by Sphinx. Refer to documentation
 78 | # for a list of supported languages.
 79 | #
 80 | # This is also used if you do content translation via gettext catalogs.
 81 | # Usually you set "language" from the command line for these cases.
 82 | language = None
 83 | 
 84 | # List of patterns, relative to source directory, that match files and
 85 | # directories to ignore when looking for source files.
 86 | # This patterns also effect to html_static_path and html_extra_path
 87 | exclude_patterns = ['.py', '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
 88 | 
 89 | # The name of the Pygments (syntax highlighting) style to use.
 90 | pygments_style = 'sphinx'
 91 | 
 92 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 93 | todo_include_todos = False
 94 | 
 95 | # -- Options for HTML output -------------------------------------------
 96 | 
 97 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 98 | # a list of builtin themes.
 99 | #
100 | html_theme = 'sphinx_rtd_theme'
101 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
102 | 
103 | # Readthedocs additions
104 | html_context = {
105 |     'display_github': True,
106 |     'github_user': user,
107 |     'github_repo': project,
108 |     'github_version': 'master',
109 |     'conf_py_path': '/docs/',
110 | }
111 | 
112 | # Theme options are theme-specific and customize the look and feel of a
113 | # theme further.  For a list of options available for each theme, see the
114 | # documentation.
115 | html_theme_options = {
116 |     'collapse_navigation': False,
117 |     'display_version': True,
118 | }
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | # html_static_path = ['_static']
124 | 
125 | # The name of an image file (relative to this directory) to use as a favicon of
126 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
127 | # pixels large.
128 | html_favicon = 'images/favicon.ico'
129 | 
130 | # If given, this must be the name of an image file (path relative to the
131 | # configuration directory) that is the logo of the docs. It is placed at
132 | # the top of the sidebar; its width should therefore not exceed 200 pixels.
133 | # html_logo = 'images/mlblocks-logo-small.png'
134 | 
135 | # -- Options for HTMLHelp output ---------------------------------------
136 | 
137 | # Output file base name for HTML help builder.
138 | htmlhelp_basename = slug + 'doc'
139 | 
140 | 
141 | # -- Options for LaTeX output ------------------------------------------
142 | 
143 | latex_elements = {
144 |     # The paper size ('letterpaper' or 'a4paper').
145 |     #
146 |     # 'papersize': 'letterpaper',
147 | 
148 |     # The font size ('10pt', '11pt' or '12pt').
149 |     #
150 |     # 'pointsize': '10pt',
151 | 
152 |     # Additional stuff for the LaTeX preamble.
153 |     #
154 |     # 'preamble': '',
155 | 
156 |     # Latex figure (float) alignment
157 |     #
158 |     # 'figure_align': 'htbp',
159 | }
160 | 
161 | # Grouping the document tree into LaTeX files. List of tuples
162 | # (source start file, target name, title, author, documentclass
163 | # [howto, manual, or own class]).
164 | latex_documents = [(
165 |     master_doc,
166 |     slug + '.tex',
167 |     title,
168 |     author,
169 |     'manual'
170 | )]
171 | 
172 | 
173 | # -- Options for manual page output ------------------------------------
174 | 
175 | # One entry per manual page. List of tuples
176 | # (source start file, name, description, authors, manual section).
177 | man_pages = [(
178 |     master_doc,
179 |     slug,
180 |     title,
181 |     [author],
182 |     1
183 | )]
184 | 
185 | 
186 | # -- Options for Texinfo output ----------------------------------------
187 | 
188 | # Grouping the document tree into Texinfo files. List of tuples
189 | # (source start file, target name, title, author,
190 | #  dir menu entry, description, category)
191 | texinfo_documents = [(
192 |     master_doc,
193 |     slug,
194 |     title,
195 |     author,
196 |     slug,
197 |     description,
198 |     'Miscellaneous'
199 | )]
200 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/docs/getting_started/install.rst:
--------------------------------------------------------------------------------
 1 | .. highlight:: shell
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | From PyPi
 7 | ---------
 8 | 
 9 | The simplest and recommended way to install MLBlocks is using `pip`:
10 | 
11 | .. code-block:: console
12 | 
13 |     pip install mlblocks
14 | 
15 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide
16 | you through the process.
17 | 
18 | .. _pip: https://pip.pypa.io
19 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
20 | 
21 | Additional dependencies
22 | -----------------------
23 | 
24 | In order to be usable, MLBlocks requires a compatible primitives library.
25 | 
26 | The official library, required in order to follow the MLBlocks tutorials and documentation examples,
27 | is `MLPrimitives`_, which you can install with this command:
28 | 
29 | .. code-block:: console
30 | 
31 |     pip install mlprimitives
32 | 
33 | .. _MLPrimitives: https://github.com/MLBazaar/MLPrimitives
34 | 
35 | Install for development
36 | -----------------------
37 | 
38 | If you are installing **MLBlocks** in order to modify its code, the installation must be done
39 | from its sources, in the editable mode, and also including some additional dependencies in
40 | order to be able to run the tests and build the documentation. Instructions about this process
41 | can be found in the `Contributing guide`_.
42 | 
43 | .. _Contributing guide: ../contributing.html#get-started
44 | 


--------------------------------------------------------------------------------
/docs/getting_started/quickstart.rst:
--------------------------------------------------------------------------------
  1 | Quickstart
  2 | ==========
  3 | 
  4 | Below is a short tutorial that will show you how to get started using **MLBlocks**.
  5 | 
  6 | In this tutorial we will learn how to:
  7 | 
  8 | * Create a pipeline using multiple primitives
  9 | * Obtain the list of tunable hyperparameters from the pipeline
 10 | * Specify hyperparameters for each primitive in the pipeline
 11 | * Fit the pipeline using training data
 12 | * Use the pipeline to make predictions from new data
 13 | 
 14 | .. note:: Some additional dependencies are required in order to run this Quickstart.
 15 |           Make sure that `you have already installed them`_.
 16 | 
 17 | Creating a pipeline
 18 | -------------------
 19 | 
 20 | With MLBlocks, creating a pipeline is as simple as specifying a list of primitives and passing
 21 | them to the `MLPipeline class`_:
 22 | 
 23 | .. ipython:: python
 24 | 
 25 |     from mlblocks import MLPipeline
 26 |     primitives = [
 27 |         'mlprimitives.custom.preprocessing.ClassEncoder',
 28 |         'mlprimitives.custom.feature_extraction.CategoricalEncoder',
 29 |         'sklearn.impute.SimpleImputer',
 30 |         'xgboost.XGBClassifier',
 31 |         'mlprimitives.custom.preprocessing.ClassDecoder'
 32 |     ]
 33 |     pipeline = MLPipeline(primitives)
 34 | 
 35 | Optionally, specific `hyperparameters`_ can be also set by specifying them in a dictionary and
 36 | passing them as the ``init_params`` argument:
 37 | 
 38 | .. ipython:: python
 39 | 
 40 |     init_params = {
 41 |         'sklearn.impute.SimpleImputer': {
 42 |             'strategy': 'median'
 43 |         }
 44 |     }
 45 |     pipeline = MLPipeline(primitives, init_params=init_params)
 46 | 
 47 | Once the pipeline has been instantiated, we can easily see what `hyperparameters`_ have been set
 48 | for each block, by calling the `get_hyperparameters method`_.
 49 | 
 50 | The output of this method is a dictionary which has the name of each block as keys and
 51 | a dictionary with the `hyperparameters`_ of the corresponding block as values.
 52 | 
 53 | .. ipython:: python
 54 | 
 55 |     pipeline.get_hyperparameters()
 56 | 
 57 | Tunable Hyperparameters
 58 | -----------------------
 59 | 
 60 | One of the main features of `MLBlocks JSON Annotations`_ is the possibility to indicate
 61 | the type and possible values that each primitive hyperparameter accepts.
 62 | 
 63 | The list of possible hyperparameters and their details can easily be obtained from the pipeline
 64 | instance by calling its `get_tunable_hyperparameters method`_.
 65 | 
 66 | The output of this method is a dictionary that contains the list of tunable hyperparameters
 67 | for each block in the pipeline, ready to be passed to any hyperparameter tuning library such
 68 | as `BTB`_.
 69 | 
 70 | .. ipython:: python
 71 | 
 72 |     pipeline.get_tunable_hyperparameters()
 73 | 
 74 | Setting Hyperparameters
 75 | -----------------------
 76 | 
 77 | Modifying the hyperparameters of an already instantiated pipeline can be done using the
 78 | `set_hyperparameters method`_, which expects a dictionary with the same format as the returned
 79 | by the `get_hyperparameters method`_.
 80 | 
 81 | Note that if a subset of the hyperparameters is passed, only these will be modified, and the
 82 | other ones will remain unmodified.
 83 | 
 84 | .. ipython:: python
 85 | 
 86 |     new_hyperparameters = {
 87 |         'xgboost.XGBClassifier#1': {
 88 |             'max_depth': 15
 89 |         }
 90 |     }
 91 |     pipeline.set_hyperparameters(new_hyperparameters)
 92 |     hyperparameters = pipeline.get_hyperparameters()
 93 |     hyperparameters['xgboost.XGBClassifier#1']['max_depth']
 94 | 
 95 | Making predictions
 96 | ------------------
 97 | 
 98 | Once we have created the pipeline with the desired hyperparameters we can fit it
 99 | and then use it to make predictions on new data.
100 | 
101 | To do this, we first call the ``fit`` method passing the training data and the corresponding
102 | labels.
103 | 
104 | .. ipython:: python
105 |     :okwarning:
106 | 
107 |     import pandas as pd
108 |     from sklearn.model_selection import train_test_split
109 | 
110 |     dataset = pd.read_csv('http://mlblocks.s3.amazonaws.com/census.csv')
111 |     label = dataset.pop('label')
112 | 
113 |     X_train, X_test, y_train, y_test = train_test_split(dataset, label, stratify=label)
114 |     pipeline.fit(X_train, y_train)
115 | 
116 | Once we have fitted our model to our data, we can call the ``predict`` method passing new data
117 | to obtain predictions from the pipeline.
118 | 
119 | .. ipython:: python
120 |     :okwarning:
121 | 
122 |     from sklearn.metrics import accuracy_score
123 | 
124 |     predictions = pipeline.predict(X_test)
125 |     predictions
126 |     accuracy_score(y_test, predictions)
127 | 
128 | .. _you have already installed them: install.html#additional-dependencies
129 | .. _MLPipeline class: ../api_reference.html#mlblocks.MLPipeline
130 | .. _get_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_hyperparameters
131 | .. _hyperparameters: ../advanced_usage/hyperparameters.html
132 | .. _MLBlocks JSON Annotations: ../advanced_usage/primitives.html#json-annotations
133 | .. _get_tunable_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.get_tunable_hyperparameters
134 | .. _BTB: https://github.com/MLBazaar/BTB
135 | .. _set_hyperparameters method: ../api_reference.html#mlblocks.MLPipeline.set_hyperparameters
136 | 


--------------------------------------------------------------------------------
/docs/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/docs/images/favicon.ico


--------------------------------------------------------------------------------
/docs/images/mlblocks-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/docs/images/mlblocks-icon.png


--------------------------------------------------------------------------------
/docs/images/mlblocks-logo-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/docs/images/mlblocks-logo-small.png


--------------------------------------------------------------------------------
/docs/images/mlblocks-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/docs/images/mlblocks-logo.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | What is MLBlocks?
  2 | =================
  3 | 
  4 | .. image:: images/mlblocks-logo.png
  5 |    :width: 300 px
  6 |    :alt: MLBlocks
  7 |    :align: center
  8 | 
  9 | * Documentation: https://mlbazaar.github.io/MLBlocks
 10 | * Github: https://github.com/MLBazaar/MLBlocks
 11 | * License: `MIT <https://github.com/MLBazaar/MLBlocks/blob/master/LICENSE>`_
 12 | 
 13 | MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning
 14 | tools developed in Python, whether they are custom developments or belong to third party
 15 | libraries, and build Pipelines out of them that can be fitted and then used to make predictions.
 16 | 
 17 | This is achieved by providing a simple and intuitive annotation language that allows the
 18 | user to specify how to integrate with each tool, here called primitives, in order to provide
 19 | a common uniform interface to each one of them.
 20 | 
 21 | At a high level:
 22 | 
 23 | * Each available primitive has been annotated using a standardized JSON file that specifies its
 24 |   native interface, as well as which hyperparameters can be used to tune its behavior.
 25 | * A list of primitives that will be combined into a pipeline is provided by the user, optionally
 26 |   passing along the hyperparameters to use for each primitive.
 27 | * An MLBlock instance is build for each primitive, offering a common interface for all of them.
 28 | * The MLBlock instances are then combined into an MLPipeline instance, able to run them all in
 29 |   the right order, passing the output from each one as input to the next one.
 30 | * The training data is passed to the `MLPipeline.fit` method, which sequentially fits each
 31 |   MLBlock instance following the JSON annotation specification.
 32 | * The data used to make predictions is passed to the `MLPipeline.predict` method, which uses each
 33 |   MLBlock sequentially to obtain the desired predictions.
 34 | 
 35 | History
 36 | -------
 37 | 
 38 | In its first iteration, in 2015, MLBlocks was designed for only multi table, multi entity temporal
 39 | data. A good reference to see our design rationale at that time is Bryan Collazo’s thesis, written
 40 | under the supervision of Kalyan Veeramachaneni:
 41 | 
 42 | * `Machine learning blocks`_.
 43 |   Bryan Collazo. Masters thesis, MIT EECS, 2015.
 44 | 
 45 | In 2018, with recent availability of a multitude of libraries and tools, we decided it was time to
 46 | integrate them and expand the library to address other data types, like images, text, graph or
 47 | time series, as well as introduce the usage of deep learning libraries. A second iteration of our
 48 | work was then started by the hand of William Xue:
 49 | 
 50 | * `A Flexible Framework for Composing End to End Machine Learning Pipelines`_.
 51 |   William Xue. Masters thesis, MIT EECS, 2018.
 52 | 
 53 | Later in 2018, Carles Sala joined the project to make it grow as a reliable open-source library
 54 | that would become part of a bigger software ecosystem designed to facilitate the development of
 55 | robust end-to-end solutions based on Machine Learning tools. This third iteration of our work
 56 | was presented in 2019 as part of the Machine Learning Bazaar:
 57 | 
 58 | * `The Machine Learning Bazaar: Harnessing the ML Ecosystem for Effective System Development`_.
 59 |   Micah J. Smith, Carles Sala, James Max Kanter, and Kalyan Veeramachaneni. Sigmod 2020.
 60 | 
 61 | .. toctree::
 62 |    :caption: Getting Started
 63 |    :titlesonly:
 64 | 
 65 |    self
 66 |    getting_started/install
 67 |    getting_started/quickstart
 68 | 
 69 | .. toctree::
 70 |    :caption: Advanced Usage
 71 |    :maxdepth: 1
 72 | 
 73 |    advanced_usage/primitives
 74 |    advanced_usage/hyperparameters
 75 |    advanced_usage/pipelines
 76 |    advanced_usage/adding_primitives
 77 | 
 78 | .. toctree::
 79 |    :caption: Pipeline Examples
 80 |    :maxdepth: 1
 81 | 
 82 |    pipeline_examples/single_table
 83 |    pipeline_examples/multi_table
 84 |    pipeline_examples/text
 85 |    pipeline_examples/image
 86 |    pipeline_examples/graph
 87 | 
 88 | .. toctree::
 89 |    :caption: API Reference
 90 |    :titlesonly:
 91 | 
 92 |    api/mlblocks
 93 |    api/mlblocks.datasets
 94 |    api/mlblocks.discovery
 95 | 
 96 | .. toctree::
 97 |    :caption: Resources
 98 |    :titlesonly:
 99 | 
100 |    contributing
101 |    authors
102 |    changelog
103 | 
104 | Indices and tables
105 | ==================
106 | * :ref:`genindex`
107 | * :ref:`modindex`
108 | * :ref:`search`
109 | 
110 | .. _Machine learning blocks: https://dai.lids.mit.edu/wp-content/uploads/2018/06/Mlblocks_Bryan.pdf
111 | 
112 | .. _A Flexible Framework for Composing End to End Machine Learning Pipelines: https://dai.lids.mit.edu/wp-content/uploads/2018/12/William_MEng.pdf
113 | .. _The Machine Learning Bazaar\: Harnessing the ML Ecosystem for Effective System Development: https://arxiv.org/abs/1905.08942
114 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=mlblocks
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/pipeline_examples/graph.rst:
--------------------------------------------------------------------------------
 1 | Graph Pipelines
 2 | ===============
 3 | 
 4 | Here we will be showing some examples using **MLBlocks** to resolve graph problems.
 5 | 
 6 | Link Prediction
 7 | ---------------
 8 | 
 9 | For the Graph Link Prediction  examples we will be using the UMLS biomedical ontology dataset,
10 | which we will load using the ``mlblocks.dataset.load_umls`` function.
11 | 
12 | The data consists of information about a 135 Graph and the relations between their nodes given
13 | as a DataFrame with three columns, `source`, `target` and `type`, indicating which nodes are
14 | related and with which type of link.
15 | The target is a 1d numpy binary integer array indicating whether the indicated link exists or not.
16 | 
17 | 
18 | NetworkX + MLPrimitives + Scikit-learn + XGBoost
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 | 
21 | In this example, we will go use some `NetworkX Link Prediction` functions to extract attributes
22 | from the Graph, to later on encode the categorical features with the `CategoricalEncoder from
23 | MLPrimitives`_, scale the data using the `StandardScaler from scikit-learn`_ and finally go into
24 | an `XGBClassifier`.
25 | 
26 | Note how in this example, the Graph objects and the names of the node columns are passed as
27 | additional variables to be added to the context, as the NetworkX primitive will need some
28 | additional information not found inside `X`.
29 | 
30 | .. code-block:: python
31 | 
32 |     from mlblocks import MLPipeline
33 |     from mlprimitives.datasets import load_umls
34 | 
35 |     dataset = load_umls()
36 |     dataset.describe()
37 | 
38 |     X_train, X_test, y_train, y_test = dataset.get_splits(1)
39 | 
40 |     primitives = [
41 |         'networkx.link_prediction_feature_extraction',
42 |         'mlprimitives.custom.feature_extraction.CategoricalEncoder',
43 |         'sklearn.preprocessing.StandardScaler',
44 |         'xgboost.XGBClassifier'
45 |     ]
46 |     init_params = {
47 |         'xgboost.XGBClassifier': {
48 |             'n_estimators': 300,
49 |             'learning_rate': 0.1
50 |         }
51 |     }
52 |     pipeline = MLPipeline(primitives)
53 | 
54 |     node_columns = ['source', 'target']
55 |     pipeline.fit(
56 |         X_train,
57 |         y_train,
58 |         graph=dataset.graph,       # These will be set in the pipeline Context
59 |         node_columns=node_columns  # and made available for the networkx primitive
60 |     )
61 | 
62 |     predictions = pipeline.predict(
63 |         X_test,
64 |         graph=dataset.graph,       # These will be set in the pipeline Context
65 |         node_columns=node_columns  # and made available for the networkx primitive
66 |     )
67 | 
68 |     dataset.score(y_test, predictions)
69 | 
70 | 
71 | .. _NetworkX Link Prediction: https://networkx.github.io/documentation/networkx-1.10/reference/algorithms.link_prediction.html
72 | .. _CategoricalEncoder from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/mlprimitives.custom.feature_extraction.CategoricalEncoder.json
73 | .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
74 | .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
75 | 


--------------------------------------------------------------------------------
/docs/pipeline_examples/image.rst:
--------------------------------------------------------------------------------
  1 | Image Pipelines
  2 | ===============
  3 | 
  4 | Here we will be showing some examples using **MLBlocks** to resolve image problems.
  5 | 
  6 | Image Classification
  7 | --------------------
  8 | 
  9 | For the image classification examples we will be using the `USPS Dataset`_, which we will
 10 | load using the ``mlblocks.dataset.load_usps`` function.
 11 | 
 12 | The data of this dataset is a 3d numpy array vector with shape ``(224, 224, 3)`` containing 9298
 13 | 224x224 RGB photos of handwritten digits, and the target is a 1d numpy integer array containing
 14 | the label of the digit represented in the image.
 15 | 
 16 | OpenCV GaussianBlur + Scikit-image HOG + Scikit-Learn RandomForestClassifier
 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 18 | 
 19 | In this first example, we will attempt to resolve the problem using some basic preprocessing
 20 | with the `OpenCV GaussianBlur function`_, to later on calculate the Histogram of Oriented
 21 | Gradients using the corresponding `scikit-image function`_ to later on use a simple
 22 | `RandomForestClassifier from scikit-learn`_ on the generated features.
 23 | 
 24 | .. code-block:: python
 25 | 
 26 |     from mlblocks import MLPipeline
 27 |     from mlprimitives.datasets import load_usps
 28 | 
 29 |     dataset = load_usps()
 30 |     dataset.describe()
 31 | 
 32 |     X_train, X_test, y_train, y_test = dataset.get_splits(1)
 33 | 
 34 |     primitives = [
 35 |         'cv2.GaussianBlur',
 36 |         'skimage.feature.hog',
 37 |         'sklearn.ensemble.RandomForestClassifier'
 38 |     ]
 39 |     init_params = {
 40 |         'skimage.feature.hog': {
 41 |             'multichannel': True,
 42 |             'visualize': False
 43 |         }
 44 |     }
 45 |     pipeline = MLPipeline(primitives, init_params)
 46 | 
 47 |     pipeline.fit(X_train, y_train)
 48 | 
 49 |     predictions = pipeline.predict(X_test)
 50 | 
 51 |     dataset.score(y_test, predictions)
 52 | 
 53 | 
 54 | OpenCV GaussianBlur + Keras Single Layer CNN
 55 | --------------------------------------------
 56 | 
 57 | In this example, we will preprocess the images using the `OpenCV GaussianBlur function`_
 58 | and directly after go into a Single Layer CNN Classifier built on Keras using the corresponding
 59 | `MLPrimitives primitive`_.
 60 | 
 61 | .. code-block:: python
 62 | 
 63 |     from mlblocks import MLPipeline
 64 |     from mlprimitives.datasets import load_usps
 65 | 
 66 |     dataset = load_usps()
 67 |     dataset.describe()
 68 | 
 69 |     X_train, X_test, y_train, y_test = dataset.get_splits(1)
 70 | 
 71 |     primitives = [
 72 |         'cv2.GaussianBlur',
 73 |         'keras.Sequential.SingleLayerCNNImageClassifier'
 74 |     ]
 75 |     init_params = {
 76 |         'keras.Sequential.SingleLayerCNNImageClassifier': {
 77 |             'dense_units': 11,
 78 |             'epochs': 5
 79 |         }
 80 |     }
 81 |     pipeline = MLPipeline(primitives, init_params)
 82 | 
 83 |     pipeline.fit(X_train, y_train)
 84 | 
 85 |     predictions = pipeline.predict(X_test)
 86 | 
 87 |     dataset.score(y_test, predictions)
 88 | 
 89 | 
 90 | Image Regression
 91 | ----------------
 92 | 
 93 | For the image regression examples we will be using the Handgeometry Dataset, which we will
 94 | load using the ``mlblocks.dataset.load_handgeometry`` function.
 95 | 
 96 | The data of this dataset is a 3d numpy array vector with shape ``(224, 224, 3)`` containing 112
 97 | 224x224 RGB photos of hands, and the target is a 1d numpy float array containing the width of
 98 | the wrist in centimeters.
 99 | 
100 | Keras MobileNet + XGBRegressor
101 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
102 | 
103 | Here we will introduce the usage of the `Pretrained Networks from Keras`_.
104 | In particular, we will be using the `MobileNet`_ for feature extraction, and pass its features
105 | to an `XGBRegressor`_ primitive.
106 | 
107 | .. code-block:: python
108 | 
109 |     from mlblocks import MLPipeline
110 |     from mlprimitives.datasets import load_handgeometry
111 | 
112 |     dataset = load_handgeometry()
113 |     dataset.describe()
114 | 
115 |     X_train, X_test, y_train, y_test = dataset.get_splits(1)
116 | 
117 |     primitives = [
118 |         'keras.applications.mobilenet.preprocess_input',
119 |         'keras.applications.mobilenet.MobileNet',
120 |         'xgboost.XGBRegressor'
121 |     ]
122 |     init_params = {
123 |         'xgboost.XGBRegressor': {
124 |             'n_estimators': 300,
125 |             'learning_rate': 0.1
126 |         }
127 |     }
128 |     pipeline = MLPipeline(primitives, init_params)
129 | 
130 |     pipeline.fit(X_train, y_train)
131 | 
132 |     predictions = pipeline.predict(X_test)
133 | 
134 |     dataset.score(y_test, predictions)
135 | 
136 | 
137 | .. _USPS Dataset: https://ieeexplore.ieee.org/document/291440/
138 | .. _OpenCV GaussianBlur function: https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur
139 | .. _MLPrimitives primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.SingleLayerCNNImageClassifier.json
140 | .. _scikit-image function: http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog
141 | .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
142 | .. _Pretrained Networks from Keras: https://keras.io/applications/
143 | .. _MobileNet: https://keras.io/applications/#mobilenet
144 | .. _XGBRegressor: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
145 | 


--------------------------------------------------------------------------------
/docs/pipeline_examples/multi_table.rst:
--------------------------------------------------------------------------------
 1 | Multi Table Pipelines
 2 | =====================
 3 | 
 4 | In the previous section we explored the most simple use cases, where the datasets
 5 | consisted in a single table.
 6 | 
 7 | In this section we will cover cases where the dataset consist on multiple tables
 8 | related by foreign keys.
 9 | 
10 | Multi Table Classification Pipeline
11 | -----------------------------------
12 | 
13 | In this example, we will be using the `WikiQA dataset`_, which contains 4 different tables
14 | with simple parent/child relationships, and which we will load using the
15 | ``mlblocks.dataset.load_wikiqa`` function.
16 | 
17 | In our pipeline, we will be using the `DeepFeatureSynthesis`_ primitive from `featuretools`_
18 | for feature extraction over the various tables that we have and later on apply an
19 | `XGBClassifier`_ on the resulting feature matrix.
20 | 
21 | Note how in this example we need to pass some additional information to the pipeline
22 | for the DFS primitive for it to know what the relationships between the multiple
23 | tables are.
24 | 
25 | .. code-block:: python
26 | 
27 |     from mlblocks import MLPipeline
28 |     from mlprimitives.datasets import load_wikiqa
29 | 
30 |     dataset = load_wikiqa()
31 |     dataset.describe()
32 | 
33 |     X_train, X_test, y_train, y_test = dataset.get_splits(1)
34 | 
35 |     primitives = [
36 |         'featuretools.dfs',
37 |         'xgboost.XGBClassifier'
38 |     ]
39 |     pipeline = MLPipeline(primitives)
40 | 
41 |     pipeline.fit(X_train, y_train, entities=dataset.entities,
42 |                  relationships=dataset.relationships, target_entity='data')
43 | 
44 |     predictions = pipeline.predict(X_test, entities=dataset.entities,
45 |                   relationships=dataset.relationships, target_entity='data')
46 | 
47 |     dataset.score(y_test, predictions)
48 | 
49 | 
50 | .. _WikiQA dataset: https://www.microsoft.com/en-us/research/publication/wikiqa-a-challenge-dataset-for-open-domain-question-answering/
51 | .. _XGBClassifier: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
52 | .. _DeepFeatureSynthesis: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/featuretools.dfs.json
53 | .. _featuretools: https://www.featuretools.com/
54 | 


--------------------------------------------------------------------------------
/docs/pipeline_examples/single_table.rst:
--------------------------------------------------------------------------------
 1 | Single Table Pipelines
 2 | ======================
 3 | 
 4 | In this section we will go over a few pipeline examples to show **MLBlocks** working
 5 | in different scenarios and with different types of data.
 6 | 
 7 | For each example, we will be using example datasets which can be downloaded using the
 8 | various functions found in the ``mlprimitives.datasets`` module.
 9 | 
10 | .. note:: Even though the datasets are not especially big, some of the examples might
11 |           use a considerable amount of resources, especially memory, and might take
12 |           several minutes to run.
13 | 
14 | Regression Pipeline
15 | -------------------
16 | 
17 | In the most simple example, we will be using a single `RandomForestRegressor`_ primitive over
18 | the numeric data from `The Boston Dataset`_, which we will load using the
19 | ``mlblocks.dataset.load_boston`` function.
20 | 
21 | .. code-block:: python
22 | 
23 |     from mlblocks import MLPipeline
24 |     from mlprimitives.datasets import load_boston
25 | 
26 |     dataset = load_boston()
27 |     dataset.describe()
28 | 
29 |     X_train, X_test, y_train, y_test = dataset.get_splits(1)
30 | 
31 |     primitives = [
32 |         'sklearn.ensemble.RandomForestRegressor'
33 |     ]
34 |     pipeline = MLPipeline(primitives)
35 | 
36 |     pipeline.fit(X_train, y_train)
37 | 
38 |     predictions = pipeline.predict(X_test)
39 | 
40 |     dataset.score(y_test, predictions)
41 | 
42 | Classification Pipeline
43 | -----------------------
44 | 
45 | As a Classification example, we will be using `The Iris Dataset`_, which we will load using the
46 | ``mlblocks.dataset.load_iris`` function.
47 | 
48 | Here we will combine the `StandardScaler from scikit-learn`_ with an `XGBClassifier primitive`_.
49 | 
50 | In this case, we will also be passing some initialization parameters for the XGBClassifier.
51 | 
52 | .. code-block:: python
53 | 
54 |     from mlblocks import MLPipeline
55 |     from mlprimitives.datasets import load_iris
56 | 
57 |     dataset = load_iris()
58 |     dataset.describe()
59 | 
60 |     X_train, X_test, y_train, y_test = dataset.get_splits(1)
61 | 
62 |     primitives = [
63 |         'sklearn.preprocessing.StandardScaler',
64 |         'xgboost.XGBClassifier'
65 |     ]
66 |     init_params = {
67 |         'xgboost.XGBClassifier': {
68 |             'learning_rate': 0.1
69 |         }
70 |     }
71 |     pipeline = MLPipeline(primitives, init_params)
72 | 
73 |     pipeline.fit(X_train, y_train)
74 | 
75 |     predictions = pipeline.predict(X_test)
76 | 
77 |     dataset.score(y_test, predictions)
78 | 
79 | 
80 | .. _The Boston Dataset: http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html
81 | .. _RandomForestRegressor: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
82 | .. _XGBRegressor: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
83 | .. _The Iris Dataset: https://en.wikipedia.org/wiki/Iris_flower_data_set
84 | .. _StandardScaler from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
85 | .. _XGBClassifier primitive: https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
86 | 


--------------------------------------------------------------------------------
/docs/pipeline_examples/text.rst:
--------------------------------------------------------------------------------
  1 | Text Pipelines
  2 | ==============
  3 | 
  4 | Here we will be showing some examples using **MLBlocks** to resolve text problems.
  5 | 
  6 | Text Classification
  7 | -------------------
  8 | 
  9 | For the text classification examples we will be using the `Twenty Newsgroups Dataset`_,
 10 | which we will load using the ``mlblocks.dataset.load_newsgroups`` function.
 11 | 
 12 | The data of this dataset is a 1d numpy array vector containing the texts from 11314 newsgroups
 13 | posts, and the target is a 1d numpy integer array containing the label of one of the 20 topics
 14 | that they are about.
 15 | 
 16 | MLPrimitives + Keras Preprocessing + Keras LSTM
 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 18 | 
 19 | In this example we will start by applying some text cleanup using the `TextCleaner primitive`_
 20 | from MLPrimitives, to then go into some `keras preprocessing` primitives and end
 21 | using a `Keras LSTM Classifier from MLPrimitives`_
 22 | 
 23 | Note how in this case we are using the ``input_names`` and ``output_names`` to properly
 24 | setup the pipeline and allow using the outputs from some primitives as additional inputs
 25 | for later ones.
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     import nltk
 30 |     from mlblocks import MLPipeline
 31 |     from mlprimitives.datasets import load_newsgroups
 32 | 
 33 |     dataset = load_newsgroups()
 34 |     dataset.describe()
 35 | 
 36 |     X_train, X_test, y_train, y_test = dataset.get_splits(1)
 37 | 
 38 |     # Make sure that we have the necessary data
 39 |     nltk.download('stopwords')
 40 | 
 41 |     # set up the pipeline
 42 |     primitives = [
 43 |         "mlprimitives.custom.counters.UniqueCounter",
 44 |         "mlprimitives.custom.text.TextCleaner",
 45 |         "mlprimitives.custom.counters.VocabularyCounter",
 46 |         "keras.preprocessing.text.Tokenizer",
 47 |         "keras.preprocessing.sequence.pad_sequences",
 48 |         "keras.Sequential.LSTMTextClassifier"
 49 |     ]
 50 |     input_names = {
 51 |         "mlprimitives.custom.counters.UniqueCounter#1": {
 52 |             "X": "y"
 53 |         }
 54 |     }
 55 |     output_names = {
 56 |         "mlprimitives.custom.counters.UniqueCounter#1": {
 57 |             "counts": "classes"
 58 |         },
 59 |         "mlprimitives.custom.counters.VocabularyCounter#1": {
 60 |             "counts": "vocabulary_size"
 61 |         }
 62 |     }
 63 |     init_params = {
 64 |         "mlprimitives.custom.counters.VocabularyCounter#1": {
 65 |             "add": 1
 66 |         },
 67 |         "mlprimitives.custom.text.TextCleaner#1": {
 68 |             "language": "en"
 69 |         },
 70 |         "keras.preprocessing.sequence.pad_sequences#1": {
 71 |             "maxlen": 100
 72 |         },
 73 |         "keras.Sequential.LSTMTextClassifier#1": {
 74 |             "input_length": 100
 75 |         }
 76 |     }
 77 |     pipeline = MLPipeline(primitives, init_params, input_names, output_names)
 78 | 
 79 |     pipeline.fit(X_train, y_train)
 80 | 
 81 |     predictions = pipeline.predict(X_test)
 82 | 
 83 |     dataset.score(y_test, predictions)
 84 | 
 85 | 
 86 | Tabular Data with Text
 87 | ----------------------
 88 | 
 89 | For these examples examples we will be using the `Personae Dataset`_, which we will load
 90 | using the ``mlblocks.dataset.load_personae`` function.
 91 | 
 92 | The data of this dataset is a 2d numpy array vector containing 145 entries that include
 93 | texts written by Dutch users in Twitter, with some additional information about the author,
 94 | and the target is a 1d numpy binary integer array indicating whether the author was extrovert
 95 | or not.
 96 | 
 97 | MLPrimitives + Scikit-learn RandomForestClassifier
 98 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 99 | 
100 | In this example use again the `TextCleaner primitive`_, then use a `StringVectorizer primitive`_,
101 | to encode all the string features, and go directly into the
102 | `RandomForestClassifier from scikit-learn`_.
103 | 
104 | .. code-block:: python
105 | 
106 |     import nltk
107 |     from mlblocks import MLPipeline
108 |     from mlprimitives.datasets import load_personae
109 | 
110 |     dataset = load_personae()
111 |     dataset.describe()
112 | 
113 |     X_train, X_test, y_train, y_test = dataset.get_splits(1)
114 | 
115 |     # Make sure that we have the necessary data
116 |     nltk.download('stopwords')
117 | 
118 |     primitives = [
119 |         'mlprimitives.custom.text.TextCleaner',
120 |         'mlprimitives.custom.feature_extraction.StringVectorizer',
121 |         'sklearn.ensemble.RandomForestClassifier',
122 |     ]
123 |     init_params = {
124 |         'mlprimitives.custom.text.TextCleaner': {
125 |             'column': 'text',
126 |             'language': 'nl'
127 |         },
128 |         'sklearn.ensemble.RandomForestClassifier': {
129 |             'n_jobs': -1,
130 |             'n_estimators': 100
131 |         }
132 |     }
133 |     pipeline = MLPipeline(primitives, init_params)
134 | 
135 |     pipeline.fit(X_train, y_train)
136 | 
137 |     predictions = pipeline.predict(X_test)
138 | 
139 |     dataset.score(y_test, predictions)
140 | 
141 | 
142 | .. _Twenty Newsgroups Dataset: http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
143 | .. _TextCleaner primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/text.py
144 | .. _StringVectorizer primitive: https://github.com/MLBazaar/MLPrimitives/blob/master/mlprimitives/feature_extraction.py
145 | .. _keras text preprocessing: https://keras.io/preprocessing/text/
146 | .. _Keras LSTM Classifier from MLPrimitives: https://github.com/MLBazaar/MLPrimitives/blob/master/mlblocks_primitives/keras.Sequential.LSTMTextClassifier.json
147 | .. _Personae Dataset: https://www.clips.uantwerpen.be/datasets/personae-corpus
148 | .. _RandomForestClassifier from scikit-learn: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
149 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # MLBlocks Examples
 2 | 
 3 | This folder contains Python code, Jupyter Notebooks and JSON examples to demonstrate MLBlocks
 4 | functionaliry.
 5 | 
 6 | Within this folder you will find:
 7 | 
 8 | <!--* `examples.py`: Simple Python code examples of a class and a function based primitive implementation.-->
 9 | * `primitives`: Example primitive JSONs to demonstrate different MLBlocks functionalities.
10 | * `pipelines`: Example pipeline JSONs to demonstrate different MLBlocks functionalities.
11 | * `tutorials`: Collection of Jupyter Notebooks to show the usage of different MLBlocks functionalities.
12 | <!--* `problem_types`: Collection of Jupyter Notebooks that show example pipelines for multiple problem types.-->
13 | 
14 | # Requirements
15 | 
16 | In order to run the examples contained in this folder you should have [pip installed on your system
17 | ](https://pip.pypa.io/en/stable/installing/).
18 | 
19 | Optionally, also install and activate a [virtualenv](https://virtualenv.pypa.io/en/latest/) to
20 | run them in an isolated environment.
21 | 
22 | # Usage
23 | 
24 | In order to run these tutorials on your computer, please follow these steps:
25 | 
26 | 1. Clone this github repository:
27 | 
28 | ```bash
29 | git clone git@github.com:MLBazaar/MLBlocks.git
30 | ```
31 | 
32 | 2. (Optional) Create a virtualenv to execute the examples in an environment isolated from the
33 | rest of your computer:
34 | 
35 | ```bash
36 | pip install virtualenv
37 | virtualenv -p $(which python3.6) mlblocks-venv
38 | soucre mlblocks-venv/bin/activate
39 | ```
40 | 
41 | 3. Enter the repository and install the dependencies
42 | 
43 | ```bash
44 | cd MLBlocks
45 | make install-examples
46 | ```
47 | 
48 | This will install [MLBLocks](https://github.com/MLBazaar/MLBlocks.git) as well as [MLPrimitives](
49 | https://github.com/MLBazaar/MLPrimitives.git) and [Jupyter](https://jupyter.org/).
50 | 
51 | 4. Enter the `examples` folder and start a Jupyter Notebook:
52 | 
53 | ```bash
54 | jupyter notebook
55 | ```
56 | 
57 | 5. Point your browser at the link shown in your console and run the examples from the `examples/tutorials` folder.
58 | 


--------------------------------------------------------------------------------
/examples/pipelines/single_table.classification.categorical_encoder.xgboost.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metadata": {
 3 |         "data_modality": "single_table",
 4 |         "task_type": "classification"
 5 |     },
 6 |     "validation": {
 7 |         "dataset": "census"
 8 |     },
 9 |     "primitives": [
10 |         "mlprimitives.custom.preprocessing.ClassEncoder",
11 |         "mlprimitives.custom.feature_extraction.CategoricalEncoder",
12 |         "sklearn.impute.SimpleImputer",
13 |         "xgboost.XGBClassifier",
14 |         "mlprimitives.custom.preprocessing.ClassDecoder"
15 |     ]
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/primitives/mlblocks.examples.ClassPrimitive.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "name": "the_primitive_name",
  3 |     "primitive": "full.python.path.to.AClass",
  4 |     "fit": {
  5 |         "method": "fit",
  6 |         "args": [
  7 |             {
  8 |                 "name": "X",
  9 |                 "keyword": "optional_name_of_the_fit_method_argument",
 10 |                 "description": "each input can be described",
 11 |                 "type": "pandas.DataFrame"
 12 |             },
 13 |             {
 14 |                 "name": "y",
 15 |                 "description": "each input can be described",
 16 |                 "default": "default_value_for_this_argument",
 17 |                 "type": "pandas.Series"
 18 |             }
 19 |         ]
 20 |     },
 21 |     "produce": {
 22 |         "method": "predict",
 23 |         "args": [
 24 |             {
 25 |                 "name": "X",
 26 |                 "keyword": "optional_name_of_the_produce_method_argument",
 27 |                 "description": "each input can be described",
 28 |                 "type": "DataFrame"
 29 |             }
 30 |         ],
 31 |         "output": [
 32 |             {
 33 |                 "name": "y",
 34 |                 "descrtiption": "each output argument can be described",
 35 |                 "type": "Series"
 36 |             }
 37 |         ]
 38 |     },
 39 |     "hyperparameters": {
 40 |         "fixed": {
 41 |             "a_required_hyperparameter": {
 42 |                 "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user because it does not have a default value",
 43 |                 "type": "int"
 44 |             },
 45 |             "an_optional_hyperparameter": {
 46 |                 "descrtiption": "this is a non tunable hyperparameter that is optional because it has a default value",
 47 |                 "type": "int",
 48 |                 "default": 1
 49 |             }
 50 |         },
 51 |         "tunable": {
 52 |             "a_simple_range_hyperparameter": {
 53 |                 "description": "hyperparameter documentation can be put here",
 54 |                 "default": 1,
 55 |                 "type": "int",
 56 |                 "range": [1, 10]
 57 |             },
 58 |             "a_categorical_hyperparameter_of_type_int": {
 59 |                 "description": "Note that it has the field `values` instead of `range`",
 60 |                 "default": 1,
 61 |                 "type": "int",
 62 |                 "values": [1, 3, 7, 10]
 63 |             },
 64 |             "a_categorical_hyperparameter_of_type_str": {
 65 |                 "default": "a",
 66 |                 "type": "str",
 67 |                 "values": ["a", "b", "c"]
 68 |             },
 69 |             "a_multi_type_hyperprameter": {
 70 |                 "description": "this is a hyperparameter that allows more than one type",
 71 |                 "type": "multitype",
 72 |                 "default": "auto",
 73 |                 "types": {
 74 |                     "int": {
 75 |                         "description": "documentation can also be included here",
 76 |                         "range": [1, 10]
 77 |                     },
 78 |                     "string": {
 79 |                         "values": ["some", "string", "values"]
 80 |                     }
 81 |                 }
 82 |             },
 83 |             "conditional_hyperparameter": {
 84 |                 "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter",
 85 |                 "type": "conditional",
 86 |                 "condition": "the_name_of_the_other_hyperparameter",
 87 |                 "values": {
 88 |                     "a": {
 89 |                         "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`",
 90 |                         "type": "int",
 91 |                         "default": 0,
 92 |                         "range": [0, 10]
 93 |                     },
 94 |                     "*": {
 95 |                         "description": "this will be used only if the value does not match any other definition",
 96 |                         "type": "float",
 97 |                         "default": 0.0,
 98 |                         "range": [0.0, 1.0]
 99 |                     }
100 |                 }
101 |             }
102 |         }
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/examples/primitives/mlblocks.examples.function_primitive.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "the_primitive_name",
 3 |     "primitive": "full.python.path.to.a_function",
 4 |     "produce": {
 5 |         "args": [
 6 |             {
 7 |                 "name": "X",
 8 |                 "keyword": "optional_name_of_the_produce_method_argument",
 9 |                 "description": "each input can be described",
10 |                 "type": "DataFrame"
11 |             }
12 |         ],
13 |         "output": [
14 |             {
15 |                 "descrtiption": "each output argument can be described",
16 |                 "name": "y",
17 |                 "type": "Series"
18 |             }
19 |         ]
20 |     },
21 |     "hyperparameters": {
22 |         "fixed": {
23 |             "a_required_hyperparameter": {
24 |                 "descrtiption": "this is a non tunable hyperparameter that needs to be specified by the user, because it does not have a default value",
25 |                 "type": "int"
26 |             },
27 |             "an_optional_hyperparameter": {
28 |                 "descrtiption": "this is a non tunable hyperparameter that is optional, because it has a default value",
29 |                 "type": "int",
30 |                 "default": 1
31 |             }
32 |         },
33 |         "tunable": {
34 |             "a_simple_range_hyperparameter": {
35 |                 "description": "hyperparameter documentation can be put here",
36 |                 "default": 1,
37 |                 "type": "int",
38 |                 "range": [1, 10]
39 |             },
40 |             "a_categorical_hyperparameter_of_type_int": {
41 |                 "description": "Note that it has the filed `values` instead of `range`",
42 |                 "default": 1,
43 |                 "type": "int",
44 |                 "values": [1, 3, 7, 10]
45 |             },
46 |             "a_categorical_hyperparameter_of_type_str": {
47 |                 "default": "a",
48 |                 "type": "str",
49 |                 "values": ["a", "b", "c"]
50 |             },
51 |             "a_multi_type_hyperprameter": {
52 |                 "description": "this is a hyperparameter that allows more than one type",
53 |                 "type": "multitype",
54 |                 "default": "auto",
55 |                 "types": {
56 |                     "int": {
57 |                         "description": "documentation can also be included here",
58 |                         "range": [1, 10]
59 |                     },
60 |                     "string": {
61 |                         "values": ["some", "string", "values"]
62 |                     }
63 |                 }
64 |             },
65 |             "conditional_hyperparameter": {
66 |                 "description": "this is a hyperparameter whose valid values depend on the value of another hyperpameter",
67 |                 "type": "conditional",
68 |                 "condition": "the_name_of_the_other_hyperparameter",
69 |                 "values": {
70 |                     "a": {
71 |                         "description": "this hyperparameter definition will be used if the value of the other hyperparameter is `a`",
72 |                         "type": "int",
73 |                         "default": 0,
74 |                         "range": [0, 10]
75 |                     },
76 |                     "*": {
77 |                         "description": "this will be used only if the value does not match any other definition",
78 |                         "type": "float",
79 |                         "default": 0.0,
80 |                         "range": [0.0, 1.0]
81 |                     }
82 |                 }
83 |             }
84 |         }
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/examples/tutorials/2. Finding and Loading a Pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Finding and Loading a Pipeline\n",
  8 |     "\n",
  9 |     "In this short tutorial we will show you how to search for pipelines suitable to solve\n",
 10 |     "your prediction problem."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "In order to find a suitable pipeline, the first thing we need is to identify\n",
 18 |     "the type of problem (data modality + task type) that we are facing.\n",
 19 |     "\n",
 20 |     "This is a full list of current data modalities and task types that we cover:\n",
 21 |     "\n",
 22 |     "| Problem Type                         | Data Modality | Task Type               |\n",
 23 |     "|:-------------------------------------|:--------------|:------------------------|\n",
 24 |     "| Single Table Classification          | single_table  | classification          |\n",
 25 |     "| Single Table Regression              | single_table  | regression              |\n",
 26 |     "| Single Table Collaborative Filtering | single_table  | collaborative_filtering |\n",
 27 |     "| Multi Table Classification           | multi_table   | classification          |\n",
 28 |     "| Multi Table Regression               | multi_table   | regression              |\n",
 29 |     "| Time Series Classification           | timeseries    | classification          |\n",
 30 |     "| Time Series Regression               | timeseries    | regression              |\n",
 31 |     "| Time Series Forecasting              | timeseries    | forecasting             |\n",
 32 |     "| Time Series Anomaly Detection        | timeseries    | anomaly_detection       |\n",
 33 |     "| Image Classification                 | image         | classification          |\n",
 34 |     "| Image Regression                     | image         | regression              |\n",
 35 |     "| Graph Link Prediction                | graph         | link_prediction         |\n",
 36 |     "| Graph Vertex Nomination              | graph         | vertex_nomination       |\n",
 37 |     "| Graph Community Detection            | graph         | community_detection     |\n",
 38 |     "| Graph Matching                       | graph         | graph_matching          |"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "Once we have identified our data modality and task type we can use the\n",
 46 |     "`mlblocks.discovery.find_pipelines` function to find all the pipelines\n",
 47 |     "that support this particular problem type.\n",
 48 |     "\n",
 49 |     "For example, if we are looking for a pipeline to work on Image Classification\n",
 50 |     "we will do the following query."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 1,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/plain": [
 61 |        "['image.classification.hog.rf',\n",
 62 |        " 'image.classification.hog.xgb',\n",
 63 |        " 'image.classification.resnet50.xgb',\n",
 64 |        " 'keras.Sequential.SingleLayerCNNImageClassifier',\n",
 65 |        " 'keras.Sequential.VGGCNNClassifier']"
 66 |       ]
 67 |      },
 68 |      "execution_count": 1,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "from mlblocks.discovery import find_pipelines\n",
 75 |     "\n",
 76 |     "filters = {\n",
 77 |     "    'metadata.data_type': 'image',\n",
 78 |     "    'metadata.task_type': 'classification',\n",
 79 |     "}\n",
 80 |     "\n",
 81 |     "find_pipelines(filters=filters)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "After finding and choosing a pipeline, we can load it as an `MLPipeline`\n",
 89 |     "by passing its name to the `MLPipeline`."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 2,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stderr",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "Using TensorFlow backend.\n",
102 |       "2020-09-16 16:03:19,939 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
103 |       "Instructions for updating:\n",
104 |       "If using Keras pass *_constraint arguments to layers.\n",
105 |       "2020-09-16 16:03:20,025 - WARNING - tensorflow - From /home/xals/.virtualenvs/MLBlocks.clean/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
106 |       "\n"
107 |      ]
108 |     }
109 |    ],
110 |    "source": [
111 |     "from mlblocks import MLPipeline\n",
112 |     "\n",
113 |     "pipeline = MLPipeline('image.classification.resnet50.xgb')"
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python 3",
120 |    "language": "python",
121 |    "name": "python3"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.6.9"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 4
138 | }
139 | 


--------------------------------------------------------------------------------
/examples/tutorials/3. Setting MLPipeline Hyperparameters.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Setting MLPipeline Hyperparameters\n",
  8 |     "\n",
  9 |     "In this short guide we will see how to modify the hyperparameters\n",
 10 |     "of an MLPipeline in order to modify its behavior or performance.\n",
 11 |     "\n",
 12 |     "Note that some steps are not explained for simplicity. Full details\n",
 13 |     "about them can be found in the previous parts of the tutorial.\n",
 14 |     "\n",
 15 |     "We will:\n",
 16 |     "\n",
 17 |     "1. Load a dataset and a Pipeline.\n",
 18 |     "2. Explore the pipeline hyperparamters.\n",
 19 |     "3. Reload the pipeline with different hyperparameters.\n",
 20 |     "4. Evaluate the pipeline performance on the dataset.\n",
 21 |     "5. Set different pipeline hyperparameters.\n",
 22 |     "6. Re-evaluate the pipeline performance on the dataset."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## Load the Dataset and the Pipeline\n",
 30 |     "\n",
 31 |     "The first step will be to load the dataset and the pipeline that we will be using."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 1,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from utils import load_census\n",
 41 |     "\n",
 42 |     "dataset = load_census()\n",
 43 |     "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from mlblocks import MLPipeline\n",
 53 |     "\n",
 54 |     "primitives = [\n",
 55 |     "    'mlprimitives.custom.preprocessing.ClassEncoder',\n",
 56 |     "    'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
 57 |     "    'sklearn.impute.SimpleImputer',\n",
 58 |     "    'xgboost.XGBClassifier',\n",
 59 |     "    'mlprimitives.custom.preprocessing.ClassDecoder'\n",
 60 |     "]\n",
 61 |     "pipeline = MLPipeline(primitives)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "## Explore the Pipeline Hyperparameters"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "Once we have loaded the pipeline, we can see the hyperparameters that it is using by\n",
 76 |     "calling its `get_hyperparameters` method."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 3,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/plain": [
 87 |        "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n",
 88 |        " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n",
 89 |        "  'copy': True,\n",
 90 |        "  'features': 'auto',\n",
 91 |        "  'max_unique_ratio': 0,\n",
 92 |        "  'max_labels': 0},\n",
 93 |        " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n",
 94 |        "  'fill_value': None,\n",
 95 |        "  'verbose': False,\n",
 96 |        "  'copy': True,\n",
 97 |        "  'strategy': 'mean'},\n",
 98 |        " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n",
 99 |        "  'n_estimators': 100,\n",
100 |        "  'max_depth': 3,\n",
101 |        "  'learning_rate': 0.1,\n",
102 |        "  'gamma': 0,\n",
103 |        "  'min_child_weight': 1},\n",
104 |        " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}"
105 |       ]
106 |      },
107 |      "execution_count": 3,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "pipeline.get_hyperparameters()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "This will return us a dictionary that contains one entry for each step in the pipeline.\n",
121 |     "Each entry will also be a dictionary, indicating the names and the values of the hyperparameters of that step.\n",
122 |     "\n",
123 |     "**NOTE** that here we see the names of the pipeline steps, which are the primitive names with a numerical suffix that allows us to tell the difference between multiple steps that use the same primitive. \n",
124 |     "\n",
125 |     "Alternatively, for better compatibility with tuning systems like [BTB](https://github.com/MLBazaar/BTB)\n",
126 |     "that work with flat, one-level, dictionaries, the argument `flat=True` can be passed."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 4,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
138 |        "  'keep'): False,\n",
139 |        " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'copy'): True,\n",
140 |        " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
141 |        "  'features'): 'auto',\n",
142 |        " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
143 |        "  'max_unique_ratio'): 0,\n",
144 |        " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
145 |        "  'max_labels'): 0,\n",
146 |        " ('sklearn.impute.SimpleImputer#1', 'missing_values'): nan,\n",
147 |        " ('sklearn.impute.SimpleImputer#1', 'fill_value'): None,\n",
148 |        " ('sklearn.impute.SimpleImputer#1', 'verbose'): False,\n",
149 |        " ('sklearn.impute.SimpleImputer#1', 'copy'): True,\n",
150 |        " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
151 |        " ('xgboost.XGBClassifier#1', 'n_jobs'): -1,\n",
152 |        " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
153 |        " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
154 |        " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
155 |        " ('xgboost.XGBClassifier#1', 'gamma'): 0,\n",
156 |        " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
157 |       ]
158 |      },
159 |      "execution_count": 4,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "pipeline.get_hyperparameters(flat=True)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "This will return us the same information as before, but organized a single one-level\n",
173 |     "dictionary where each key is a `tuple` containing both the name of the step and the hyperparameter."
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "## Setting Pipeline hyperparameter values\n",
181 |     "\n",
182 |     "We can set some different hyperparameter values when loading the pipeline by adding the\n",
183 |     "`init_params` argument to `MLPipeline`.\n",
184 |     "\n",
185 |     "The `init_params` has to be a dictionary where each entry corresponds to the name of one of the\n",
186 |     "pipeline steps and each value is another dictionary indicating the hyperparameter values that we\n",
187 |     "want to use on that step.\n",
188 |     "\n",
189 |     "As an example, we will set a different imputer strategy and a different xgboost max dempt."
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 5,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "init_params = {\n",
199 |     "    'sklearn.impute.SimpleImputer#1': {\n",
200 |     "        'strategy': 'median'\n",
201 |     "    },\n",
202 |     "    'xgboost.XGBClassifier#1': {\n",
203 |     "        'max_depth': 4\n",
204 |     "    }\n",
205 |     "}\n",
206 |     "pipeline = MLPipeline(\n",
207 |     "    primitives,\n",
208 |     "    init_params=init_params\n",
209 |     ")"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "We can now see how the hyperparameters are different than before."
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 6,
222 |    "metadata": {},
223 |    "outputs": [
224 |     {
225 |      "data": {
226 |       "text/plain": [
227 |        "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n",
228 |        " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n",
229 |        "  'copy': True,\n",
230 |        "  'features': 'auto',\n",
231 |        "  'max_unique_ratio': 0,\n",
232 |        "  'max_labels': 0},\n",
233 |        " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n",
234 |        "  'fill_value': None,\n",
235 |        "  'verbose': False,\n",
236 |        "  'copy': True,\n",
237 |        "  'strategy': 'median'},\n",
238 |        " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n",
239 |        "  'max_depth': 4,\n",
240 |        "  'n_estimators': 100,\n",
241 |        "  'learning_rate': 0.1,\n",
242 |        "  'gamma': 0,\n",
243 |        "  'min_child_weight': 1},\n",
244 |        " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}"
245 |       ]
246 |      },
247 |      "execution_count": 6,
248 |      "metadata": {},
249 |      "output_type": "execute_result"
250 |     }
251 |    ],
252 |    "source": [
253 |     "pipeline.get_hyperparameters()"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "## Evaluate the Pipeline performance\n",
261 |     "\n",
262 |     "We can now evaluate the pipeline performance to see what results these\n",
263 |     "hyperparameters produce."
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 7,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "name": "stderr",
273 |      "output_type": "stream",
274 |      "text": [
275 |       "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
276 |       "  warnings.warn(\n"
277 |      ]
278 |     },
279 |     {
280 |      "data": {
281 |       "text/plain": [
282 |        "0.8647586291610367"
283 |       ]
284 |      },
285 |      "execution_count": 7,
286 |      "metadata": {},
287 |      "output_type": "execute_result"
288 |     }
289 |    ],
290 |    "source": [
291 |     "pipeline.fit(X_train, y_train)\n",
292 |     "y_pred = pipeline.predict(X_test)\n",
293 |     "\n",
294 |     "dataset.score(y_test, y_pred)"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "## Setting hyperparameter values\n",
302 |     "\n",
303 |     "Another way of setting the pipeline hyperparameters without having to recreate it\n",
304 |     "from scratch, is to use its `set_hyperparameters` method.\n",
305 |     "\n",
306 |     "In this case, we will change the CategoricalEncoder `max_labels` and the xgboost `learning_rate`."
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 8,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "hyperparameters = {\n",
316 |     "    'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {\n",
317 |     "        'max_labels': 10\n",
318 |     "    },\n",
319 |     "    'xgboost.XGBClassifier#1': {\n",
320 |     "        'learning_rate': 0.3\n",
321 |     "    }\n",
322 |     "}\n",
323 |     "pipeline.set_hyperparameters(hyperparameters)"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "metadata": {},
329 |    "source": [
330 |     "Alternatively, the hyperparameters can be set using the `flat` format:"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 9,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "hyperparameters = {\n",
340 |     "    ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 10,\n",
341 |     "    ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3\n",
342 |     "}\n",
343 |     "pipeline.set_hyperparameters(hyperparameters)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {},
349 |    "source": [
350 |     "And we can see how these hyperparameters now are different than before:"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 10,
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "data": {
360 |       "text/plain": [
361 |        "{'mlprimitives.custom.preprocessing.ClassEncoder#1': {},\n",
362 |        " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'keep': False,\n",
363 |        "  'copy': True,\n",
364 |        "  'features': 'auto',\n",
365 |        "  'max_unique_ratio': 0,\n",
366 |        "  'max_labels': 10},\n",
367 |        " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n",
368 |        "  'fill_value': None,\n",
369 |        "  'verbose': False,\n",
370 |        "  'copy': True,\n",
371 |        "  'strategy': 'median'},\n",
372 |        " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n",
373 |        "  'max_depth': 4,\n",
374 |        "  'n_estimators': 100,\n",
375 |        "  'learning_rate': 0.3,\n",
376 |        "  'gamma': 0,\n",
377 |        "  'min_child_weight': 1},\n",
378 |        " 'mlprimitives.custom.preprocessing.ClassDecoder#1': {}}"
379 |       ]
380 |      },
381 |      "execution_count": 10,
382 |      "metadata": {},
383 |      "output_type": "execute_result"
384 |     }
385 |    ],
386 |    "source": [
387 |     "pipeline.get_hyperparameters()"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "## Evaluate the Pipeline performance\n",
395 |     "\n",
396 |     "We can now evaluate again the pipeline performance and see how the hyperparameter\n",
397 |     "change affected the pipeline performance."
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 11,
403 |    "metadata": {},
404 |    "outputs": [
405 |     {
406 |      "name": "stderr",
407 |      "output_type": "stream",
408 |      "text": [
409 |       "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
410 |       "  warnings.warn(\n"
411 |      ]
412 |     },
413 |     {
414 |      "data": {
415 |       "text/plain": [
416 |        "0.870531875690947"
417 |       ]
418 |      },
419 |      "execution_count": 11,
420 |      "metadata": {},
421 |      "output_type": "execute_result"
422 |     }
423 |    ],
424 |    "source": [
425 |     "pipeline.fit(X_train, y_train)\n",
426 |     "y_pred = pipeline.predict(X_test)\n",
427 |     "\n",
428 |     "dataset.score(y_test, y_pred)"
429 |    ]
430 |   }
431 |  ],
432 |  "metadata": {
433 |   "kernelspec": {
434 |    "display_name": "Python 3 (ipykernel)",
435 |    "language": "python",
436 |    "name": "python3"
437 |   },
438 |   "language_info": {
439 |    "codemirror_mode": {
440 |     "name": "ipython",
441 |     "version": 3
442 |    },
443 |    "file_extension": ".py",
444 |    "mimetype": "text/x-python",
445 |    "name": "python",
446 |    "nbconvert_exporter": "python",
447 |    "pygments_lexer": "ipython3",
448 |    "version": "3.8.16"
449 |   }
450 |  },
451 |  "nbformat": 4,
452 |  "nbformat_minor": 4
453 | }
454 | 


--------------------------------------------------------------------------------
/examples/tutorials/4. Saving and Loading a Pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Saving and Loading a Pipeline\n",
  8 |     "\n",
  9 |     "This short guide shows how serialize a Pipeline into a file and later on load it\n",
 10 |     "to make predictions.\n",
 11 |     "\n",
 12 |     "Note that some steps are not explained for simplicity. Full details\n",
 13 |     "about them can be found in the previous parts of the tutorial.\n",
 14 |     "\n",
 15 |     "We will:\n",
 16 |     "\n",
 17 |     "1. Load and fit a pipeline to a dataset\n",
 18 |     "2. Save the pipeline to a file.\n",
 19 |     "3. Load the pipeline as a new object.\n",
 20 |     "4. Make predictions using the new pipeline object."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Fit the pipeline\n",
 28 |     "\n",
 29 |     "The first step will be to load and fit the pipeline to the dataset."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 1,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "from utils import load_census\n",
 39 |     "\n",
 40 |     "dataset = load_census()"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "X_train, X_test, y_train, y_test = dataset.get_splits(1)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "from mlblocks import MLPipeline\n",
 59 |     "\n",
 60 |     "primitives = [\n",
 61 |     "    'mlprimitives.custom.preprocessing.ClassEncoder',\n",
 62 |     "    'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n",
 63 |     "    'sklearn.impute.SimpleImputer',\n",
 64 |     "    'xgboost.XGBClassifier',\n",
 65 |     "    'mlprimitives.custom.preprocessing.ClassDecoder'\n",
 66 |     "]\n",
 67 |     "pipeline = MLPipeline(primitives)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stderr",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "/Users/sarah/anaconda3/envs/mlp/lib/python3.8/site-packages/sklearn/impute/_base.py:382: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n",
 80 |       "  warnings.warn(\n"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "pipeline.fit(X_train, y_train)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Save the Pipeline\n",
 93 |     "\n",
 94 |     "Once the pipeline is fit and ready to make predictions we can store it in a file.\n",
 95 |     "We will do so using [pickle](https://docs.python.org/3/library/pickle.html)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "import pickle\n",
105 |     "\n",
106 |     "with open('pipeline.pkl', 'wb') as f:\n",
107 |     "    pickle.dump(pipeline, f)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## Load the Pipeline\n",
115 |     "\n",
116 |     "The saved pipeline can then be moved to another system where we can load it back to\n",
117 |     "memory using pickle again."
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 6,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "with open('pipeline.pkl', 'rb') as f:\n",
127 |     "    loaded_pipeline = pickle.load(f)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "**IMPORTANT**: All the dependencies need to also be installed in the system that is loading the pipeline. This includes **MLBlocks** and **MLPrimitives** or any other libraries required by the pipeline primitives."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "## Make Predictions\n",
142 |     "\n",
143 |     "Once the pipeline is loaded it is ready to make predictions again"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "pred = pipeline.predict(X_test)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 8,
158 |    "metadata": {},
159 |    "outputs": [
160 |     {
161 |      "data": {
162 |       "text/plain": [
163 |        "array([' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)"
164 |       ]
165 |      },
166 |      "execution_count": 8,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "pred[0:5]"
173 |    ]
174 |   }
175 |  ],
176 |  "metadata": {
177 |   "kernelspec": {
178 |    "display_name": "Python 3 (ipykernel)",
179 |    "language": "python",
180 |    "name": "python3"
181 |   },
182 |   "language_info": {
183 |    "codemirror_mode": {
184 |     "name": "ipython",
185 |     "version": 3
186 |    },
187 |    "file_extension": ".py",
188 |    "mimetype": "text/x-python",
189 |    "name": "python",
190 |    "nbconvert_exporter": "python",
191 |    "pygments_lexer": "ipython3",
192 |    "version": "3.8.16"
193 |   }
194 |  },
195 |  "nbformat": 4,
196 |  "nbformat_minor": 4
197 | }
198 | 


--------------------------------------------------------------------------------
/examples/tutorials/7. Tuning a Pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tuning a Pipeline\n",
  8 |     "\n",
  9 |     "This short guide shows how tune a Pipeline using a [BTB](https://github.com/MLBazaar/BTB) Tuner.\n",
 10 |     "\n",
 11 |     "Note that some steps are not explained for simplicity. Full details\n",
 12 |     "about them can be found in the previous parts of the tutorial.\n",
 13 |     "\n",
 14 |     "Here we will:\n",
 15 |     "1. Load a dataset and a pipeline\n",
 16 |     "2. Explore the pipeline tunable hyperparameters\n",
 17 |     "3. Write a scoring function\n",
 18 |     "4. Build a BTB Tunable and BTB Tuner.\n",
 19 |     "5. Write a tuning loop"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Load dataset and the pipeline\n",
 27 |     "\n",
 28 |     "The first step will be to load the dataset that we were using in previous tutorials."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "from utils import load_census\n",
 38 |     "\n",
 39 |     "dataset = load_census()"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "And load a suitable pipeline.\n",
 47 |     "\n",
 48 |     "Note how in this case we are using the variable name `template` instead of `pipeline`,\n",
 49 |     "because this will only be used as a template for the pipelines that we will create\n",
 50 |     "and evaluate during the later tuning loop."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 2,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "from mlblocks import MLPipeline\n",
 60 |     "\n",
 61 |     "template = MLPipeline('single_table.classification.xgb')"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "## Explore the pipeline tunable hyperparameters"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "Once we have loaded the pipeline, we can now extract the hyperparameters that we will tune\n",
 76 |     "by calling the `get_tunable_hyperparameters` method.\n",
 77 |     "\n",
 78 |     "In this case we will call it using `flat=True` to obtain the hyperparameters in a format\n",
 79 |     "that is compatible with BTB."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "tunable_hyperparameters = template.get_tunable_hyperparameters(flat=True)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 4,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/plain": [
 99 |        "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
100 |        "  'max_labels'): {'type': 'int', 'default': 0, 'range': [0, 100]},\n",
101 |        " ('sklearn.impute.SimpleImputer#1', 'strategy'): {'type': 'str',\n",
102 |        "  'default': 'mean',\n",
103 |        "  'values': ['mean', 'median', 'most_frequent', 'constant']},\n",
104 |        " ('xgboost.XGBClassifier#1', 'n_estimators'): {'type': 'int',\n",
105 |        "  'default': 100,\n",
106 |        "  'range': [10, 1000]},\n",
107 |        " ('xgboost.XGBClassifier#1', 'max_depth'): {'type': 'int',\n",
108 |        "  'default': 3,\n",
109 |        "  'range': [3, 10]},\n",
110 |        " ('xgboost.XGBClassifier#1', 'learning_rate'): {'type': 'float',\n",
111 |        "  'default': 0.1,\n",
112 |        "  'range': [0, 1]},\n",
113 |        " ('xgboost.XGBClassifier#1', 'gamma'): {'type': 'float',\n",
114 |        "  'default': 0,\n",
115 |        "  'range': [0, 1]},\n",
116 |        " ('xgboost.XGBClassifier#1', 'min_child_weight'): {'type': 'int',\n",
117 |        "  'default': 1,\n",
118 |        "  'range': [1, 10]}}"
119 |       ]
120 |      },
121 |      "execution_count": 4,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "tunable_hyperparameters"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## Write a scoring function\n",
135 |     "\n",
136 |     "To tune the pipeline we will need to evaluate its performance multiple times with different hyperparameters.\n",
137 |     "\n",
138 |     "For this reason, we will start by writing a scoring function that will expect only one\n",
139 |     "input, the hyperparameters dictionary, and evaluate the performance of the pipeline using them.\n",
140 |     "\n",
141 |     "In this case, the evaluation will be done using 5-fold cross validation based on the `get_splits`\n",
142 |     "method from the dataset."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 5,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "import numpy as np\n",
152 |     "\n",
153 |     "def cross_validate(hyperparameters=None):\n",
154 |     "    scores = []\n",
155 |     "    for X_train, X_test, y_train, y_test in dataset.get_splits(5):\n",
156 |     "        pipeline = MLPipeline(template.to_dict())  # Make a copy of the template\n",
157 |     "        if hyperparameters:\n",
158 |     "            pipeline.set_hyperparameters(hyperparameters)\n",
159 |     "\n",
160 |     "        pipeline.fit(X_train, y_train)\n",
161 |     "        y_pred = pipeline.predict(X_test)\n",
162 |     "        \n",
163 |     "        scores.append(dataset.score(y_test, y_pred))\n",
164 |     "        \n",
165 |     "    return np.mean(scores)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "By calling this function without any arguments we will obtain the score obtained\n",
173 |     "with the default hyperparameters."
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 6,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "0.863978563379761"
185 |       ]
186 |      },
187 |      "execution_count": 6,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "default_score = cross_validate()\n",
194 |     "default_score"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "Optionally, we can certify that by passing a hyperparameters dictionary the new hyperparameters\n",
202 |     "will be used, resulting on a different score."
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 7,
208 |    "metadata": {},
209 |    "outputs": [
210 |     {
211 |      "data": {
212 |       "text/plain": [
213 |        "0.868554574842"
214 |       ]
215 |      },
216 |      "execution_count": 7,
217 |      "metadata": {},
218 |      "output_type": "execute_result"
219 |     }
220 |    ],
221 |    "source": [
222 |     "hyperparameters = {\n",
223 |     "    ('xgboost.XGBClassifier#1', 'max_depth'): 4\n",
224 |     "}\n",
225 |     "cross_validate(hyperparameters)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "## Create a BTB Tunable\n",
233 |     "\n",
234 |     "The next step is to create the BTB Tunable instance that will be tuned by the BTB Tuner.\n",
235 |     "\n",
236 |     "For this we will use its `from_dict` method, passing our hyperparameters dict."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 8,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "from baytune.tuning import Tunable\n",
246 |     "\n",
247 |     "tunable = Tunable.from_dict(tunable_hyperparameters)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "## Create the BTB Tuner\n",
255 |     "\n",
256 |     "After creating the Tunable, we need to create a Tuner to tune it.\n",
257 |     "\n",
258 |     "In this case we will use the GPTuner, a Meta-model based tuner that uses a Gaussian Process Regressor\n",
259 |     "for the optimization."
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 9,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "from baytune.tuning import GPTuner\n",
269 |     "\n",
270 |     "tuner = GPTuner(tunable)"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "Optionally, since we already know the score obtained by the default arguments and\n",
278 |     "these have a high probability of being already decent, we will inform the tuner\n",
279 |     "about their performance.\n",
280 |     "\n",
281 |     "In order to obtain the default hyperparameters used before we can either call\n",
282 |     "the template `get_hyperparameters(flat=True)` method, the `tunable.get_defaults()`."
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 10,
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "data": {
292 |       "text/plain": [
293 |        "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
294 |        "  'max_labels'): 0,\n",
295 |        " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
296 |        " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n",
297 |        " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n",
298 |        " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n",
299 |        " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n",
300 |        " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}"
301 |       ]
302 |      },
303 |      "execution_count": 10,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "defaults = tunable.get_defaults()\n",
310 |     "defaults"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 11,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "tuner.record(defaults, default_score)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "## Start the Tuning loop\n",
327 |     "\n",
328 |     "Once we have the tuner ready we can the tuning loop.\n",
329 |     "\n",
330 |     "During this loop we will:\n",
331 |     "\n",
332 |     "1. Ask the tuner for a new hyperparameter proposal\n",
333 |     "2. Run the `cross_validate` function to evaluate these hyperparameters\n",
334 |     "3. Record the obtained score back to the tuner.\n",
335 |     "4. If the obtained score is better than the previous one, store the proposal."
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 12,
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "name": "stdout",
345 |      "output_type": "stream",
346 |      "text": [
347 |       "scoring pipeline 1\n",
348 |       "New best found: 0.871994161365419\n",
349 |       "scoring pipeline 2\n",
350 |       "New best found: 0.8723319756253888\n",
351 |       "scoring pipeline 3\n",
352 |       "scoring pipeline 4\n",
353 |       "scoring pipeline 5\n",
354 |       "scoring pipeline 6\n",
355 |       "scoring pipeline 7\n",
356 |       "scoring pipeline 8\n",
357 |       "scoring pipeline 9\n",
358 |       "scoring pipeline 10\n"
359 |      ]
360 |     }
361 |    ],
362 |    "source": [
363 |     "best_score = default_score\n",
364 |     "best_proposal = defaults\n",
365 |     "\n",
366 |     "for iteration in range(10):\n",
367 |     "    print(\"scoring pipeline {}\".format(iteration + 1))\n",
368 |     "    \n",
369 |     "    proposal = tuner.propose()\n",
370 |     "    score = cross_validate(proposal)\n",
371 |     "    \n",
372 |     "    tuner.record(proposal, score)\n",
373 |     "    \n",
374 |     "    if score > best_score:\n",
375 |     "        print(\"New best found: {}\".format(score))\n",
376 |     "        best_score = score\n",
377 |     "        best_proposal = proposal"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "After the loop has finished, the best proposal will be stored in the `best_proposal` variable,\n",
385 |     "which can be used to generate a new pipeline instance."
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 13,
391 |    "metadata": {},
392 |    "outputs": [
393 |     {
394 |      "data": {
395 |       "text/plain": [
396 |        "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n",
397 |        "  'max_labels'): 60,\n",
398 |        " ('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n",
399 |        " ('xgboost.XGBClassifier#1', 'n_estimators'): 190,\n",
400 |        " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n",
401 |        " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.13575511242790694,\n",
402 |        " ('xgboost.XGBClassifier#1', 'gamma'): 0.6326488945712287,\n",
403 |        " ('xgboost.XGBClassifier#1', 'min_child_weight'): 8}"
404 |       ]
405 |      },
406 |      "execution_count": 13,
407 |      "metadata": {},
408 |      "output_type": "execute_result"
409 |     }
410 |    ],
411 |    "source": [
412 |     "best_proposal"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": 14,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "best_pipeline = MLPipeline(template.to_dict())"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 15,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "best_pipeline.set_hyperparameters(best_proposal)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 16,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "best_pipeline.fit(dataset.data, dataset.target)"
440 |    ]
441 |   }
442 |  ],
443 |  "metadata": {
444 |   "kernelspec": {
445 |    "display_name": "Python 3 (ipykernel)",
446 |    "language": "python",
447 |    "name": "python3"
448 |   },
449 |   "language_info": {
450 |    "codemirror_mode": {
451 |     "name": "ipython",
452 |     "version": 3
453 |    },
454 |    "file_extension": ".py",
455 |    "mimetype": "text/x-python",
456 |    "name": "python",
457 |    "nbconvert_exporter": "python",
458 |    "pygments_lexer": "ipython3",
459 |    "version": "3.10.15"
460 |   }
461 |  },
462 |  "nbformat": 4,
463 |  "nbformat_minor": 4
464 | }
465 | 


--------------------------------------------------------------------------------
/examples/tutorials/utils.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | 
 4 | import pandas as pd
 5 | from sklearn.metrics import accuracy_score
 6 | from mlprimitives.datasets import Dataset
 7 | 
 8 | DATA_PATH = os.path.join(
 9 |     os.path.dirname(__file__),
10 |     'data'
11 | )
12 | 
13 | DATA_URL = 'http://mlblocks.s3.amazonaws.com/{}.csv'
14 | 
15 | def _download(dataset_name, dataset_path):
16 |     url = DATA_URL.format(dataset_name)
17 | 
18 |     data = pd.read_csv(url)
19 |     data.to_csv(dataset_path, index=False)
20 | 
21 | def _load(dataset_name):
22 |     if not os.path.exists(DATA_PATH):
23 |         os.makedirs(DATA_PATH)
24 | 
25 |     dataset_path = os.path.join(DATA_PATH, dataset_name + '.csv')
26 |     if not os.path.exists(dataset_path):
27 |         _download(dataset_name, dataset_path)
28 | 
29 |     return dataset_path
30 | 
31 | def load_census():
32 |     """Adult Census dataset.
33 | 
34 |     Predict whether income exceeds $50K/yr based on census data. Also known as "Adult" dataset.
35 | 
36 |     Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean
37 |     records was extracted using the following conditions: ((AAGE>16) && (AGI>100) &&
38 |     (AFNLWGT>1)&& (HRSWK>0))
39 | 
40 |     Prediction task is to determine whether a person makes over 50K a year.
41 | 
42 |     source: "UCI
43 |     sourceURI: "https://archive.ics.uci.edu/ml/datasets/census+income"
44 |     """
45 | 
46 |     dataset_path = _load('census_train')
47 | 
48 |     X = pd.read_csv(dataset_path)
49 |     y = X.pop('label').values
50 | 
51 |     return Dataset(load_census.__doc__, X, y, accuracy_score, 'single_table',
52 |                    'classification', 'binary', stratify=True)


--------------------------------------------------------------------------------
/mlblocks/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | MLBlocks top module.
 5 | 
 6 | MLBlocks is a simple framework for composing end-to-end tunable Machine Learning Pipelines by
 7 | seamlessly combining tools from any python library with a simple, common and uniform interface.
 8 | 
 9 | * Free software: MIT license
10 | * Documentation: https://MLBazaar.github.io/MLBlocks
11 | """
12 | 
13 | from mlblocks.discovery import (
14 |     add_pipelines_path, add_primitives_path, find_pipelines, find_primitives, get_pipelines_paths,
15 |     get_primitives_paths, load_pipeline, load_primitive)
16 | from mlblocks.mlblock import MLBlock
17 | from mlblocks.mlpipeline import MLPipeline
18 | 
19 | __author__ = 'MIT Data To AI Lab'
20 | __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
21 | __email__ = 'dailabmit@gmail.com'
22 | __license__ = 'MIT'
23 | __version__ = '0.6.3.dev0'
24 | 
25 | __all__ = [
26 |     'MLBlock',
27 |     'MLPipeline',
28 |     'add_pipelines_path',
29 |     'add_primitives_path',
30 |     'find_pipelines',
31 |     'find_primitives',
32 |     'get_pipelines_paths',
33 |     'get_primitives_paths',
34 |     'load_pipeline',
35 |     'load_primitive'
36 | ]
37 | 


--------------------------------------------------------------------------------
/mlblocks/discovery.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Primitives and Pipelines discovery module.
  5 | 
  6 | This module contains functions to load primitive and pipeline
  7 | annotations, as well as to configure how MLBlocks finds the
  8 | primitives and pipelines.
  9 | """
 10 | 
 11 | import json
 12 | import logging
 13 | import os
 14 | import re
 15 | import sys
 16 | 
 17 | import pkg_resources
 18 | 
 19 | LOGGER = logging.getLogger(__name__)
 20 | 
 21 | _PRIMITIVES_PATHS = [
 22 |     os.path.join(os.getcwd(), 'mlprimitives'),
 23 |     os.path.join(sys.prefix, 'mlprimitives'),
 24 |     os.path.join(os.getcwd(), 'mlblocks_primitives'),    # legacy
 25 |     os.path.join(sys.prefix, 'mlblocks_primitives'),    # legacy
 26 | ]
 27 | 
 28 | _PIPELINES_PATHS = [
 29 |     os.path.join(os.getcwd(), 'mlpipelines'),
 30 | ]
 31 | 
 32 | 
 33 | def _add_lookup_path(path, paths):
 34 |     """Add a new path to lookup.
 35 | 
 36 |     The new path will be inserted in the first place of the list,
 37 |     so any element found in this new folder will take precedence
 38 |     over any other element with the same name that existed in the
 39 |     system before.
 40 | 
 41 |     Args:
 42 |         path (str):
 43 |             path to add
 44 |         paths (list):
 45 |             list where the new path will be added.
 46 | 
 47 |     Raises:
 48 |         ValueError:
 49 |             A ``ValueError`` will be raised if the path is not valid.
 50 | 
 51 |     Returns:
 52 |         bool:
 53 |             Whether the new path was added or not.
 54 |     """
 55 |     if path not in paths:
 56 |         if not os.path.isdir(path):
 57 |             raise ValueError('Invalid path: {}'.format(path))
 58 | 
 59 |         paths.insert(0, os.path.abspath(path))
 60 |         return True
 61 | 
 62 |     return False
 63 | 
 64 | 
 65 | def add_primitives_path(path):
 66 |     """Add a new path to look for primitives.
 67 | 
 68 |     The new path will be inserted in the first place of the list,
 69 |     so any primitive found in this new folder will take precedence
 70 |     over any other primitive with the same name that existed in the
 71 |     system before.
 72 | 
 73 |     Args:
 74 |         path (str):
 75 |             path to add
 76 | 
 77 |     Raises:
 78 |         ValueError:
 79 |             A ``ValueError`` will be raised if the path is not valid.
 80 |     """
 81 |     added = _add_lookup_path(path, _PRIMITIVES_PATHS)
 82 |     if added:
 83 |         LOGGER.debug('New primitives path added: %s', path)
 84 | 
 85 | 
 86 | def add_pipelines_path(path):
 87 |     """Add a new path to look for pipelines.
 88 | 
 89 |     The new path will be inserted in the first place of the list,
 90 |     so any primitive found in this new folder will take precedence
 91 |     over any other pipeline with the same name that existed in the
 92 |     system before.
 93 | 
 94 |     Args:
 95 |         path (str):
 96 |             path to add
 97 | 
 98 |     Raises:
 99 |         ValueError:
100 |             A ``ValueError`` will be raised if the path is not valid.
101 |     """
102 |     added = _add_lookup_path(path, _PIPELINES_PATHS)
103 |     if added:
104 |         LOGGER.debug('New pipelines path added: %s', path)
105 | 
106 | 
107 | def _load_entry_points(entry_point_name, entry_point_group='mlblocks'):
108 |     """Get a list of folders from entry points.
109 | 
110 |     This list will include the value of any entry point named after the given
111 |     ``entry_point_name`` published under the given ``entry_point_group``.
112 | 
113 |     An example of such an entry point would be::
114 | 
115 |         entry_points = {
116 |             'mlblocks': [
117 |                 'primitives=some_module:SOME_VARIABLE'
118 |             ]
119 |         }
120 | 
121 |     where the module ``some_module`` contains a variable such as::
122 | 
123 |         SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
124 | 
125 |     Args:
126 |         entry_point:
127 |             The name of the ``entry_point`` to look for.
128 | 
129 |     Returns:
130 |         list:
131 |             The list of folders.
132 |     """
133 |     lookup_paths = list()
134 |     entry_points = pkg_resources.iter_entry_points(entry_point_group)
135 |     for entry_point in entry_points:
136 |         if entry_point.name == entry_point_name:
137 |             paths = entry_point.load()
138 |             if isinstance(paths, str):
139 |                 lookup_paths.append(paths)
140 |             elif isinstance(paths, (list, tuple)):
141 |                 lookup_paths.extend(paths)
142 | 
143 |     return lookup_paths
144 | 
145 | 
146 | def get_primitives_paths():
147 |     """Get the list of folders where primitives will be looked for.
148 | 
149 |     This list will include the values of all the entry points named ``primitives``
150 |     published under the entry point group ``mlblocks``.
151 | 
152 |     Also, for backwards compatibility reasons, the paths from the entry points
153 |     named ``jsons_path`` published under the ``mlprimitives`` group will also
154 |     be included.
155 | 
156 |     An example of such an entry point would be::
157 | 
158 |         entry_points = {
159 |             'mlblocks': [
160 |                 'primitives=some_module:SOME_VARIABLE'
161 |             ]
162 |         }
163 | 
164 |     where the module ``some_module`` contains a variable such as::
165 | 
166 |         SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
167 | 
168 |     Returns:
169 |         list:
170 |             The list of folders.
171 |     """
172 |     paths = _load_entry_points('primitives') + _load_entry_points('jsons_path', 'mlprimitives')
173 |     return _PRIMITIVES_PATHS + list(set(paths))
174 | 
175 | 
176 | def get_pipelines_paths():
177 |     """Get the list of folders where pipelines will be looked for.
178 | 
179 |     This list will include the values of all the entry points named ``pipelines``
180 |     published under the entry point group ``mlblocks``.
181 | 
182 |     An example of such an entry point would be::
183 | 
184 |         entry_points = {
185 |             'mlblocks': [
186 |                 'pipelines=some_module:SOME_VARIABLE'
187 |             ]
188 |         }
189 | 
190 |     where the module ``some_module`` contains a variable such as::
191 | 
192 |         SOME_VARIABLE = os.path.join(os.path.dirname(__file__), 'jsons')
193 | 
194 |     Returns:
195 |         list:
196 |             The list of folders.
197 |     """
198 |     return _PIPELINES_PATHS + _load_entry_points('pipelines')
199 | 
200 | 
201 | def _load_json(json_path):
202 |     with open(json_path, 'r') as json_file:
203 |         LOGGER.debug('Loading %s', json_path)
204 |         return json.load(json_file)
205 | 
206 | 
207 | def _load(name, paths):
208 |     """Locate and load the JSON annotation in any of the given paths.
209 | 
210 |     All the given paths will be scanned to find a JSON file with the given name,
211 |     and as soon as a JSON with the given name is found it is returned.
212 | 
213 |     Args:
214 |         name (str):
215 |             Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
216 |         paths (list):
217 |             list of paths where the primitives will be looked for.
218 | 
219 |     Returns:
220 |         dict:
221 |             The content of the JSON annotation file loaded into a dict.
222 |     """
223 |     if os.path.isfile(name):
224 |         return _load_json(name)
225 | 
226 |     for base_path in paths:
227 |         parts = name.split('.')
228 |         number_of_parts = len(parts)
229 | 
230 |         for folder_parts in range(number_of_parts):
231 |             folder = os.path.join(base_path, *parts[:folder_parts])
232 |             filename = '.'.join(parts[folder_parts:]) + '.json'
233 |             json_path = os.path.join(folder, filename)
234 | 
235 |             if os.path.isfile(json_path):
236 |                 return _load_json(json_path)
237 | 
238 | 
239 | def load_primitive(name):
240 |     """Locate and load the primitive JSON annotation.
241 | 
242 |     All the primitive paths will be scanned to find a JSON file with the given name,
243 |     and as soon as a JSON with the given name is found it is returned.
244 | 
245 |     Args:
246 |         name (str):
247 |             Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
248 | 
249 |     Returns:
250 |         dict:
251 |             The content of the JSON annotation file loaded into a dict.
252 | 
253 |     Raises:
254 |         ValueError:
255 |             A ``ValueError`` will be raised if the primitive cannot be found.
256 |     """
257 |     primitive = _load(name, get_primitives_paths())
258 |     if primitive is None:
259 |         raise ValueError("Unknown primitive: {}".format(name))
260 | 
261 |     return primitive
262 | 
263 | 
264 | def load_pipeline(name):
265 |     """Locate and load the pipeline JSON annotation.
266 | 
267 |     All the pipeline paths will be scanned to find a JSON file with the given name,
268 |     and as soon as a JSON with the given name is found it is returned.
269 | 
270 |     Args:
271 |         name (str):
272 |             Path to a JSON file or name of the JSON to look for withouth the ``.json`` extension.
273 | 
274 |     Returns:
275 |         dict:
276 |             The content of the JSON annotation file loaded into a dict.
277 | 
278 |     Raises:
279 |         ValueError:
280 |             A ``ValueError`` will be raised if the pipeline cannot be found.
281 |     """
282 |     pipeline = _load(name, get_pipelines_paths())
283 |     if pipeline is None:
284 |         raise ValueError("Unknown pipeline: {}".format(name))
285 | 
286 |     return pipeline
287 | 
288 | 
289 | def _search_annotations(base_path, pattern, parts=None):
290 |     """Search for annotations within the given path.
291 | 
292 |     If the indicated path has subfolders, search recursively within them.
293 | 
294 |     If a pattern is given, return only the annotations whose name
295 |     matches the pattern.
296 | 
297 |     Args:
298 |         base_path (str):
299 |             path to the folder to be searched for annotations.
300 |         pattern (str):
301 |             Regular expression to search in the annotation names.
302 |         parts (list):
303 |             Optional. List containing the parent folders that are also part
304 |             of the annotation name. Used during recursion to be able to
305 |             build the final annotation name before returning it.
306 | 
307 |     Returns:
308 |         dict:
309 |             dictionary containing paths as keys and annotation names as
310 |             values.
311 |     """
312 |     pattern = re.compile(pattern)
313 |     annotations = dict()
314 |     parts = parts or list()
315 |     if os.path.exists(base_path):
316 |         for name in os.listdir(base_path):
317 |             path = os.path.abspath(os.path.join(base_path, name))
318 |             if os.path.isdir(path):
319 |                 annotations.update(_search_annotations(path, pattern, parts + [name]))
320 |             elif path not in annotations:
321 |                 name = '.'.join(parts + [name])
322 |                 if pattern.search(name) and name.endswith('.json'):
323 |                     annotations[path] = name[:-5]
324 | 
325 |     return annotations
326 | 
327 | 
328 | def _match(annotation, key, values):
329 |     """Check if the anotation has the key and it matches any of the values.
330 | 
331 |     If the given key is not found but it contains dots, split by the dots
332 |     and consider each part a sublevel in the annotation.
333 | 
334 |     If the key value within the annotation is a list or a dict, check
335 |     whether any of the given values is contained within it instead of
336 |     checking for equality.
337 | 
338 |     Args:
339 |         annotation (dict):
340 |             Dictionary annotation.
341 |         key (str):
342 |             Key to search within the annoation. It can contain dots to
343 |             separated nested subdictionary levels within the annotation.
344 |         values (object or list):
345 |             Value or list of values to search for.
346 | 
347 |     Returns:
348 |         bool:
349 |             whether there is a match or not.
350 |     """
351 |     if not isinstance(values, list):
352 |         values = [values]
353 | 
354 |     if key not in annotation:
355 |         if '.' in key:
356 |             name, key = key.split('.', 1)
357 |             part = annotation.get(name) or dict()
358 |             return _match(part, key, values)
359 |         else:
360 |             return False
361 | 
362 |     annotation_value = annotation[key]
363 | 
364 |     for value in values:
365 |         if isinstance(annotation_value, (list, dict)):
366 |             return value in annotation_value
367 |         elif annotation_value == value:
368 |             return True
369 | 
370 |     return False
371 | 
372 | 
373 | def _find_annotations(paths, loader, pattern, filters):
374 |     """Find matching annotations within the given paths.
375 | 
376 |     Math annotations by both name pattern and filters.
377 | 
378 |     Args:
379 |         paths (list):
380 |             List of paths to search annotations in.
381 |         loader (callable):
382 |             Function to use to load the annotation contents.
383 |         pattern (str):
384 |             Pattern to match against the annotation name.
385 |         filters (dict):
386 |             Dictionary containing key/value filters.
387 | 
388 |     Returns:
389 |         list:
390 |             names of the matching annotations.
391 |     """
392 |     annotations = dict()
393 |     for base_path in paths:
394 |         annotations.update(_search_annotations(base_path, pattern))
395 | 
396 |     matching = list()
397 |     for name in sorted(annotations.values()):
398 |         annotation = loader(name)
399 |         for key, value in filters.items():
400 |             if not _match(annotation, key, value):
401 |                 break
402 | 
403 |         else:
404 |             matching.append(name)
405 | 
406 |     return matching
407 | 
408 | 
409 | def find_primitives(pattern='', filters=None):
410 |     """Find primitives by name and filters.
411 | 
412 |     If a patter is given, only the primitives whose name matches
413 |     the pattern will be returned.
414 | 
415 |     If filters are given, they should be a dictionary containing key/value
416 |     filters that will have to be matched within the primitive annotation
417 |     for it to be included in the results.
418 | 
419 |     If the given key is not found but it contains dots, split by the dots
420 |     and consider each part a sublevel in the annotation.
421 | 
422 |     If the key value within the annotation is a list or a dict, check
423 |     whether any of the given values is contained within it instead of
424 |     checking for equality.
425 | 
426 |     Args:
427 |         pattern (str):
428 |             Regular expression to match agains the primitive names.
429 |         filters (dict):
430 |             Dictionary containing the filters to apply over the matchin
431 |             primitives.
432 | 
433 |     Returns:
434 |         list:
435 |             Names of the matching primitives.
436 |     """
437 |     filters = filters or dict()
438 |     return _find_annotations(get_primitives_paths(), load_primitive, pattern, filters)
439 | 
440 | 
441 | def find_pipelines(pattern='', filters=None):
442 |     """Find pipelines by name and filters.
443 | 
444 |     If a patter is given, only the pipelines whose name matches
445 |     the pattern will be returned.
446 | 
447 |     If filters are given, they should be a dictionary containing key/value
448 |     filters that will have to be matched within the pipeline annotation
449 |     for it to be included in the results.
450 | 
451 |     If the given key is not found but it contains dots, split by the dots
452 |     and consider each part a sublevel in the annotation.
453 | 
454 |     If the key value within the annotation is a list or a dict, check
455 |     whether any of the given values is contained within it instead of
456 |     checking for equality.
457 | 
458 |     Args:
459 |         pattern (str):
460 |             Regular expression to match agains the pipeline names.
461 |         filters (dict):
462 |             Dictionary containing the filters to apply over the matchin
463 |             pipelines.
464 | 
465 |     Returns:
466 |         list:
467 |             Names of the matching pipelines.
468 |     """
469 |     filters = filters or dict()
470 |     return _find_annotations(get_pipelines_paths(), load_pipeline, pattern, filters)
471 | 


--------------------------------------------------------------------------------
/mlblocks/mlblock.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Package where the MLBlock class is defined."""
  4 | 
  5 | import importlib
  6 | import logging
  7 | from copy import deepcopy
  8 | 
  9 | from mlblocks.discovery import load_primitive
 10 | 
 11 | LOGGER = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | def import_object(object_name):
 15 |     """Import an object from its Fully Qualified Name."""
 16 | 
 17 |     if isinstance(object_name, str):
 18 |         parent_name, attribute = object_name.rsplit('.', 1)
 19 |         try:
 20 |             parent = importlib.import_module(parent_name)
 21 |         except ImportError:
 22 |             grand_parent_name, parent_name = parent_name.rsplit('.', 1)
 23 |             grand_parent = importlib.import_module(grand_parent_name)
 24 |             parent = getattr(grand_parent, parent_name)
 25 | 
 26 |         return getattr(parent, attribute)
 27 | 
 28 |     return object_name
 29 | 
 30 | 
 31 | class MLBlock():
 32 |     """MLBlock Class.
 33 | 
 34 |     The MLBlock class represents a single step within an MLPipeline.
 35 | 
 36 |     It is responsible for loading and interpreting JSON primitives, as well
 37 |     as wrapping them and providing a common interface to run them.
 38 | 
 39 |     Attributes:
 40 |         name (str):
 41 |             Primitive name.
 42 |         metadata (dict):
 43 |             Additional information about this primitive
 44 |         primitive (object):
 45 |             the actual function or instance which this MLBlock wraps.
 46 |         fit_args (dict):
 47 |             specification of the arguments expected by the ``fit`` method.
 48 |         fit_method (str):
 49 |             name of the primitive method to call on ``fit``. ``None`` if the
 50 |             primitive is a function.
 51 |         produce_args (dict):
 52 |             specification of the arguments expected by the ``predict`` method.
 53 |         produce_output (dict):
 54 |             specification of the outputs of the ``produce`` method.
 55 |         produce_method (str):
 56 |             name of the primitive method to call on ``produce``. ``None`` if the primitive is a
 57 |             function.
 58 | 
 59 |     Args:
 60 |         primitive (str or dict):
 61 |             primitive name or primitive dictionary.
 62 |         **kwargs:
 63 |             Any additional arguments that will be used as hyperparameters or passed to the
 64 |             ``fit`` or ``produce`` methods.
 65 | 
 66 |     Raises:
 67 |         TypeError:
 68 |             A ``TypeError`` is raised if a required argument is not found within the ``kwargs``
 69 |             or if an unexpected argument has been given.
 70 |     """  # pylint: disable=too-many-instance-attributes
 71 | 
 72 |     def _extract_params(self, kwargs, hyperparameters):
 73 |         """Extract init, fit and produce params from kwargs.
 74 | 
 75 |         The ``init_params``, ``fit_params`` and ``produce_params`` are extracted
 76 |         from the passed ``kwargs`` taking the metadata hyperparameters as a
 77 |         reference.
 78 | 
 79 |         During this extraction, make sure that all the required hyperparameters
 80 |         have been given and that nothing unexpected exists in the input.
 81 | 
 82 |         Args:
 83 |             kwargs (dict):
 84 |                 dict containing the Keyword arguments that have been passed to the ``__init__``
 85 |                 method upon initialization.
 86 |             hyperparameters (dict):
 87 |                 hyperparameters dictionary, as found in the JSON annotation.
 88 | 
 89 |         Raises:
 90 |             TypeError:
 91 |                 A ``TypeError`` is raised if a required argument is not found in the
 92 |                 ``kwargs`` dict, or if an unexpected argument has been given.
 93 |         """
 94 |         init_params = dict()
 95 |         fit_params = dict()
 96 |         produce_params = dict()
 97 | 
 98 |         for name, param in hyperparameters.get('fixed', dict()).items():
 99 |             if name in kwargs:
100 |                 value = kwargs.pop(name)
101 | 
102 |             elif 'default' in param:
103 |                 value = param['default']
104 | 
105 |             else:
106 |                 raise TypeError("{} required argument '{}' not found".format(self.name, name))
107 | 
108 |             init_params[name] = value
109 | 
110 |         for name, param in hyperparameters.get('tunable', dict()).items():
111 |             if name in kwargs:
112 |                 init_params[name] = kwargs.pop(name)
113 | 
114 |         if not isinstance(self.fit_args, str):
115 |             fit_args = [arg['name'] for arg in self.fit_args]
116 |         else:
117 |             fit_args = []
118 | 
119 |         if not isinstance(self.produce_args, str):
120 |             produce_args = [arg['name'] for arg in self.produce_args]
121 |         else:
122 |             produce_args = []
123 | 
124 |         for name in list(kwargs.keys()):
125 |             if name in fit_args:
126 |                 fit_params[name] = kwargs.pop(name)
127 | 
128 |             elif name in produce_args:
129 |                 produce_params[name] = kwargs.pop(name)
130 | 
131 |         if kwargs:
132 |             error = "Unexpected hyperparameters '{}'".format(', '.join(kwargs.keys()))
133 |             raise TypeError(error)
134 | 
135 |         return init_params, fit_params, produce_params
136 | 
137 |     @staticmethod
138 |     def _filter_conditional(conditional, init_params):
139 |         condition = conditional['condition']
140 |         default = conditional.get('default')
141 | 
142 |         if condition not in init_params:
143 |             return default
144 | 
145 |         condition_value = init_params[condition]
146 |         values = conditional['values']
147 |         return values.get(condition_value, default)
148 | 
149 |     @classmethod
150 |     def _get_tunable(cls, hyperparameters, init_params):
151 |         tunable = dict()
152 |         for name, param in hyperparameters.get('tunable', dict()).items():
153 |             if name not in init_params:
154 |                 if param['type'] == 'conditional':
155 |                     param = cls._filter_conditional(param, init_params)
156 |                     if param is not None:
157 |                         tunable[name] = param
158 | 
159 |                 else:
160 |                     tunable[name] = param
161 | 
162 |         return tunable
163 | 
164 |     def __init__(self, primitive, **kwargs):
165 |         if isinstance(primitive, str):
166 |             primitive = load_primitive(primitive)
167 | 
168 |         self.metadata = primitive
169 |         self.name = primitive['name']
170 | 
171 |         self.primitive = import_object(self.metadata['primitive'])
172 | 
173 |         self._fit = self.metadata.get('fit', dict())
174 |         self.fit_args = self._fit.get('args', [])
175 |         self.fit_method = self._fit.get('method')
176 | 
177 |         self._produce = self.metadata['produce']
178 |         self.produce_args = self._produce['args']
179 |         self.produce_output = self._produce['output']
180 |         self.produce_method = self._produce.get('method')
181 | 
182 |         self._class = bool(self.produce_method)
183 | 
184 |         hyperparameters = self.metadata.get('hyperparameters', dict())
185 |         init_params, fit_params, produce_params = self._extract_params(kwargs, hyperparameters)
186 | 
187 |         self._hyperparameters = init_params
188 |         self._fit_params = fit_params
189 |         self._produce_params = produce_params
190 | 
191 |         self._tunable = self._get_tunable(hyperparameters, init_params)
192 | 
193 |         default = {
194 |             name: param['default']
195 |             for name, param in self._tunable.items()
196 |             # TODO: support undefined defaults
197 |         }
198 | 
199 |         self.set_hyperparameters(default)
200 | 
201 |     def __str__(self):
202 |         """Return a string that represents this block."""
203 |         return 'MLBlock - {}'.format(self.name)
204 | 
205 |     def get_tunable_hyperparameters(self):
206 |         """Get the hyperparameters that can be tuned for this MLBlock.
207 | 
208 |         The list of hyperparameters is taken from the JSON annotation,
209 |         filtering out any hyperparameter for which a value has been given
210 |         during the initalization.
211 | 
212 |         Returns:
213 |             dict:
214 |                 the dictionary containing the hyperparameters that can be
215 |                 tuned, their types and, if applicable, the accepted
216 |                 ranges or values.
217 |         """
218 |         return deepcopy(self._tunable)
219 | 
220 |     def get_hyperparameters(self):
221 |         """Get hyperparameters values that the current MLBlock is using.
222 | 
223 |         Returns:
224 |             dict:
225 |                 the dictionary containing the hyperparameter values that the
226 |                 MLBlock is currently using.
227 |         """
228 |         return deepcopy(self._hyperparameters)
229 | 
230 |     def set_hyperparameters(self, hyperparameters):
231 |         """Set new hyperparameters.
232 | 
233 |         Only the specified hyperparameters are modified, so any other
234 |         hyperparameter keeps the value that had been previously given.
235 | 
236 |         If necessary, a new instance of the primitive is created.
237 | 
238 |         Args:
239 |             hyperparameters (dict):
240 |                 Dictionary containing as keys the name of the hyperparameters and as
241 |                 values the values to be used.
242 |         """
243 |         self._hyperparameters.update(hyperparameters)
244 | 
245 |         if self._class:
246 |             LOGGER.debug('Creating a new primitive instance for %s', self.name)
247 |             self.instance = self.primitive(**self.get_hyperparameters())
248 | 
249 |     def _get_method_kwargs(self, kwargs, method_args):
250 |         """Prepare the kwargs for the method.
251 | 
252 |         The kwargs dict will be altered according to the method_kwargs
253 |         specification to make them ready for the primitive method to
254 |         accept them.
255 | 
256 |         Args:
257 |             kwargs (dict):
258 |                 keyword arguments that have been passed to the block method.
259 |             method_args (list):
260 |                 method arguments as specified in the JSON annotation.
261 | 
262 |         Returns:
263 |             dict:
264 |                 A dictionary containing the argument names and values to pass
265 |                 to the primitive method.
266 |         """
267 |         if isinstance(method_args, str):
268 |             method_args = getattr(self.instance, method_args)()
269 | 
270 |         method_kwargs = dict()
271 |         for arg in method_args:
272 |             name = arg['name']
273 |             keyword = arg.get('keyword', name)
274 | 
275 |             if name in kwargs:
276 |                 value = kwargs[name]
277 |             elif 'default' in arg:
278 |                 value = arg['default']
279 |             elif arg.get('required', True):
280 |                 raise TypeError("missing expected argument '{}'".format(name))
281 | 
282 |             method_kwargs[keyword] = value
283 | 
284 |         return method_kwargs
285 | 
286 |     def fit(self, **kwargs):
287 |         """Call the fit method of the primitive.
288 | 
289 |         The given keyword arguments will be passed directly to the ``fit``
290 |         method of the primitive instance specified in the JSON annotation.
291 | 
292 |         If any of the arguments expected by the produce method had been
293 |         given during the MLBlock initialization, they will be passed as well.
294 | 
295 |         If the fit method was not specified in the JSON annotation, or if
296 |         the primitive is a simple function, this will be a noop.
297 | 
298 |         Args:
299 |             **kwargs:
300 |                 Any given keyword argument will be directly passed to the primitive fit method.
301 | 
302 |         Raises:
303 |             TypeError:
304 |                 A ``TypeError`` might be raised if any argument not expected by the primitive fit
305 |                 method is given.
306 |         """
307 |         if self.fit_method is not None:
308 |             fit_kwargs = self._fit_params.copy()
309 |             fit_kwargs.update(kwargs)
310 |             fit_kwargs = self._get_method_kwargs(fit_kwargs, self.fit_args)
311 |             getattr(self.instance, self.fit_method)(**fit_kwargs)
312 | 
313 |     def produce(self, **kwargs):
314 |         """Call the primitive function, or the predict method of the primitive.
315 | 
316 |         The given keyword arguments will be passed directly to the primitive,
317 |         if it is a simple function, or to the ``produce`` method of the
318 |         primitive instance specified in the JSON annotation, if it is a class.
319 | 
320 |         If any of the arguments expected by the fit method had been given
321 |         during the MLBlock initialization, they will be passed as well.
322 | 
323 |         Returns:
324 |             The output of the call to the primitive function or primitive
325 |             produce method.
326 |         """
327 |         produce_kwargs = self._produce_params.copy()
328 |         produce_kwargs.update(kwargs)
329 |         produce_kwargs = self._get_method_kwargs(produce_kwargs, self.produce_args)
330 |         if self._class:
331 |             return getattr(self.instance, self.produce_method)(**produce_kwargs)
332 | 
333 |         produce_kwargs.update(self.get_hyperparameters())
334 |         return self.primitive(**produce_kwargs)
335 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Requirements for development and mybinder environment
2 | -e .[dev]
3 | docutils<0.16,>=0.10  # Fix dependency conflict on mybinder
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.6.3.dev0
 3 | commit = True
 4 | tag = True
 5 | parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
 6 | serialize = 
 7 | 	{major}.{minor}.{patch}.{release}{candidate}
 8 | 	{major}.{minor}.{patch}
 9 | 
10 | [bumpversion:part:release]
11 | optional_value = release
12 | first_value = dev
13 | values = 
14 | 	dev
15 | 	release
16 | 
17 | [bumpversion:part:candidate]
18 | 
19 | [bumpversion:file:setup.py]
20 | search = version='{current_version}'
21 | replace = version='{new_version}'
22 | 
23 | [bumpversion:file:mlblocks/__init__.py]
24 | search = __version__ = '{current_version}'
25 | replace = __version__ = '{new_version}'
26 | 
27 | [bdist_wheel]
28 | universal = 1
29 | 
30 | [flake8]
31 | max-line-length = 99
32 | exclude = .tox, .git, __pycache__, .ipynb_checkpoints
33 | ignore = # Keep empty to prevent default ignores
34 | 
35 | [isort]
36 | line_length = 99
37 | lines_between_types = 0
38 | multi_line_output = 4
39 | use_parentheses = True
40 | not_skip = __init__.py
41 | skip_glob = *.bak
42 | 
43 | [metadata]
44 | description-file = README.md
45 | 
46 | [aliases]
47 | test = pytest
48 | 
49 | [tool:pytest]
50 | collect_ignore = ['setup.py']
51 | 
52 | [tool:pylint]
53 | good-names = X,y
54 | 
55 | [doc8]
56 | max-line-length = 99
57 | 
58 | [pydocstyle]
59 | add-ignore = D403,D413,D105,D107
60 | 
61 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """The setup script."""
  5 | 
  6 | from setuptools import find_packages, setup
  7 | 
  8 | with open('README.md', encoding='utf-8') as readme_file:
  9 |     readme = readme_file.read()
 10 | 
 11 | with open('HISTORY.md', encoding='utf-8') as history_file:
 12 |     history = history_file.read()
 13 | 
 14 | 
 15 | install_requires = [
 16 |     'graphviz>=0.9,<1',
 17 |     'numpy>=1.17.1,<3',
 18 |     'psutil>=5,<7',
 19 | ]
 20 | 
 21 | 
 22 | mlprimitives_requires = [
 23 |     'mlprimitives>=0.4.0,<0.5',
 24 |     'h5py<4,>=2.10.0',  # <- tensorflow 2.3.2 conflict
 25 |     'matplotlib<4,>=2.2.2',  # <- copulas 0.3.3
 26 |     'protobuf<4', # <- importlib
 27 | ]
 28 | 
 29 | examples_require = mlprimitives_requires + [
 30 |     'jupyter==1.0.0',
 31 |     'baytune>=0.5.0,<0.6',
 32 |     'copulas<0.12',
 33 | ]
 34 | 
 35 | 
 36 | tests_require = [
 37 |     'pytest>=3.4.2',
 38 |     'pytest-cov>=2.6.0',
 39 |     'setuptools>=41.0.0',
 40 |     'rundoc>=0.4.3',
 41 |     'prompt-toolkit>=2.0,<3.0',
 42 | ]
 43 | 
 44 | 
 45 | setup_requires = [
 46 |     'pytest-runner>=2.11.1',
 47 | ]
 48 | 
 49 | 
 50 | development_requires = [
 51 |     # general
 52 |     'bumpversion>=0.5.3,<0.6',
 53 |     'pip>=9.0.1',
 54 |     'watchdog>=0.8.3,<5',
 55 | 
 56 |     # docs
 57 |     'm2r>=0.2.0,<0.3',
 58 |     'Sphinx>=1.7.1,<3',
 59 |     'sphinx_rtd_theme>=0.2.4,<0.5',
 60 |     'docutils>=0.12,<0.18',
 61 |     'ipython>=6.5.0',
 62 |     'autodocsumm>=0.1.10',
 63 |     'Jinja2>=2,<3', # >=3 makes sphinx theme fail
 64 |     'markupsafe<2.1.0',
 65 | 
 66 |     # fails on Sphinx < v3.4
 67 |     'alabaster<=0.7.12',
 68 |     # fails on Sphins < v5.0
 69 |     'sphinxcontrib-applehelp<1.0.8',
 70 |     'sphinxcontrib-devhelp<1.0.6',
 71 |     'sphinxcontrib-htmlhelp<2.0.5',
 72 |     'sphinxcontrib-serializinghtml<1.1.10',
 73 |     'sphinxcontrib-qthelp<1.0.7',
 74 | 
 75 |     # style check
 76 |     'flake8>=3.7.7,<4',
 77 |     'isort>=4.3.4,<5',
 78 | 
 79 |     # fix style issues
 80 |     'autoflake>=1.1,<2',
 81 |     'autopep8>=1.4.3,<2',
 82 | 
 83 |     # distribute on PyPI
 84 |     'twine>=1.10.0,<4',
 85 |     'wheel>=0.30.0',
 86 | 
 87 |     # Advanced testing
 88 |     'coverage>=4.5.1,<6',
 89 |     'tox>=2.9.1,<4',
 90 | 
 91 |     # Documentation style
 92 |     'doc8>=0.8.0',
 93 |     'pydocstyle>=3.0.0',
 94 | ]
 95 | 
 96 | 
 97 | setup(
 98 |     author='MIT Data To AI Lab',
 99 |     author_email='dailabmit@gmail.com',
100 |     classifiers=[
101 |         'Development Status :: 2 - Pre-Alpha',
102 |         'Intended Audience :: Developers',
103 |         'License :: OSI Approved :: MIT License',
104 |         'Natural Language :: English',
105 |         'Programming Language :: Python :: 3',
106 |         'Programming Language :: Python :: 3.8',
107 |         'Programming Language :: Python :: 3.9',
108 |         'Programming Language :: Python :: 3.10',
109 |         'Programming Language :: Python :: 3.11',
110 |         'Programming Language :: Python :: 3.12',
111 |         'Programming Language :: Python :: 3.13',
112 |     ],
113 |     description='Pipelines and primitives for machine learning and data science.',
114 |     extras_require={
115 |         'dev': development_requires + tests_require + examples_require,
116 |         'unit': tests_require,
117 |         'test': tests_require + examples_require,
118 |         'examples': examples_require,
119 |         'mlprimitives': mlprimitives_requires,
120 |     },
121 |     include_package_data=True,
122 |     install_requires=install_requires,
123 |     keywords='auto machine learning classification regression data science pipeline',
124 |     license='MIT license',
125 |     long_description=readme + '\n\n' + history,
126 |     long_description_content_type='text/markdown',
127 |     name='mlblocks',
128 |     packages=find_packages(include=['mlblocks', 'mlblocks.*']),
129 |     python_requires='>=3.8,<3.14',
130 |     setup_requires=setup_requires,
131 |     test_suite='tests',
132 |     tests_require=tests_require,
133 |     url='https://github.com/MLBazaar/MLBlocks',
134 |     version='0.6.3.dev0',
135 |     zip_safe=False,
136 | )
137 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MLBazaar/MLBlocks/db5ff4b925358ef568492b45058dddded05be873/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/diagrams/diagram_fit.txt:
--------------------------------------------------------------------------------
 1 | digraph {
 2 | 	graph [splines=ortho]
 3 | 	tooltip=" "
 4 | 	node [penwidth=0 shape=box]
 5 | 	subgraph cluster_outputs {
 6 | 		tooltip="Output variables"
 7 | 		graph [bgcolor=azure3 penwidth=0 rank=source]
 8 | 		node [fontsize=20 penwidth=0]
 9 | 		edge [arrowhead=none penwidth=0]
10 | 		Output [label=Output fontsize=14 tooltip="Output variables"]
11 | 		output_variable_output [label=output_variable]
12 | 		output_variable_output -> Output
13 | 		{
14 | 			rank=same
15 | 			rankdir=LR
16 | 		}
17 | 	}
18 | 	"a_primitive#1" [label=a_primitive penwidth=1]
19 | 	"a_primitive#1 output_variable" [label=output_variable]
20 | 	"a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none]
21 | 	"a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal]
22 | 	input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1]
23 | 	subgraph cluster_inputs {
24 | 		tooltip="Input variables"
25 | 		graph [bgcolor=azure3 penwidth=0 rank=source]
26 | 		node [fontsize=20 penwidth=0]
27 | 		edge [arrowhead=none penwidth=0]
28 | 		Input [label=Input fontsize=14 tooltip="Input variables"]
29 | 		input_variable_input [label=input_variable]
30 | 		Input -> input_variable_input
31 | 		{
32 | 			rank=same
33 | 		}
34 | 	}
35 | 	{
36 | 		graph [penwidth=0]
37 | 		node [penwidth=0]
38 | 		edge [len=1 minlen=1 penwidth=1]
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/data/diagrams/diagram_multiple_blocks.txt:
--------------------------------------------------------------------------------
 1 | digraph {
 2 | 	graph [splines=ortho]
 3 | 	tooltip=" "
 4 | 	node [penwidth=0 shape=box]
 5 | 	subgraph cluster_outputs {
 6 | 		tooltip="Output variables"
 7 | 		graph [bgcolor=azure3 penwidth=0 rank=source]
 8 | 		node [fontsize=20 penwidth=0]
 9 | 		edge [arrowhead=none penwidth=0]
10 | 		Output [label=Output fontsize=14 tooltip="Output variables"]
11 | 		output_variable_b_output [label=output_variable_b]
12 | 		output_variable_b_output -> Output
13 | 		{
14 | 			rank=same
15 | 			rankdir=LR
16 | 		}
17 | 	}
18 | 	"b_primitive#1" [label=b_primitive penwidth=1]
19 | 	"b_primitive#1 output_variable_b" [label=output_variable_b]
20 | 	"b_primitive#1" -> "b_primitive#1 output_variable_b" [arrowhead=none]
21 | 	"b_primitive#1 output_variable_b" -> output_variable_b_output [arrowhead=normal]
22 | 	"a_primitive#1" [label=a_primitive penwidth=1]
23 | 	"a_primitive#1 output_variable_a" [label=output_variable_a]
24 | 	"a_primitive#1" -> "a_primitive#1 output_variable_a" [arrowhead=none]
25 | 	"a_primitive#1 output_variable_a" -> "b_primitive#1" [arrowhead=normal]
26 | 	input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1]
27 | 	subgraph cluster_inputs {
28 | 		tooltip="Input variables"
29 | 		graph [bgcolor=azure3 penwidth=0 rank=source]
30 | 		node [fontsize=20 penwidth=0]
31 | 		edge [arrowhead=none penwidth=0]
32 | 		Input [label=Input fontsize=14 tooltip="Input variables"]
33 | 		input_variable_input [label=input_variable]
34 | 		Input -> input_variable_input
35 | 		{
36 | 			rank=same
37 | 		}
38 | 	}
39 | 	{
40 | 		graph [penwidth=0]
41 | 		node [penwidth=0]
42 | 		edge [len=1 minlen=1 penwidth=1]
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/tests/data/diagrams/diagram_simple.txt:
--------------------------------------------------------------------------------
 1 | digraph {
 2 | 	graph [splines=ortho]
 3 | 	tooltip=" "
 4 | 	node [penwidth=0 shape=box]
 5 | 	subgraph cluster_outputs {
 6 | 		tooltip="Output variables"
 7 | 		graph [bgcolor=azure3 penwidth=0 rank=source]
 8 | 		node [fontsize=20 penwidth=0]
 9 | 		edge [arrowhead=none penwidth=0]
10 | 		Output [label=Output fontsize=14 tooltip="Output variables"]
11 | 		output_variable_output [label=output_variable]
12 | 		output_variable_output -> Output
13 | 		{
14 | 			rank=same
15 | 			rankdir=LR
16 | 		}
17 | 	}
18 | 	"a_primitive#1" [label=a_primitive penwidth=1]
19 | 	"a_primitive#1 output_variable" [label=output_variable]
20 | 	"a_primitive#1" -> "a_primitive#1 output_variable" [arrowhead=none]
21 | 	"a_primitive#1 output_variable" -> output_variable_output [arrowhead=normal]
22 | 	input_variable_input -> "a_primitive#1" [arrowhead=normal pendwith=1]
23 | 	subgraph cluster_inputs {
24 | 		tooltip="Input variables"
25 | 		graph [bgcolor=azure3 penwidth=0 rank=source]
26 | 		node [fontsize=20 penwidth=0]
27 | 		edge [arrowhead=none penwidth=0]
28 | 		Input [label=Input fontsize=14 tooltip="Input variables"]
29 | 		input_variable_input [label=input_variable]
30 | 		Input -> input_variable_input
31 | 		{
32 | 			rank=same
33 | 		}
34 | 	}
35 | 	{
36 | 		graph [penwidth=0]
37 | 		node [penwidth=0]
38 | 		edge [len=1 minlen=1 penwidth=1]
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/features/test_fit_predicr_args.py:
--------------------------------------------------------------------------------
 1 | from mlblocks.mlpipeline import MLPipeline
 2 | 
 3 | 
 4 | def test_fit_predict_args_in_init():
 5 | 
 6 |     def add(a, b):
 7 |         return a + b
 8 | 
 9 |     primitive = {
10 |         'name': 'add',
11 |         'primitive': add,
12 |         'produce': {
13 |             'args': [
14 |                 {
15 |                     'name': 'a',
16 |                     'type': 'float',
17 |                 },
18 |                 {
19 |                     'name': 'b',
20 |                     'type': 'float',
21 |                 },
22 |             ],
23 |             'output': [
24 |                 {
25 |                     'type': 'float',
26 |                     'name': 'out'
27 |                 }
28 |             ]
29 |         }
30 |     }
31 | 
32 |     primitives = [primitive]
33 |     init_params = {
34 |         'add': {
35 |             'b': 10
36 |         }
37 |     }
38 |     pipeline = MLPipeline(primitives, init_params=init_params)
39 | 
40 |     out = pipeline.predict(a=3)
41 | 
42 |     assert out == 13
43 | 


--------------------------------------------------------------------------------
/tests/features/test_partial_outputs.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | from unittest.mock import Mock
  3 | 
  4 | import numpy as np
  5 | 
  6 | from mlblocks.mlpipeline import MLPipeline
  7 | 
  8 | 
  9 | def almost_equal(obj1, obj2):
 10 |     if isinstance(obj1, dict):
 11 |         if not isinstance(obj2, dict):
 12 |             raise AssertionError("{} is not equal to {}".format(type(obj2), dict))
 13 | 
 14 |         for key, value in obj1.items():
 15 |             if key not in obj2:
 16 |                 raise AssertionError("{} not in {}".format(key, obj2))
 17 | 
 18 |             almost_equal(value, obj2[key])
 19 | 
 20 |     else:
 21 |         np.testing.assert_almost_equal(obj1, obj2)
 22 | 
 23 | 
 24 | class TestPartialOutputs(TestCase):
 25 |     def setUp(self):
 26 |         self.X = np.array([
 27 |             [1, 0, 0, 0, 0],
 28 |             [0, 1, 0, 0, 0],
 29 |             [0, 0, 1, 0, 0],
 30 |             [0, 0, 0, 1, 0],
 31 |             [0, 0, 0, 0, 1],
 32 |         ])
 33 |         self.y = np.array([0, 0, 0, 0, 1])
 34 | 
 35 |     def test_fit_output(self):
 36 | 
 37 |         # Setup variables
 38 |         primitives = [
 39 |             'sklearn.preprocessing.StandardScaler',
 40 |             'sklearn.linear_model.LogisticRegression'
 41 |         ]
 42 |         pipeline = MLPipeline(primitives)
 43 | 
 44 |         named = 'default'
 45 |         list_ = ['default', 0]
 46 |         int_block = 0
 47 |         invalid_int = 10
 48 |         str_block = 'sklearn.preprocessing.StandardScaler#1'
 49 |         invalid_block = 'InvalidBlockName'
 50 |         str_block_variable = 'sklearn.preprocessing.StandardScaler#1.X'
 51 |         invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid'
 52 | 
 53 |         # Run
 54 |         named_out = pipeline.fit(self.X, self.y, output_=named)
 55 |         list_out = pipeline.fit(self.X, self.y, output_=list_)
 56 |         int_out = pipeline.fit(self.X, self.y, output_=int_block)
 57 |         str_out = pipeline.fit(self.X, self.y, output_=str_block)
 58 |         str_out_variable = pipeline.fit(self.X, self.y,
 59 |                                         output_=str_block_variable)
 60 |         no_output = pipeline.fit(self.X, self.y)
 61 | 
 62 |         # Assert successful calls
 63 |         X = np.array([
 64 |             [2., -0.5, -0.5, -0.5, -0.5],
 65 |             [-0.5, 2., -0.5, -0.5, -0.5],
 66 |             [-0.5, -0.5, 2., -0.5, -0.5],
 67 |             [-0.5, -0.5, -0.5, 2., -0.5],
 68 |             [-0.5, -0.5, -0.5, -0.5, 2.],
 69 |         ])
 70 |         y = np.array([
 71 |             0, 0, 0, 0, 1
 72 |         ])
 73 |         context = {'X': X, 'y': y}
 74 | 
 75 |         almost_equal(named_out, y)
 76 |         assert len(list_out) == 2
 77 |         almost_equal(list_out[0], y)
 78 |         almost_equal(list_out[1], context)
 79 |         almost_equal(context, int_out)
 80 |         almost_equal(context, str_out)
 81 |         almost_equal(X, str_out_variable)
 82 |         assert no_output is None
 83 | 
 84 |         # Run asserting exceptions
 85 |         with self.assertRaises(IndexError):
 86 |             pipeline.fit(self.X, self.y, output_=invalid_int)
 87 | 
 88 |         with self.assertRaises(ValueError):
 89 |             pipeline.fit(self.X, self.y, output_=invalid_block)
 90 | 
 91 |         with self.assertRaises(ValueError):
 92 |             pipeline.fit(self.X, self.y, output_=invalid_variable)
 93 | 
 94 |     def test_fit_start(self):
 95 |         # Setup variables
 96 |         primitives = [
 97 |             'sklearn.preprocessing.StandardScaler',
 98 |             'sklearn.linear_model.LogisticRegression'
 99 |         ]
100 |         pipeline = MLPipeline(primitives)
101 | 
102 |         # Mock the first block
103 |         block_mock = Mock()
104 |         pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock
105 | 
106 |         # Run first block
107 |         context = {
108 |             'X': self.X,
109 |             'y': self.y
110 |         }
111 |         int_start = 1
112 |         str_start = 'sklearn.linear_model.LogisticRegression#1'
113 | 
114 |         pipeline.fit(start_=int_start, **context)
115 |         pipeline.fit(start_=str_start, **context)
116 | 
117 |         # Assert that mock has not been called
118 |         block_mock.fit.assert_not_called()
119 | 
120 |     def test_predict_start(self):
121 |         # Setup variables
122 |         primitives = [
123 |             'sklearn.preprocessing.StandardScaler',
124 |             'sklearn.linear_model.LogisticRegression'
125 |         ]
126 |         pipeline = MLPipeline(primitives)
127 |         pipeline.fit(self.X, self.y)
128 | 
129 |         # Mock the first block
130 |         block_mock = Mock()
131 |         pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock
132 | 
133 |         # Run first block
134 |         context = {
135 |             'X': self.X,
136 |         }
137 |         int_start = 1
138 |         str_start = 'sklearn.linear_model.LogisticRegression#1'
139 | 
140 |         pipeline.predict(start_=int_start, **context)
141 |         pipeline.predict(start_=str_start, **context)
142 | 
143 |         # Assert that mock has not been called
144 |         block_mock.predict.assert_not_called()
145 | 


--------------------------------------------------------------------------------
/tests/features/test_pipeline_loading.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | 
  3 | from mlblocks import MLPipeline
  4 | 
  5 | 
  6 | class TestMLPipeline(TestCase):
  7 | 
  8 |     def test_dict(self):
  9 |         pipeline_dict = {
 10 |             'primitives': [
 11 |                 'sklearn.ensemble.RandomForestClassifier'
 12 |             ],
 13 |             'init_params': {
 14 |                 'sklearn.ensemble.RandomForest#1': {
 15 |                     'n_estimators': 500
 16 |                 }
 17 |             },
 18 |             'input_names': {
 19 |                 'sklearn.ensemble.RandomForest#1': {
 20 |                     'X': 'X1'
 21 |                 }
 22 |             },
 23 |             'output_names': {
 24 |                 'sklearn.ensemble.RandomForest#1': {
 25 |                     'y': 'y1'
 26 |                 }
 27 |             }
 28 |         }
 29 | 
 30 |         pipeline = MLPipeline(pipeline_dict)
 31 | 
 32 |         assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier']
 33 |         assert pipeline.init_params == {
 34 |             'sklearn.ensemble.RandomForest#1': {
 35 |                 'n_estimators': 500
 36 |             }
 37 |         }
 38 |         assert pipeline.input_names == {
 39 |             'sklearn.ensemble.RandomForest#1': {
 40 |                 'X': 'X1'
 41 |             }
 42 |         }
 43 |         assert pipeline.output_names == {
 44 |             'sklearn.ensemble.RandomForest#1': {
 45 |                 'y': 'y1'
 46 |             }
 47 |         }
 48 | 
 49 |     def test_list(self):
 50 |         primitives = [
 51 |             'sklearn.ensemble.RandomForestClassifier'
 52 |         ]
 53 |         init_params = {
 54 |             'sklearn.ensemble.RandomForest#1': {
 55 |                 'n_estimators': 500
 56 |             }
 57 |         }
 58 | 
 59 |         pipeline = MLPipeline(primitives, init_params=init_params)
 60 | 
 61 |         assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier']
 62 |         assert pipeline.init_params == {
 63 |             'sklearn.ensemble.RandomForest#1': {
 64 |                 'n_estimators': 500
 65 |             }
 66 |         }
 67 | 
 68 |     def test_none(self):
 69 |         primitives = [
 70 |             'sklearn.ensemble.RandomForestClassifier'
 71 |         ]
 72 |         init_params = {
 73 |             'sklearn.ensemble.RandomForest#1': {
 74 |                 'n_estimators': 500
 75 |             }
 76 |         }
 77 | 
 78 |         pipeline = MLPipeline(primitives=primitives, init_params=init_params)
 79 | 
 80 |         assert pipeline.primitives == ['sklearn.ensemble.RandomForestClassifier']
 81 |         assert pipeline.init_params == {
 82 |             'sklearn.ensemble.RandomForest#1': {
 83 |                 'n_estimators': 500
 84 |             }
 85 |         }
 86 | 
 87 |     def test_mlpipeline(self):
 88 |         primitives = [
 89 |             'sklearn.ensemble.RandomForestClassifier'
 90 |         ]
 91 |         init_params = {
 92 |             'sklearn.ensemble.RandomForest#1': {
 93 |                 'n_estimators': 500
 94 |             }
 95 |         }
 96 | 
 97 |         pipeline = MLPipeline(primitives=primitives, init_params=init_params)
 98 |         pipeline2 = MLPipeline(pipeline)
 99 | 
100 |         assert pipeline2.primitives == ['sklearn.ensemble.RandomForestClassifier']
101 |         assert pipeline2.init_params == {
102 |             'sklearn.ensemble.RandomForest#1': {
103 |                 'n_estimators': 500
104 |             }
105 |         }
106 | 


--------------------------------------------------------------------------------
/tests/test_discovery.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import os
  5 | import tempfile
  6 | import uuid
  7 | from unittest.mock import Mock, call, patch
  8 | 
  9 | import pytest
 10 | from pkg_resources import Distribution, EntryPoint
 11 | 
 12 | from mlblocks import discovery
 13 | 
 14 | FAKE_PRIMITIVES_PATH = 'this/is/a/fake'
 15 | FAKE_PRIMITIVES_PATHS = [
 16 |     'this/is/another/fake',
 17 |     'this/is/yet/another/fake',
 18 | ]
 19 | 
 20 | 
 21 | def test__add_lookup_path_do_nothing():
 22 |     paths = ['a', 'b']
 23 |     discovery._add_lookup_path('a', paths)
 24 | 
 25 |     assert paths == ['a', 'b']
 26 | 
 27 | 
 28 | def test__add_lookup_path_exception():
 29 |     paths = ['a', 'b']
 30 |     invalid_path = str(uuid.uuid4())
 31 | 
 32 |     with pytest.raises(ValueError):
 33 |         discovery._add_lookup_path(invalid_path, paths)
 34 | 
 35 | 
 36 | def test__add_lookup_path():
 37 |     paths = ['a', 'b']
 38 |     discovery._add_lookup_path('tests', paths)
 39 | 
 40 |     expected_path = os.path.abspath('tests')
 41 | 
 42 |     assert paths == [expected_path, 'a', 'b']
 43 | 
 44 | 
 45 | @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
 46 | def test_add_primitives_path():
 47 |     discovery.add_primitives_path(os.path.abspath('tests'))
 48 | 
 49 |     expected_path = os.path.abspath('tests')
 50 |     assert discovery._PRIMITIVES_PATHS == [expected_path, 'a', 'b']
 51 | 
 52 | 
 53 | @patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b'])
 54 | def test_add_pipelines_path():
 55 |     discovery.add_pipelines_path('tests')
 56 | 
 57 |     expected_path = os.path.abspath('tests')
 58 |     assert discovery._PIPELINES_PATHS == [expected_path, 'a', 'b']
 59 | 
 60 | 
 61 | @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
 62 | @patch('mlblocks.discovery.pkg_resources.iter_entry_points')
 63 | def test__load_entry_points_no_entry_points(iep_mock):
 64 |     # setup
 65 |     iep_mock.return_value == []
 66 | 
 67 |     # run
 68 |     paths = discovery._load_entry_points('jsons_path', 'mlprimitives')
 69 | 
 70 |     # assert
 71 |     assert paths == []
 72 |     expected_calls = [
 73 |         call('mlprimitives'),
 74 |     ]
 75 |     assert iep_mock.call_args_list == expected_calls
 76 | 
 77 | 
 78 | @patch('mlblocks.discovery.pkg_resources.iter_entry_points')
 79 | def test__load_entry_points_entry_points(iep_mock):
 80 |     # setup
 81 |     something_else_ep = EntryPoint('something_else', 'mlblocks.__version__')
 82 |     primitives_ep = EntryPoint(
 83 |         'primitives',
 84 |         'tests.test_discovery',
 85 |         attrs=['FAKE_PRIMITIVES_PATH'],
 86 |         dist=Distribution()
 87 |     )
 88 |     another_primitives_ep = EntryPoint(
 89 |         'primitives',
 90 |         'tests.test_discovery',
 91 |         attrs=['FAKE_PRIMITIVES_PATHS'],
 92 |         dist=Distribution()
 93 |     )
 94 |     iep_mock.return_value = [
 95 |         something_else_ep,
 96 |         primitives_ep,
 97 |         another_primitives_ep
 98 |     ]
 99 | 
100 |     # run
101 |     paths = discovery._load_entry_points('primitives')
102 | 
103 |     # assert
104 |     expected = [
105 |         'this/is/a/fake',
106 |         'this/is/another/fake',
107 |         'this/is/yet/another/fake',
108 |     ]
109 |     assert paths == expected
110 | 
111 |     expected_calls = [
112 |         call('mlblocks'),
113 |     ]
114 |     assert iep_mock.call_args_list == expected_calls
115 | 
116 | 
117 | @patch('mlblocks.discovery._PRIMITIVES_PATHS', new=['a', 'b'])
118 | @patch('mlblocks.discovery._load_entry_points')
119 | def test_get_primitives_paths(lep_mock):
120 |     lep_mock.side_effect = [['c'], []]
121 | 
122 |     paths = discovery.get_primitives_paths()
123 | 
124 |     assert paths == ['a', 'b', 'c']
125 |     expected_calls = [
126 |         call('primitives'),
127 |         call('jsons_path', 'mlprimitives'),
128 |     ]
129 |     assert lep_mock.call_args_list == expected_calls
130 | 
131 | 
132 | @patch('mlblocks.discovery._PIPELINES_PATHS', new=['a', 'b'])
133 | @patch('mlblocks.discovery._load_entry_points')
134 | def test_get_pipelines_paths(lep_mock):
135 |     lep_mock.return_value = ['c']
136 | 
137 |     paths = discovery.get_pipelines_paths()
138 | 
139 |     assert paths == ['a', 'b', 'c']
140 |     lep_mock.assert_called_once_with('pipelines')
141 | 
142 | 
143 | def test__load_value_error():
144 |     primitive = discovery._load('invalid.primitive', ['a', 'b'])
145 | 
146 |     assert primitive is None
147 | 
148 | 
149 | def test__load_success():
150 |     primitive = {
151 |         'name': 'temp.primitive',
152 |         'primitive': 'temp.primitive'
153 |     }
154 | 
155 |     with tempfile.TemporaryDirectory() as tempdir:
156 |         paths = [tempdir]
157 |         primitive_path = os.path.join(tempdir, 'temp.primitive.json')
158 |         with open(primitive_path, 'w') as primitive_file:
159 |             json.dump(primitive, primitive_file, indent=4)
160 | 
161 |         loaded = discovery._load('temp.primitive', paths)
162 | 
163 |         assert primitive == loaded
164 | 
165 | 
166 | def test__load_json_path():
167 |     primitive = {
168 |         'name': 'temp.primitive',
169 |         'primitive': 'temp.primitive'
170 |     }
171 | 
172 |     with tempfile.TemporaryDirectory() as tempdir:
173 |         paths = [tempdir]
174 |         primitive_path = os.path.join(tempdir, 'temp.primitive.json')
175 |         with open(primitive_path, 'w') as primitive_file:
176 |             json.dump(primitive, primitive_file, indent=4)
177 | 
178 |         loaded = discovery._load(primitive_path, paths)
179 | 
180 |         assert primitive == loaded
181 | 
182 | 
183 | @patch('mlblocks.discovery.get_primitives_paths')
184 | @patch('mlblocks.discovery._load')
185 | def test__load_primitive_value_error(load_mock, gpp_mock):
186 |     load_mock.return_value = None
187 |     gpp_mock.return_value = ['a', 'b']
188 | 
189 |     with pytest.raises(ValueError):
190 |         discovery.load_primitive('invalid.primitive')
191 | 
192 |     load_mock.assert_called_once_with('invalid.primitive', ['a', 'b'])
193 | 
194 | 
195 | @patch('mlblocks.discovery.get_primitives_paths')
196 | @patch('mlblocks.discovery._load')
197 | def test__load_primitive_success(load_mock, gpp_mock):
198 |     gpp_mock.return_value = ['a', 'b']
199 | 
200 |     primitive = discovery.load_primitive('valid.primitive')
201 | 
202 |     load_mock.assert_called_once_with('valid.primitive', ['a', 'b'])
203 | 
204 |     assert primitive == load_mock.return_value
205 | 
206 | 
207 | @patch('mlblocks.discovery.get_pipelines_paths')
208 | @patch('mlblocks.discovery._load')
209 | def test__load_pipeline_value_error(load_mock, gpp_mock):
210 |     load_mock.return_value = None
211 |     gpp_mock.return_value = ['a', 'b']
212 | 
213 |     with pytest.raises(ValueError):
214 |         discovery.load_pipeline('invalid.pipeline')
215 | 
216 |     load_mock.assert_called_once_with('invalid.pipeline', ['a', 'b'])
217 | 
218 | 
219 | @patch('mlblocks.discovery.get_pipelines_paths')
220 | @patch('mlblocks.discovery._load')
221 | def test__load_pipeline_success(load_mock, gpp_mock):
222 |     gpp_mock.return_value = ['a', 'b']
223 | 
224 |     pipeline = discovery.load_pipeline('valid.pipeline')
225 | 
226 |     load_mock.assert_called_once_with('valid.pipeline', ['a', 'b'])
227 | 
228 |     assert pipeline == load_mock.return_value
229 | 
230 | 
231 | @patch('mlblocks.discovery.os')
232 | def test__search_annotations(os_mock):
233 |     os_mock.path.abspath = os.path.abspath
234 |     os_mock.path.join = os.path.join
235 |     os_mock.path.exists.return_value = True
236 |     os_mock.listdir.side_effect = [
237 |         [
238 |             'a.primitive.json',
239 |             'another.primitive.json',
240 |             'some',
241 |         ],
242 |         [
243 |             'other',
244 |         ],
245 |         [
246 |             'primitive.json'
247 |         ]
248 |     ]
249 |     os_mock.path.isdir.return_value = False
250 |     os_mock.path.isdir.side_effect = [
251 |         False,
252 |         False,
253 |         True,
254 |         True,
255 |         False
256 |     ]
257 | 
258 |     annotations = discovery._search_annotations('/path/to', 'other')
259 | 
260 |     assert annotations == {
261 |         '/path/to/another.primitive.json': 'another.primitive',
262 |         '/path/to/some/other/primitive.json': 'some.other.primitive'
263 |     }
264 | 
265 | 
266 | def test__match_no_match():
267 |     annotation = {
268 |         'name': 'a.primitive',
269 |     }
270 | 
271 |     matches = discovery._match(annotation, 'key', 'value')
272 | 
273 |     assert not matches
274 | 
275 | 
276 | def test__match_root():
277 |     annotation = {
278 |         'name': 'a.primitive',
279 |         'key': 'value'
280 |     }
281 | 
282 |     matches = discovery._match(annotation, 'key', 'value')
283 | 
284 |     assert matches
285 | 
286 | 
287 | def test__match_sublevel():
288 |     annotation = {
289 |         'name': 'a.primitive',
290 |         'some': {
291 |             'sublevel': {
292 |                 'key': 'value'
293 |             }
294 |         }
295 |     }
296 | 
297 |     matches = discovery._match(annotation, 'some.sublevel.key', 'value')
298 | 
299 |     assert matches
300 | 
301 | 
302 | def test__match_list_no_match():
303 |     annotation = {
304 |         'name': 'a.primitive',
305 |         'key': [
306 |             'another_value'
307 |             'yet_another_value'
308 |         ]
309 |     }
310 | 
311 |     matches = discovery._match(annotation, 'key', 'value')
312 | 
313 |     assert not matches
314 | 
315 | 
316 | def test__match_list():
317 |     annotation = {
318 |         'name': 'a.primitive',
319 |         'key': [
320 |             'value',
321 |             'another_value'
322 |         ]
323 |     }
324 | 
325 |     matches = discovery._match(annotation, 'key', 'value')
326 | 
327 |     assert matches
328 | 
329 | 
330 | def test__match_dict():
331 |     annotation = {
332 |         'name': 'a.primitive',
333 |         'key': {
334 |             'value': 'subvalue',
335 |             'another_value': 'another_subvalue'
336 |         }
337 |     }
338 | 
339 |     matches = discovery._match(annotation, 'key', 'value')
340 | 
341 |     assert matches
342 | 
343 | 
344 | def test__match_multiple_keys():
345 |     annotation = {
346 |         'name': 'a.primitive',
347 |         'key': 'value'
348 |     }
349 | 
350 |     matches = discovery._match(annotation, 'key', ['value', 'another_value'])
351 | 
352 |     assert matches
353 | 
354 | 
355 | @patch('mlblocks.discovery._search_annotations')
356 | def test__find_annotations(search_annotations_mock):
357 |     search_annotations_mock.return_value = {
358 |         '/path/to/a/classifier.primitive.json': 'classifier.primitive',
359 |         '/path/to/a/regressor.primitive.json': 'regressor.primitive',
360 |     }
361 | 
362 |     loader = Mock()
363 |     loader.side_effect = [
364 |         {
365 |             'name': 'classifier.primitive',
366 |             'classifiers': {
367 |                 'type': 'estimator',
368 |                 'subtype': 'classifier',
369 |             }
370 |         },
371 |         {
372 |             'name': 'regressor.primitive',
373 |             'classifiers': {
374 |                 'type': 'estimator',
375 |                 'subtype': 'regressor',
376 |             }
377 |         }
378 |     ]
379 | 
380 |     filters = {
381 |         'classifiers.subtype': 'regressor'
382 |     }
383 |     annotations = discovery._find_annotations(['/a/path'], loader, 'pattern', filters)
384 | 
385 |     assert annotations == ['regressor.primitive']
386 |     search_annotations_mock.assert_called_once_with('/a/path', 'pattern')
387 | 
388 | 
389 | @patch('mlblocks.discovery._find_annotations')
390 | @patch('mlblocks.discovery.get_primitives_paths')
391 | def test_find_primitives(gpp_mock, fa_mock):
392 |     primitives = discovery.find_primitives('pattern')
393 | 
394 |     fa_mock.assert_called_once_with(
395 |         gpp_mock.return_value, discovery.load_primitive, 'pattern', dict())
396 | 
397 |     assert primitives == fa_mock.return_value
398 | 
399 | 
400 | @patch('mlblocks.discovery._find_annotations')
401 | @patch('mlblocks.discovery.get_pipelines_paths')
402 | def test_find_pipelines(gpp_mock, fa_mock):
403 |     primitives = discovery.find_pipelines('pattern', {'a': 'filter'})
404 | 
405 |     fa_mock.assert_called_once_with(
406 |         gpp_mock.return_value, discovery.load_pipeline, 'pattern', {'a': 'filter'})
407 | 
408 |     assert primitives == fa_mock.return_value
409 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py3{6,7,8,9,10,11}, test-devel
 3 | 
 4 | [travis]
 5 | python =
 6 |     3.13: py313
 7 |     3.12: py312
 8 |     3.11: py311
 9 |     3.10: py310
10 |     3.9: py39
11 |     3.8: py38, test-devel
12 | 
13 | [testenv]
14 | passenv = CI TRAVIS TRAVIS_*
15 | allowlist_externals = rm
16 | skipsdist = false
17 | skip_install = false
18 | extras = test
19 | commands =
20 |     /usr/bin/env make test
21 |     rm -r {envdir}
22 | 
23 | [testenv:test-devel]
24 | extras = dev
25 | commands =
26 |     /usr/bin/env make test-devel
27 |     rm -r {envdir}
28 | 


--------------------------------------------------------------------------------