├── .github └── ISSUE_TEMPLATE │ ├── bug.md │ └── everything-else.md ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DEFAULT_LOGIC.md ├── LICENSE ├── PULL_REQUEST_TEMPLATE.md ├── README.md ├── docs ├── Makefile ├── conf.py ├── generated │ ├── modules.rst │ └── steppy.rst └── index.rst ├── requirements.txt ├── setup.cfg ├── setup.py ├── steppy ├── __init__.py ├── adapter.py ├── base.py └── utils.py └── tests ├── __init__.py ├── conftest.py ├── steppy_test_utils.py ├── test_adapter.py └── test_base.py /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: bug 3 | about: Create bug report 4 | 5 | --- 6 | 7 | There are two things that will make the processing of your issue faster: 8 | 1. Make sure that you are using the latest version of the code, 9 | 1. In case of bug issue, it would be nice to provide more technical details such like execution command, error message or script that reproduces your bug. 10 | # 11 | 12 | Thanks! 13 | 14 | Kamil & Jakub, 15 | 16 | *core contributors to the [minerva.ml](https://minerva.ml)* 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/everything-else.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: everything else 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .pytest_cache 6 | tests/.cache 7 | 8 | # C extensions 9 | *.so 10 | 11 | # neptune, pycharm 12 | .cache 13 | .cache/ 14 | .idea/ 15 | .idea_modules/ 16 | out/ 17 | output 18 | output/ 19 | *.log 20 | target/ 21 | devbook.ipynb 22 | devbook_local.ipynb 23 | 24 | # Distribution / packaging 25 | .Python 26 | env/ 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .coverage.* 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | .hypothesis/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | local_settings.py 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # Jupyter Notebook 81 | Untitled*.ipynb 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # dotenv 94 | .env 95 | 96 | # virtualenv 97 | .venv 98 | venv/ 99 | ENV/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | 114 | # Working directories 115 | examples/cache/ 116 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at ml-team@neptune.ml. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to the Steppy core library 2 | 3 | Here, at [minerva.ml](https://minerva.ml) we are creating Steppy - lightweight, open-source, Python library for fast and reproducible experimentation. 4 | 5 | ### Get involved 6 | You are welcome to contribute to the Steppy library. To get started: 7 | 1. Check [our kanban board](https://github.com/minerva-ml/steppy/projects/1) to see what we are working on right now. 8 | 1. Express your interest in a paticular [issue](https://github.com/minerva-ml/steppy/issues) by submitting a comment or, 9 | * submit your own [issue](https://github.com/minerva-ml/steppy/issues). 10 | 1. We will get back to you in order to start working together. 11 | 12 | ### Code contributions 13 | Major - and most appreciated - contribution is [pull request](https://github.com/minerva-ml/steppy/pulls) with feature or bug fix. 14 | 15 | ### Remarks 16 | In case of custom ideas, please contact core contributors directly at ml-team@neptune.ml. 17 | # 18 | 19 | Thanks! 20 | 21 | Jakub & Kamil, 22 | 23 | *core contributors to the [minerva.ml](https://minerva.ml)* 24 | -------------------------------------------------------------------------------- /DEFAULT_LOGIC.md: -------------------------------------------------------------------------------- 1 | Default logic of the `fit_transform()`: 2 | 1. execute it on the last Step. 3 | 1. Step is fitted and transformed. Any data or models will be overridden (default setup). 4 | 1. If `force_fitting` is not obligatory, then look for cache: 5 | 1. if output is cached, then use it. In such situation `fit_tranform()` was just taking output from cache. 6 | 1. If output is not cached -> steppy looks for persisted (saved to disk) output. If exist, `fit_tranform()` was just loading output from the project directory. 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 neptune.ml 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Pull Request template 2 | 3 | ### Code contributions 4 | Major - and most appreciated - contribution is pull request with feature or bug fix. Each pull request initiates discussion about your code contribution. 5 | 6 | Each pull request should be provided with minimal description about its contents. 7 | # 8 | 9 | Thanks! 10 | 11 | Jakub & Kamil, 12 | 13 | _core contributors to the minerva.ml_ 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Steppy 2 | [![license](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/minerva-ml/steppy/blob/master/LICENSE) 3 | 4 | ### What is Steppy? 5 | 1. Steppy is a lightweight, open-source, Python 3 library for fast and reproducible experimentation. 6 | 1. Steppy lets data scientist focus on data science, not on software development issues. 7 | 1. Steppy's minimal interface does not impose constraints, however, enables clean machine learning pipeline design. 8 | 9 | ### What problem steppy solves? 10 | #### Problems 11 | In the course of the project, data scientist faces two problems: 12 | 1. Difficulties with reproducibility in data science / machine learning projects. 13 | 1. Lack of the ability to prepare or extend experiments quickly. 14 | 15 | #### Solution 16 | Steppy address both problems by introducing two simple abstractions: `Step` and `Tranformer`. We consider it minimal interface for building machine learning pipelines. 17 | 1. `Step` is a wrapper over the transformer and handles multiple aspects of the execution of the pipeline, such as saving intermediate results (if needed), checkpointing the model during training and much more. 18 | 1. `Tranformer` in turn, is purely computational, data scientist-defined piece that takes an input data and produces some output data. Typical Transformers are neural network, machine learning algorithms and pre- or post-processing routines. 19 | 20 | # Start using steppy 21 | ### Installation 22 | Steppy requires `python3.5` or above. 23 | ```bash 24 | pip3 install steppy 25 | ``` 26 | _(you probably want to install it in your [virtualenv](https://virtualenv.pypa.io/en/stable))_ 27 | 28 | ### Resources 29 | 1. :ledger: [Documentation](https://steppy.readthedocs.io/en/latest) 30 | 1. :computer: [Source](https://github.com/minerva-ml/steppy) 31 | 1. :name_badge: [Bugs reports](https://github.com/minerva-ml/steppy/issues) 32 | 1. :rocket: [Feature requests](https://github.com/minerva-ml/steppy/issues) 33 | 1. :star2: Tutorial notebooks ([their repository](https://github.com/minerva-ml/steppy-examples)): 34 | - :arrow_forward: [Getting started](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/1-getting-started.ipynb) 35 | - :arrow_forward:[Steps with multiple inputs](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/2-multi-step.ipynb) 36 | - :arrow_forward: [Advanced adapters](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/3-adapter_advanced.ipynb) 37 | - :arrow_forward: [Caching and persistance](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/4-caching-persistence.ipynb) 38 | - :arrow_forward: [Steppy with Keras](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/5-steps-with-keras.ipynb) 39 | 40 | ### Feature Requests 41 | Please send us your ideas on how to improve steppy library! We are looking for your comments here: [Feature requests](https://github.com/minerva-ml/steppy/issues). 42 | 43 | ### Roadmap 44 | :fast_forward: At this point steppy is early-stage library heavily tested on multiple machine learning challenges ([data-science-bowl](https://github.com/minerva-ml/open-solution-data-science-bowl-2018 "Kaggle's data science bowl 2018"), [toxic-comment-classification-challenge](https://github.com/minerva-ml/open-solution-toxic-comments "Kaggle's Toxic Comment Classification Challenge"), [mapping-challenge](https://github.com/minerva-ml/open-solution-mapping-challenge "CrowdAI's Mapping Challenge")) and educational projects ([minerva-advanced-data-scientific-training](https://github.com/minerva-ml/minerva-training-materials "minerva.ml -> advanced data scientific training")). 45 | 46 | :fast_forward: We are developing steppy towards practical tool for data scientists who can run their experiments easily and change their pipelines with just few manipulations in the code. 47 | 48 | ### Related projects 49 | We are also building [steppy-toolkit](https://github.com/minerva-ml/steppy-toolkit "steppy toolkit"), a collection of high quality implementations of the top deep learning architectures -> all of them with the same, intuitive interface. 50 | 51 | ### Contributing 52 | You are welcome to contribute to the Steppy library. Please check [CONTRIBUTING](https://github.com/minerva-ml/steppy/blob/master/CONTRIBUTING.md) for more information. 53 | 54 | ### Terms of use 55 | Steppy is [MIT-licensed](https://github.com/minerva-ml/steppy/blob/master/LICENSE). 56 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = steppy 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'steppy' 23 | copyright = '2018, neptune.ml' 24 | author = 'Kamil A. Kaczmarek and Jakub Czakon' 25 | 26 | # The short X.Y version 27 | version = '0.1' 28 | # The full version, including alpha/beta/rc tags 29 | release = '0.1.16' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.doctest', 44 | ] 45 | 46 | # Add any paths that contain templates here, relative to this directory. 47 | templates_path = ['_templates'] 48 | 49 | # The suffix(es) of source filenames. 50 | # You can specify multiple suffix as a list of string: 51 | # 52 | # source_suffix = ['.rst', '.md'] 53 | source_suffix = '.rst' 54 | 55 | # The master toctree document. 56 | master_doc = 'index' 57 | 58 | # The language for content autogenerated by Sphinx. Refer to documentation 59 | # for a list of supported languages. 60 | # 61 | # This is also used if you do content translation via gettext catalogs. 62 | # Usually you set "language" from the command line for these cases. 63 | language = None 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | # This pattern also affects html_static_path and html_extra_path . 68 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 69 | 70 | # The name of the Pygments (syntax highlighting) style to use. 71 | pygments_style = 'sphinx' 72 | 73 | 74 | # -- Options for HTML output ------------------------------------------------- 75 | 76 | # The theme to use for HTML and HTML Help pages. See the documentation for 77 | # a list of builtin themes. 78 | # 79 | html_theme = 'sphinx_rtd_theme' 80 | 81 | # Theme options are theme-specific and customize the look and feel of a theme 82 | # further. For a list of options available for each theme, see the 83 | # documentation. 84 | # 85 | # html_theme_options = {} 86 | 87 | # Add any paths that contain custom static files (such as style sheets) here, 88 | # relative to this directory. They are copied after the builtin static files, 89 | # so a file named "default.css" will overwrite the builtin "default.css". 90 | html_static_path = ['_static'] 91 | 92 | # Custom sidebar templates, must be a dictionary that maps document names 93 | # to template names. 94 | # 95 | # The default sidebars (for documents that don't match any pattern) are 96 | # defined by theme itself. Builtin themes are using these templates by 97 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 98 | # 'searchbox.html']``. 99 | # 100 | # html_sidebars = {} 101 | 102 | 103 | # -- Options for HTMLHelp output --------------------------------------------- 104 | 105 | # Output file base name for HTML help builder. 106 | htmlhelp_basename = 'steppydoc' 107 | 108 | 109 | # -- Options for LaTeX output ------------------------------------------------ 110 | 111 | latex_elements = { 112 | # The paper size ('letterpaper' or 'a4paper'). 113 | # 114 | # 'papersize': 'letterpaper', 115 | 116 | # The font size ('10pt', '11pt' or '12pt'). 117 | # 118 | # 'pointsize': '10pt', 119 | 120 | # Additional stuff for the LaTeX preamble. 121 | # 122 | # 'preamble': '', 123 | 124 | # Latex figure (float) alignment 125 | # 126 | # 'figure_align': 'htbp', 127 | } 128 | 129 | # Grouping the document tree into LaTeX files. List of tuples 130 | # (source start file, target name, title, 131 | # author, documentclass [howto, manual, or own class]). 132 | latex_documents = [ 133 | (master_doc, 'steppy.tex', 'steppy Documentation', 134 | 'minerva.ml', 'manual'), 135 | ] 136 | 137 | 138 | # -- Options for manual page output ------------------------------------------ 139 | 140 | # One entry per manual page. List of tuples 141 | # (source start file, name, description, authors, manual section). 142 | man_pages = [ 143 | (master_doc, 'steppy', 'steppy Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ---------------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'steppy', 'steppy Documentation', 155 | author, 'steppy', 'One line description of project.', 156 | 'Miscellaneous'), 157 | ] 158 | 159 | 160 | # -- Extension configuration ------------------------------------------------- 161 | extensions = ['sphinx.ext.napoleon'] 162 | -------------------------------------------------------------------------------- /docs/generated/modules.rst: -------------------------------------------------------------------------------- 1 | steppy 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | steppy 8 | -------------------------------------------------------------------------------- /docs/generated/steppy.rst: -------------------------------------------------------------------------------- 1 | steppy package 2 | ============== 3 | 4 | steppy.adapter module 5 | --------------------- 6 | 7 | .. automodule:: steppy.adapter 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | steppy.base module 13 | ------------------ 14 | 15 | .. automodule:: steppy.base 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | steppy.utils module 21 | ------------------- 22 | 23 | .. automodule:: steppy.utils 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to steppy 2 | ================================== 3 | 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | :caption: Module contents: 8 | 9 | 10 | API documentation 11 | ~~~~~~~~~~~~~~~~~ 12 | 13 | * :ref:`genindex` 14 | * :ref:`modindex` 15 | * :ref:`search` 16 | 17 | 18 | What is Steppy? 19 | ~~~~~~~~~~~~~~~ 20 | 21 | Steppy is a lightweight, open-source, Python 3 library for fast and 22 | reproducible experimentation. It lets data scientist focus on data 23 | science, not on software development issues. Steppy’s minimal interface 24 | does not impose constraints, however, enables clean machine learning 25 | pipeline design. 26 | 27 | What problem steppy solves? 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29 | 30 | In the course of the project, data scientist faces multiple problems. 31 | Difficulties with reproducibility and lack of the ability to prepare 32 | experiments quickly are two particular examples. Steppy address both 33 | problems by introducing two simple abstractions: ``Step`` and 34 | ``Tranformer``. We consider it minimal interface for building machine 35 | learning pipelines. 36 | 37 | ``Step`` is a wrapper over the transformer and handles multiple aspects 38 | of the execution of the pipeline, such as saving intermediate results 39 | (if needed), checkpointing the model during training and much more. 40 | ``Tranformer`` in turn, is purely computational, data scientist-defined 41 | piece that takes an input data and produces some output data. Typical 42 | Transformers are neural network, machine learning algorithms and pre- or 43 | post-processing routines. 44 | 45 | Start using steppy 46 | ~~~~~~~~~~~~~~~~~~ 47 | 48 | Installation 49 | ^^^^^^^^^^^^ 50 | 51 | Steppy requires ``python3.5`` or above. 52 | 53 | .. code:: bash 54 | 55 | pip3 install steppy 56 | 57 | *(you probably want to install it in 58 | your* \ `virtualenv `__\ *)* 59 | 60 | Resources 61 | ~~~~~~~~~ 62 | 63 | 1. `Documentation `__ 64 | 2. `Source `__ 65 | 3. `Bugs reports `__ 66 | 4. `Feature requests `__ 67 | 5. Tutorial notebooks (`their repository `__): 68 | 69 | - `Getting started `__ 70 | - `Steps with multiple inputs `__ 71 | - `Advanced adapters `__ 72 | - `Caching and persistance `__ 73 | - `Steppy with Keras `__ 74 | 75 | Feature Requests 76 | ~~~~~~~~~~~~~~~~ 77 | 78 | Please send us your ideas on how to improve steppy library! We are 79 | looking for your comments here: `Feature 80 | requests `__. 81 | 82 | Roadmap 83 | ~~~~~~~ 84 | 85 | At this point steppy is early-stage library heavily 86 | tested on multiple machine learning challenges 87 | (`data-science-bowl `__, 88 | `toxic-comment-classification-challenge `__, 89 | `mapping-challenge `__) 90 | and educational projects 91 | (`minerva-advanced-data-scientific-training `__). 92 | 93 | We are developing steppy towards practical tool for data 94 | scientists who can run their experiments easily and change their 95 | pipelines with just few manipulations in the code. 96 | 97 | Related projects 98 | ~~~~~~~~~~~~~~~~ 99 | 100 | We are also building 101 | `steppy-toolkit `__, a 102 | collection of high quality implementations of the top deep learning 103 | architectures -> all of them with the same, intuitive interface. 104 | 105 | Contributing 106 | ~~~~~~~~~~~~ 107 | 108 | You are welcome to contribute to the Steppy library. Please check 109 | `CONTRIBUTING `__ 110 | for more information. 111 | 112 | Terms of use 113 | ~~~~~~~~~~~~ 114 | 115 | Steppy is 116 | `MIT-licensed `__. 117 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ipython>=6.4.0 2 | numpy>=1.14.0 3 | pydot_ng>=1.0.0 4 | pytest>=3.6.0 5 | scikit_learn>=0.19.0 6 | scipy>=1.0.0 7 | setuptools>=39.2.0 8 | typing>=3.6.4 9 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | long_description = ''' 4 | Steppy is lightweight, Python library for fast and reproducible experimentation. 5 | The goal of this package is to provide data scientist with minimal interface 6 | that allows her to build complex, yet elegant machine learning pipelines. 7 | 8 | Steppy is designed for data scientists who run a lot of experiments. 9 | 10 | Steppy is compatible with Python>=3.5 11 | and is distributed under the MIT license. 12 | ''' 13 | 14 | setup(name='steppy', 15 | packages=['steppy'], 16 | version='0.1.16', 17 | description='A lightweight, open-source, Python library for fast and reproducible experimentation', 18 | long_description=long_description, 19 | url='https://github.com/minerva-ml/steppy', 20 | download_url='https://github.com/minerva-ml/steppy/archive/0.1.16.tar.gz', 21 | author='Kamil A. Kaczmarek, Jakub Czakon', 22 | author_email='kamil.kaczmarek@neptune.ml, jakub.czakon@neptune.ml', 23 | keywords=['machine-learning', 'reproducibility', 'pipeline', 'data-science'], 24 | license='MIT', 25 | install_requires=[ 26 | 'ipython>=6.4.0', 27 | 'numpy>=1.14.0', 28 | 'pydot_ng>=1.0.0', 29 | 'pytest>=3.6.0', 30 | 'scikit_learn>=0.19.0', 31 | 'scipy>=1.0.0', 32 | 'setuptools>=39.2.0', 33 | 'typing>=3.6.4'], 34 | zip_safe=False, 35 | classifiers=[]) 36 | -------------------------------------------------------------------------------- /steppy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minerva-ml/steppy/856b95f1f5189e1d2ca122b891bc670adac9692b/steppy/__init__.py -------------------------------------------------------------------------------- /steppy/adapter.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List, Dict, Any, NamedTuple 2 | 3 | E = NamedTuple('E', [('input_name', str), 4 | ('key', str)] 5 | ) 6 | 7 | AdaptingRecipe = Any 8 | DataPacket = Dict[str, Any] 9 | AllOutputs = Dict[str, DataPacket] 10 | 11 | 12 | class AdapterError(Exception): 13 | pass 14 | 15 | 16 | class Adapter: 17 | """Translates outputs from parent steps to inputs to the current step. 18 | 19 | Attributes: 20 | adapting_recipes: The recipes that the adapter was initialized with. 21 | 22 | Example: 23 | Normally Adapter is used with a Step. In the following example 24 | `RandomForestTransformer` follows sklearn convention of calling arguments `X` and `y`, 25 | however names passed to the Step are different. We use Adapter to map recieved names 26 | to the expected names. 27 | 28 | .. code-block:: python 29 | 30 | from sklearn.datasets import load_iris 31 | from sklearn.ensemble import RandomForestClassifier 32 | from sklearn.metrics import log_loss 33 | from steppy.base import BaseTransformer, Step 34 | from steppy.adapter import Adapter, E 35 | 36 | iris = load_iris() 37 | 38 | pipeline_input = { 39 | 'train_data': { 40 | 'target': iris.target, 41 | 'data': iris.data 42 | } 43 | } 44 | 45 | class RandomForestTransformer(BaseTransformer): 46 | def __init__(self, random_state=None): 47 | self.estimator = RandomForestClassifier(random_state=random_state) 48 | 49 | def fit(self, X, y): 50 | self.estimator.fit(X, y) 51 | return self 52 | 53 | def transform(self, X, **kwargs): 54 | y_proba = self.estimator.predict_proba(X) 55 | return {'y_proba': y_proba} 56 | 57 | random_forest = Step( 58 | name="random_forest", 59 | transformer=RandomForestTransformer(), 60 | input_data=['train_data'], 61 | adapter=Adapter({ 62 | 'X': E('train_data', 'data'), 63 | 'y': E('train_data', 'target') 64 | }), 65 | experiment_directory='./working_dir' 66 | ) 67 | 68 | result = random_forest.fit_transform(pipeline_input) 69 | print(log_loss(y_true=iris.target, y_pred=result['y_proba'])) 70 | """ 71 | 72 | def __init__(self, adapting_recipes: Dict[str, AdaptingRecipe]): 73 | """Adapter constructor. 74 | 75 | Note: 76 | You have to import the extractor 'E' from this module to construct 77 | adapters. 78 | 79 | Args: 80 | adapting_recipes: Recipes used to control the input translation. 81 | An adapting recipe may be any Python data structure. If this structure 82 | contains placeholders denoted by `E`, then values extracted from parent 83 | steps' outputs will be substituted in their place. 84 | `adapting_recipes` is a dict where the keys match the arguments 85 | expected by the transformer. The values in this dictionary may be for example 86 | one of the following: 87 | 88 | 1. `E('input_name', 'key')` will query the parent step 89 | 'input_name' for the output 'key' 90 | 91 | 2. List of `E('input_name', 'key')` will apply the extractors 92 | to the parent steps and combine the results into a list 93 | 94 | 3. Tuple of `E('input_name', 'key')` will apply the extractors 95 | to the parent steps and combine the results into a tuple 96 | 97 | 4. Dict like `{k: E('input_name', 'key')}` will apply the 98 | extractors to the parent steps and combine the results 99 | into a dict with the same keys 100 | 101 | 5. Anything else: the value itself will be used as the argument 102 | to the transformer 103 | """ 104 | self.adapting_recipes = adapting_recipes 105 | 106 | def adapt(self, all_ouputs: AllOutputs) -> DataPacket: 107 | """Adapt inputs for the transformer included in the step. 108 | 109 | Args: 110 | all_ouputs: Dict of outputs from parent steps. The keys should 111 | match the names of these steps and the values should be their 112 | respective outputs. 113 | 114 | Returns: 115 | Dictionary with the same keys as `adapting_recipes` and values 116 | constructed according to the respective recipes. 117 | 118 | """ 119 | adapted = {} 120 | for name, recipe in self.adapting_recipes.items(): 121 | adapted[name] = self._construct(all_ouputs, recipe) 122 | return adapted 123 | 124 | def _construct(self, all_ouputs: AllOutputs, recipe: AdaptingRecipe) -> Any: 125 | return { 126 | E: self._construct_element, 127 | tuple: self._construct_tuple, 128 | list: self._construct_list, 129 | dict: self._construct_dict, 130 | }.get(recipe.__class__, self._construct_constant)(all_ouputs, recipe) 131 | 132 | def _construct_constant(self, _: AllOutputs, constant) -> Any: 133 | return constant 134 | 135 | def _construct_element(self, all_ouputs: AllOutputs, element: E): 136 | input_name = element.input_name 137 | key = element.key 138 | try: 139 | input_results = all_ouputs[input_name] 140 | try: 141 | return input_results[key] 142 | except KeyError: 143 | msg = "Input '{}' didn't have '{}' in its result.".format(input_name, key) 144 | raise AdapterError(msg) 145 | except KeyError: 146 | msg = "No such input: '{}'".format(input_name) 147 | raise AdapterError(msg) 148 | 149 | def _construct_list(self, all_ouputs: AllOutputs, lst: List[AdaptingRecipe]): 150 | return [self._construct(all_ouputs, recipe) for recipe in lst] 151 | 152 | def _construct_tuple(self, all_ouputs: AllOutputs, tup: Tuple): 153 | return tuple(self._construct(all_ouputs, recipe) for recipe in tup) 154 | 155 | def _construct_dict(self, all_ouputs: AllOutputs, dic: Dict[AdaptingRecipe, AdaptingRecipe]): 156 | return {self._construct(all_ouputs, k): self._construct(all_ouputs, v) 157 | for k, v in dic.items()} 158 | -------------------------------------------------------------------------------- /steppy/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pprint 3 | from collections import defaultdict 4 | 5 | from sklearn.externals import joblib 6 | 7 | from steppy.adapter import Adapter, AdapterError 8 | from steppy.utils import display_upstream_structure, persist_as_png, get_logger, initialize_logger 9 | 10 | initialize_logger() 11 | logger = get_logger() 12 | 13 | DEFAULT_TRAINING_SETUP = { 14 | 'is_fittable': True, 15 | 'force_fitting': True, 16 | 'persist_output': False, 17 | 'cache_output': False, 18 | 'load_persisted_output': False 19 | } 20 | 21 | 22 | class Step: 23 | """Step is a building block of steppy pipelines. 24 | 25 | It is an execution wrapper over the transformer (see :class:`~steppy.base.BaseTransformer`), 26 | which realizes single operation on data. With Step you can: 27 | 28 | 1. design multiple input/output data flows and connections between Steps. 29 | 2. handle persistence and caching of transformer and intermediate results. 30 | 31 | Step executes `fit_transform` method inspired by the sklearn on every step recursively 32 | starting from the very last Step and making its way forward through the `input_steps`. 33 | One can easily debug the data flow by plotting the pipeline graph 34 | (see: :func:`~steppy.utils.persist_as_png`) or return step in a jupyter notebook cell. 35 | 36 | Attributes: 37 | transformer (obj): object that inherits from BaseTransformer or Step instance. 38 | When Step instance is passed, transformer from that Step will be copied and used to 39 | perform transformations. It is useful when both train and valid data are passed in 40 | one pipeline (common situation in deep learning). 41 | 42 | name (str): Step name. 43 | Each step in a pipeline must have a unique name. It is name of the persisted 44 | transformer and output of this Step. 45 | Default is transformer's class name. 46 | 47 | experiment_directory (str): path to the directory where all execution artifacts will be 48 | stored. 49 | Default is ``~/.steppy``. 50 | The following directories will be created under ``~/.steppy``, if they were not created by 51 | preceding Steps: 52 | 53 | * transformers: transformer objects are persisted in this folder 54 | * output: step output dictionaries are persisted in this folder 55 | (if ``persist_output=True``) 56 | 57 | input_data (list): Elements of this list are keys in the data dictionary that is passed 58 | to the Step's `fit_transform` and `transform` methods. 59 | List of str, default is empty list. 60 | 61 | Example: 62 | 63 | .. code-block:: python 64 | 65 | data_train = {'input': {'images': X_train, 66 | 'labels': y_train} 67 | } 68 | 69 | my_step = Step(name='random_forest', 70 | transformer=RandomForestTransformer(), 71 | input_data=['input'] 72 | ) 73 | 74 | my_step.fit_transform(data_train) 75 | 76 | `data_train` is dictionary where: 77 | 78 | * keys are names of data packets, 79 | * values are data packets, that is dictionaries that describes dataset. 80 | In this example keys in the data packet are `images` and `labels` and values 81 | are actual data of any type. 82 | 83 | `Step.input_data` takes the key from `data_train` (values must match!) and extracts 84 | actual data that will be passed to the `fit_transform` and `transform` method of 85 | the `self.transformer`. 86 | 87 | input_steps (list): List of input Steps that the current Step uses as its input. 88 | list of Step instances, default is empty list. 89 | Current Step will combine output from `input_steps` and `input_data` using `adapter`. 90 | Then pass it to the transformer methods `fit_transform` and `transform`. 91 | 92 | Example: 93 | 94 | .. code-block:: python 95 | 96 | self.input_steps=[cnn_step, rf_step, ensemble_step, guesses_step] 97 | 98 | Each element of the list is Step instance. 99 | 100 | adapter (obj): It renames and arranges inputs that are passed to the Transformer 101 | (see :class:`~steppy.base.BaseTransformer`). 102 | Default is ``None``. 103 | If ``not None``, then must be an instance of the :class:`~steppy.adapter.Adapter` class. 104 | 105 | Example: 106 | .. code-block:: python 107 | 108 | self.adapter=Adapter({'X': E('input', 'images'), 109 | 'y': E('input', 'labels')} 110 | ) 111 | 112 | Adapter simplifies the renaming and combining of inputs from multiple steps. 113 | In this example, after the adaptation: 114 | 115 | * `X` is key to the data stored under the `images` key 116 | * `y` is key to the data stored under the `labels` key 117 | 118 | where both `images` and `labels` keys comes from `input` 119 | (see :attr:`~steppy.base.Step.input_data`) 120 | 121 | cache_output (bool): If True, Step output dictionary will be cached under 122 | ``self.output``, when transform method of the Step transformer 123 | is completed. If the same Step is used multiple times, transform method is invoked 124 | only once. Further invokes simply use cached output. 125 | Default ``False``: do not cache output 126 | 127 | Warning: 128 | One should always run `step.clean_cache_upstream()` before executing 129 | `step.fit_transform(data)` or `step.transform(data)` 130 | When working with large datasets, cache might be very large. 131 | 132 | persist_output (bool): If True, persist Step output to disk under the 133 | ``/output/`` directory. 134 | Default ``False``: do not persist any files to disk. 135 | If True then Step output dictionary will be persisted to the 136 | ``/output/`` directory, after transform method of the Step 137 | transformer is completed. Step persists to disk the output after every run of the 138 | transformer's transform method. It means that Step overrides files. See also 139 | `load_persisted_output` parameter. 140 | 141 | Warning: 142 | When working with large datasets, cache might be very large. 143 | 144 | load_persisted_output (bool): If True, Step output dictionary already persisted to the 145 | ``/output/`` will be loaded when Step is called. 146 | Default ``False``: do not load persisted output. 147 | Useful when debugging and working with ensemble models or time consuming feature 148 | extraction. One can easily persist already computed pieces of the pipeline and save 149 | time by loading them instead of calculating. 150 | 151 | Warning: 152 | Re-running the same step on new data with `load_persisted_output` set ``True`` 153 | may lead to errors when output from old data are loaded while user would expect 154 | the pipeline to use new data instead. 155 | 156 | force_fitting (bool): If True, Step transformer will be fitted (via `fit_transform`) 157 | even if ``/transformers/`` exists. 158 | Default ``True``: fit transformer each time `fit_transform()` is called. 159 | Helpful when one wants to use ``persist_output=True`` and load ``persist_output=True`` 160 | on a previous Step and fit current Step multiple times. This is a typical scenario 161 | for tuning hyperparameters for an ensemble model trained on the output from first 162 | level models or a model build on features that are time consuming to compute. 163 | """ 164 | 165 | def __init__(self, 166 | transformer, 167 | name=None, 168 | experiment_directory=None, 169 | output_directory=None, 170 | input_data=None, 171 | input_steps=None, 172 | adapter=None, 173 | 174 | is_fittable=True, 175 | force_fitting=True, 176 | 177 | persist_output=False, 178 | cache_output=False, 179 | load_persisted_output=False): 180 | 181 | self.name = self._format_step_name(name, transformer) 182 | 183 | if experiment_directory is not None: 184 | assert isinstance(experiment_directory, str),\ 185 | 'Step {} error, experiment_directory must ' \ 186 | 'be str, got {} instead.'.format(self.name, type(experiment_directory)) 187 | else: 188 | experiment_directory = os.path.join(os.path.expanduser("~"), '.steppy') 189 | logger.info('Using default experiment directory: {}'.format(experiment_directory)) 190 | 191 | if output_directory is not None: 192 | assert isinstance(output_directory, str),\ 193 | 'Step {}, output_directory must be str, got {} instead'.format(self.name, type(output_directory)) 194 | 195 | if input_data is not None: 196 | assert isinstance(input_data, list), 'Step {} error, input_data must be list, ' \ 197 | 'got {} instead.'.format(self.name, type(input_data)) 198 | if input_steps is not None: 199 | assert isinstance(input_steps, list), 'Step {} error, input_steps must be list, ' \ 200 | 'got {} instead.'.format(self.name, type(input_steps)) 201 | if adapter is not None: 202 | assert isinstance(adapter, Adapter), 'Step {} error, adapter must be an instance ' \ 203 | 'of {}'.format(self.name, str(Adapter)) 204 | 205 | assert isinstance(cache_output, bool), 'Step {} error, cache_output must be bool, ' \ 206 | 'got {} instead.'.format(self.name, type(cache_output)) 207 | assert isinstance(persist_output, bool), 'Step {} error, persist_output must be bool, ' \ 208 | 'got {} instead.'.format(self.name, type(persist_output)) 209 | assert isinstance(load_persisted_output, bool),\ 210 | 'Step {} error, load_persisted_output ' \ 211 | 'must be bool, got {} instead.'.format(self.name, type(load_persisted_output)) 212 | assert isinstance(force_fitting, bool), 'Step {} error, force_fitting must be bool, ' \ 213 | 'got {} instead.'.format(self.name, type(force_fitting)) 214 | 215 | logger.info('Initializing Step {}'.format(self.name)) 216 | 217 | self.transformer = transformer 218 | self.output_directory = output_directory 219 | self.input_steps = input_steps or [] 220 | self.input_data = input_data or [] 221 | self.adapter = adapter 222 | self.is_fittable = is_fittable 223 | self.cache_output = cache_output 224 | self.persist_output = persist_output 225 | self.load_persisted_output = load_persisted_output 226 | self.force_fitting = force_fitting 227 | 228 | self.output = None 229 | self.experiment_directory = os.path.join(experiment_directory) 230 | self._prepare_experiment_directories() 231 | self._mode = 'train' 232 | 233 | self._validate_upstream_names() 234 | logger.info('Step {} initialized'.format(self.name)) 235 | 236 | @property 237 | def experiment_directory_transformers_step(self): 238 | directory = os.path.join(self.experiment_directory, 'transformers') 239 | os.makedirs(directory, exist_ok=True) 240 | return os.path.join(directory, self.name) 241 | 242 | @property 243 | def experiment_directory_output_step(self): 244 | directory = os.path.join(self.experiment_directory, 'output') 245 | if self.output_directory is not None: 246 | os.makedirs(os.path.join(directory, self.output_directory), exist_ok=True) 247 | return os.path.join(directory, self.output_directory, self.name) 248 | 249 | if self._mode == 'train': 250 | os.makedirs(os.path.join(directory, 'train'), exist_ok=True) 251 | return os.path.join(directory, 'train', self.name) 252 | 253 | if self._mode == 'inference': 254 | os.makedirs(os.path.join(directory, 'inference'), exist_ok=True) 255 | return os.path.join(directory, 'inference', self.name) 256 | 257 | @property 258 | def upstream_structure(self): 259 | """Build dictionary with entire upstream pipeline structure 260 | (with regard to the current Step). 261 | 262 | Returns: 263 | dict: dictionary describing the upstream pipeline structure. It has two keys: 264 | ``'edges'`` and ``'nodes'``, where: 265 | 266 | - value of ``'edges'`` is set of tuples ``(input_step.name, self.name)`` 267 | - value of ``'nodes'`` is set of all step names upstream to this Step 268 | """ 269 | structure_dict = {'edges': set(), 270 | 'nodes': set()} 271 | structure_dict = self._build_structure_dict(structure_dict) 272 | return structure_dict 273 | 274 | @property 275 | def all_upstream_steps(self): 276 | """Build dictionary with all Step instances that are upstream to `self`. 277 | 278 | Returns: 279 | all_upstream_steps (dict): dictionary where keys are Step names (str) and values are Step 280 | instances (obj) 281 | """ 282 | all_steps_ = {} 283 | all_steps_ = self._get_steps(all_steps_) 284 | return all_steps_ 285 | 286 | @property 287 | def transformer_is_persisted(self): 288 | """(bool): True if transformer exists under the directory 289 | ``/transformers/`` 290 | """ 291 | return os.path.exists(self.experiment_directory_transformers_step) 292 | 293 | @property 294 | def output_is_cached(self): 295 | """(bool): True if step output exists under the ``self.output``. 296 | See `cache_output`. 297 | """ 298 | if self.output is not None: 299 | return True 300 | else: 301 | return False 302 | 303 | @property 304 | def output_is_persisted(self): 305 | """(bool): True if step output exists under the ``/output//``. 306 | See :attr:`~steppy.base.Step.persist_output`. 307 | """ 308 | return os.path.exists(self.experiment_directory_output_step) 309 | 310 | def fit_transform(self, data): 311 | """Fit the model and transform data or load already processed data. 312 | 313 | Loads cached or persisted output or adapts data for the current transformer and 314 | executes ``transformer.fit_transform``. 315 | 316 | Args: 317 | data (dict): data dictionary with keys as input names and values as dictionaries of 318 | key-value pairs that can be passed to the ``self.transformer.fit_transform`` method. 319 | Example: 320 | 321 | .. code-block:: python 322 | 323 | data = {'input_1': {'X': X, 324 | 'y': y}, 325 | 'input_2': {'X': X, 326 | 'y': y} 327 | } 328 | 329 | Returns: 330 | dict: Step output from the ``self.transformer.fit_transform`` method 331 | """ 332 | if data: 333 | assert isinstance(data, dict), 'Step {}, "data" argument in the "fit_transform()" method must be dict, ' \ 334 | 'got {} instead.'.format(self.name, type(data)) 335 | logger.info('Step {}, working in "{}" mode'.format(self.name, self._mode)) 336 | 337 | if self._mode == 'inference': 338 | ValueError('Step {}, you are in "{}" mode, where you cannot run "fit".' 339 | 'Please change mode to "train" to enable fitting.' 340 | 'Use: "step.set_mode_train()" then "step.fit_transform()"'.format(self.name, self._mode)) 341 | 342 | if self.output_is_cached and not self.force_fitting: 343 | logger.info('Step {} using cached output'.format(self.name)) 344 | step_output_data = self.output 345 | elif self.output_is_persisted and self.load_persisted_output and not self.force_fitting: 346 | logger.info('Step {} loading persisted output from {}'.format(self.name, 347 | self.experiment_directory_output_step)) 348 | step_output_data = self._load_output(self.experiment_directory_output_step) 349 | else: 350 | step_inputs = {} 351 | if self.input_data is not None: 352 | for input_data_part in self.input_data: 353 | step_inputs[input_data_part] = data[input_data_part] 354 | 355 | for input_step in self.input_steps: 356 | step_inputs[input_step.name] = input_step.fit_transform(data) 357 | 358 | if self.adapter: 359 | step_inputs = self._adapt(step_inputs) 360 | else: 361 | step_inputs = self._unpack(step_inputs) 362 | step_output_data = self._fit_transform_operation(step_inputs) 363 | logger.info('Step {}, fit and transform completed'.format(self.name)) 364 | return step_output_data 365 | 366 | def transform(self, data): 367 | """Transforms data or loads already processed data. 368 | 369 | Loads cached persisted output or adapts data for the current transformer and executes 370 | its `transform` method. 371 | 372 | Args: 373 | data (dict): data dictionary with keys as input names and values as dictionaries of 374 | key:value pairs that can be passed to the ``step.transformer.fit_transform`` method 375 | 376 | Example: 377 | 378 | .. code-block:: python 379 | 380 | data = {'input_1':{'X':X, 381 | 'y':y 382 | }, 383 | 'input_2': {'X':X, 384 | 'y':y 385 | } 386 | } 387 | 388 | Returns: 389 | dict: step output from the transformer.transform method 390 | """ 391 | if data: 392 | assert isinstance(data, dict), 'Step {}, "data" argument in the "transform()" method must be dict, ' \ 393 | 'got {} instead.'.format(self.name, type(data)) 394 | logger.info('Step {}, working in "{}" mode'.format(self.name, self._mode)) 395 | 396 | if self.output_is_cached: 397 | logger.info('Step {} using cached output'.format(self.name)) 398 | step_output_data = self.output 399 | elif self.output_is_persisted and self.load_persisted_output: 400 | logger.info('Step {} loading persisted output from {}'.format(self.name, 401 | self.experiment_directory_output_step)) 402 | step_output_data = self._load_output(self.experiment_directory_output_step) 403 | else: 404 | step_inputs = {} 405 | if self.input_data is not None: 406 | for input_data_part in self.input_data: 407 | step_inputs[input_data_part] = data[input_data_part] 408 | 409 | for input_step in self.input_steps: 410 | step_inputs[input_step.name] = input_step.transform(data) 411 | 412 | if self.adapter: 413 | step_inputs = self._adapt(step_inputs) 414 | else: 415 | step_inputs = self._unpack(step_inputs) 416 | step_output_data = self._transform_operation(step_inputs) 417 | logger.info('Step {}, transform completed'.format(self.name)) 418 | return step_output_data 419 | 420 | def set_mode_train(self): 421 | """Applies 'train' mode to all upstream Steps including this Step 422 | and cleans cache for all upstream Steps including this Step. 423 | """ 424 | self._set_mode('train') 425 | return self 426 | 427 | def set_mode_inference(self): 428 | """Applies 'inference' mode to all upstream Steps including this Step 429 | and cleans cache for all upstream Steps including this Step. 430 | """ 431 | self._set_mode('inference') 432 | return self 433 | 434 | def reset(self): 435 | """Reset all upstream Steps to the default training parameters and 436 | cleans cache for all upstream Steps including this Step. 437 | Defaults are: 438 | 'mode': 'train', 439 | 'is_fittable': True, 440 | 'force_fitting': True, 441 | 'persist_output': False, 442 | 'cache_output': False, 443 | 'load_persisted_output': False 444 | """ 445 | self.clean_cache_upstream() 446 | self.set_mode_train() 447 | for step_obj in self.all_upstream_steps.values(): 448 | step_obj.is_fittable = DEFAULT_TRAINING_SETUP['is_fittable'] 449 | step_obj.force_fitting = DEFAULT_TRAINING_SETUP['force_fitting'] 450 | step_obj.persist_output = DEFAULT_TRAINING_SETUP['persist_output'] 451 | step_obj.cache_output = DEFAULT_TRAINING_SETUP['cache_output'] 452 | step_obj.load_persisted_output = DEFAULT_TRAINING_SETUP['load_persisted_output'] 453 | logger.info('Step {}, reset all upstream Steps to default training parameters, ' 454 | 'including this Step'.format(self.name)) 455 | return self 456 | 457 | def set_parameters_upstream(self, parameters): 458 | """Set parameters to all upstream Steps including this Step. 459 | Parameters is dict() where key is Step attribute, and value is new value to set. 460 | """ 461 | assert isinstance(parameters, dict), 'parameters must be dict, got {} instead'.format(type(parameters)) 462 | for step_obj in self.all_upstream_steps.values(): 463 | for key in step_obj.__dict__.keys(): 464 | if key in list(parameters.keys()): 465 | step_obj.__dict__[key] = parameters[key] 466 | if key == 'experiment_directory': 467 | step_obj._prepare_experiment_directories() 468 | logger.info('set new values to all upstream Steps including this Step.') 469 | return self 470 | 471 | def clean_cache_step(self): 472 | """Clean cache for current step. 473 | """ 474 | logger.info('Step {}, cleaning cache'.format(self.name)) 475 | self.output = None 476 | return self 477 | 478 | def clean_cache_upstream(self): 479 | """Clean cache for all steps that are upstream to `self`. 480 | """ 481 | logger.info('Cleaning cache for the entire upstream pipeline') 482 | for step in self.all_upstream_steps.values(): 483 | logger.info('Step {}, cleaning cache'.format(step.name)) 484 | step.output = None 485 | return self 486 | 487 | def get_step_by_name(self, name): 488 | """Extracts step by name from the pipeline. 489 | 490 | Extracted Step is a fully functional pipeline as well. 491 | All upstream Steps are already defined. 492 | 493 | Args: 494 | name (str): name of the step to be fetched 495 | Returns: 496 | Step (obj): extracted step 497 | """ 498 | self._validate_step_name(name) 499 | name = str(name) 500 | try: 501 | return self.all_upstream_steps[name] 502 | except KeyError as e: 503 | msg = 'No Step with name "{}" found. ' \ 504 | 'You have following Steps: {}'.format(name, list(self.all_upstream_steps.keys())) 505 | raise StepError(msg) from e 506 | 507 | def persist_upstream_structure(self): 508 | """Persist json file with the upstream steps structure, that is step names and their connections.""" 509 | persist_dir = os.path.join(self.experiment_directory, '{}_upstream_structure.json'.format(self.name)) 510 | logger.info('Step {}, saving upstream pipeline structure to {}'.format(self.name, persist_dir)) 511 | joblib.dump(self.upstream_structure, persist_dir) 512 | 513 | def persist_upstream_diagram(self, filepath): 514 | """Creates upstream steps diagram and persists it to disk as png file. 515 | 516 | Pydot graph is created and persisted to disk as png file under the filepath directory. 517 | 518 | Args: 519 | filepath (str): filepath to which the png with steps visualization should 520 | be persisted 521 | """ 522 | assert isinstance(filepath, str),\ 523 | 'Step {} error, filepath must be str. Got {} instead'.format(self.name, type(filepath)) 524 | persist_as_png(self.upstream_structure, filepath) 525 | 526 | def _fit_transform_operation(self, step_inputs): 527 | if self.is_fittable: 528 | if self.transformer_is_persisted and not self.force_fitting: 529 | logger.info('Step {}, loading transformer from the {}' 530 | .format(self.name, self.experiment_directory_transformers_step)) 531 | self.transformer.load(self.experiment_directory_transformers_step) 532 | logger.info('Step {}, transforming...'.format(self.name)) 533 | 534 | try: 535 | step_output_data = self.transformer.transform(**step_inputs) 536 | except Exception as e: 537 | msg = 'Step {}, Transformer "{}" error ' \ 538 | 'during "transform()" operation.'.format(self.name, self.transformer.__class__.__name__) 539 | raise StepError(msg) from e 540 | 541 | logger.info('Step {}, transforming completed'.format(self.name)) 542 | else: 543 | logger.info('Step {}, fitting and transforming...'.format(self.name)) 544 | 545 | try: 546 | step_output_data = self.transformer.fit_transform(**step_inputs) 547 | except Exception as e: 548 | msg = 'Step {}, Transformer "{}" error ' \ 549 | 'during "fit_transform()" operation.'.format(self.name, self.transformer.__class__.__name__) 550 | raise StepError(msg) from e 551 | 552 | logger.info('Step {}, fitting and transforming completed'.format(self.name)) 553 | logger.info('Step {}, persisting transformer to the {}' 554 | .format(self.name, self.experiment_directory_transformers_step)) 555 | self.transformer.persist(self.experiment_directory_transformers_step) 556 | else: 557 | logger.info('Step {}, is not fittable, transforming...'.format(self.name)) 558 | 559 | try: 560 | step_output_data = self.transformer.transform(**step_inputs) 561 | except Exception as e: 562 | msg = 'Step {}, Transformer "{}" error ' \ 563 | 'during "transform()" operation.'.format(self.name, self.transformer.__class__.__name__) 564 | raise StepError(msg) from e 565 | 566 | logger.info('Step {}, transforming completed'.format(self.name)) 567 | 568 | assert isinstance(step_output_data, dict), 'Step {}, Transformer "{}", error. ' \ 569 | 'Output from transformer must be dict, got {} instead'.format(self.name, 570 | self.transformer.__class__.__name__, 571 | type(step_output_data)) 572 | 573 | if self.cache_output: 574 | logger.info('Step {}, caching output'.format(self.name)) 575 | self.output = step_output_data 576 | if self.persist_output: 577 | logger.info('Step {}, persisting output to the {}' 578 | .format(self.name, self.experiment_directory_output_step)) 579 | self._persist_output(step_output_data, self.experiment_directory_output_step) 580 | return step_output_data 581 | 582 | def _transform_operation(self, step_inputs): 583 | if self.is_fittable: 584 | if self.transformer_is_persisted: 585 | logger.info('Step {}, loading transformer from the {}' 586 | .format(self.name, self.experiment_directory_transformers_step)) 587 | self.transformer.load(self.experiment_directory_transformers_step) 588 | logger.info('Step {}, transforming...'.format(self.name)) 589 | 590 | try: 591 | step_output_data = self.transformer.transform(**step_inputs) 592 | except Exception as e: 593 | msg = 'Step {}, Transformer "{}" error ' \ 594 | 'during "transform()" operation.'.format(self.name, self.transformer.__class__.__name__) 595 | raise StepError(msg) from e 596 | 597 | logger.info('Step {}, transforming completed'.format(self.name)) 598 | else: 599 | raise ValueError('No transformer persisted with name: {}. ' 600 | 'Make sure that you have this transformer under the directory: {}' 601 | .format(self.name, self.experiment_directory_transformers_step)) 602 | else: 603 | logger.info('Step {}, transforming...'.format(self.name)) 604 | 605 | try: 606 | step_output_data = self.transformer.transform(**step_inputs) 607 | except Exception as e: 608 | msg = 'Step {}, Transformer "{}" error ' \ 609 | 'during "transform()" operation.'.format(self.name, self.transformer.__class__.__name__) 610 | raise StepError(msg) from e 611 | 612 | logger.info('Step {}, transforming completed'.format(self.name)) 613 | 614 | assert isinstance(step_output_data, dict), 'Step {}, Transformer "{}", error. ' \ 615 | 'Output from transformer must be dict, got {} instead'.format(self.name, 616 | self.transformer.__class__.__name__, 617 | type(step_output_data)) 618 | 619 | if self.cache_output: 620 | logger.info('Step {}, caching output'.format(self.name)) 621 | self.output = step_output_data 622 | if self.persist_output: 623 | logger.info('Step {}, persisting output to the {}' 624 | .format(self.name, self.experiment_directory_output_step)) 625 | self._persist_output(step_output_data, self.experiment_directory_output_step) 626 | return step_output_data 627 | 628 | def _load_output(self, filepath): 629 | logger.info('Step {}, loading output from {}'.format(self.name, filepath)) 630 | return joblib.load(filepath) 631 | 632 | def _persist_output(self, output_data, filepath): 633 | joblib.dump(output_data, filepath) 634 | 635 | def _adapt(self, step_inputs): 636 | logger.info('Step {}, adapting inputs'.format(self.name)) 637 | try: 638 | return self.adapter.adapt(step_inputs) 639 | except AdapterError as e: 640 | msg = "Error while adapting step '{}'. Check Step inputs".format(self.name) 641 | raise StepError(msg) from e 642 | 643 | def _unpack(self, step_inputs): 644 | logger.info('Step {}, unpacking inputs'.format(self.name)) 645 | unpacked_steps = {} 646 | key_to_step_names = defaultdict(list) 647 | for step_name, step_dict in step_inputs.items(): 648 | unpacked_steps.update(step_dict) 649 | for key in step_dict.keys(): 650 | key_to_step_names[key].append(step_name) 651 | 652 | repeated_keys = [(key, step_names) for key, step_names in key_to_step_names.items() 653 | if len(step_names) > 1] 654 | if len(repeated_keys) == 0: 655 | return unpacked_steps 656 | else: 657 | msg = "Could not unpack inputs. Following keys are present in multiple input steps:\n " \ 658 | "\n".join([" '{}' present in steps {}".format(key, step_names) 659 | for key, step_names in repeated_keys]) 660 | raise StepError(msg) 661 | 662 | def _prepare_experiment_directories(self): 663 | if not os.path.exists(os.path.join(self.experiment_directory, 'transformers')): 664 | logger.info('initializing experiment directories under {}'.format(self.experiment_directory)) 665 | for dir_name in ['transformers', 'output']: 666 | os.makedirs(os.path.join(self.experiment_directory, dir_name), exist_ok=True) 667 | 668 | def _get_steps(self, all_steps): 669 | self._check_name_uniqueness(all_steps=all_steps) 670 | for input_step in self.input_steps: 671 | all_steps = input_step._get_steps(all_steps) 672 | all_steps[self.name] = self 673 | return all_steps 674 | 675 | def _format_step_name(self, name, transformer): 676 | self._validate_step_name(name=name) 677 | if name is not None: 678 | name_ = str(name) 679 | else: 680 | name_ = transformer.__class__.__name__ 681 | return name_ 682 | 683 | def _validate_step_name(self, name): 684 | if name is not None: 685 | assert isinstance(name, str) or isinstance(name, float) or isinstance(name, int),\ 686 | 'Step name must be str, float or int. Got {} instead.'.format(type(name)) 687 | 688 | def _check_name_uniqueness(self, all_steps): 689 | if self.name in all_steps.keys(): 690 | logger.info('STEPPY WARNING: Step with name "{}", already exist. ' 691 | 'Make sure that all Steps have unique name.'.format(self.name)) 692 | 693 | def _validate_upstream_names(self): 694 | try: 695 | _ = self.all_upstream_steps.keys() 696 | except ValueError as e: 697 | msg = 'Incorrect Step names' 698 | raise StepError(msg) from e 699 | 700 | def _build_structure_dict(self, structure_dict): 701 | for input_step in self.input_steps: 702 | structure_dict = input_step._build_structure_dict(structure_dict) 703 | structure_dict['edges'].add((input_step.name, self.name)) 704 | structure_dict['nodes'].add(self.name) 705 | for input_data in self.input_data: 706 | structure_dict['nodes'].add(input_data) 707 | structure_dict['edges'].add((input_data, self.name)) 708 | return structure_dict 709 | 710 | def _set_mode(self, mode): 711 | self.clean_cache_upstream() 712 | for name, step_obj in self.all_upstream_steps.items(): 713 | step_obj._mode = mode 714 | logger.info('Step {}, applied "{}" mode to all upstream Steps, including this Step'.format(self.name, mode)) 715 | 716 | def _repr_html_(self): 717 | return display_upstream_structure(self.upstream_structure) 718 | 719 | def __str__(self): 720 | return pprint.pformat(self.upstream_structure) 721 | 722 | 723 | class BaseTransformer: 724 | """Abstraction on ``fit`` and ``transform`` execution. 725 | 726 | Base transformer is an abstraction strongly inspired by the ``sklearn.Transformer`` and 727 | ``sklearn.Estimator``. Two main concepts are: 728 | 729 | 1. Every action that can be performed on data (transformation, model training) can be 730 | performed in two steps: fitting (where trainable parameters are estimated) and transforming 731 | (where previously estimated parameters are used to transform the data into desired state). 732 | 733 | 2. Every transformer knows how it should be persisted and loaded (especially useful when 734 | working with Keras/Pytorch or scikit-learn) in one pipeline. 735 | """ 736 | 737 | def __init__(self): 738 | self.estimator = None 739 | 740 | def fit(self, *args, **kwargs): 741 | """Performs estimation of trainable parameters. 742 | 743 | All model estimations with scikit-learn, keras, pytorch models as well as some preprocessing 744 | techniques (normalization) estimate parameters based on data (training data). 745 | Those parameters are trained during fit execution and are persisted for the future. 746 | Only the estimation logic, nothing else. 747 | 748 | Args: 749 | args: positional arguments (can be anything) 750 | kwargs: keyword arguments (can be anything) 751 | 752 | Returns: 753 | BaseTransformer: self object 754 | """ 755 | return self 756 | 757 | def transform(self, *args, **kwargs): 758 | """Performs transformation of data. 759 | 760 | All data transformation including prediction with deep learning/machine learning models 761 | can be performed here. No parameters should be estimated in this method nor stored as 762 | class attributes. Only the transformation logic, nothing else. 763 | 764 | Args: 765 | args: positional arguments (can be anything) 766 | kwargs: keyword arguments (can be anything) 767 | 768 | Returns: 769 | dict: output 770 | """ 771 | raise NotImplementedError 772 | 773 | def fit_transform(self, *args, **kwargs): 774 | """Performs fit followed by transform. 775 | 776 | This method simply combines fit and transform. 777 | 778 | Args: 779 | args: positional arguments (can be anything) 780 | kwargs: keyword arguments (can be anything) 781 | 782 | Returns: 783 | dict: output 784 | """ 785 | self.fit(*args, **kwargs) 786 | return self.transform(*args, **kwargs) 787 | 788 | def load(self, filepath): 789 | """Loads the trainable parameters of the transformer. 790 | 791 | Specific implementation of loading persisted model parameters should be implemented here. 792 | In case of transformer that do not learn any parameters one can leave this method as is. 793 | 794 | Args: 795 | filepath (str): filepath from which the transformer should be loaded 796 | Returns: 797 | BaseTransformer: self instance 798 | """ 799 | _ = filepath 800 | return self 801 | 802 | def persist(self, filepath): 803 | """Saves the trainable parameters of the transformer 804 | 805 | Specific implementation of model parameter persistence should be implemented here. 806 | In case of transformer that do not learn any parameters one can leave this method as is. 807 | 808 | Args: 809 | filepath (str): filepath where the transformer parameters should be persisted 810 | """ 811 | joblib.dump('hello-steppy', filepath) 812 | 813 | 814 | class StepError(Exception): 815 | pass 816 | 817 | 818 | def make_transformer(func): 819 | class StaticTransformer(BaseTransformer): 820 | def fit(self): 821 | logger.info('StaticTransformer "{}" is not fittable.' 822 | 'By running "fit_transform()", you simply "transform()".'.format(self.__class__.__name__)) 823 | return self 824 | 825 | def transform(self, *args, **kwargs): 826 | return func(*args, **kwargs) 827 | 828 | def persist(self, filepath): 829 | logger.info('StaticTransformer "{}" is not persistable.'.format(self.__class__.__name__)) 830 | 831 | _transformer = StaticTransformer() 832 | _transformer.__class__.__name__ = func.__name__ 833 | return _transformer 834 | 835 | 836 | class IdentityOperation(BaseTransformer): 837 | """Transformer that performs identity operation, f(x)=x.""" 838 | 839 | def transform(self, **kwargs): 840 | return kwargs 841 | 842 | def persist(self, filepath): 843 | logger.info('"IdentityOperation" is not persistable.') 844 | pass 845 | -------------------------------------------------------------------------------- /steppy/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | import pydot_ng as pydot 5 | from IPython.display import Image, display 6 | 7 | 8 | def initialize_logger(): 9 | """Initialize steppy logger. 10 | 11 | This logger is used throughout the steppy library to report computation progress. 12 | 13 | Example: 14 | 15 | Simple use of steppy logger: 16 | 17 | .. code-block:: python 18 | 19 | initialize_logger() 20 | logger = get_logger() 21 | logger.info('My message inside pipeline') 22 | 23 | result looks like this: 24 | 25 | .. code:: 26 | 27 | 2018-06-02 12:33:48 steppy >>> My message inside pipeline 28 | 29 | Returns: 30 | logging.Logger: logger object formatted in the steppy style 31 | """ 32 | logger = logging.getLogger('steppy') 33 | logger.setLevel(logging.INFO) 34 | message_format = logging.Formatter(fmt='%(asctime)s %(name)s >>> %(message)s', 35 | datefmt='%Y-%m-%d %H:%M:%S') 36 | 37 | # console handler 38 | console_handler = logging.StreamHandler(sys.stdout) 39 | console_handler.setLevel(logging.INFO) 40 | console_handler.setFormatter(fmt=message_format) 41 | 42 | # add the handlers to the logger 43 | logger.addHandler(console_handler) 44 | 45 | return logger 46 | 47 | 48 | def get_logger(): 49 | """Fetch existing steppy logger. 50 | 51 | Example: 52 | 53 | .. code-block:: python 54 | 55 | initialize_logger() 56 | logger = get_logger() 57 | logger.info('My message goes here') 58 | 59 | result looks like this: 60 | 61 | .. code:: 62 | 63 | 2018-06-02 12:33:48 steppy >>> My message inside pipeline 64 | 65 | Returns: 66 | logging.Logger: logger object formatted in the steppy style 67 | """ 68 | return logging.getLogger('steppy') 69 | 70 | 71 | def display_upstream_structure(structure_dict): 72 | """Displays pipeline structure in the jupyter notebook. 73 | 74 | Args: 75 | structure_dict (dict): dict returned by 76 | :func:`~steppy.base.Step.upstream_structure`. 77 | """ 78 | graph = _create_graph(structure_dict) 79 | plt = Image(graph.create_png()) 80 | display(plt) 81 | 82 | 83 | def persist_as_png(structure_dict, filepath): 84 | """Saves pipeline diagram to disk as png file. 85 | 86 | Args: 87 | structure_dict (dict): dict returned by 88 | :func:`~steppy.base.Step.upstream_structure` 89 | filepath (str): filepath to which the png with pipeline visualization should be persisted 90 | """ 91 | graph = _create_graph(structure_dict) 92 | graph.write(filepath, format='png') 93 | 94 | 95 | def _create_graph(structure_dict): 96 | """Creates pydot graph from the pipeline structure dict. 97 | 98 | Args: 99 | structure_dict (dict): dict returned by step.upstream_structure 100 | 101 | Returns: 102 | graph (pydot.Dot): object representing upstream pipeline structure (with regard to the current Step). 103 | """ 104 | graph = pydot.Dot() 105 | for node in structure_dict['nodes']: 106 | graph.add_node(pydot.Node(node)) 107 | for node1, node2 in structure_dict['edges']: 108 | graph.add_edge(pydot.Edge(node1, node2)) 109 | return graph 110 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minerva-ml/steppy/856b95f1f5189e1d2ca122b891bc670adac9692b/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import steppy.base # To make sure logger is initialized before running prepare_steps_logger 2 | 3 | from .steppy_test_utils import prepare_steps_logger 4 | 5 | 6 | def pytest_sessionstart(session): 7 | prepare_steps_logger() 8 | 9 | 10 | def pytest_runtest_setup(item): 11 | pass 12 | 13 | 14 | def pytest_runtest_teardown(item): 15 | pass 16 | -------------------------------------------------------------------------------- /tests/steppy_test_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from pathlib import Path 5 | 6 | LOGS_PATH = 'steps_tests.log' 7 | 8 | 9 | def remove_logs(): 10 | if Path(LOGS_PATH).exists(): 11 | os.remove(LOGS_PATH) 12 | 13 | 14 | def prepare_steps_logger(): 15 | print("Redirecting logging to {}.".format(LOGS_PATH)) 16 | remove_logs() 17 | logger = logging.getLogger('steps') 18 | for h in logger.handlers: 19 | logger.removeHandler(h) 20 | message_format = logging.Formatter(fmt='%(asctime)s %(name)s >>> %(message)s', 21 | datefmt='%Y-%m-%d %H:%M:%S') 22 | fh = logging.FileHandler(LOGS_PATH) 23 | fh.setLevel(logging.INFO) 24 | fh.setFormatter(fmt=message_format) 25 | logger.addHandler(fh) 26 | -------------------------------------------------------------------------------- /tests/test_adapter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from steppy.adapter import Adapter, E 5 | 6 | 7 | @pytest.fixture 8 | def data(): 9 | return { 10 | 'input_1': { 11 | 'features': np.array([ 12 | [1, 6], 13 | [2, 5], 14 | [3, 4] 15 | ]), 16 | 'labels': np.array([2, 5, 3]) 17 | }, 18 | 'input_2': { 19 | 'extra_features': np.array([ 20 | [5, 7, 3], 21 | [67, 4, 5], 22 | [6, 13, 14] 23 | ]) 24 | }, 25 | 'input_3': { 26 | 'images': np.array([ 27 | [[0, 255], [255, 0]], 28 | [[255, 0], [0, 255]], 29 | [[255, 255], [0, 0]], 30 | ]), 31 | 'labels': np.array([1, 1, 0]) 32 | } 33 | } 34 | 35 | 36 | def test_adapter_creates_defined_keys(data): 37 | adapter = Adapter({ 38 | 'X': [E('input_1', 'features')], 39 | 'Y': [E('input_2', 'extra_features')] 40 | }) 41 | res = adapter.adapt(data) 42 | 43 | assert {'X', 'Y'} == set(res.keys()) 44 | 45 | 46 | def test_recipe_with_single_item(data): 47 | adapter = Adapter({ 48 | 'X': E('input_1', 'labels'), 49 | 'Y': E('input_3', 'labels'), 50 | }) 51 | res = adapter.adapt(data) 52 | 53 | assert np.array_equal(res['X'], data['input_1']['labels']) 54 | assert np.array_equal(res['Y'], data['input_3']['labels']) 55 | 56 | 57 | def test_recipe_with_list(data): 58 | adapter = Adapter({ 59 | 'X': [], 60 | 'Y': [E('input_1', 'features')], 61 | 'Z': [E('input_1', 'features'), 62 | E('input_2', 'extra_features')] 63 | }) 64 | res = adapter.adapt(data) 65 | for i, key in enumerate(('X', 'Y', 'Z')): 66 | assert isinstance(res[key], list) 67 | assert len(res[key]) == i 68 | 69 | assert res['X'] == [] 70 | assert np.array_equal(res['Y'][0], data['input_1']['features']) 71 | assert np.array_equal(res['Z'][0], data['input_1']['features']) 72 | assert np.array_equal(res['Z'][1], data['input_2']['extra_features']) 73 | 74 | 75 | def test_recipe_with_tuple(data): 76 | adapter = Adapter({ 77 | 'X': (), 78 | 'Y': (E('input_1', 'features'),), 79 | 'Z': (E('input_1', 'features'), E('input_2', 'extra_features')) 80 | }) 81 | res = adapter.adapt(data) 82 | 83 | for i, key in enumerate(('X', 'Y', 'Z')): 84 | assert isinstance(res[key], tuple) 85 | assert len(res[key]) == i 86 | 87 | assert res['X'] == () 88 | assert np.array_equal(res['Y'][0], data['input_1']['features']) 89 | assert np.array_equal(res['Z'][0], data['input_1']['features']) 90 | assert np.array_equal(res['Z'][1], data['input_2']['extra_features']) 91 | 92 | 93 | def test_recipe_with_dictionary(data): 94 | adapter = Adapter({ 95 | 'X': {}, 96 | 'Y': {'a': E('input_1', 'features')}, 97 | 'Z': {'a': E('input_1', 'features'), 98 | 'b': E('input_2', 'extra_features')} 99 | }) 100 | res = adapter.adapt(data) 101 | 102 | for i, key in enumerate(('X', 'Y', 'Z')): 103 | assert isinstance(res[key], dict) 104 | assert len(res[key]) == i 105 | 106 | assert res['X'] == {} 107 | assert np.array_equal(res['Y']['a'], data['input_1']['features']) 108 | assert np.array_equal(res['Z']['a'], data['input_1']['features']) 109 | assert np.array_equal(res['Z']['b'], data['input_2']['extra_features']) 110 | 111 | 112 | def test_recipe_with_constants(data): 113 | adapter = Adapter({ 114 | 'A': 112358, 115 | 'B': 3.14, 116 | 'C': "lorem ipsum", 117 | 'D': ('input_1', 'features'), 118 | 'E': {112358: 112358, 'a': 'a', 3.14: 3.14}, 119 | 'F': [112358, 3.14, "lorem ipsum", ('input_1', 'features')] 120 | }) 121 | res = adapter.adapt(data) 122 | 123 | assert res['A'] == 112358 124 | assert res['B'] == 3.14 125 | assert res['C'] == "lorem ipsum" 126 | assert res['D'] == ('input_1', 'features') 127 | assert res['E'] == {112358: 112358, 'a': 'a', 3.14: 3.14} 128 | assert res['F'] == [112358, 3.14, "lorem ipsum", ('input_1', 'features')] 129 | 130 | 131 | def test_nested_recipes(data): 132 | adapter = Adapter({ 133 | 'X': [{'a': [E('input_1', 'features')]}], 134 | 'Y': {'a': [{'b': E('input_2', 'extra_features')}]} 135 | }) 136 | res = adapter.adapt(data) 137 | 138 | assert res['X'] == [{'a': [data['input_1']['features']]}] 139 | assert res['Y'] == {'a': [{'b': data['input_2']['extra_features']}]} 140 | -------------------------------------------------------------------------------- /tests/test_base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from steppy.base import Step, StepError, make_transformer, IdentityOperation 5 | from steppy.adapter import Adapter, E 6 | 7 | 8 | @pytest.fixture 9 | def data(): 10 | return { 11 | 'input_1': { 12 | 'features': np.array([ 13 | [1, 6], 14 | [2, 5], 15 | [3, 4] 16 | ]), 17 | 'labels': np.array([2, 5, 3]) 18 | }, 19 | 'input_2': { 20 | 'extra_features': np.array([ 21 | [5, 7, 3], 22 | [67, 4, 5], 23 | [6, 13, 14] 24 | ]) 25 | }, 26 | 'input_3': { 27 | 'images': np.array([ 28 | [[0, 255], [255, 0]], 29 | [[255, 0], [0, 255]], 30 | [[255, 255], [0, 0]], 31 | ]), 32 | 'labels': np.array([1, 1, 0]) 33 | } 34 | } 35 | 36 | 37 | @pytest.mark.parametrize("mode", [0, 1]) 38 | def test_make_transformer(mode): 39 | def fun(x, y, m=0): 40 | return x + y if m == 0 else x - y 41 | tr = make_transformer(fun) 42 | 43 | tr.fit() 44 | res = tr.transform(7, 3, m=mode) 45 | assert res == (10 if mode == 0 else 4) 46 | 47 | 48 | def test_inputs_without_conflicting_names_do_not_require_adapter(data): 49 | step = Step( 50 | name='test_inputs_without_conflicting_names_do_not_require_adapter_1', 51 | transformer=IdentityOperation(), 52 | input_data=['input_1'] 53 | ) 54 | output = step.fit_transform(data) 55 | assert output == data['input_1'] 56 | 57 | step = Step( 58 | name='test_inputs_without_conflicting_names_do_not_require_adapter_2', 59 | transformer=IdentityOperation(), 60 | input_data=['input_1', 'input_2'] 61 | ) 62 | output = step.fit_transform(data) 63 | assert output == {**data['input_1'], **data['input_2']} 64 | 65 | 66 | def test_inputs_with_conflicting_names_require_adapter(data): 67 | step = Step( 68 | name='test_inputs_with_conflicting_names_require_adapter', 69 | transformer=IdentityOperation(), 70 | input_data=['input_1', 'input_3'] 71 | ) 72 | with pytest.raises(StepError): 73 | step.fit_transform(data) 74 | 75 | 76 | def test_step_with_adapted_inputs(data): 77 | step = Step( 78 | name='test_step_wit_adapted_inputs', 79 | transformer=IdentityOperation(), 80 | input_data=['input_1', 'input_3'], 81 | adapter=Adapter({ 82 | 'img': E('input_3', 'images'), 83 | 'fea': E('input_1', 'features'), 84 | 'l1': E('input_3', 'labels'), 85 | 'l2': E('input_1', 'labels'), 86 | }) 87 | ) 88 | output = step.fit_transform(data) 89 | expected = { 90 | 'img': data['input_3']['images'], 91 | 'fea': data['input_1']['features'], 92 | 'l1': data['input_3']['labels'], 93 | 'l2': data['input_1']['labels'], 94 | } 95 | assert output == expected 96 | --------------------------------------------------------------------------------