├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug.md
    │   └── everything-else.md
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DEFAULT_LOGIC.md
├── LICENSE
├── PULL_REQUEST_TEMPLATE.md
├── README.md
├── docs
    ├── Makefile
    ├── conf.py
    ├── generated
    │   ├── modules.rst
    │   └── steppy.rst
    └── index.rst
├── requirements.txt
├── setup.cfg
├── setup.py
├── steppy
    ├── __init__.py
    ├── adapter.py
    ├── base.py
    └── utils.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── steppy_test_utils.py
    ├── test_adapter.py
    └── test_base.py


/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: bug
 3 | about: Create bug report
 4 | 
 5 | ---
 6 | 
 7 | There are two things that will make the processing of your issue faster:
 8 | 1. Make sure that you are using the latest version of the code,
 9 | 1. In case of bug issue, it would be nice to provide more technical details such like execution command, error message or script that reproduces your bug.
10 | #
11 | 
12 | Thanks!
13 | 
14 | Kamil & Jakub,
15 | 
16 | *core contributors to the [minerva.ml](https://minerva.ml)*
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/everything-else.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: everything else
3 | about: Suggest an idea for this project
4 | 
5 | ---
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .pytest_cache
  6 | tests/.cache
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # neptune, pycharm
 12 | .cache
 13 | .cache/
 14 | .idea/
 15 | .idea_modules/
 16 | out/
 17 | output
 18 | output/
 19 | *.log
 20 | target/
 21 | devbook.ipynb
 22 | devbook_local.ipynb
 23 | 
 24 | # Distribution / packaging
 25 | .Python
 26 | env/
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | 
 43 | # PyInstaller
 44 | #  Usually these files are written by a python script from a template
 45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 46 | *.manifest
 47 | *.spec
 48 | 
 49 | # Installer logs
 50 | pip-log.txt
 51 | pip-delete-this-directory.txt
 52 | 
 53 | # Unit test / coverage reports
 54 | htmlcov/
 55 | .tox/
 56 | .coverage
 57 | .coverage.*
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | .hypothesis/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | local_settings.py
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # Jupyter Notebook
 81 | Untitled*.ipynb
 82 | .ipynb_checkpoints
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # SageMath parsed files
 91 | *.sage.py
 92 | 
 93 | # dotenv
 94 | .env
 95 | 
 96 | # virtualenv
 97 | .venv
 98 | venv/
 99 | ENV/
100 | 
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 | 
105 | # Rope project settings
106 | .ropeproject
107 | 
108 | # mkdocs documentation
109 | /site
110 | 
111 | # mypy
112 | .mypy_cache/
113 | 
114 | # Working directories
115 | examples/cache/
116 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at ml-team@neptune.ml. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to the Steppy core library
 2 | 
 3 | Here, at [minerva.ml](https://minerva.ml) we are creating Steppy - lightweight, open-source, Python library for fast and reproducible experimentation.
 4 | 
 5 | ### Get involved
 6 | You are welcome to contribute to the Steppy library. To get started:
 7 | 1. Check [our kanban board](https://github.com/minerva-ml/steppy/projects/1) to see what we are working on right now.
 8 | 1. Express your interest in a paticular [issue](https://github.com/minerva-ml/steppy/issues) by submitting a comment or,
 9 |     * submit your own [issue](https://github.com/minerva-ml/steppy/issues).
10 | 1. We will get back to you in order to start working together.
11 | 
12 | ### Code contributions
13 | Major - and most appreciated - contribution is [pull request](https://github.com/minerva-ml/steppy/pulls) with feature or bug fix.
14 | 
15 | ### Remarks
16 | In case of custom ideas, please contact core contributors directly at ml-team@neptune.ml.
17 | #
18 | 
19 | Thanks!
20 | 
21 | Jakub & Kamil,
22 | 
23 | *core contributors to the [minerva.ml](https://minerva.ml)*
24 | 


--------------------------------------------------------------------------------
/DEFAULT_LOGIC.md:
--------------------------------------------------------------------------------
1 | Default logic of the `fit_transform()`:
2 | 1. execute it on the last Step.
3 | 1. Step is fitted and transformed. Any data or models will be overridden (default setup).
4 | 1. If `force_fitting` is not obligatory, then look for cache:
5 |     1. if output is cached, then use it. In such situation `fit_tranform()` was just taking output from cache.
6 |     1. If output is not cached -> steppy looks for persisted (saved to disk) output. If exist, `fit_tranform()` was just loading output from the project directory.
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 neptune.ml
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Pull Request template
 2 | 
 3 | ### Code contributions
 4 | Major - and most appreciated - contribution is pull request with feature or bug fix. Each pull request initiates discussion about your code contribution.
 5 | 
 6 | Each pull request should be provided with minimal description about its contents.
 7 | #
 8 | 
 9 | Thanks!
10 | 
11 | Jakub & Kamil,
12 | 
13 | _core contributors to the minerva.ml_
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Steppy
 2 | [![license](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/minerva-ml/steppy/blob/master/LICENSE)
 3 | 
 4 | ### What is Steppy?
 5 | 1. Steppy is a lightweight, open-source, Python 3 library for fast and reproducible experimentation.
 6 | 1. Steppy lets data scientist focus on data science, not on software development issues.
 7 | 1. Steppy's minimal interface does not impose constraints, however, enables clean machine learning pipeline design.
 8 | 
 9 | ### What problem steppy solves?
10 | #### Problems
11 | In the course of the project, data scientist faces two problems:
12 | 1. Difficulties with reproducibility in data science / machine learning projects.
13 | 1. Lack of the ability to prepare or extend experiments quickly.
14 | 
15 | #### Solution
16 | Steppy address both problems by introducing two simple abstractions: `Step` and `Tranformer`. We consider it minimal interface for building machine learning pipelines.
17 | 1. `Step` is a wrapper over the transformer and handles multiple aspects of the execution of the pipeline, such as saving intermediate results (if needed), checkpointing the model during training and much more.
18 | 1. `Tranformer` in turn, is purely computational, data scientist-defined piece that takes an input data and produces some output data. Typical Transformers are neural network, machine learning algorithms and pre- or post-processing routines.
19 | 
20 | # Start using steppy
21 | ### Installation
22 | Steppy requires `python3.5` or above.
23 | ```bash
24 | pip3 install steppy
25 | ```
26 | _(you probably want to install it in your [virtualenv](https://virtualenv.pypa.io/en/stable))_
27 | 
28 | ### Resources
29 | 1. :ledger: [Documentation](https://steppy.readthedocs.io/en/latest)
30 | 1. :computer: [Source](https://github.com/minerva-ml/steppy)
31 | 1. :name_badge: [Bugs reports](https://github.com/minerva-ml/steppy/issues)
32 | 1. :rocket: [Feature requests](https://github.com/minerva-ml/steppy/issues)
33 | 1. :star2: Tutorial notebooks ([their repository](https://github.com/minerva-ml/steppy-examples)):
34 |     - :arrow_forward: [Getting started](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/1-getting-started.ipynb)
35 |     -  :arrow_forward:[Steps with multiple inputs](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/2-multi-step.ipynb)
36 |     - :arrow_forward: [Advanced adapters](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/3-adapter_advanced.ipynb)
37 |     - :arrow_forward: [Caching and persistance](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/4-caching-persistence.ipynb)
38 |     - :arrow_forward: [Steppy with Keras](https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/5-steps-with-keras.ipynb)
39 | 
40 | ### Feature Requests
41 | Please send us your ideas on how to improve steppy library! We are looking for your comments here: [Feature requests](https://github.com/minerva-ml/steppy/issues).
42 | 
43 | ### Roadmap
44 | :fast_forward: At this point steppy is early-stage library heavily tested on multiple machine learning challenges ([data-science-bowl](https://github.com/minerva-ml/open-solution-data-science-bowl-2018 "Kaggle's data science bowl 2018"), [toxic-comment-classification-challenge](https://github.com/minerva-ml/open-solution-toxic-comments "Kaggle's Toxic Comment Classification Challenge"), [mapping-challenge](https://github.com/minerva-ml/open-solution-mapping-challenge "CrowdAI's Mapping Challenge")) and educational projects ([minerva-advanced-data-scientific-training](https://github.com/minerva-ml/minerva-training-materials "minerva.ml -> advanced data scientific training")).
45 | 
46 | :fast_forward: We are developing steppy towards practical tool for data scientists who can run their experiments easily and change their pipelines with just few manipulations in the code.
47 | 
48 | ### Related projects
49 | We are also building [steppy-toolkit](https://github.com/minerva-ml/steppy-toolkit "steppy toolkit"), a collection of high quality implementations of the top deep learning architectures -> all of them with the same, intuitive interface.
50 | 
51 | ### Contributing
52 | You are welcome to contribute to the Steppy library. Please check [CONTRIBUTING](https://github.com/minerva-ml/steppy/blob/master/CONTRIBUTING.md) for more information.
53 | 
54 | ### Terms of use
55 | Steppy is [MIT-licensed](https://github.com/minerva-ml/steppy/blob/master/LICENSE).
56 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = steppy
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, os.path.abspath('.'))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = 'steppy'
 23 | copyright = '2018, neptune.ml'
 24 | author = 'Kamil A. Kaczmarek and Jakub Czakon'
 25 | 
 26 | # The short X.Y version
 27 | version = '0.1'
 28 | # The full version, including alpha/beta/rc tags
 29 | release = '0.1.16'
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     'sphinx.ext.autodoc',
 43 |     'sphinx.ext.doctest',
 44 | ]
 45 | 
 46 | # Add any paths that contain templates here, relative to this directory.
 47 | templates_path = ['_templates']
 48 | 
 49 | # The suffix(es) of source filenames.
 50 | # You can specify multiple suffix as a list of string:
 51 | #
 52 | # source_suffix = ['.rst', '.md']
 53 | source_suffix = '.rst'
 54 | 
 55 | # The master toctree document.
 56 | master_doc = 'index'
 57 | 
 58 | # The language for content autogenerated by Sphinx. Refer to documentation
 59 | # for a list of supported languages.
 60 | #
 61 | # This is also used if you do content translation via gettext catalogs.
 62 | # Usually you set "language" from the command line for these cases.
 63 | language = None
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | # This pattern also affects html_static_path and html_extra_path .
 68 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 69 | 
 70 | # The name of the Pygments (syntax highlighting) style to use.
 71 | pygments_style = 'sphinx'
 72 | 
 73 | 
 74 | # -- Options for HTML output -------------------------------------------------
 75 | 
 76 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 77 | # a list of builtin themes.
 78 | #
 79 | html_theme = 'sphinx_rtd_theme'
 80 | 
 81 | # Theme options are theme-specific and customize the look and feel of a theme
 82 | # further.  For a list of options available for each theme, see the
 83 | # documentation.
 84 | #
 85 | # html_theme_options = {}
 86 | 
 87 | # Add any paths that contain custom static files (such as style sheets) here,
 88 | # relative to this directory. They are copied after the builtin static files,
 89 | # so a file named "default.css" will overwrite the builtin "default.css".
 90 | html_static_path = ['_static']
 91 | 
 92 | # Custom sidebar templates, must be a dictionary that maps document names
 93 | # to template names.
 94 | #
 95 | # The default sidebars (for documents that don't match any pattern) are
 96 | # defined by theme itself.  Builtin themes are using these templates by
 97 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
 98 | # 'searchbox.html']``.
 99 | #
100 | # html_sidebars = {}
101 | 
102 | 
103 | # -- Options for HTMLHelp output ---------------------------------------------
104 | 
105 | # Output file base name for HTML help builder.
106 | htmlhelp_basename = 'steppydoc'
107 | 
108 | 
109 | # -- Options for LaTeX output ------------------------------------------------
110 | 
111 | latex_elements = {
112 |     # The paper size ('letterpaper' or 'a4paper').
113 |     #
114 |     # 'papersize': 'letterpaper',
115 | 
116 |     # The font size ('10pt', '11pt' or '12pt').
117 |     #
118 |     # 'pointsize': '10pt',
119 | 
120 |     # Additional stuff for the LaTeX preamble.
121 |     #
122 |     # 'preamble': '',
123 | 
124 |     # Latex figure (float) alignment
125 |     #
126 |     # 'figure_align': 'htbp',
127 | }
128 | 
129 | # Grouping the document tree into LaTeX files. List of tuples
130 | # (source start file, target name, title,
131 | #  author, documentclass [howto, manual, or own class]).
132 | latex_documents = [
133 |     (master_doc, 'steppy.tex', 'steppy Documentation',
134 |      'minerva.ml', 'manual'),
135 | ]
136 | 
137 | 
138 | # -- Options for manual page output ------------------------------------------
139 | 
140 | # One entry per manual page. List of tuples
141 | # (source start file, name, description, authors, manual section).
142 | man_pages = [
143 |     (master_doc, 'steppy', 'steppy Documentation',
144 |      [author], 1)
145 | ]
146 | 
147 | 
148 | # -- Options for Texinfo output ----------------------------------------------
149 | 
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | #  dir menu entry, description, category)
153 | texinfo_documents = [
154 |     (master_doc, 'steppy', 'steppy Documentation',
155 |      author, 'steppy', 'One line description of project.',
156 |      'Miscellaneous'),
157 | ]
158 | 
159 | 
160 | # -- Extension configuration -------------------------------------------------
161 | extensions = ['sphinx.ext.napoleon']
162 | 


--------------------------------------------------------------------------------
/docs/generated/modules.rst:
--------------------------------------------------------------------------------
1 | steppy
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    steppy
8 | 


--------------------------------------------------------------------------------
/docs/generated/steppy.rst:
--------------------------------------------------------------------------------
 1 | steppy package
 2 | ==============
 3 | 
 4 | steppy.adapter module
 5 | ---------------------
 6 | 
 7 | .. automodule:: steppy.adapter
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | steppy.base module
13 | ------------------
14 | 
15 | .. automodule:: steppy.base
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | steppy.utils module
21 | -------------------
22 | 
23 | .. automodule:: steppy.utils
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | Welcome to steppy
  2 | ==================================
  3 | 
  4 | 
  5 | .. toctree::
  6 |    :maxdepth: 2
  7 |    :caption: Module contents:
  8 |       
  9 | 
 10 | API documentation
 11 | ~~~~~~~~~~~~~~~~~
 12 | 
 13 | * :ref:`genindex`
 14 | * :ref:`modindex`
 15 | * :ref:`search`
 16 | 
 17 | 
 18 | What is Steppy?
 19 | ~~~~~~~~~~~~~~~
 20 | 
 21 | Steppy is a lightweight, open-source, Python 3 library for fast and
 22 | reproducible experimentation. It lets data scientist focus on data
 23 | science, not on software development issues. Steppy’s minimal interface
 24 | does not impose constraints, however, enables clean machine learning
 25 | pipeline design.
 26 | 
 27 | What problem steppy solves?
 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 29 | 
 30 | In the course of the project, data scientist faces multiple problems.
 31 | Difficulties with reproducibility and lack of the ability to prepare
 32 | experiments quickly are two particular examples. Steppy address both
 33 | problems by introducing two simple abstractions: ``Step`` and
 34 | ``Tranformer``. We consider it minimal interface for building machine
 35 | learning pipelines.
 36 | 
 37 | ``Step`` is a wrapper over the transformer and handles multiple aspects
 38 | of the execution of the pipeline, such as saving intermediate results
 39 | (if needed), checkpointing the model during training and much more.
 40 | ``Tranformer`` in turn, is purely computational, data scientist-defined
 41 | piece that takes an input data and produces some output data. Typical
 42 | Transformers are neural network, machine learning algorithms and pre- or
 43 | post-processing routines.
 44 | 
 45 | Start using steppy
 46 | ~~~~~~~~~~~~~~~~~~
 47 | 
 48 | Installation
 49 | ^^^^^^^^^^^^
 50 | 
 51 | Steppy requires ``python3.5`` or above.
 52 | 
 53 | .. code:: bash
 54 | 
 55 |    pip3 install steppy
 56 | 
 57 | *(you probably want to install it in
 58 | your* \ `virtualenv <https://virtualenv.pypa.io/en/stable>`__\ *)*
 59 | 
 60 | Resources
 61 | ~~~~~~~~~
 62 | 
 63 | 1. `Documentation <https://steppy.readthedocs.io/en/latest>`__
 64 | 2. `Source <https://github.com/minerva-ml/steppy>`__
 65 | 3. `Bugs reports <https://github.com/minerva-ml/steppy/issues>`__
 66 | 4. `Feature requests <https://github.com/minerva-ml/steppy/issues>`__
 67 | 5. Tutorial notebooks (`their repository <https://github.com/minerva-ml/steppy-examples>`__):
 68 | 
 69 |    -  `Getting started <https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/1-getting-started.ipynb>`__
 70 |    -  `Steps with multiple inputs <https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/2-multi-step.ipynb>`__
 71 |    -  `Advanced adapters <https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/3-adapter_advanced.ipynb>`__
 72 |    -  `Caching and persistance <https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/4-caching-persistence.ipynb>`__
 73 |    -  `Steppy with Keras <https://github.com/minerva-ml/steppy-examples/blob/master/tutorials/5-steps-with-keras.ipynb>`__
 74 | 
 75 | Feature Requests
 76 | ~~~~~~~~~~~~~~~~
 77 | 
 78 | Please send us your ideas on how to improve steppy library! We are
 79 | looking for your comments here: `Feature
 80 | requests <https://github.com/minerva-ml/steppy/issues>`__.
 81 | 
 82 | Roadmap
 83 | ~~~~~~~
 84 | 
 85 | At this point steppy is early-stage library heavily
 86 | tested on multiple machine learning challenges
 87 | (`data-science-bowl <https://github.com/minerva-ml/open-solution-data-science-bowl-2018>`__,
 88 | `toxic-comment-classification-challenge <https://github.com/minerva-ml/open-solution-toxic-comments>`__,
 89 | `mapping-challenge <https://github.com/minerva-ml/open-solution-mapping-challenge>`__)
 90 | and educational projects
 91 | (`minerva-advanced-data-scientific-training <https://github.com/minerva-ml/minerva-training-materials>`__).
 92 | 
 93 | We are developing steppy towards practical tool for data
 94 | scientists who can run their experiments easily and change their
 95 | pipelines with just few manipulations in the code.
 96 | 
 97 | Related projects
 98 | ~~~~~~~~~~~~~~~~
 99 | 
100 | We are also building
101 | `steppy-toolkit <https://github.com/minerva-ml/steppy-toolkit>`__, a
102 | collection of high quality implementations of the top deep learning
103 | architectures -> all of them with the same, intuitive interface.
104 | 
105 | Contributing
106 | ~~~~~~~~~~~~
107 | 
108 | You are welcome to contribute to the Steppy library. Please check
109 | `CONTRIBUTING <https://github.com/minerva-ml/steppy/blob/master/CONTRIBUTING.md>`__
110 | for more information.
111 | 
112 | Terms of use
113 | ~~~~~~~~~~~~
114 | 
115 | Steppy is
116 | `MIT-licensed <https://github.com/minerva-ml/steppy/blob/master/LICENSE>`__.
117 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ipython>=6.4.0
2 | numpy>=1.14.0
3 | pydot_ng>=1.0.0
4 | pytest>=3.6.0
5 | scikit_learn>=0.19.0
6 | scipy>=1.0.0
7 | setuptools>=39.2.0
8 | typing>=3.6.4
9 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | long_description = '''
 4 | Steppy is lightweight, Python library for fast and reproducible experimentation.
 5 | The goal of this package is to provide data scientist with minimal interface
 6 | that allows her to build complex, yet elegant machine learning pipelines.
 7 | 
 8 | Steppy is designed for data scientists who run a lot of experiments.
 9 | 
10 | Steppy is compatible with Python>=3.5
11 | and is distributed under the MIT license.
12 | '''
13 | 
14 | setup(name='steppy',
15 |       packages=['steppy'],
16 |       version='0.1.16',
17 |       description='A lightweight, open-source, Python library for fast and reproducible experimentation',
18 |       long_description=long_description,
19 |       url='https://github.com/minerva-ml/steppy',
20 |       download_url='https://github.com/minerva-ml/steppy/archive/0.1.16.tar.gz',
21 |       author='Kamil A. Kaczmarek, Jakub Czakon',
22 |       author_email='kamil.kaczmarek@neptune.ml, jakub.czakon@neptune.ml',
23 |       keywords=['machine-learning', 'reproducibility', 'pipeline', 'data-science'],
24 |       license='MIT',
25 |       install_requires=[
26 |           'ipython>=6.4.0',
27 |           'numpy>=1.14.0',
28 |           'pydot_ng>=1.0.0',
29 |           'pytest>=3.6.0',
30 |           'scikit_learn>=0.19.0',
31 |           'scipy>=1.0.0',
32 |           'setuptools>=39.2.0',
33 |           'typing>=3.6.4'],
34 |       zip_safe=False,
35 |       classifiers=[])
36 | 


--------------------------------------------------------------------------------
/steppy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minerva-ml/steppy/856b95f1f5189e1d2ca122b891bc670adac9692b/steppy/__init__.py


--------------------------------------------------------------------------------
/steppy/adapter.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, List, Dict, Any, NamedTuple
  2 | 
  3 | E = NamedTuple('E', [('input_name', str),
  4 |                      ('key', str)]
  5 |                )
  6 | 
  7 | AdaptingRecipe = Any
  8 | DataPacket = Dict[str, Any]
  9 | AllOutputs = Dict[str, DataPacket]
 10 | 
 11 | 
 12 | class AdapterError(Exception):
 13 |     pass
 14 | 
 15 | 
 16 | class Adapter:
 17 |     """Translates outputs from parent steps to inputs to the current step.
 18 | 
 19 |     Attributes:
 20 |         adapting_recipes: The recipes that the adapter was initialized with.
 21 | 
 22 |     Example:
 23 |         Normally Adapter is used with a Step. In the following example
 24 |         `RandomForestTransformer` follows sklearn convention of calling arguments `X` and `y`,
 25 |         however names passed to the Step are different. We use Adapter to map recieved names
 26 |         to the expected names.
 27 | 
 28 |         .. code-block:: python
 29 | 
 30 |             from sklearn.datasets import load_iris
 31 |             from sklearn.ensemble import RandomForestClassifier
 32 |             from sklearn.metrics import log_loss
 33 |             from steppy.base import BaseTransformer, Step
 34 |             from steppy.adapter import Adapter, E
 35 | 
 36 |             iris = load_iris()
 37 | 
 38 |             pipeline_input = {
 39 |                 'train_data': {
 40 |                     'target': iris.target,
 41 |                     'data': iris.data
 42 |                 }
 43 |             }
 44 | 
 45 |             class RandomForestTransformer(BaseTransformer):
 46 |                 def __init__(self, random_state=None):
 47 |                     self.estimator = RandomForestClassifier(random_state=random_state)
 48 | 
 49 |                 def fit(self, X, y):
 50 |                     self.estimator.fit(X, y)
 51 |                     return self
 52 | 
 53 |                 def transform(self, X, **kwargs):
 54 |                     y_proba  = self.estimator.predict_proba(X)
 55 |                     return {'y_proba': y_proba}
 56 | 
 57 |             random_forest = Step(
 58 |                 name="random_forest",
 59 |                 transformer=RandomForestTransformer(),
 60 |                 input_data=['train_data'],
 61 |                 adapter=Adapter({
 62 |                     'X': E('train_data', 'data'),
 63 |                     'y': E('train_data', 'target')
 64 |                 }),
 65 |                 experiment_directory='./working_dir'
 66 |             )
 67 | 
 68 |             result = random_forest.fit_transform(pipeline_input)
 69 |             print(log_loss(y_true=iris.target, y_pred=result['y_proba']))
 70 |     """
 71 | 
 72 |     def __init__(self, adapting_recipes: Dict[str, AdaptingRecipe]):
 73 |         """Adapter constructor.
 74 | 
 75 |         Note:
 76 |             You have to import the extractor 'E' from this module to construct
 77 |             adapters.
 78 | 
 79 |         Args:
 80 |             adapting_recipes: Recipes used to control the input translation.
 81 |                 An adapting recipe may be any Python data structure. If this structure
 82 |                 contains placeholders denoted by `E`, then values extracted from parent
 83 |                 steps' outputs will be substituted in their place.
 84 |                 `adapting_recipes` is a dict where the keys match the arguments
 85 |                 expected by the transformer. The values in this dictionary may be for example
 86 |                 one of the following:
 87 | 
 88 |                 1. `E('input_name', 'key')` will query the parent step
 89 |                     'input_name' for the output 'key'
 90 | 
 91 |                 2. List of `E('input_name', 'key')` will apply the extractors
 92 |                     to the parent steps and combine the results into a list
 93 | 
 94 |                 3. Tuple of `E('input_name', 'key')` will apply the extractors
 95 |                     to the parent steps and combine the results into a tuple
 96 | 
 97 |                 4. Dict like `{k: E('input_name', 'key')}` will apply the
 98 |                     extractors to the parent steps and combine the results
 99 |                     into a dict with the same keys
100 | 
101 |                 5. Anything else: the value itself will be used as the argument
102 |                     to the transformer
103 |         """
104 |         self.adapting_recipes = adapting_recipes
105 | 
106 |     def adapt(self, all_ouputs: AllOutputs) -> DataPacket:
107 |         """Adapt inputs for the transformer included in the step.
108 | 
109 |         Args:
110 |             all_ouputs: Dict of outputs from parent steps. The keys should
111 |                 match the names of these steps and the values should be their
112 |                 respective outputs.
113 | 
114 |         Returns:
115 |             Dictionary with the same keys as `adapting_recipes` and values
116 |             constructed according to the respective recipes.
117 | 
118 |         """
119 |         adapted = {}
120 |         for name, recipe in self.adapting_recipes.items():
121 |             adapted[name] = self._construct(all_ouputs, recipe)
122 |         return adapted
123 | 
124 |     def _construct(self, all_ouputs: AllOutputs, recipe: AdaptingRecipe) -> Any:
125 |         return {
126 |             E: self._construct_element,
127 |             tuple: self._construct_tuple,
128 |             list: self._construct_list,
129 |             dict: self._construct_dict,
130 |         }.get(recipe.__class__, self._construct_constant)(all_ouputs, recipe)
131 | 
132 |     def _construct_constant(self, _: AllOutputs, constant) -> Any:
133 |         return constant
134 | 
135 |     def _construct_element(self, all_ouputs: AllOutputs, element: E):
136 |         input_name = element.input_name
137 |         key = element.key
138 |         try:
139 |             input_results = all_ouputs[input_name]
140 |             try:
141 |                 return input_results[key]
142 |             except KeyError:
143 |                 msg = "Input '{}' didn't have '{}' in its result.".format(input_name, key)
144 |                 raise AdapterError(msg)
145 |         except KeyError:
146 |             msg = "No such input: '{}'".format(input_name)
147 |             raise AdapterError(msg)
148 | 
149 |     def _construct_list(self, all_ouputs: AllOutputs, lst: List[AdaptingRecipe]):
150 |         return [self._construct(all_ouputs, recipe) for recipe in lst]
151 | 
152 |     def _construct_tuple(self, all_ouputs: AllOutputs, tup: Tuple):
153 |         return tuple(self._construct(all_ouputs, recipe) for recipe in tup)
154 | 
155 |     def _construct_dict(self, all_ouputs: AllOutputs, dic: Dict[AdaptingRecipe, AdaptingRecipe]):
156 |         return {self._construct(all_ouputs, k): self._construct(all_ouputs, v)
157 |                 for k, v in dic.items()}
158 | 


--------------------------------------------------------------------------------
/steppy/base.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pprint
  3 | from collections import defaultdict
  4 | 
  5 | from sklearn.externals import joblib
  6 | 
  7 | from steppy.adapter import Adapter, AdapterError
  8 | from steppy.utils import display_upstream_structure, persist_as_png, get_logger, initialize_logger
  9 | 
 10 | initialize_logger()
 11 | logger = get_logger()
 12 | 
 13 | DEFAULT_TRAINING_SETUP = {
 14 |     'is_fittable': True,
 15 |     'force_fitting': True,
 16 |     'persist_output': False,
 17 |     'cache_output': False,
 18 |     'load_persisted_output': False
 19 | }
 20 | 
 21 | 
 22 | class Step:
 23 |     """Step is a building block of steppy pipelines.
 24 | 
 25 |     It is an execution wrapper over the transformer (see :class:`~steppy.base.BaseTransformer`),
 26 |     which realizes single operation on data. With Step you can:
 27 | 
 28 |     1. design multiple input/output data flows and connections between Steps.
 29 |     2. handle persistence and caching of transformer and intermediate results.
 30 | 
 31 |     Step executes `fit_transform` method inspired by the sklearn on every step recursively
 32 |     starting from the very last Step and making its way forward through the `input_steps`.
 33 |     One can easily debug the data flow by plotting the pipeline graph
 34 |     (see: :func:`~steppy.utils.persist_as_png`) or return step in a jupyter notebook cell.
 35 | 
 36 |     Attributes:
 37 |         transformer (obj): object that inherits from BaseTransformer or Step instance.
 38 |             When Step instance is passed, transformer from that Step will be copied and used to
 39 |             perform transformations. It is useful when both train and valid data are passed in
 40 |             one pipeline (common situation in deep learning).
 41 | 
 42 |         name (str): Step name.
 43 |             Each step in a pipeline must have a unique name. It is name of the persisted
 44 |             transformer and output of this Step.
 45 |             Default is transformer's class name.
 46 | 
 47 |         experiment_directory (str): path to the directory where all execution artifacts will be
 48 |             stored.
 49 |             Default is ``~/.steppy``.
 50 |             The following directories will be created under ``~/.steppy``, if they were not created by
 51 |             preceding Steps:
 52 | 
 53 |             * transformers: transformer objects are persisted in this folder
 54 |             * output:      step output dictionaries are persisted in this folder
 55 |               (if ``persist_output=True``)
 56 | 
 57 |         input_data (list): Elements of this list are keys in the data dictionary that is passed
 58 |             to the Step's `fit_transform` and `transform` methods.
 59 |             List of str, default is empty list.
 60 | 
 61 |             Example:
 62 | 
 63 |                 .. code-block:: python
 64 | 
 65 |                     data_train = {'input': {'images': X_train,
 66 |                                             'labels': y_train}
 67 |                                  }
 68 | 
 69 |                     my_step = Step(name='random_forest',
 70 |                                    transformer=RandomForestTransformer(),
 71 |                                    input_data=['input']
 72 |                                    )
 73 | 
 74 |                     my_step.fit_transform(data_train)
 75 | 
 76 |                 `data_train` is dictionary where:
 77 | 
 78 |                  * keys are names of data packets,
 79 |                  * values are data packets, that is dictionaries that describes dataset.
 80 |                    In this example keys in the data packet are `images` and `labels` and values
 81 |                    are actual data of any type.
 82 | 
 83 |                 `Step.input_data` takes the key from `data_train` (values must match!) and extracts
 84 |                 actual data that will be passed to the `fit_transform` and `transform` method of
 85 |                 the `self.transformer`.
 86 | 
 87 |         input_steps (list): List of input Steps that the current Step uses as its input.
 88 |             list of Step instances, default is empty list.
 89 |             Current Step will combine output from `input_steps` and `input_data` using `adapter`.
 90 |             Then pass it to the transformer methods `fit_transform` and `transform`.
 91 | 
 92 |             Example:
 93 | 
 94 |                 .. code-block:: python
 95 | 
 96 |                     self.input_steps=[cnn_step, rf_step, ensemble_step, guesses_step]
 97 | 
 98 |                 Each element of the list is Step instance.
 99 | 
100 |         adapter (obj): It renames and arranges inputs that are passed to the Transformer
101 |             (see :class:`~steppy.base.BaseTransformer`).
102 |             Default is ``None``.
103 |             If ``not None``, then must be an instance of the :class:`~steppy.adapter.Adapter` class.
104 | 
105 |             Example:
106 |                 .. code-block:: python
107 | 
108 |                     self.adapter=Adapter({'X': E('input', 'images'),
109 |                                           'y': E('input', 'labels')}
110 |                                          )
111 | 
112 |             Adapter simplifies the renaming and combining of inputs from multiple steps.
113 |             In this example, after the adaptation:
114 | 
115 |             * `X` is key to the data stored under the `images` key
116 |             * `y` is key to the data stored under the `labels` key
117 | 
118 |                 where both `images` and `labels` keys comes from `input`
119 |                 (see :attr:`~steppy.base.Step.input_data`)
120 | 
121 |         cache_output (bool): If True, Step output dictionary will be cached under
122 |             ``self.output``, when transform method of the Step transformer
123 |             is completed. If the same Step is used multiple times, transform method is invoked
124 |             only once. Further invokes simply use cached output.
125 |             Default ``False``: do not cache output
126 | 
127 |             Warning:
128 |                 One should always run `step.clean_cache_upstream()` before executing
129 |                 `step.fit_transform(data)` or `step.transform(data)`
130 |                 When working with large datasets, cache might be very large.
131 | 
132 |         persist_output (bool): If True, persist Step output to disk under the
133 |             ``<experiment_directory>/output/<name>`` directory.
134 |             Default ``False``: do not persist any files to disk.
135 |             If True then Step output dictionary will be persisted to the
136 |             ``<experiment_directory>/output/<name>`` directory, after transform method of the Step
137 |             transformer is completed. Step persists to disk the output after every run of the
138 |             transformer's transform method. It means that Step overrides files. See also
139 |             `load_persisted_output` parameter.
140 | 
141 |             Warning:
142 |                 When working with large datasets, cache might be very large.
143 | 
144 |         load_persisted_output (bool): If True, Step output dictionary already persisted to the
145 |             ``<experiment_directory>/output/<name>`` will be loaded when Step is called.
146 |             Default ``False``: do not load persisted output.
147 |             Useful when debugging and working with ensemble models or time consuming feature
148 |             extraction. One can easily persist already computed pieces of the pipeline and save
149 |             time by loading them instead of calculating.
150 | 
151 |             Warning:
152 |                 Re-running the same step on new data with `load_persisted_output` set ``True``
153 |                 may lead to errors when output from old data are loaded while user would expect
154 |                 the pipeline to use new data instead.
155 | 
156 |         force_fitting (bool): If True, Step transformer will be fitted (via `fit_transform`)
157 |             even if ``<experiment_directory>/transformers/<step_name>`` exists.
158 |             Default ``True``: fit transformer each time `fit_transform()` is called.
159 |             Helpful when one wants to use ``persist_output=True`` and load ``persist_output=True``
160 |             on a previous Step and fit current Step multiple times. This is a typical scenario
161 |             for tuning hyperparameters for an ensemble model trained on the output from first
162 |             level models or a model build on features that are time consuming to compute.
163 |     """
164 | 
165 |     def __init__(self,
166 |                  transformer,
167 |                  name=None,
168 |                  experiment_directory=None,
169 |                  output_directory=None,
170 |                  input_data=None,
171 |                  input_steps=None,
172 |                  adapter=None,
173 | 
174 |                  is_fittable=True,
175 |                  force_fitting=True,
176 | 
177 |                  persist_output=False,
178 |                  cache_output=False,
179 |                  load_persisted_output=False):
180 | 
181 |         self.name = self._format_step_name(name, transformer)
182 | 
183 |         if experiment_directory is not None:
184 |             assert isinstance(experiment_directory, str),\
185 |                 'Step {} error, experiment_directory must ' \
186 |                 'be str, got {} instead.'.format(self.name, type(experiment_directory))
187 |         else:
188 |             experiment_directory = os.path.join(os.path.expanduser("~"), '.steppy')
189 |             logger.info('Using default experiment directory: {}'.format(experiment_directory))
190 | 
191 |         if output_directory is not None:
192 |             assert isinstance(output_directory, str),\
193 |                 'Step {}, output_directory must be str, got {} instead'.format(self.name, type(output_directory))
194 | 
195 |         if input_data is not None:
196 |             assert isinstance(input_data, list), 'Step {} error, input_data must be list, ' \
197 |                                                  'got {} instead.'.format(self.name, type(input_data))
198 |         if input_steps is not None:
199 |             assert isinstance(input_steps, list), 'Step {} error, input_steps must be list, ' \
200 |                                                   'got {} instead.'.format(self.name, type(input_steps))
201 |         if adapter is not None:
202 |             assert isinstance(adapter, Adapter), 'Step {} error, adapter must be an instance ' \
203 |                                                  'of {}'.format(self.name, str(Adapter))
204 | 
205 |         assert isinstance(cache_output, bool), 'Step {} error, cache_output must be bool, ' \
206 |                                                'got {} instead.'.format(self.name, type(cache_output))
207 |         assert isinstance(persist_output, bool), 'Step {} error, persist_output must be bool, ' \
208 |                                                  'got {} instead.'.format(self.name, type(persist_output))
209 |         assert isinstance(load_persisted_output, bool),\
210 |             'Step {} error, load_persisted_output ' \
211 |             'must be bool, got {} instead.'.format(self.name, type(load_persisted_output))
212 |         assert isinstance(force_fitting, bool), 'Step {} error, force_fitting must be bool, ' \
213 |                                                 'got {} instead.'.format(self.name, type(force_fitting))
214 | 
215 |         logger.info('Initializing Step {}'.format(self.name))
216 | 
217 |         self.transformer = transformer
218 |         self.output_directory = output_directory
219 |         self.input_steps = input_steps or []
220 |         self.input_data = input_data or []
221 |         self.adapter = adapter
222 |         self.is_fittable = is_fittable
223 |         self.cache_output = cache_output
224 |         self.persist_output = persist_output
225 |         self.load_persisted_output = load_persisted_output
226 |         self.force_fitting = force_fitting
227 | 
228 |         self.output = None
229 |         self.experiment_directory = os.path.join(experiment_directory)
230 |         self._prepare_experiment_directories()
231 |         self._mode = 'train'
232 | 
233 |         self._validate_upstream_names()
234 |         logger.info('Step {} initialized'.format(self.name))
235 | 
236 |     @property
237 |     def experiment_directory_transformers_step(self):
238 |         directory = os.path.join(self.experiment_directory, 'transformers')
239 |         os.makedirs(directory, exist_ok=True)
240 |         return os.path.join(directory, self.name)
241 | 
242 |     @property
243 |     def experiment_directory_output_step(self):
244 |         directory = os.path.join(self.experiment_directory, 'output')
245 |         if self.output_directory is not None:
246 |             os.makedirs(os.path.join(directory, self.output_directory), exist_ok=True)
247 |             return os.path.join(directory, self.output_directory, self.name)
248 | 
249 |         if self._mode == 'train':
250 |             os.makedirs(os.path.join(directory, 'train'), exist_ok=True)
251 |             return os.path.join(directory, 'train', self.name)
252 | 
253 |         if self._mode == 'inference':
254 |             os.makedirs(os.path.join(directory, 'inference'), exist_ok=True)
255 |             return os.path.join(directory, 'inference', self.name)
256 | 
257 |     @property
258 |     def upstream_structure(self):
259 |         """Build dictionary with entire upstream pipeline structure
260 |         (with regard to the current Step).
261 | 
262 |         Returns:
263 |             dict: dictionary describing the upstream pipeline structure. It has two keys:
264 |             ``'edges'`` and ``'nodes'``, where:
265 | 
266 |             - value of ``'edges'`` is set of tuples ``(input_step.name, self.name)``
267 |             - value of ``'nodes'`` is set of all step names upstream to this Step
268 |         """
269 |         structure_dict = {'edges': set(),
270 |                           'nodes': set()}
271 |         structure_dict = self._build_structure_dict(structure_dict)
272 |         return structure_dict
273 | 
274 |     @property
275 |     def all_upstream_steps(self):
276 |         """Build dictionary with all Step instances that are upstream to `self`.
277 | 
278 |         Returns:
279 |             all_upstream_steps (dict): dictionary where keys are Step names (str) and values are Step
280 |             instances (obj)
281 |         """
282 |         all_steps_ = {}
283 |         all_steps_ = self._get_steps(all_steps_)
284 |         return all_steps_
285 | 
286 |     @property
287 |     def transformer_is_persisted(self):
288 |         """(bool): True if transformer exists under the directory
289 |         ``<experiment_directory>/transformers/<step_name>``
290 |         """
291 |         return os.path.exists(self.experiment_directory_transformers_step)
292 | 
293 |     @property
294 |     def output_is_cached(self):
295 |         """(bool): True if step output exists under the ``self.output``.
296 |             See `cache_output`.
297 |         """
298 |         if self.output is not None:
299 |             return True
300 |         else:
301 |             return False
302 | 
303 |     @property
304 |     def output_is_persisted(self):
305 |         """(bool): True if step output exists under the ``<experiment_directory>/output/<mode>/<name>``.
306 |             See :attr:`~steppy.base.Step.persist_output`.
307 |         """
308 |         return os.path.exists(self.experiment_directory_output_step)
309 | 
310 |     def fit_transform(self, data):
311 |         """Fit the model and transform data or load already processed data.
312 | 
313 |         Loads cached or persisted output or adapts data for the current transformer and
314 |         executes ``transformer.fit_transform``.
315 | 
316 |         Args:
317 |             data (dict): data dictionary with keys as input names and values as dictionaries of
318 |                 key-value pairs that can be passed to the ``self.transformer.fit_transform`` method.
319 |                 Example:
320 | 
321 |                 .. code-block:: python
322 | 
323 |                     data = {'input_1': {'X': X,
324 |                                         'y': y},
325 |                             'input_2': {'X': X,
326 |                                         'y': y}
327 |                             }
328 | 
329 |         Returns:
330 |             dict: Step output from the ``self.transformer.fit_transform`` method
331 |         """
332 |         if data:
333 |             assert isinstance(data, dict), 'Step {}, "data" argument in the "fit_transform()" method must be dict, ' \
334 |                                            'got {} instead.'.format(self.name, type(data))
335 |         logger.info('Step {}, working in "{}" mode'.format(self.name, self._mode))
336 | 
337 |         if self._mode == 'inference':
338 |             ValueError('Step {}, you are in "{}" mode, where you cannot run "fit".'
339 |                        'Please change mode to "train" to enable fitting.'
340 |                        'Use: "step.set_mode_train()" then "step.fit_transform()"'.format(self.name, self._mode))
341 | 
342 |         if self.output_is_cached and not self.force_fitting:
343 |             logger.info('Step {} using cached output'.format(self.name))
344 |             step_output_data = self.output
345 |         elif self.output_is_persisted and self.load_persisted_output and not self.force_fitting:
346 |             logger.info('Step {} loading persisted output from {}'.format(self.name,
347 |                                                                           self.experiment_directory_output_step))
348 |             step_output_data = self._load_output(self.experiment_directory_output_step)
349 |         else:
350 |             step_inputs = {}
351 |             if self.input_data is not None:
352 |                 for input_data_part in self.input_data:
353 |                     step_inputs[input_data_part] = data[input_data_part]
354 | 
355 |             for input_step in self.input_steps:
356 |                 step_inputs[input_step.name] = input_step.fit_transform(data)
357 | 
358 |             if self.adapter:
359 |                 step_inputs = self._adapt(step_inputs)
360 |             else:
361 |                 step_inputs = self._unpack(step_inputs)
362 |             step_output_data = self._fit_transform_operation(step_inputs)
363 |         logger.info('Step {}, fit and transform completed'.format(self.name))
364 |         return step_output_data
365 | 
366 |     def transform(self, data):
367 |         """Transforms data or loads already processed data.
368 | 
369 |         Loads cached persisted output or adapts data for the current transformer and executes
370 |         its `transform` method.
371 | 
372 |         Args:
373 |             data (dict): data dictionary with keys as input names and values as dictionaries of
374 |                 key:value pairs that can be passed to the ``step.transformer.fit_transform`` method
375 | 
376 |                 Example:
377 | 
378 |                     .. code-block:: python
379 | 
380 |                         data = {'input_1':{'X':X,
381 |                                            'y':y
382 |                                            },
383 |                                 'input_2': {'X':X,
384 |                                             'y':y
385 |                                            }
386 |                                }
387 | 
388 |         Returns:
389 |             dict: step output from the transformer.transform method
390 |         """
391 |         if data:
392 |             assert isinstance(data, dict), 'Step {}, "data" argument in the "transform()" method must be dict, ' \
393 |                                            'got {} instead.'.format(self.name, type(data))
394 |         logger.info('Step {}, working in "{}" mode'.format(self.name, self._mode))
395 | 
396 |         if self.output_is_cached:
397 |             logger.info('Step {} using cached output'.format(self.name))
398 |             step_output_data = self.output
399 |         elif self.output_is_persisted and self.load_persisted_output:
400 |             logger.info('Step {} loading persisted output from {}'.format(self.name,
401 |                                                                           self.experiment_directory_output_step))
402 |             step_output_data = self._load_output(self.experiment_directory_output_step)
403 |         else:
404 |             step_inputs = {}
405 |             if self.input_data is not None:
406 |                 for input_data_part in self.input_data:
407 |                     step_inputs[input_data_part] = data[input_data_part]
408 | 
409 |             for input_step in self.input_steps:
410 |                 step_inputs[input_step.name] = input_step.transform(data)
411 | 
412 |             if self.adapter:
413 |                 step_inputs = self._adapt(step_inputs)
414 |             else:
415 |                 step_inputs = self._unpack(step_inputs)
416 |             step_output_data = self._transform_operation(step_inputs)
417 |         logger.info('Step {}, transform completed'.format(self.name))
418 |         return step_output_data
419 | 
420 |     def set_mode_train(self):
421 |         """Applies 'train' mode to all upstream Steps including this Step
422 |         and cleans cache for all upstream Steps including this Step.
423 |         """
424 |         self._set_mode('train')
425 |         return self
426 | 
427 |     def set_mode_inference(self):
428 |         """Applies 'inference' mode to all upstream Steps including this Step
429 |         and cleans cache for all upstream Steps including this Step.
430 |         """
431 |         self._set_mode('inference')
432 |         return self
433 | 
434 |     def reset(self):
435 |         """Reset all upstream Steps to the default training parameters and
436 |         cleans cache for all upstream Steps including this Step.
437 |         Defaults are:
438 |             'mode': 'train',
439 |             'is_fittable': True,
440 |             'force_fitting': True,
441 |             'persist_output': False,
442 |             'cache_output': False,
443 |             'load_persisted_output': False
444 |         """
445 |         self.clean_cache_upstream()
446 |         self.set_mode_train()
447 |         for step_obj in self.all_upstream_steps.values():
448 |             step_obj.is_fittable = DEFAULT_TRAINING_SETUP['is_fittable']
449 |             step_obj.force_fitting = DEFAULT_TRAINING_SETUP['force_fitting']
450 |             step_obj.persist_output = DEFAULT_TRAINING_SETUP['persist_output']
451 |             step_obj.cache_output = DEFAULT_TRAINING_SETUP['cache_output']
452 |             step_obj.load_persisted_output = DEFAULT_TRAINING_SETUP['load_persisted_output']
453 |         logger.info('Step {}, reset all upstream Steps to default training parameters, '
454 |                     'including this Step'.format(self.name))
455 |         return self
456 | 
457 |     def set_parameters_upstream(self, parameters):
458 |         """Set parameters to all upstream Steps including this Step.
459 |         Parameters is dict() where key is Step attribute, and value is new value to set.
460 |         """
461 |         assert isinstance(parameters, dict), 'parameters must be dict, got {} instead'.format(type(parameters))
462 |         for step_obj in self.all_upstream_steps.values():
463 |             for key in step_obj.__dict__.keys():
464 |                 if key in list(parameters.keys()):
465 |                     step_obj.__dict__[key] = parameters[key]
466 |                     if key == 'experiment_directory':
467 |                         step_obj._prepare_experiment_directories()
468 |         logger.info('set new values to all upstream Steps including this Step.')
469 |         return self
470 | 
471 |     def clean_cache_step(self):
472 |         """Clean cache for current step.
473 |         """
474 |         logger.info('Step {}, cleaning cache'.format(self.name))
475 |         self.output = None
476 |         return self
477 | 
478 |     def clean_cache_upstream(self):
479 |         """Clean cache for all steps that are upstream to `self`.
480 |         """
481 |         logger.info('Cleaning cache for the entire upstream pipeline')
482 |         for step in self.all_upstream_steps.values():
483 |             logger.info('Step {}, cleaning cache'.format(step.name))
484 |             step.output = None
485 |         return self
486 | 
487 |     def get_step_by_name(self, name):
488 |         """Extracts step by name from the pipeline.
489 | 
490 |         Extracted Step is a fully functional pipeline as well.
491 |         All upstream Steps are already defined.
492 | 
493 |         Args:
494 |             name (str): name of the step to be fetched
495 |         Returns:
496 |             Step (obj): extracted step
497 |         """
498 |         self._validate_step_name(name)
499 |         name = str(name)
500 |         try:
501 |             return self.all_upstream_steps[name]
502 |         except KeyError as e:
503 |             msg = 'No Step with name "{}" found. ' \
504 |                   'You have following Steps: {}'.format(name, list(self.all_upstream_steps.keys()))
505 |             raise StepError(msg) from e
506 | 
507 |     def persist_upstream_structure(self):
508 |         """Persist json file with the upstream steps structure, that is step names and their connections."""
509 |         persist_dir = os.path.join(self.experiment_directory, '{}_upstream_structure.json'.format(self.name))
510 |         logger.info('Step {}, saving upstream pipeline structure to {}'.format(self.name, persist_dir))
511 |         joblib.dump(self.upstream_structure, persist_dir)
512 | 
513 |     def persist_upstream_diagram(self, filepath):
514 |         """Creates upstream steps diagram and persists it to disk as png file.
515 | 
516 |         Pydot graph is created and persisted to disk as png file under the filepath directory.
517 | 
518 |         Args:
519 |             filepath (str): filepath to which the png with steps visualization should
520 |                 be persisted
521 |         """
522 |         assert isinstance(filepath, str),\
523 |             'Step {} error, filepath must be str. Got {} instead'.format(self.name, type(filepath))
524 |         persist_as_png(self.upstream_structure, filepath)
525 | 
526 |     def _fit_transform_operation(self, step_inputs):
527 |         if self.is_fittable:
528 |             if self.transformer_is_persisted and not self.force_fitting:
529 |                 logger.info('Step {}, loading transformer from the {}'
530 |                             .format(self.name, self.experiment_directory_transformers_step))
531 |                 self.transformer.load(self.experiment_directory_transformers_step)
532 |                 logger.info('Step {}, transforming...'.format(self.name))
533 | 
534 |                 try:
535 |                     step_output_data = self.transformer.transform(**step_inputs)
536 |                 except Exception as e:
537 |                     msg = 'Step {}, Transformer "{}" error ' \
538 |                           'during "transform()" operation.'.format(self.name, self.transformer.__class__.__name__)
539 |                     raise StepError(msg) from e
540 | 
541 |                 logger.info('Step {}, transforming completed'.format(self.name))
542 |             else:
543 |                 logger.info('Step {}, fitting and transforming...'.format(self.name))
544 | 
545 |                 try:
546 |                     step_output_data = self.transformer.fit_transform(**step_inputs)
547 |                 except Exception as e:
548 |                     msg = 'Step {}, Transformer "{}" error ' \
549 |                           'during "fit_transform()" operation.'.format(self.name, self.transformer.__class__.__name__)
550 |                     raise StepError(msg) from e
551 | 
552 |                 logger.info('Step {}, fitting and transforming completed'.format(self.name))
553 |                 logger.info('Step {}, persisting transformer to the {}'
554 |                             .format(self.name, self.experiment_directory_transformers_step))
555 |                 self.transformer.persist(self.experiment_directory_transformers_step)
556 |         else:
557 |             logger.info('Step {}, is not fittable, transforming...'.format(self.name))
558 | 
559 |             try:
560 |                 step_output_data = self.transformer.transform(**step_inputs)
561 |             except Exception as e:
562 |                 msg = 'Step {}, Transformer "{}" error ' \
563 |                       'during "transform()" operation.'.format(self.name, self.transformer.__class__.__name__)
564 |                 raise StepError(msg) from e
565 | 
566 |             logger.info('Step {}, transforming completed'.format(self.name))
567 | 
568 |         assert isinstance(step_output_data, dict), 'Step {}, Transformer "{}", error. ' \
569 |             'Output from transformer must be dict, got {} instead'.format(self.name,
570 |                                                                           self.transformer.__class__.__name__,
571 |                                                                           type(step_output_data))
572 | 
573 |         if self.cache_output:
574 |             logger.info('Step {}, caching output'.format(self.name))
575 |             self.output = step_output_data
576 |         if self.persist_output:
577 |             logger.info('Step {}, persisting output to the {}'
578 |                         .format(self.name, self.experiment_directory_output_step))
579 |             self._persist_output(step_output_data, self.experiment_directory_output_step)
580 |         return step_output_data
581 | 
582 |     def _transform_operation(self, step_inputs):
583 |         if self.is_fittable:
584 |             if self.transformer_is_persisted:
585 |                 logger.info('Step {}, loading transformer from the {}'
586 |                             .format(self.name, self.experiment_directory_transformers_step))
587 |                 self.transformer.load(self.experiment_directory_transformers_step)
588 |                 logger.info('Step {}, transforming...'.format(self.name))
589 | 
590 |                 try:
591 |                     step_output_data = self.transformer.transform(**step_inputs)
592 |                 except Exception as e:
593 |                     msg = 'Step {}, Transformer "{}" error ' \
594 |                           'during "transform()" operation.'.format(self.name, self.transformer.__class__.__name__)
595 |                     raise StepError(msg) from e
596 | 
597 |                 logger.info('Step {}, transforming completed'.format(self.name))
598 |             else:
599 |                 raise ValueError('No transformer persisted with name: {}. '
600 |                                  'Make sure that you have this transformer under the directory: {}'
601 |                                  .format(self.name, self.experiment_directory_transformers_step))
602 |         else:
603 |             logger.info('Step {}, transforming...'.format(self.name))
604 | 
605 |             try:
606 |                 step_output_data = self.transformer.transform(**step_inputs)
607 |             except Exception as e:
608 |                 msg = 'Step {}, Transformer "{}" error ' \
609 |                       'during "transform()" operation.'.format(self.name, self.transformer.__class__.__name__)
610 |                 raise StepError(msg) from e
611 | 
612 |             logger.info('Step {}, transforming completed'.format(self.name))
613 | 
614 |         assert isinstance(step_output_data, dict), 'Step {}, Transformer "{}", error. ' \
615 |             'Output from transformer must be dict, got {} instead'.format(self.name,
616 |                                                                           self.transformer.__class__.__name__,
617 |                                                                           type(step_output_data))
618 | 
619 |         if self.cache_output:
620 |             logger.info('Step {}, caching output'.format(self.name))
621 |             self.output = step_output_data
622 |         if self.persist_output:
623 |             logger.info('Step {}, persisting output to the {}'
624 |                         .format(self.name, self.experiment_directory_output_step))
625 |             self._persist_output(step_output_data, self.experiment_directory_output_step)
626 |         return step_output_data
627 | 
628 |     def _load_output(self, filepath):
629 |         logger.info('Step {}, loading output from {}'.format(self.name, filepath))
630 |         return joblib.load(filepath)
631 | 
632 |     def _persist_output(self, output_data, filepath):
633 |         joblib.dump(output_data, filepath)
634 | 
635 |     def _adapt(self, step_inputs):
636 |         logger.info('Step {}, adapting inputs'.format(self.name))
637 |         try:
638 |             return self.adapter.adapt(step_inputs)
639 |         except AdapterError as e:
640 |             msg = "Error while adapting step '{}'. Check Step inputs".format(self.name)
641 |             raise StepError(msg) from e
642 | 
643 |     def _unpack(self, step_inputs):
644 |         logger.info('Step {}, unpacking inputs'.format(self.name))
645 |         unpacked_steps = {}
646 |         key_to_step_names = defaultdict(list)
647 |         for step_name, step_dict in step_inputs.items():
648 |             unpacked_steps.update(step_dict)
649 |             for key in step_dict.keys():
650 |                 key_to_step_names[key].append(step_name)
651 | 
652 |         repeated_keys = [(key, step_names) for key, step_names in key_to_step_names.items()
653 |                          if len(step_names) > 1]
654 |         if len(repeated_keys) == 0:
655 |             return unpacked_steps
656 |         else:
657 |             msg = "Could not unpack inputs. Following keys are present in multiple input steps:\n " \
658 |                   "\n".join(["  '{}' present in steps {}".format(key, step_names)
659 |                              for key, step_names in repeated_keys])
660 |             raise StepError(msg)
661 | 
662 |     def _prepare_experiment_directories(self):
663 |         if not os.path.exists(os.path.join(self.experiment_directory, 'transformers')):
664 |             logger.info('initializing experiment directories under {}'.format(self.experiment_directory))
665 |             for dir_name in ['transformers', 'output']:
666 |                 os.makedirs(os.path.join(self.experiment_directory, dir_name), exist_ok=True)
667 | 
668 |     def _get_steps(self, all_steps):
669 |         self._check_name_uniqueness(all_steps=all_steps)
670 |         for input_step in self.input_steps:
671 |             all_steps = input_step._get_steps(all_steps)
672 |         all_steps[self.name] = self
673 |         return all_steps
674 | 
675 |     def _format_step_name(self, name, transformer):
676 |         self._validate_step_name(name=name)
677 |         if name is not None:
678 |             name_ = str(name)
679 |         else:
680 |             name_ = transformer.__class__.__name__
681 |         return name_
682 | 
683 |     def _validate_step_name(self, name):
684 |         if name is not None:
685 |             assert isinstance(name, str) or isinstance(name, float) or isinstance(name, int),\
686 |                 'Step name must be str, float or int. Got {} instead.'.format(type(name))
687 | 
688 |     def _check_name_uniqueness(self, all_steps):
689 |         if self.name in all_steps.keys():
690 |             logger.info('STEPPY WARNING: Step with name "{}", already exist. '
691 |                         'Make sure that all Steps have unique name.'.format(self.name))
692 | 
693 |     def _validate_upstream_names(self):
694 |         try:
695 |             _ = self.all_upstream_steps.keys()
696 |         except ValueError as e:
697 |             msg = 'Incorrect Step names'
698 |             raise StepError(msg) from e
699 | 
700 |     def _build_structure_dict(self, structure_dict):
701 |         for input_step in self.input_steps:
702 |             structure_dict = input_step._build_structure_dict(structure_dict)
703 |             structure_dict['edges'].add((input_step.name, self.name))
704 |         structure_dict['nodes'].add(self.name)
705 |         for input_data in self.input_data:
706 |             structure_dict['nodes'].add(input_data)
707 |             structure_dict['edges'].add((input_data, self.name))
708 |         return structure_dict
709 | 
710 |     def _set_mode(self, mode):
711 |         self.clean_cache_upstream()
712 |         for name, step_obj in self.all_upstream_steps.items():
713 |             step_obj._mode = mode
714 |         logger.info('Step {}, applied "{}" mode to all upstream Steps, including this Step'.format(self.name, mode))
715 | 
716 |     def _repr_html_(self):
717 |         return display_upstream_structure(self.upstream_structure)
718 | 
719 |     def __str__(self):
720 |         return pprint.pformat(self.upstream_structure)
721 | 
722 | 
723 | class BaseTransformer:
724 |     """Abstraction on ``fit`` and ``transform`` execution.
725 | 
726 |     Base transformer is an abstraction strongly inspired by the ``sklearn.Transformer`` and
727 |     ``sklearn.Estimator``. Two main concepts are:
728 | 
729 |         1. Every action that can be performed on data (transformation, model training) can be
730 |         performed in two steps: fitting (where trainable parameters are estimated) and transforming
731 |         (where previously estimated parameters are used to transform the data into desired state).
732 | 
733 |         2. Every transformer knows how it should be persisted and loaded (especially useful when
734 |         working with Keras/Pytorch or scikit-learn) in one pipeline.
735 |     """
736 | 
737 |     def __init__(self):
738 |         self.estimator = None
739 | 
740 |     def fit(self, *args, **kwargs):
741 |         """Performs estimation of trainable parameters.
742 | 
743 |         All model estimations with scikit-learn, keras, pytorch models as well as some preprocessing
744 |         techniques (normalization) estimate parameters based on data (training data).
745 |         Those parameters are trained during fit execution and are persisted for the future.
746 |         Only the estimation logic, nothing else.
747 | 
748 |         Args:
749 |             args: positional arguments (can be anything)
750 |             kwargs: keyword arguments (can be anything)
751 | 
752 |         Returns:
753 |             BaseTransformer: self object
754 |         """
755 |         return self
756 | 
757 |     def transform(self, *args, **kwargs):
758 |         """Performs transformation of data.
759 | 
760 |         All data transformation including prediction with deep learning/machine learning models
761 |         can be performed here. No parameters should be estimated in this method nor stored as
762 |         class attributes. Only the transformation logic, nothing else.
763 | 
764 |         Args:
765 |             args: positional arguments (can be anything)
766 |             kwargs: keyword arguments (can be anything)
767 | 
768 |         Returns:
769 |             dict: output
770 |         """
771 |         raise NotImplementedError
772 | 
773 |     def fit_transform(self, *args, **kwargs):
774 |         """Performs fit followed by transform.
775 | 
776 |         This method simply combines fit and transform.
777 | 
778 |         Args:
779 |             args: positional arguments (can be anything)
780 |             kwargs: keyword arguments (can be anything)
781 | 
782 |         Returns:
783 |             dict: output
784 |         """
785 |         self.fit(*args, **kwargs)
786 |         return self.transform(*args, **kwargs)
787 | 
788 |     def load(self, filepath):
789 |         """Loads the trainable parameters of the transformer.
790 | 
791 |         Specific implementation of loading persisted model parameters should be implemented here.
792 |         In case of transformer that do not learn any parameters one can leave this method as is.
793 | 
794 |         Args:
795 |             filepath (str): filepath from which the transformer should be loaded
796 |         Returns:
797 |             BaseTransformer: self instance
798 |         """
799 |         _ = filepath
800 |         return self
801 | 
802 |     def persist(self, filepath):
803 |         """Saves the trainable parameters of the transformer
804 | 
805 |         Specific implementation of model parameter persistence should be implemented here.
806 |         In case of transformer that do not learn any parameters one can leave this method as is.
807 | 
808 |         Args:
809 |             filepath (str): filepath where the transformer parameters should be persisted
810 |         """
811 |         joblib.dump('hello-steppy', filepath)
812 | 
813 | 
814 | class StepError(Exception):
815 |     pass
816 | 
817 | 
818 | def make_transformer(func):
819 |     class StaticTransformer(BaseTransformer):
820 |         def fit(self):
821 |             logger.info('StaticTransformer "{}" is not fittable.'
822 |                         'By running "fit_transform()", you simply "transform()".'.format(self.__class__.__name__))
823 |             return self
824 | 
825 |         def transform(self, *args, **kwargs):
826 |             return func(*args, **kwargs)
827 | 
828 |         def persist(self, filepath):
829 |             logger.info('StaticTransformer "{}" is not persistable.'.format(self.__class__.__name__))
830 | 
831 |     _transformer = StaticTransformer()
832 |     _transformer.__class__.__name__ = func.__name__
833 |     return _transformer
834 | 
835 | 
836 | class IdentityOperation(BaseTransformer):
837 |     """Transformer that performs identity operation, f(x)=x."""
838 | 
839 |     def transform(self, **kwargs):
840 |         return kwargs
841 | 
842 |     def persist(self, filepath):
843 |         logger.info('"IdentityOperation" is not persistable.')
844 |         pass
845 | 


--------------------------------------------------------------------------------
/steppy/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sys
  3 | 
  4 | import pydot_ng as pydot
  5 | from IPython.display import Image, display
  6 | 
  7 | 
  8 | def initialize_logger():
  9 |     """Initialize steppy logger.
 10 | 
 11 |     This logger is used throughout the steppy library to report computation progress.
 12 | 
 13 |     Example:
 14 |     
 15 |         Simple use of steppy logger:
 16 | 
 17 |         .. code-block:: python
 18 |         
 19 |             initialize_logger()
 20 |             logger = get_logger()
 21 |             logger.info('My message inside pipeline')
 22 |             
 23 |         result looks like this:
 24 |         
 25 |         .. code::
 26 |         
 27 |             2018-06-02 12:33:48 steppy >>> My message inside pipeline
 28 | 
 29 |     Returns:
 30 |         logging.Logger: logger object formatted in the steppy style
 31 |     """
 32 |     logger = logging.getLogger('steppy')
 33 |     logger.setLevel(logging.INFO)
 34 |     message_format = logging.Formatter(fmt='%(asctime)s %(name)s >>> %(message)s',
 35 |                                        datefmt='%Y-%m-%d %H:%M:%S')
 36 | 
 37 |     # console handler
 38 |     console_handler = logging.StreamHandler(sys.stdout)
 39 |     console_handler.setLevel(logging.INFO)
 40 |     console_handler.setFormatter(fmt=message_format)
 41 | 
 42 |     # add the handlers to the logger
 43 |     logger.addHandler(console_handler)
 44 | 
 45 |     return logger
 46 | 
 47 | 
 48 | def get_logger():
 49 |     """Fetch existing steppy logger.
 50 | 
 51 |     Example:
 52 |     
 53 |         .. code-block:: python
 54 |         
 55 |             initialize_logger()
 56 |             logger = get_logger()
 57 |             logger.info('My message goes here')
 58 |             
 59 |         result looks like this:
 60 |         
 61 |         .. code::
 62 |         
 63 |             2018-06-02 12:33:48 steppy >>> My message inside pipeline
 64 | 
 65 |     Returns:
 66 |         logging.Logger: logger object formatted in the steppy style
 67 |     """
 68 |     return logging.getLogger('steppy')
 69 | 
 70 | 
 71 | def display_upstream_structure(structure_dict):
 72 |     """Displays pipeline structure in the jupyter notebook.
 73 | 
 74 |     Args:
 75 |         structure_dict (dict): dict returned by
 76 |             :func:`~steppy.base.Step.upstream_structure`.
 77 |     """
 78 |     graph = _create_graph(structure_dict)
 79 |     plt = Image(graph.create_png())
 80 |     display(plt)
 81 | 
 82 | 
 83 | def persist_as_png(structure_dict, filepath):
 84 |     """Saves pipeline diagram to disk as png file.
 85 | 
 86 |     Args:
 87 |         structure_dict (dict): dict returned by
 88 |             :func:`~steppy.base.Step.upstream_structure`
 89 |         filepath (str): filepath to which the png with pipeline visualization should be persisted
 90 |     """
 91 |     graph = _create_graph(structure_dict)
 92 |     graph.write(filepath, format='png')
 93 | 
 94 | 
 95 | def _create_graph(structure_dict):
 96 |     """Creates pydot graph from the pipeline structure dict.
 97 | 
 98 |     Args:
 99 |         structure_dict (dict): dict returned by step.upstream_structure
100 | 
101 |     Returns:
102 |         graph (pydot.Dot): object representing upstream pipeline structure (with regard to the current Step).
103 |     """
104 |     graph = pydot.Dot()
105 |     for node in structure_dict['nodes']:
106 |         graph.add_node(pydot.Node(node))
107 |     for node1, node2 in structure_dict['edges']:
108 |         graph.add_edge(pydot.Edge(node1, node2))
109 |     return graph
110 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minerva-ml/steppy/856b95f1f5189e1d2ca122b891bc670adac9692b/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import steppy.base  # To make sure logger is initialized before running prepare_steps_logger
 2 | 
 3 | from .steppy_test_utils import prepare_steps_logger
 4 | 
 5 | 
 6 | def pytest_sessionstart(session):
 7 |     prepare_steps_logger()
 8 | 
 9 | 
10 | def pytest_runtest_setup(item):
11 |     pass
12 | 
13 | 
14 | def pytest_runtest_teardown(item):
15 |     pass
16 | 


--------------------------------------------------------------------------------
/tests/steppy_test_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | LOGS_PATH = 'steps_tests.log'
 7 | 
 8 | 
 9 | def remove_logs():
10 |     if Path(LOGS_PATH).exists():
11 |         os.remove(LOGS_PATH)
12 | 
13 | 
14 | def prepare_steps_logger():
15 |     print("Redirecting logging to {}.".format(LOGS_PATH))
16 |     remove_logs()
17 |     logger = logging.getLogger('steps')
18 |     for h in logger.handlers:
19 |         logger.removeHandler(h)
20 |     message_format = logging.Formatter(fmt='%(asctime)s %(name)s >>> %(message)s',
21 |                                        datefmt='%Y-%m-%d %H:%M:%S')
22 |     fh = logging.FileHandler(LOGS_PATH)
23 |     fh.setLevel(logging.INFO)
24 |     fh.setFormatter(fmt=message_format)
25 |     logger.addHandler(fh)
26 | 


--------------------------------------------------------------------------------
/tests/test_adapter.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | 
  4 | from steppy.adapter import Adapter, E
  5 | 
  6 | 
  7 | @pytest.fixture
  8 | def data():
  9 |     return {
 10 |         'input_1': {
 11 |             'features': np.array([
 12 |                 [1, 6],
 13 |                 [2, 5],
 14 |                 [3, 4]
 15 |             ]),
 16 |             'labels': np.array([2, 5, 3])
 17 |         },
 18 |         'input_2': {
 19 |             'extra_features': np.array([
 20 |                 [5, 7, 3],
 21 |                 [67, 4, 5],
 22 |                 [6, 13, 14]
 23 |             ])
 24 |         },
 25 |         'input_3': {
 26 |             'images': np.array([
 27 |                 [[0, 255], [255, 0]],
 28 |                 [[255, 0], [0, 255]],
 29 |                 [[255, 255], [0, 0]],
 30 |             ]),
 31 |             'labels': np.array([1, 1, 0])
 32 |         }
 33 |     }
 34 | 
 35 | 
 36 | def test_adapter_creates_defined_keys(data):
 37 |     adapter = Adapter({
 38 |         'X': [E('input_1', 'features')],
 39 |         'Y': [E('input_2', 'extra_features')]
 40 |     })
 41 |     res = adapter.adapt(data)
 42 | 
 43 |     assert {'X', 'Y'} == set(res.keys())
 44 | 
 45 | 
 46 | def test_recipe_with_single_item(data):
 47 |     adapter = Adapter({
 48 |         'X': E('input_1', 'labels'),
 49 |         'Y': E('input_3', 'labels'),
 50 |     })
 51 |     res = adapter.adapt(data)
 52 | 
 53 |     assert np.array_equal(res['X'], data['input_1']['labels'])
 54 |     assert np.array_equal(res['Y'], data['input_3']['labels'])
 55 | 
 56 | 
 57 | def test_recipe_with_list(data):
 58 |     adapter = Adapter({
 59 |         'X': [],
 60 |         'Y': [E('input_1', 'features')],
 61 |         'Z': [E('input_1', 'features'),
 62 |               E('input_2', 'extra_features')]
 63 |     })
 64 |     res = adapter.adapt(data)
 65 |     for i, key in enumerate(('X', 'Y', 'Z')):
 66 |         assert isinstance(res[key], list)
 67 |         assert len(res[key]) == i
 68 | 
 69 |     assert res['X'] == []
 70 |     assert np.array_equal(res['Y'][0], data['input_1']['features'])
 71 |     assert np.array_equal(res['Z'][0], data['input_1']['features'])
 72 |     assert np.array_equal(res['Z'][1], data['input_2']['extra_features'])
 73 | 
 74 | 
 75 | def test_recipe_with_tuple(data):
 76 |     adapter = Adapter({
 77 |         'X': (),
 78 |         'Y': (E('input_1', 'features'),),
 79 |         'Z': (E('input_1', 'features'), E('input_2', 'extra_features'))
 80 |     })
 81 |     res = adapter.adapt(data)
 82 | 
 83 |     for i, key in enumerate(('X', 'Y', 'Z')):
 84 |         assert isinstance(res[key], tuple)
 85 |         assert len(res[key]) == i
 86 | 
 87 |     assert res['X'] == ()
 88 |     assert np.array_equal(res['Y'][0], data['input_1']['features'])
 89 |     assert np.array_equal(res['Z'][0], data['input_1']['features'])
 90 |     assert np.array_equal(res['Z'][1], data['input_2']['extra_features'])
 91 | 
 92 | 
 93 | def test_recipe_with_dictionary(data):
 94 |     adapter = Adapter({
 95 |         'X': {},
 96 |         'Y': {'a': E('input_1', 'features')},
 97 |         'Z': {'a': E('input_1', 'features'),
 98 |               'b': E('input_2', 'extra_features')}
 99 |     })
100 |     res = adapter.adapt(data)
101 | 
102 |     for i, key in enumerate(('X', 'Y', 'Z')):
103 |         assert isinstance(res[key], dict)
104 |         assert len(res[key]) == i
105 | 
106 |     assert res['X'] == {}
107 |     assert np.array_equal(res['Y']['a'], data['input_1']['features'])
108 |     assert np.array_equal(res['Z']['a'], data['input_1']['features'])
109 |     assert np.array_equal(res['Z']['b'], data['input_2']['extra_features'])
110 | 
111 | 
112 | def test_recipe_with_constants(data):
113 |     adapter = Adapter({
114 |         'A': 112358,
115 |         'B': 3.14,
116 |         'C': "lorem ipsum",
117 |         'D': ('input_1', 'features'),
118 |         'E': {112358: 112358, 'a': 'a', 3.14: 3.14},
119 |         'F': [112358, 3.14, "lorem ipsum", ('input_1', 'features')]
120 |     })
121 |     res = adapter.adapt(data)
122 | 
123 |     assert res['A'] == 112358
124 |     assert res['B'] == 3.14
125 |     assert res['C'] == "lorem ipsum"
126 |     assert res['D'] == ('input_1', 'features')
127 |     assert res['E'] == {112358: 112358, 'a': 'a', 3.14: 3.14}
128 |     assert res['F'] == [112358, 3.14, "lorem ipsum", ('input_1', 'features')]
129 | 
130 | 
131 | def test_nested_recipes(data):
132 |     adapter = Adapter({
133 |         'X': [{'a': [E('input_1', 'features')]}],
134 |         'Y': {'a': [{'b': E('input_2', 'extra_features')}]}
135 |     })
136 |     res = adapter.adapt(data)
137 | 
138 |     assert res['X'] == [{'a': [data['input_1']['features']]}]
139 |     assert res['Y'] == {'a': [{'b': data['input_2']['extra_features']}]}
140 | 


--------------------------------------------------------------------------------
/tests/test_base.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from steppy.base import Step, StepError, make_transformer, IdentityOperation
 5 | from steppy.adapter import Adapter, E
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def data():
10 |     return {
11 |         'input_1': {
12 |             'features': np.array([
13 |                 [1, 6],
14 |                 [2, 5],
15 |                 [3, 4]
16 |             ]),
17 |             'labels': np.array([2, 5, 3])
18 |         },
19 |         'input_2': {
20 |             'extra_features': np.array([
21 |                 [5, 7, 3],
22 |                 [67, 4, 5],
23 |                 [6, 13, 14]
24 |             ])
25 |         },
26 |         'input_3': {
27 |             'images': np.array([
28 |                 [[0, 255], [255, 0]],
29 |                 [[255, 0], [0, 255]],
30 |                 [[255, 255], [0, 0]],
31 |             ]),
32 |             'labels': np.array([1, 1, 0])
33 |         }
34 |     }
35 | 
36 | 
37 | @pytest.mark.parametrize("mode", [0, 1])
38 | def test_make_transformer(mode):
39 |     def fun(x, y, m=0):
40 |         return x + y if m == 0 else x - y
41 |     tr = make_transformer(fun)
42 | 
43 |     tr.fit()
44 |     res = tr.transform(7, 3, m=mode)
45 |     assert res == (10 if mode == 0 else 4)
46 | 
47 | 
48 | def test_inputs_without_conflicting_names_do_not_require_adapter(data):
49 |     step = Step(
50 |         name='test_inputs_without_conflicting_names_do_not_require_adapter_1',
51 |         transformer=IdentityOperation(),
52 |         input_data=['input_1']
53 |     )
54 |     output = step.fit_transform(data)
55 |     assert output == data['input_1']
56 | 
57 |     step = Step(
58 |         name='test_inputs_without_conflicting_names_do_not_require_adapter_2',
59 |         transformer=IdentityOperation(),
60 |         input_data=['input_1', 'input_2']
61 |     )
62 |     output = step.fit_transform(data)
63 |     assert output == {**data['input_1'], **data['input_2']}
64 | 
65 | 
66 | def test_inputs_with_conflicting_names_require_adapter(data):
67 |     step = Step(
68 |         name='test_inputs_with_conflicting_names_require_adapter',
69 |         transformer=IdentityOperation(),
70 |         input_data=['input_1', 'input_3']
71 |     )
72 |     with pytest.raises(StepError):
73 |         step.fit_transform(data)
74 | 
75 | 
76 | def test_step_with_adapted_inputs(data):
77 |     step = Step(
78 |         name='test_step_wit_adapted_inputs',
79 |         transformer=IdentityOperation(),
80 |         input_data=['input_1', 'input_3'],
81 |         adapter=Adapter({
82 |             'img': E('input_3', 'images'),
83 |             'fea': E('input_1', 'features'),
84 |             'l1': E('input_3', 'labels'),
85 |             'l2': E('input_1', 'labels'),
86 |         })
87 |     )
88 |     output = step.fit_transform(data)
89 |     expected = {
90 |         'img': data['input_3']['images'],
91 |         'fea': data['input_1']['features'],
92 |         'l1': data['input_3']['labels'],
93 |         'l2': data['input_1']['labels'],
94 |     }
95 |     assert output == expected
96 | 


--------------------------------------------------------------------------------