├── .gitattributes ├── .github └── workflows │ └── linting.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .travis.yml ├── .yapfignore ├── CONTRIBUTING.rst ├── Dockerfile ├── HISTORY.rst ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── README.rst ├── docs ├── Makefile ├── make.bat ├── readthedocs-environment.yml └── source │ ├── API.rst │ ├── conf.py │ ├── contributing.rst │ ├── hashing-and-mutation.ipynb │ ├── history.rst │ ├── images │ ├── keep_calm_pipeline_on.png │ └── lineage_example.png │ ├── index.rst │ ├── intro-guide.ipynb │ └── ml-pipeline.ipynb ├── environment.yml ├── examples ├── basic │ ├── README.md │ ├── basic_example.py │ └── environment.yml ├── experiment │ ├── README.md │ ├── config.yaml │ ├── environment.yml │ └── experiment_example.py └── sftp │ ├── README.md │ ├── environment.yml │ └── sftp_example.py ├── provenance ├── __init__.py ├── _commonstore.py ├── _config.py ├── _dependencies.py ├── _version.py ├── alembic.ini ├── artifact_hasher.py ├── blobstores.py ├── core.py ├── google_storage.py ├── hashing.py ├── migrations │ ├── README │ ├── env.py │ ├── script.py.mako │ └── versions │ │ └── e0317ab07ba4_initial_schema.py ├── models.py ├── repos.py ├── serializers.py ├── sftp │ └── __init__.py ├── test_serializers.py ├── utils.py └── vis │ ├── __init__.py │ └── utils.py ├── readthedocs.yml ├── release-procedure.md ├── requirements.txt ├── setup.cfg ├── setup.py ├── test_requirements.txt ├── tests └── provenance │ ├── conftest.py │ ├── strategies.py │ ├── test_blobstores.py │ ├── test_config.py │ ├── test_core.py │ ├── test_hashing.py │ ├── test_pytorch.py │ ├── test_repos.py │ └── test_utils.py └── versioneer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | provenance/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/workflows/linting.yaml: -------------------------------------------------------------------------------- 1 | name: code-style 2 | 3 | on: 4 | push: 5 | branches: "*" 6 | pull_request: 7 | branches: trunk 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v2 15 | - name: Set up Python 3.8 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.8 19 | - name: Lint via pre-commit checks 20 | uses: pre-commit/action@v2.0.0 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | *.egg-info 4 | .benchmarks 5 | docs/build 6 | build/ 7 | dist/ 8 | .idea/ 9 | log.* 10 | log 11 | .coverage 12 | .DS_Store 13 | *.swp 14 | *.swo 15 | scratch/ 16 | .hypothesis 17 | .cache 18 | .eggs 19 | .artifacts 20 | examples/**/artifacts/ 21 | examples/**/blobstore/ 22 | .ipynb_checkpoints 23 | docs/build 24 | .pytest_cache/ 25 | .vscode/ 26 | .mypy_cache/ 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v3.3.0 5 | hooks: 6 | - id: trailing-whitespace 7 | - id: end-of-file-fixer 8 | - id: check-docstring-first 9 | - id: check-json 10 | - id: check-yaml 11 | - id: double-quote-string-fixer 12 | 13 | - repo: https://github.com/pre-commit/mirrors-yapf 14 | rev: v0.30.0 15 | hooks: 16 | - id: yapf 17 | args: ['--parallel', '--in-place'] 18 | 19 | - repo: https://github.com/asottile/seed-isort-config 20 | rev: v2.2.0 21 | hooks: 22 | - id: seed-isort-config 23 | - repo: https://github.com/pre-commit/mirrors-isort 24 | rev: v5.6.4 25 | hooks: 26 | - id: isort 27 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: required 3 | dist: xenial 4 | addons: 5 | postgresql: "9.5" 6 | services: 7 | - postgresql 8 | before_script: 9 | - psql -c 'create database test_provenance;' -U postgres 10 | env: 11 | global: 12 | - DB=postgresql://postgres@localhost/test_provenance 13 | 14 | python: 15 | # We don't actually use the Travis Python, but this keeps it organized. For now only python 3.5 is supported. 16 | # - "2.7" 17 | # - "3.3" 18 | # - "3.4" 19 | - "3.5" 20 | 21 | install: 22 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 23 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; 24 | else 25 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 26 | fi 27 | - bash miniconda.sh -b -p $HOME/miniconda 28 | - export PATH="$HOME/miniconda/bin:$PATH" 29 | - hash -r 30 | - conda config --set always_yes yes --set changeps1 no 31 | - conda update -q conda 32 | # Useful for debugging any issues with conda. 33 | - conda info -a 34 | - conda create -n test-environment python=$TRAVIS_PYTHON_VERSION 35 | - source activate test-environment 36 | - conda install numpy 37 | - conda install -c conda-forge pyarrow 38 | - pip install -r test_requirements.txt 39 | # Due to [this issue](https://github.com/boto/botocore/issues/1872), we have 40 | # to explicitly install a specific version of dateutil. Note, this is not 41 | # being added to the requirements file as this does not affect local builds, 42 | # only the travis environment which is using boto. 43 | - pip install python-dateutil==2.8.0 44 | - python setup.py install 45 | 46 | script: pytest -v 47 | -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- 1 | versioneer.py 2 | provenance/_version.py 3 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | Contributions are welcome, and they are greatly appreciated! Every 6 | little bit helps, and credit will always be given. 7 | 8 | You can contribute in many ways: 9 | 10 | Types of Contributions 11 | ---------------------- 12 | 13 | Report Bugs 14 | ~~~~~~~~~~~ 15 | 16 | Report bugs at https://github.com/bmabey/provenance/issues. 17 | 18 | If you are reporting a bug, please include: 19 | 20 | * Your operating system name and version. 21 | * Any details about your local setup that might be helpful in troubleshooting. 22 | * Detailed steps to reproduce the bug. 23 | 24 | Fix Bugs 25 | ~~~~~~~~ 26 | 27 | Look through the GitHub issues for bugs. Anything tagged with "bug" 28 | is open to whoever wants to implement it. 29 | 30 | Implement Features 31 | ~~~~~~~~~~~~~~~~~~ 32 | 33 | Look through the GitHub issues for features. Anything tagged with "feature" 34 | is open to whoever wants to implement it. 35 | 36 | Write Documentation 37 | ~~~~~~~~~~~~~~~~~~~ 38 | 39 | provenance could always use more documentation, whether as part of the 40 | official provenance docs, in docstrings, or even on the web in blog posts, 41 | articles, and such. 42 | 43 | Submit Feedback 44 | ~~~~~~~~~~~~~~~ 45 | 46 | The best way to send feedback is to file an issue at https://github.com/bmabey/provenance/issues. 47 | 48 | If you are proposing a feature: 49 | 50 | * Explain in detail how it would work. 51 | * Keep the scope as narrow as possible, to make it easier to implement. 52 | * Remember that this is a volunteer-driven project, and that contributions 53 | are welcome :) 54 | 55 | Get Started! 56 | ------------ 57 | 58 | Ready to contribute? Here's how to set up `provenance` for local development. 59 | 60 | 1. Fork the `provenance` repo on GitHub. 61 | 2. Clone your fork locally:: 62 | 63 | $ git clone git@github.com:your_name_here/provenance.git 64 | 65 | 3. Setup your development environment. Assuming you have conda installed, the following commands can be used to create a development environment:: 66 | 67 | Initial environment creation 68 | 69 | .. code:: bash 70 | 71 | conda env create 72 | source activate provenance-dev 73 | pip install -r requirements.txt 74 | pip install -r test_requirements.txt 75 | 76 | Reactivating the environment after it has been created 77 | 78 | .. code:: bash 79 | 80 | source activate provenance-dev 81 | 82 | 4. Create a branch for local development:: 83 | 84 | $ git checkout -b name-of-your-bugfix-or-feature 85 | 86 | Now you can make your changes locally. 87 | 88 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: 89 | 90 | $ flake8 provenance tests 91 | $ python setup.py test 92 | 93 | 6. Commit your changes and push your branch to GitHub:: 94 | 95 | $ git add . 96 | $ git commit -m "Your detailed description of your changes." 97 | $ git push origin name-of-your-bugfix-or-feature 98 | 99 | 7. Submit a pull request through the GitHub website. 100 | 101 | Pull Request Guidelines 102 | ----------------------- 103 | 104 | Before you submit a pull request, check that it meets these guidelines: 105 | 106 | 1. The pull request should include tests. 107 | 2. If the pull request adds functionality, the docs should be updated. Put 108 | your new functionality into a function with a docstring. Consider updating 109 | a guide or other documentation as well. 110 | 3. The pull request should pass the all the TravisCI builds. 111 | https://travis-ci.org/bmabey/provenance/pull_requests 112 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM andrewosh/binder-base 2 | 3 | MAINTAINER Ben Mabey 4 | 5 | USER root 6 | 7 | RUN apt-get update -y && \ 8 | apt-get install -y postgresql postgresql-contrib && \ 9 | service postgresql start 10 | 11 | USER main 12 | 13 | ADD environment.yml /home/main/environment.yml 14 | RUN /home/main/anaconda/bin/conda install nb_conda_kernels && \ 15 | cd /home/main && /home/main/anaconda/bin/conda env create && \ 16 | /bin/bash -c "source /home/main/anaconda/bin/activate provenance-dev && pip install git+https://github.com/bmabey/provenance" 17 | 18 | 19 | CMD ["/bin/bash"] 20 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ======= 5 | 6 | 7 | 0.14.0 (2020-10-22) 8 | ------------ 9 | 10 | Thanks to Anderson Banihirwe, @andersy005, for this release! 11 | 12 | * Updates joblib pin "joblib>=0.15.0" and related code. 13 | * Tests and code formatting improvements! 14 | 15 | 0.13.0 (2019-12-02) 16 | ------------ 17 | 18 | Thanks to Dan Maljovec, @dmaljovec, for these fixes and additions! 19 | 20 | * Updates ``wrapt`` dependency and makes Artifact proxies compatible. 21 | * Adds optional PyTorch model serialization. 22 | * Adds helpful error message when a user does not set a default repo. 23 | 24 | 0.12.0 (2018-10-08) 25 | ------------ 26 | * Change default hashing algorithm to MD5 since SHA1 for performance considerations. 27 | * Extends serialziaiton so the type used is inferred off of type. 28 | * Makes the default serializer for Pandas DataFrames and Series to use Parquet. 29 | * (breaking change!) Remove names from ArtifactSets, use a JSONB of labels instead. 30 | * Doc tweaks. 31 | 32 | 0.11.0 (2018-08-23) 33 | ------------ 34 | * Optional Google Storage support. 35 | * Adds `persistent_connections` option to Postgres repo so NullPoll can be used when appropriate. 36 | * Doc tweaks. 37 | 38 | 39 | 0.10.0 (2016-04-30) 40 | ------------ 41 | 42 | * Change the default artifact name from the function name to the fully qualified module and function name. 43 | This will invalidate previously cached artifacts unless the names are migrated or explicitly set. 44 | * Documentation! A start at least, more docstrings and guides will be added soon. 45 | * Adds ``use_cache`` parameter and config option for when you only want to track provenance but not look for cache hits. 46 | * Adds ``check_mutations`` option to prevent ``Artifact`` value mutations. 47 | * Adds ``tags`` parameter to the ``provenance`` decorator for when you only want to track provenance but not look for cache hits. 48 | * Adds experimental (alpha!) ``keras`` support. 49 | * Adds a visualization module, pretty basic and mostly for docs and to illustrate what is possible. 50 | * Adds ``ensure_proxies`` decorator to guard against non ``ArtifactProxy`` being sent to functions. 51 | 52 | 0.9.4.2 (2016-03-23) 53 | --------------------- 54 | 55 | * Improved error reporing when paramiko not present for SFTP store. 56 | 57 | 0.9.4.1 (2016-03-22) (0.9.4 was a bad release) 58 | --------------------- 59 | 60 | * Adds ability for a database and/or schema to be created when it doesn't exist. 61 | * Adds SFTP blobstore as separate package provenance[sftp]. 62 | * Adds examples to illustrate how the library is used. 63 | 64 | 0.9.3 (2016-02-17) 65 | --------------------- 66 | 67 | * Patch release to fix packaging problems in 0.9.2. 68 | 69 | 0.9.2 (2016-02-17) 70 | --------------------- 71 | 72 | * Adds archive_file feature. 73 | 74 | 0.9.1 (2015-10-05) 75 | --------------------- 76 | 77 | * Python versions now supported: 2.7, 3.3, 3.4, 3.5 78 | 79 | 0.9.0 (2015-10-05) 80 | --------------------- 81 | 82 | * First release on PyPI. Basic functionality but lacking in docs. 83 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Savvysherpa and contributors 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include provenance *.py 2 | recursive-include docs *.rst 3 | include provenance/migrations/* 4 | include provenance/vis/* 5 | include provenance/vis 6 | include provenance/migrations/versions/* 7 | include provenance/alembic.ini 8 | 9 | include setup.py 10 | include requirements.txt 11 | include test_requirements.txt 12 | include README.rst 13 | include LICENSE.txt 14 | include HISTORY.rst 15 | include MANIFEST.in 16 | 17 | prune docs/build 18 | include versioneer.py 19 | include provenance/_version.py 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc clean-build docs clean 2 | 3 | help: 4 | @echo "clean - remove all build, test, coverage and Python artifacts" 5 | @echo "clean-build - remove build artifacts" 6 | @echo "clean-pyc - remove Python file artifacts" 7 | @echo "clean-test - remove test and coverage artifacts" 8 | # @echo "lint - check style with flake8" 9 | @echo "test - run tests quickly with the default Python" 10 | @echo "test-all - run tests on every Python version with tox" 11 | # @echo "coverage - check code coverage quickly with the default Python" # 12 | # @echo "docs - generate Sphinx HTML documentation, including API docs" 13 | @echo "release - package and upload a release" 14 | @echo "dist - package" 15 | @echo "install - install the package to the active Python's site-packages" 16 | 17 | clean: clean-build clean-pyc clean-test 18 | 19 | clean-build: 20 | rm -rf build/ 21 | rm -rf dist/ 22 | rm -rf .eggs/ 23 | find . -name '*.egg-info' -exec rm -rf {} + 24 | find . -name '*.egg' -exec rm -rf {} + 25 | 26 | clean-pyc: 27 | find . -name '*.pyc' -exec rm -f {} + 28 | find . -name '*.pyo' -exec rm -f {} + 29 | find . -name '*~' -exec rm -f {} + 30 | find . -name '__pycache__' -exec rm -rf {} + 31 | 32 | clean-test: 33 | rm -rf .tox/ 34 | rm -f .coverage 35 | rm -rf htmlcov/ 36 | 37 | # lint: 38 | # flake8 provenance tests 39 | 40 | test: 41 | python setup.py test 42 | 43 | test-all: 44 | tox 45 | 46 | # coverage: 47 | # coverage run --source provenance setup.py test 48 | # coverage report -m 49 | # coverage html 50 | # open htmlcov/index.html 51 | 52 | 53 | release: dist 54 | twine upload dist/* 55 | 56 | dist: clean 57 | python setup.py sdist bdist_wheel 58 | ls -l dist 59 | 60 | install: clean 61 | python setup.py install 62 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | provenance 3 | ========== 4 | 5 | |version status| |conda-version status| |build status| |docs| 6 | 7 | 8 | .. |version status| image:: https://img.shields.io/pypi/v/provenance.svg 9 | :target: https://pypi.python.org/pypi/provenance 10 | :alt: Version Status 11 | .. |conda-version status| image:: https://img.shields.io/conda/vn/conda-forge/provenance 12 | :target: https://anaconda.org/conda-forge/provenance 13 | :alt: Conda version Status 14 | .. |build status| image:: https://travis-ci.org/bmabey/provenance.png?branch=trunk 15 | :target: https://travis-ci.org/bmabey/provenance 16 | :alt: Build Status 17 | .. |docs| image:: https://readthedocs.org/projects/provenance/badge/?version=latest 18 | :target: https://provenance.readthedocs.org 19 | :alt: Documentation Status 20 | 21 | ``provenance`` is a Python library for function-level caching and provenance that aids in 22 | creating Parsimonious Pythonic |Pipelines|. By wrapping functions in the ``provenance`` 23 | decorator computed results are cached across various tiered stores (disk, S3, SFTP) and 24 | `provenance `_ (i.e. lineage) information is tracked 25 | and stored in an artifact repository. A central artifact repository can be used to enable 26 | production pipelines, team collaboration, and reproducible results. The library is general 27 | purpose but was built with machine learning pipelines in mind. By leveraging the fantastic 28 | `joblib`_ library object serialization is optimized for ``numpy`` and other PyData libraries. 29 | 30 | What that means in practice is that you can easily keep track of how artifacts (models, 31 | features, or any object or file) are created, where they are used, and have a central place 32 | to store and share these artifacts. This basic plumbing is required (or at least desired!) 33 | in any machine learning pipeline and project. ``provenance`` can be used standalone along with 34 | a build server to run pipelines or in conjunction with more advanced workflow systems 35 | (e.g. `Airflow`_, `Luigi`_). 36 | 37 | .. |Pipelines| unicode:: Pipelines U+2122 38 | .. _joblib: https://pythonhosted.org/joblib/ 39 | .. _Airflow: http://airbnb.io/projects/airflow/ 40 | .. _Luigi: https://github.com/spotify/luigi 41 | 42 | Example 43 | ======= 44 | 45 | For an explanation of this example please see the `Introductory Guide`_. 46 | 47 | .. code-block:: python 48 | 49 | import provenance as p 50 | 51 | p.load_config(...) 52 | 53 | import time 54 | 55 | @p.provenance() 56 | def expensive_add(a, b): 57 | time.sleep(2) 58 | return a + b 59 | 60 | 61 | @p.provenance() 62 | def expensive_mult(a, b): 63 | time.sleep(2) 64 | return a * b 65 | 66 | 67 | a1 = expensive_add(4, 3) 68 | a2 = expensive_add(1, 1) 69 | 70 | result = expensive_mult(a1, a2) 71 | 72 | vis.visualize_lineage(result) 73 | 74 | 75 | .. image:: https://raw.githubusercontent.com/bmabey/provenance/trunk/docs/source/images/lineage_example.png 76 | 77 | 78 | .. _Introductory Guide: http://provenance.readthedocs.io/en/latest/intro-guide.html 79 | 80 | Installation 81 | ============ 82 | 83 | For the base functionality: 84 | 85 | .. code:: bash 86 | 87 | pip install provenance 88 | 89 | 90 | For the visualization module (which requires ``graphviz`` to be installed): 91 | 92 | .. code:: bash 93 | 94 | pip install provenance[vis] 95 | 96 | For the SFTP store: 97 | 98 | .. code:: bash 99 | 100 | pip install provenance[sftp] 101 | 102 | For everything all at once: 103 | 104 | 105 | .. code:: bash 106 | 107 | pip install provenance[all] 108 | 109 | provenance is also available from conda-forge for conda installations: 110 | 111 | .. code:: bash 112 | 113 | conda install -c conda-forge provenance 114 | 115 | 116 | 117 | Compatibility 118 | ============= 119 | 120 | ``provenance`` is currently only compatible with Python 3.5 and higher. Updating it to work with Python 2.7x 121 | should be easy, follow this `ticket`_ if you are interested in that. 122 | 123 | 124 | .. _ticket: https://github.com/bmabey/provenance/issues/32 125 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = provenance 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | 23 | clean-provenance: 24 | rm -rf /tmp/provenance-intro-artifacts 25 | rm -rf /tmp/provenance-ml-artifacts 26 | dropdb provenance-intro 27 | dropdb provenance-ml-guide 28 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=provenance 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/readthedocs-environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - python==3.5 5 | - numpy 6 | - pandas 7 | - nbconvert 8 | - ipykernel 9 | - alembic 10 | - numpydoc 11 | - sphinx 12 | - pandoc 13 | # for ML examples 14 | - scikit-learn 15 | - pip: 16 | - nbsphinx 17 | - yamlmagic 18 | - sphinxcontrib-websupport 19 | - s3fs>=0.0.8 20 | - boltons>=16.5.1 21 | - joblib>=0.10.2 22 | - toolz>=0.8.2 23 | - cloudpickle>=0.2.1 24 | - psutil>=5.0.0 25 | - ordered-set>=2.0.1 26 | - sqlalchemy>=1.1.3 27 | - sqlalchemy-utils>=0.32.12 28 | - memoized-property>=1.0.2 29 | - wrapt>=1.10.8 30 | - psycopg2 31 | - numpy 32 | -------------------------------------------------------------------------------- /docs/source/API.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | .. currentmodule:: provenance 5 | 6 | Primary API 7 | ~~~~~~~~~~~~~~~~ 8 | 9 | .. autosummary:: 10 | provenance 11 | load_artifact 12 | load_proxy 13 | ensure_proxies 14 | promote 15 | provenance_set 16 | capture_set 17 | create_set 18 | load_set_by_id 19 | load_set_by_name 20 | archive_file 21 | 22 | Configuration 23 | ~~~~~~~~~~~~~ 24 | 25 | .. autosummary:: 26 | from_config 27 | load_config 28 | load_yaml_config 29 | current_config 30 | get_repo_by_name 31 | set_default_repo 32 | get_default_repo 33 | set_check_mutations 34 | get_check_mutations 35 | set_run_info_fn 36 | get_use_cache 37 | set_use_cache 38 | using_repo 39 | 40 | 41 | Utils 42 | ~~~~~ 43 | 44 | .. autosummary:: 45 | is_proxy 46 | lazy_dict 47 | lazy_proxy_dict 48 | 49 | Visualization 50 | ~~~~~~~~~~~~~ 51 | 52 | .. currentmodule:: provenance.vis 53 | 54 | .. autosummary:: 55 | visualize_lineage 56 | 57 | 58 | Detailed Docs 59 | ~~~~~~~~~~~~~ 60 | 61 | .. currentmodule:: provenance 62 | 63 | 64 | Primary API 65 | 66 | .. autofunction:: provenance 67 | .. autofunction:: load_artifact 68 | .. autofunction:: load_proxy 69 | .. autofunction:: ensure_proxies 70 | .. autofunction:: promote 71 | .. autofunction:: provenance_set 72 | .. autofunction:: capture_set 73 | .. autofunction:: create_set 74 | .. autofunction:: load_set_by_id 75 | .. autofunction:: load_set_by_name 76 | .. autofunction:: archive_file 77 | 78 | 79 | Configuration 80 | 81 | .. autofunction:: from_config 82 | .. autofunction:: load_config 83 | .. autofunction:: load_yaml_config 84 | .. autofunction:: current_config 85 | .. autofunction:: get_repo_by_name 86 | .. autofunction:: set_default_repo 87 | .. autofunction:: get_default_repo 88 | .. autofunction:: set_check_mutations 89 | .. autofunction:: get_check_mutations 90 | .. autofunction:: set_run_info_fn 91 | .. autofunction:: get_use_cache 92 | .. autofunction:: set_use_cache 93 | .. autofunction:: using_repo 94 | 95 | 96 | Utils 97 | 98 | .. autofunction:: is_proxy 99 | .. autofunction:: lazy_dict 100 | .. autofunction:: lazy_proxy_dict 101 | 102 | Visualization (beta) 103 | 104 | .. currentmodule:: provenance.vis 105 | 106 | .. autofunction:: visualize_lineage 107 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # provenance documentation build configuration file, created by 5 | # sphinx-quickstart on Sat Apr 29 08:47:13 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | # The version info for the project you're documenting, acts as replacement for 24 | # |version| and |release|, also used in various other places throughout the 25 | # built documents. 26 | # 27 | from provenance import __version__ as version 28 | 29 | sys.path.insert(0, os.path.abspath('../../')) 30 | 31 | # -- General configuration ------------------------------------------------ 32 | 33 | # If your documentation needs a minimal Sphinx version, state it here. 34 | # 35 | # needs_sphinx = '1.0' 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | 41 | extensions = [ 42 | 'nbsphinx', 43 | 'sphinx.ext.autodoc', 44 | 'sphinx.ext.autosummary', 45 | 'sphinx.ext.doctest', 46 | 'sphinx.ext.coverage', 47 | 'sphinx.ext.viewcode', 48 | 'numpydoc', 49 | ] 50 | 51 | numpydoc_show_class_members = False 52 | 53 | exclude_patterns = ['_build', '**.ipynb_checkpoints'] 54 | 55 | # Add any paths that contain templates here, relative to this directory. 56 | templates_path = ['_templates'] 57 | 58 | # The suffix(es) of source filenames. 59 | # You can specify multiple suffix as a list of string: 60 | # 61 | # source_suffix = ['.rst', '.md'] 62 | source_suffix = ['.rst', '.ipynb'] 63 | 64 | # The master toctree document. 65 | master_doc = 'index' 66 | 67 | # General information about the project. 68 | project = 'provenance' 69 | copyright = '2017, Ben Mabey' 70 | author = 'Ben Mabey' 71 | 72 | release = version 73 | 74 | # The language for content autogenerated by Sphinx. Refer to documentation 75 | # for a list of supported languages. 76 | # 77 | # This is also used if you do content translation via gettext catalogs. 78 | # Usually you set "language" from the command line for these cases. 79 | language = None 80 | 81 | # List of patterns, relative to source directory, that match files and 82 | # directories to ignore when looking for source files. 83 | # This patterns also effect to html_static_path and html_extra_path 84 | exclude_patterns = [] 85 | 86 | # The name of the Pygments (syntax highlighting) style to use. 87 | pygments_style = 'sphinx' 88 | 89 | # If true, `todo` and `todoList` produce output, else they produce nothing. 90 | todo_include_todos = False 91 | 92 | # -- Options for HTML output ---------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | # 97 | 98 | # Taken from docs.readthedocs.io: 99 | # on_rtd is whether we are on readthedocs.io 100 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 101 | 102 | if not on_rtd: # only import and set the theme if we're building docs locally 103 | import sphinx_rtd_theme 104 | 105 | html_theme = 'sphinx_rtd_theme' 106 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 107 | 108 | # Theme options are theme-specific and customize the look and feel of a theme 109 | # further. For a list of options available for each theme, see the 110 | # documentation. 111 | # 112 | # html_theme_options = {} 113 | 114 | # Add any paths that contain custom static files (such as style sheets) here, 115 | # relative to this directory. They are copied after the builtin static files, 116 | # so a file named "default.css" will overwrite the builtin "default.css". 117 | html_static_path = ['_static'] 118 | 119 | # -- Options for HTMLHelp output ------------------------------------------ 120 | 121 | # Output file base name for HTML help builder. 122 | htmlhelp_basename = 'provenancedoc' 123 | 124 | # -- Options for LaTeX output --------------------------------------------- 125 | 126 | latex_elements = { 127 | # The paper size ('letterpaper' or 'a4paper'). 128 | # 129 | # 'papersize': 'letterpaper', 130 | # The font size ('10pt', '11pt' or '12pt'). 131 | # 132 | # 'pointsize': '10pt', 133 | # Additional stuff for the LaTeX preamble. 134 | # 135 | # 'preamble': '', 136 | # Latex figure (float) alignment 137 | # 138 | # 'figure_align': 'htbp', 139 | } 140 | 141 | # Grouping the document tree into LaTeX files. List of tuples 142 | # (source start file, target name, title, 143 | # author, documentclass [howto, manual, or own class]). 144 | latex_documents = [ 145 | (master_doc, 'provenance.tex', 'provenance Documentation', 'Ben Mabey', 'manual'), 146 | ] 147 | 148 | # -- Options for manual page output --------------------------------------- 149 | 150 | # One entry per manual page. List of tuples 151 | # (source start file, name, description, authors, manual section). 152 | man_pages = [(master_doc, 'provenance', 'provenance Documentation', [author], 1)] 153 | 154 | # -- Options for Texinfo output ------------------------------------------- 155 | 156 | # Grouping the document tree into Texinfo files. List of tuples 157 | # (source start file, target name, title, author, 158 | # dir menu entry, description, category) 159 | texinfo_documents = [ 160 | ( 161 | master_doc, 162 | 'provenance', 163 | 'provenance Documentation', 164 | author, 165 | 'provenance', 166 | 'Provenance and caching library for functions, built for creating lightweight machine learning pipelines.', 167 | 'Miscellaneous', 168 | ), 169 | ] 170 | 171 | extlinks = { 172 | 'issue': ('https://github.com/bmabey/provenance/issues/%s', 'GH#'), 173 | 'pr': ('https://github.com/bmabey/provenance/pull/%s', 'GH#'), 174 | } 175 | -------------------------------------------------------------------------------- /docs/source/contributing.rst: -------------------------------------------------------------------------------- 1 | ../../CONTRIBUTING.rst -------------------------------------------------------------------------------- /docs/source/history.rst: -------------------------------------------------------------------------------- 1 | ../../HISTORY.rst -------------------------------------------------------------------------------- /docs/source/images/keep_calm_pipeline_on.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bmabey/provenance/d29ad2ffc39fbc389600df092da9e7df4f920100/docs/source/images/keep_calm_pipeline_on.png -------------------------------------------------------------------------------- /docs/source/images/lineage_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bmabey/provenance/d29ad2ffc39fbc389600df092da9e7df4f920100/docs/source/images/lineage_example.png -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../README.rst 2 | 3 | 4 | 5 | Index 6 | ----- 7 | 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | :caption: Guides 12 | 13 | intro-guide.ipynb 14 | ml-pipeline.ipynb 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: Main Concepts 19 | 20 | hashing-and-mutation.ipynb 21 | API.rst 22 | 23 | .. toctree:: 24 | :maxdepth: 1 25 | :caption: Project Information 26 | 27 | contributing.rst 28 | history.rst 29 | -------------------------------------------------------------------------------- /docs/source/ml-pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "ein.tags": [ 7 | "worksheet-0" 8 | ], 9 | "slideshow": { 10 | "slide_type": "-" 11 | } 12 | }, 13 | "source": [ 14 | "# Machine Learning Pipeline\n", 15 | "\n", 16 | "** WORK IN PROGRESS ** This guide isn't complete but the code examples may be useful as is.\n", 17 | "\n", 18 | "This guide assumes you are familiar with all the content in the [Introductory Guide](intro-guide.ipynb).\n", 19 | "\n", 20 | "A typical machine learning pipeline consists of loading data, extracting features, training models and storing the models for later use in a production system or further analysis. In some cases the feature extraction process is quick and the features are transitory without any need of saving them independently of the finished trained model. Other times the features are a representation of the data that you wish to reuse in different settings, e.g. in a dashboard explaining predictions, ad-hoc analysis, further model development. \n", 21 | "\n", 22 | "In the end a good deal of plumbing is required to wire up an app/service with the latest models and features in such a way that API calls can be traced back to the originating model, features, and even data sources. `provenance` abstracts much of this plumbing so you can focus on writing parsimonious pythonic pipelines™ with plain old functions." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": { 29 | "autoscroll": "json-false", 30 | "ein.tags": [ 31 | "worksheet-0" 32 | ], 33 | "slideshow": { 34 | "slide_type": "-" 35 | } 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "%load_ext yamlmagic" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": { 46 | "autoscroll": "json-false", 47 | "ein.tags": [ 48 | "worksheet-0" 49 | ], 50 | "slideshow": { 51 | "slide_type": "-" 52 | } 53 | }, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "application/javascript": [ 58 | "\n", 59 | " require(\n", 60 | " [\n", 61 | " \"notebook/js/codecell\",\n", 62 | " \"codemirror/mode/yaml/yaml\"\n", 63 | " ],\n", 64 | " function(cc){\n", 65 | " cc.CodeCell.options_default.highlight_modes.magic_yaml = {\n", 66 | " reg: [\"^%%yaml\"]\n", 67 | " }\n", 68 | " }\n", 69 | " );\n", 70 | " " 71 | ], 72 | "text/plain": [ 73 | "" 74 | ] 75 | }, 76 | "metadata": {}, 77 | "output_type": "display_data" 78 | } 79 | ], 80 | "source": [ 81 | "%%yaml basic_config\n", 82 | "blobstores:\n", 83 | " disk:\n", 84 | " type: disk\n", 85 | " cachedir: /tmp/provenance-ml-artifacts\n", 86 | " read: True\n", 87 | " write: True\n", 88 | " delete: True\n", 89 | "artifact_repos:\n", 90 | " local:\n", 91 | " type: postgres\n", 92 | " db: postgresql://localhost/provenance-ml-guide\n", 93 | " store: 'disk'\n", 94 | " read: True\n", 95 | " write: True\n", 96 | " delete: True\n", 97 | " # this option will create the database if it doesn't exist\n", 98 | " create_db: True\n", 99 | "default_repo: local" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 3, 105 | "metadata": { 106 | "autoscroll": "json-false", 107 | "ein.tags": [ 108 | "worksheet-0" 109 | ], 110 | "slideshow": { 111 | "slide_type": "-" 112 | } 113 | }, 114 | "outputs": [ 115 | { 116 | "name": "stderr", 117 | "output_type": "stream", 118 | "text": [ 119 | "INFO [alembic.runtime.migration] Context impl PostgresqlImpl.\n", 120 | "INFO [alembic.runtime.migration] Will assume transactional DDL.\n", 121 | "INFO [alembic.runtime.migration] Running stamp_revision -> e0317ab07ba4\n" 122 | ] 123 | }, 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "" 128 | ] 129 | }, 130 | "execution_count": 3, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "import provenance as p\n", 137 | "\n", 138 | "\n", 139 | "p.load_config(basic_config)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 4, 145 | "metadata": { 146 | "autoscroll": "json-false", 147 | "collapsed": true, 148 | "ein.tags": [ 149 | "worksheet-0" 150 | ], 151 | "slideshow": { 152 | "slide_type": "skip" 153 | } 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "import numpy as np\n", 158 | "import pandas as pd\n", 159 | "import time\n", 160 | "from sklearn.utils import check_random_state\n", 161 | "import toolz as t" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 5, 167 | "metadata": { 168 | "autoscroll": "json-false", 169 | "collapsed": true, 170 | "ein.tags": [ 171 | "worksheet-0" 172 | ], 173 | "slideshow": { 174 | "slide_type": "-" 175 | } 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "@p.provenance()\n", 180 | "def load_data(query):\n", 181 | " # fetch something from the DB in real life...\n", 182 | " random_state = check_random_state(abs(hash(query)) // (10**10))\n", 183 | " return random_state.uniform(0, 10, 10)\n", 184 | "\n", 185 | "\n", 186 | "@p.provenance()\n", 187 | "def extract_features_a(data, hyperparam_a=5):\n", 188 | " time.sleep(2)\n", 189 | " rs = check_random_state(hyperparam_a)\n", 190 | " return data[0:5] + 1 + rs.rand(5)\n", 191 | "\n", 192 | "\n", 193 | "@p.provenance()\n", 194 | "def extract_features_b(data, hyperparam_x=10):\n", 195 | " time.sleep(2)\n", 196 | " rs = check_random_state(hyperparam_x)\n", 197 | " return data[5:] + 1 + rs.rand(5)\n", 198 | "\n", 199 | "\n", 200 | "@p.provenance()\n", 201 | "def build_model(features_a, features_b, num_trees=100):\n", 202 | " return {'whatever': 'special model with {} trees'.format(num_trees)}\n", 203 | "\n", 204 | "\n", 205 | "@p.provenance()\n", 206 | "def evaluate(model, data):\n", 207 | " return {'some_metric': 0.5, 'another_metric': 0.4}\n", 208 | "\n", 209 | "\n", 210 | "def pipeline(train_query='some query', valid_query=\"another query\", hyperparam_a=5, hyperparam_x=10):\n", 211 | " data = load_data(\"some query\")\n", 212 | " features_a = extract_features_a(data, hyperparam_a)\n", 213 | " features_b = extract_features_b(data, hyperparam_x)\n", 214 | " model = build_model(data, features_a, features_b)\n", 215 | "\n", 216 | " validation_data = load_data(\"another query\")\n", 217 | " evaluation = evaluate(model, validation_data)\n", 218 | "\n", 219 | " return {'features_a': features_a, 'features_b': features_b,\n", 220 | " 'model': model, 'evaluation': evaluation}\n", 221 | "\n", 222 | "\n", 223 | "@p.provenance()\n", 224 | "def make_decision(model, request):\n", 225 | " # make some sort of prediction, classification, with the model\n", 226 | " # to help make a 'decision' and return it as the result\n", 227 | " return {'prediction': 0.5, 'model': model.artifact.id}" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "**TODO** explain everything.. including the concept of artifact sets and how they simpify the building and deployment of models." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 6, 240 | "metadata": { 241 | "autoscroll": "json-false", 242 | "collapsed": true, 243 | "ein.tags": [ 244 | "worksheet-0" 245 | ], 246 | "slideshow": { 247 | "slide_type": "-" 248 | } 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "def run_production_pipeline():\n", 253 | " with p.capture_set('production'):\n", 254 | " return pipeline()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 7, 260 | "metadata": { 261 | "autoscroll": "json-false", 262 | "collapsed": true, 263 | "ein.tags": [ 264 | "worksheet-0" 265 | ], 266 | "slideshow": { 267 | "slide_type": "-" 268 | } 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "res = run_production_pipeline()" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 8, 278 | "metadata": { 279 | "autoscroll": "json-false", 280 | "collapsed": true, 281 | "ein.tags": [ 282 | "worksheet-0" 283 | ], 284 | "slideshow": { 285 | "slide_type": "-" 286 | } 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "res = p.load_set_by_name('production')" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 9, 296 | "metadata": { 297 | "autoscroll": "json-false", 298 | "ein.tags": [ 299 | "worksheet-0" 300 | ], 301 | "slideshow": { 302 | "slide_type": "-" 303 | } 304 | }, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "ArtifactSet(id='08f3c7c6a84132faa155ca9996a26c4df92bd798', artifact_ids=frozenset({'2411521185b4267706a24f85b16c46e3a24b4e66', '96c47ddbeff008e2b3a27913611c9648c3e74aa2', 'd3bb8e7625b7093b079bdc8b7d50c6eaaa62f835', '46268ac8c40932b63033b387aa0217974c82c717', 'd3c930d243d6ec4d7be481ddd1f4c3e9277d5f09', '3fdafd792f113c669d55b416bed9b5091f954029'}), created_at=datetime.datetime(2017, 5, 1, 0, 1, 9, 119196), name='production')" 310 | ] 311 | }, 312 | "execution_count": 9, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "res" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 10, 324 | "metadata": { 325 | "autoscroll": "json-false", 326 | "collapsed": true, 327 | "ein.tags": [ 328 | "worksheet-0" 329 | ], 330 | "slideshow": { 331 | "slide_type": "-" 332 | } 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "build_artifacts = res.proxy_dict(group_artifacts_of_same_name=True)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 11, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "dict_keys(['__main__.load_data', '__main__.build_model', '__main__.extract_features_b', '__main__.evaluate', '__main__.extract_features_a'])" 348 | ] 349 | }, 350 | "execution_count": 11, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "build_artifacts.keys()" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 12, 362 | "metadata": { 363 | "collapsed": true 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "model = build_artifacts['__main__.build_model']" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 13, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/plain": [ 378 | "" 379 | ] 380 | }, 381 | "execution_count": 13, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "model" 388 | ] 389 | } 390 | ], 391 | "metadata": { 392 | "kernelspec": { 393 | "display_name": "Python 3", 394 | "language": "python", 395 | "name": "python3" 396 | }, 397 | "language_info": { 398 | "codemirror_mode": { 399 | "name": "ipython", 400 | "version": 3 401 | }, 402 | "file_extension": ".py", 403 | "mimetype": "text/x-python", 404 | "name": "python", 405 | "nbconvert_exporter": "python", 406 | "pygments_lexer": "ipython3", 407 | "version": "3.7.0" 408 | }, 409 | "name": "Introduction Guide.ipynb" 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 1 413 | } 414 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: provenance-dev 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python==3.5 6 | - ipython 7 | - numpy 8 | - pandas 9 | - alembic 10 | - numpydoc 11 | - sphinx 12 | - pandoc 13 | - ipykernel 14 | - pyarrow 15 | # for the docs 16 | - scikit-learn 17 | - pip: 18 | - versioneer 19 | - twine 20 | - nbsphinx 21 | - yamlmagic 22 | - sphinxcontrib-websupport 23 | - sphinx-autobuild 24 | - sphinx_rtd_theme 25 | -------------------------------------------------------------------------------- /examples/basic/README.md: -------------------------------------------------------------------------------- 1 | # provenance-basic-example 2 | ## Step 1: Run some stuff 3 | 4 | All you do is `conda env create`, `source activate provenance-basic-example`, 5 | and `./basic_example.py`. 6 | 7 | Then you can explore how the artifacts and blobs were saved in `./artifacts` and 8 | in `psql provenance-basic-example`. 9 | 10 | ## Step 2: Learn some stuff 11 | 12 | ### The gist 13 | In `basic_example.py` you'll see the decorator `@p.provenance()` above the 14 | function `my_add`. Because of this, Provenance will keep track of inputs and 15 | outputs to this function. Then if you call the function again, it won't compute 16 | the sum, rather it will say "I've already seen these inputs!" and simply look up 17 | the answer based on the inputs. It's safe to say that this is a gross 18 | simplification but it lays the ground work for going forward. 19 | 20 | ### Terminology 21 | #### Artifact 22 | An artifact is the mechanism by which Provenance stores the inputs and outputs 23 | to our function `my_add`. It actually stores more than that but we'll get there. 24 | An artifact exists as an entry in a database table. It's probably best described 25 | by looking at the columns in the artifact table. There are 21 columns but we'll 26 | start by only looking at 2 of them: `id` and `value_id`. The `id` is just that, 27 | the id of the artifact. But it's actually more then that, it's also a hash of 28 | the inputs (as well as other things like the function name). In the blobstore 29 | (see [below](#blobs-and-blobstore)) there is a blob 30 | (see [below](#blobs-and-blobstore)), the name of that blob is this same as `id` 31 | and the blob contains a pickled version of the inputs. Next is `value_id`, this 32 | is a hash of the output and similarly shares a name with a blob which contains a 33 | pickled version of the output. We won't go over the other columns in the 34 | artifact table now. 35 | 36 | #### Blobs and Blobstore 37 | A blob is a Binary Large OBject. Although in this case we don't require them to 38 | be large. A blob is simply a file, what type of file? Doesn't really matter. The 39 | blobstore is simply the place where the blobs are kept. In this example it is 40 | the `artifacts` directory. To be a bit more technical, we can see the blobstore 41 | defined [here](basic_example.py#L5). The `cachedir` part of the blobstore is the 42 | `artifacts` directory but since that's really the heart of the blobstore, we'll 43 | just think of them as synonymous for now until we go into more details about the 44 | config [below](#the-config). 45 | 46 | #### Repo (or artifact repo) 47 | A repo is the place where the artifacts are stored. You can see it 48 | defined [here](basic_example.py#L12). In this case it's just a postgres database 49 | as you can see in the `db` part of the repos definition. Again there is more to 50 | a repo but the db is the heart so for now they are synonymous. 51 | 52 | ### Recap 53 | The first time we run `basic_example.py` we print the result of calling `my_add` 54 | with 1 and 4. We see 5 printed, along with the string 'Executed' that lets us 55 | know that the function was actually executed. The blobstore (artifacts 56 | directory) now contains two blobs (just files). An artifact (entry in a db 57 | table) is created in the repo (postgres db). The artifact has an `id` which is 58 | the hash of the inputs and some other things. One of the blobs (files) has this 59 | as it's name. In that blob (file) is the pickled inputs 1 and 4. The other blob 60 | shares it's name with the `value_id` of the artifact and the blob contains a 61 | pickled 5. Now if we run the same call to my_add with 1 and 4, we won't see 62 | 'Executed' printed, but 5 will still be returned. This is evidence that the 63 | function was not ran, rather the answer was looked up. If we call my_add with 64 | different inputs the function is executed and more artifacts and blobs are 65 | created. 66 | 67 | ### The Config 68 | The config is a map (see [here](basic_example.py#L5)). At the top level we have 69 | `blobstores`, `artifact_repos`, and `default_repo`. We won't get into the reason 70 | for first two are plural here. It will be addressed in a more advanced example. 71 | So for now we have to define a blobstore, and an artifact_repo. (In the case of 72 | the plural artifact\_repos we also have a default_repo, which for us is just our 73 | only repo.) Our blobstore is called 'disk', this name is totally up to you. It 74 | is of type 'disk', meaning on your drive. The possible types are disk, memory, 75 | s3, and chained (chained gets into the plural thing so we'll hold of on that 76 | explaination). The cachedir is defined as discussed earlier. We'll come back to 77 | read, write, read\_through\_right, and delete. We then define Our artifact\_repo 78 | is called 'local', this name is again up to you. It is of type 'postgres'. The 79 | possible types are postgres, memory, and chained. Again our db is defined as 80 | discussed earlier. The read, write, read\_through\_write, and delete fields in 81 | the config of both the blobstore and artifact\_repo are boolean permissions. Am I 82 | allowed to read, write, or delete from this blobstore or artifact\_repo? The 83 | read\_through\_write is concerned with chained blobstores and artifact_repos and 84 | we'll continue to hold off discussing that. 85 | -------------------------------------------------------------------------------- /examples/basic/basic_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import provenance as p 4 | 5 | p.load_config( 6 | { 7 | 'blobstores': 8 | { 9 | 'disk': 10 | { 11 | 'type': 'disk', 12 | 'cachedir': 'artifacts', 13 | 'read': True, 14 | 'write': True, 15 | 'read_through_write': False, 16 | 'delete': True, 17 | } 18 | }, 19 | 'artifact_repos': 20 | { 21 | 'local': 22 | { 23 | 'type': 'postgres', 24 | 'db': 'postgresql://localhost/provenance-basic-example', 25 | 'store': 'disk', 26 | 'read': True, 27 | 'write': True, 28 | 'create_db': True, 29 | 'read_through_write': False, 30 | 'delete': True, 31 | } 32 | }, 33 | 'default_repo': 'local', 34 | } 35 | ) 36 | 37 | 38 | @p.provenance() 39 | def my_add(x, y): 40 | print('Executed') 41 | return x + y 42 | 43 | 44 | print(my_add(1, 4)) 45 | -------------------------------------------------------------------------------- /examples/basic/environment.yml: -------------------------------------------------------------------------------- 1 | name: provenance-basic-example 2 | dependencies: 3 | - python==3.5 4 | - pip: 5 | - "git+https://github.com/bmabey/provenance.git" 6 | -------------------------------------------------------------------------------- /examples/experiment/README.md: -------------------------------------------------------------------------------- 1 | # provenance-experiment-example 2 | 3 | ## Step 0: Understand sftp-example 4 | 5 | ## Step 1: Run some stuff 6 | 7 | First you run `conda env create`, `source activate 8 | provenance-experiment-example`. Now open up `config.yaml`. In the config fill in 9 | the `cachedir`, `basepath`, and `ssh_config` for the sftp blobstore. You'll find 10 | some directions acting as placeholders or you can see [here](#the-config). Now 11 | you can run `./experiment_example.py`. 12 | 13 | Then you can explore how the artifacts and blobs were saved in your specified 14 | `cachedir`,`basepath`, and in `psql provenance-experiment-example`. 15 | 16 | ## Step 2: Learn some stuff 17 | 18 | ### The gist 19 | Here we learn about archiving files, provenance\_sets and chaining blobstores. 20 | For chaining blobstores see [here](#the-config). Archiving files is really 21 | straight forward if you've understood the previous examples, you can see it in 22 | action [here](sftp\_example.py#62). Two blobs are created for each file, one is 23 | the actual file and the other is the inputs to the call to archive_file. See the 24 | comments in [sftp\_example.py](sftp\_example.py) for additional detail. 25 | 26 | At the top of the function that contains the calls to `archive_file` 27 | (see [here](sftp\_example.py#54)) you'll see we have `p.provenance_set` instead 28 | of the `p.provenance` that we've seen before. A provenance_set is simply a named 29 | set containing the id's of the artifacts in the set. In this example each entry 30 | (demographic.json and matrix.csv) are put in a set named after the entry id 31 | (0000 or 0001 etc.). Details on how to get the set back and the artifacts out 32 | have not yet been written but are coming soon. 33 | 34 | ### The Config 35 | We changed the config to be a yaml file and loaded it in. You'll notice that we 36 | define two blobstores (the same two from basic-example and sftp-example). Then 37 | there is a third. This third, called `experiment` is a chained blobstore. It 38 | chains `disk` to `sftp`. Remember in `sftp-example` a local blobstore was 39 | created but Provenance didn't know that it could look there when asked to 40 | retrieve an artifact. By chaining we say, first look/write in `disk`, then to 41 | `sftp`. Here's where `read_through_write` comes into play. We've set it to 42 | `True`. This means that if Provenance is trying to look up an artifact and it 43 | doesn't find it in `disk` but it does find it in `sftp` it will write it in 44 | `disk` "on the way back". Notice that we set the store for the artifact_repo to 45 | `experiment`. 46 | -------------------------------------------------------------------------------- /examples/experiment/config.yaml: -------------------------------------------------------------------------------- 1 | blobstores: 2 | disk: 3 | type: 'disk' 4 | cachedir: 'blobstore' 5 | read: True 6 | write: True 7 | read_through_write: True 8 | delete: True 9 | sftp: 10 | type: 'sftp' 11 | cachedir: | 12 | 14 | basepath: | 15 | remote machine 16 | ex. /home/me/artifacts>, you need to make sure that path directory exists.' 17 | read: True 18 | write: True 19 | read_through_write: True 20 | delete: True 21 | ssh_config: 22 | hostname: '' 23 | port: '' 24 | username: '' 25 | password: '' 26 | experiment: 27 | type: 'chained' 28 | stores: ['disk', 'sftp'] 29 | artifact_repos: 30 | local: 31 | type: 'postgres' 32 | db: 'postgresql://localhost/provenance-experiment-example' 33 | store: 'experiment' 34 | read: True 35 | write: True 36 | create_db: True 37 | read_through_write: False 38 | delete: True 39 | default_repo: 'local' 40 | -------------------------------------------------------------------------------- /examples/experiment/environment.yml: -------------------------------------------------------------------------------- 1 | name: provenance-experiment-example 2 | dependencies: 3 | - python==3.5 4 | - paramiko 5 | - pip: 6 | - "git+https://github.com/bmabey/provenance.git" 7 | - pyyaml 8 | -------------------------------------------------------------------------------- /examples/experiment/experiment_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import csv 4 | import json 5 | import os 6 | import random 7 | import shutil 8 | 9 | import provenance as p 10 | 11 | p.load_yaml_config('config.yaml') 12 | 13 | # Suppose you are conducting an experiment to determine the correlation between 14 | # geographic location and favorite 3x3 matrix. To do this you have them sit at a 15 | # computer and enter in the information. To store the data you create a 16 | # directory structure such that each entry gets its own numbered directory in 17 | # which there are two files, info.json which has the demographic info and 18 | # data.csv which contains their favorite 3x3 matrix. Now, is this the best way 19 | # to store the data for this experiment? No. But we'll ignore that for the sake 20 | # of instruction. 21 | 22 | ################################################################################ 23 | ## Generate random data, you wouldn't actually have this code in your experiment. 24 | 25 | first_names = ['Eric', 'Belinda', 'Jane', 'Scott', 'Joe', 'Mike', 'Wilhelmina'] 26 | last_names = ['Thompson', 'Erikson', 'Gandalfo', 'Wesson', 'Black', 'Stephens'] 27 | 28 | 29 | def gen_name(): 30 | return random.choice(first_names) + ' ' + random.choice(last_names) 31 | 32 | 33 | def gen_age(): 34 | return random.randint(18, 100) 35 | 36 | 37 | street_names = [ 38 | 'Maple St', 39 | 'Corner Ave', 40 | 'West Helm Lp', 41 | '4th St', 42 | 'Main St', 43 | 'Center St', 44 | ] 45 | 46 | 47 | def gen_address(): 48 | return str(random.randint(1000, 10000)) + ' ' + random.choice(street_names) 49 | 50 | 51 | def gen_matrix(): 52 | return [[random.randint(0, 100) for x in range(3)] for y in range(3)] 53 | 54 | 55 | ################################################################################ 56 | ## Here's the crux. You WOULD have this code in your experiment. This function 57 | ## actually writes the data files that you want to keep track of and share with 58 | ## others. Here we introduce the provenance_set, which is basically a named set 59 | ## of artifacts. It makes sense if each entry (which includes two files) becomes 60 | ## a set. We can name the set, then use that name to retreive the latest 61 | ## version. 62 | 63 | 64 | def save_entry(id, name, age, address, matrix): 65 | directory = os.path.join('./experiment_data', id) 66 | os.mkdir(directory) 67 | demographics = {'name': name, 'age': age, 'address': address} 68 | 69 | @p.provenance_set(set_name=id) 70 | def write_entry(): 71 | with open(os.path.join(directory, 'demographic.json'), 'w') as demof: 72 | json.dump(demographics, demof) 73 | 74 | with open(os.path.join(directory, 'matrix.csv'), 'w') as matrixf: 75 | writer = csv.writer(matrixf) 76 | writer.writerows(matrix) 77 | p.archive_file( 78 | os.path.join(directory, 'demographic.json'), 79 | name=id + '/demographic', 80 | delete_original=True, 81 | ) 82 | p.archive_file( 83 | os.path.join(directory, 'matrix.csv'), 84 | name=id + '/matrix', 85 | delete_original=True, 86 | ) 87 | 88 | write_entry() 89 | 90 | 91 | ################################################################################ 92 | ## Simulate some number of participants, you wouldn't actually have this code in 93 | ## your experiment. 94 | 95 | 96 | def simulate_entry(id): 97 | name = gen_name() 98 | age = gen_age() 99 | address = gen_address() 100 | matrix = gen_matrix() 101 | save_entry(id, name, age, address, matrix) 102 | 103 | 104 | def simulate_experiment(num_participants): 105 | # I use the experiment_data as a temporary location to write the data to. 106 | # Provenance will store the files in the blobstore so... 107 | if not os.path.exists('./experiment_data'): 108 | os.mkdir('./experiment_data') 109 | 110 | for i in range(num_participants): 111 | simulate_entry(str(i).zfill(4)) 112 | 113 | # ... then I erase the folder at the end. 114 | shutil.rmtree('./experiment_data') 115 | 116 | 117 | simulate_experiment(10) 118 | -------------------------------------------------------------------------------- /examples/sftp/README.md: -------------------------------------------------------------------------------- 1 | # provenance-sftp-example 2 | 3 | ## Step 0: Understand the basic example 4 | 5 | ## Step 1: Run some stuff 6 | 7 | First you run `conda env create`, `source activate provenance-sftp-example`, and 8 | Now open up `sftp_example.py`. In the config fill in the `cachedir`, `basepath`, 9 | and `ssh_config` for the sftp blobstore. You'll find some directions acting as 10 | placeholders or you can see [here](#the-config). Now you can run 11 | `./sftp_example.py`. 12 | 13 | Then you can explore how the artifacts and blobs were saved in your specified 14 | `cachedir`,`basepath`, and in `psql provenance-sftp-example`. 15 | 16 | ## Step 2: Learn some stuff 17 | 18 | ### The gist 19 | This is pretty much the same as the basic example, only the blobstore is on a 20 | remote machine. The artifacts in the postgres db are referring to the blobs in 21 | the remote blobstore. 22 | 23 | ### The Config 24 | `cachedir` here has the same meaning as it did in the basic example. This leads 25 | to the question "do I then have an implicit local blobstore?". Yes and no, yes 26 | because all the blobs will be in the `cachedir`. No because Provenance will not 27 | look for them there but will go immediately to the remote host and look in 28 | `basepath`. (You could use the local blobstore via a chained blobstore which is 29 | not covered in this example. Also this is a feature that should probably be 30 | added so it's automatic.) `basepath` is the path to the blobstore on the remote 31 | machine. While `cachedir` will be created for you if it doesn't exist, 32 | `basepath` won't be so make sure you create it. `ssh_config` is relatively 33 | straight forward as it is the standard things you need to ssh onto a machine. 34 | -------------------------------------------------------------------------------- /examples/sftp/environment.yml: -------------------------------------------------------------------------------- 1 | name: provenance-sftp-example 2 | dependencies: 3 | - python==3.5 4 | - paramiko 5 | - pip: 6 | - "git+https://github.com/bmabey/provenance.git" 7 | -------------------------------------------------------------------------------- /examples/sftp/sftp_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from joblib.disk import mkdirp 4 | 5 | import provenance as p 6 | 7 | mkdirp('./remote-machine/sftp-artifacts') 8 | 9 | p.load_config( 10 | { 11 | 'blobstores': 12 | { 13 | 'sftp': 14 | { 15 | 'type': 16 | 'sftp', 17 | 'cachedir': 18 | '', 19 | 'basepath': 20 | '< "" remote machine "" , ex. /home/me/artifacts>, you need to make sure that path directory exists.', 21 | 'read': 22 | True, 23 | 'write': 24 | True, 25 | 'read_through_write': 26 | False, 27 | 'delete': 28 | True, 29 | 'ssh_config': 30 | { 31 | 'hostname': '', 32 | 'port': '', 33 | 'username': '', 34 | 'password': '', 35 | }, 36 | } 37 | }, 38 | 'artifact_repos': 39 | { 40 | 'local': 41 | { 42 | 'type': 'postgres', 43 | 'db': 'postgresql://localhost/provenance-sftp-example', 44 | 'store': 'sftp', 45 | 'read': True, 46 | 'write': True, 47 | 'create_db': True, 48 | 'read_through_write': False, 49 | 'delete': True, 50 | } 51 | }, 52 | 'default_repo': 'local', 53 | } 54 | ) 55 | 56 | 57 | @p.provenance() 58 | def my_add(x, y): 59 | print('Executed') 60 | return x + y 61 | 62 | 63 | print(my_add(1, 4)) 64 | -------------------------------------------------------------------------------- /provenance/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from ._config import from_config, load_config, load_yaml_config 3 | from ._dependencies import dependencies 4 | from ._version import get_versions 5 | from .core import archive_file, ensure_proxies, promote, provenance, provenance_set 6 | from .hashing import hash, value_repr 7 | from .repos import ( 8 | capture_set, 9 | create_set, 10 | current_config, 11 | get_check_mutations, 12 | get_default_repo, 13 | get_read_only, 14 | get_repo_by_name, 15 | get_use_cache, 16 | is_proxy, 17 | lazy_dict, 18 | lazy_proxy_dict, 19 | load_artifact, 20 | load_proxy, 21 | load_set_by_id, 22 | load_set_by_labels, 23 | load_set_by_name, 24 | set_check_mutations, 25 | set_default_repo, 26 | set_read_only, 27 | set_run_info_fn, 28 | set_use_cache, 29 | using_repo, 30 | ) 31 | from .serializers import register_serializer 32 | 33 | __version__ = get_versions()['version'] 34 | del get_versions 35 | -------------------------------------------------------------------------------- /provenance/_commonstore.py: -------------------------------------------------------------------------------- 1 | import operator as op 2 | 3 | import toolz as t 4 | 5 | 6 | class PermissionError(Exception): 7 | 8 | def __init__(self, action, store, permission): 9 | message = 'A `{}` operation was attempted on {} and {} is set to `False`!'.format( 10 | action, store, permission 11 | ) 12 | self.action = action 13 | self.store = store 14 | self.permission = permission 15 | Exception.__init__(self, message) 16 | 17 | 18 | class KeyExistsError(Exception): 19 | 20 | def __init__(self, key, store): 21 | msg = 'The key {} is already present in {}, you can not overwrite it!'.format(key, store) 22 | self.key = key 23 | self.store = store 24 | Exception.__init__(self, msg) 25 | 26 | 27 | class InconsistentKeyError(Exception): 28 | 29 | def __init__(self, key, store, value): 30 | msg = 'The key {} already represents a different value in {}'.format(key, store) 31 | self.key = key 32 | self.store = store 33 | self.value = value 34 | Exception.__init__(self, msg) 35 | 36 | 37 | def find_first(pred, seq): 38 | for i in seq: 39 | if pred(i): 40 | return i 41 | 42 | 43 | def ensure_read(obj, action='get'): 44 | if not obj._read: 45 | raise PermissionError(action, obj, 'read') 46 | 47 | 48 | def ensure_write(obj, action='put'): 49 | if not obj._write: 50 | raise PermissionError(action, obj, 'write') 51 | 52 | 53 | ensure_contains = t.partial(ensure_read, action='contains') 54 | 55 | 56 | def ensure_present(obj, id): 57 | if id not in obj: 58 | raise KeyError(id, obj) 59 | 60 | 61 | def ensure_delete(obj, id=None, check_contains=True): 62 | if not obj._delete: 63 | raise PermissionError('delete', obj, 'delete') 64 | if check_contains and id is not None and id not in obj: 65 | raise KeyError(id, obj) 66 | 67 | 68 | def ensure_put(obj, id, read_through=None, check_contains=True): 69 | if read_through: 70 | if not obj._read_through_write: 71 | raise PermissionError('read_through_put', obj, 'read_through_write') 72 | elif not obj._write: 73 | raise PermissionError('put', obj, 'write') 74 | if check_contains and id in obj: 75 | raise KeyExistsError(id, obj) 76 | 77 | 78 | def chained_contains(chained, id, contains=op.contains): 79 | stores_with_read = [s for s in chained.stores if s._read] 80 | if len(stores_with_read) == 0: 81 | raise PermissionError('contains', chained, 'read') 82 | 83 | for store in stores_with_read: 84 | if store._read and contains(store, id): 85 | return True 86 | return False 87 | 88 | 89 | def chained_put(chained, id, value, put=None, overwrite=False, contains=op.contains, **kargs): 90 | stores_with_write = [s for s in chained.stores if s._write] 91 | if len(stores_with_write) == 0: 92 | raise PermissionError('put', chained, 'write') 93 | 94 | record = None 95 | putin = [] 96 | for store in stores_with_write: 97 | if overwrite or not contains(store, id): 98 | if put: 99 | record = put(store, id, value, **kargs) 100 | else: 101 | record = store.put(id, value, **kargs) 102 | putin.append(store) 103 | 104 | if len(putin) == 0 and len(stores_with_write) > 0: 105 | raise KeyExistsError(id, chained) 106 | 107 | return record 108 | 109 | 110 | def chained_get(chained, get, id, put=None): 111 | stores_with_read = [s for s in chained.stores if s._read] 112 | if len(stores_with_read) == 0: 113 | raise KeyError(id, chained) 114 | 115 | pushback = [] 116 | for store in stores_with_read: 117 | try: 118 | value = get(store, id) 119 | break 120 | except KeyError: 121 | if store._read_through_write: 122 | pushback.append(store) 123 | else: 124 | raise KeyError(id, chained) 125 | 126 | for store in pushback: 127 | if put: 128 | put(store, id, value, read_through=True) 129 | else: 130 | store.put(id, value, read_through=True) 131 | return value 132 | 133 | 134 | def chained_delete(chained, id, delete=None, contains=op.contains): 135 | stores_with_delete = [s for s in chained.stores if s._delete] 136 | if len(stores_with_delete) == 0: 137 | raise PermissionError('delete', chained, 'delete') 138 | 139 | foundin = [] 140 | for store in stores_with_delete: 141 | if contains(store, id): 142 | foundin.append(store) 143 | if delete: 144 | delete(store, id) 145 | else: 146 | store.delete(id) 147 | if len(foundin) == 0: 148 | raise KeyError(id, chained) 149 | else: 150 | return foundin 151 | 152 | 153 | def chained_filename(chained, id): 154 | if id in chained: 155 | 156 | def valid_store(s): 157 | return s._read and hasattr(s, '_filename') and id in s 158 | 159 | store = find_first(valid_store, chained.stores) 160 | 161 | if store is not None: 162 | return store._filename(id) 163 | else: 164 | raise Exception('You do not have a disk-based store setup.') 165 | -------------------------------------------------------------------------------- /provenance/_config.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | 4 | import toolz as t 5 | 6 | import provenance.blobstores as bs 7 | import provenance.repos as r 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @t.curry 13 | def full_config(configs, base_config): 14 | if 'type' in base_config: 15 | return base_config 16 | prototype = full_config(configs, configs[base_config['prototype']]) 17 | return t.thread_first(prototype, (t.merge, base_config), (t.dissoc, 'prototype')) 18 | 19 | 20 | def merge_prototypes(config): 21 | return t.valmap(full_config(config), config) 22 | 23 | 24 | @t.curry 25 | def atomic_item_from_config(config, type_dict, item_plural, name=None): 26 | stype = config['type'] 27 | if stype not in type_dict: 28 | raise Exception( 29 | '{} may only be created of types: {}, you had {}'.format( 30 | item_plural, tuple(type_dict.keys()), stype 31 | ) 32 | ) 33 | cls = type_dict[stype] 34 | kargs = t.dissoc(config, 'type') 35 | return cls(**kargs) 36 | 37 | 38 | BLOBSTORE_TYPES = { 39 | 'disk': bs.DiskStore, 40 | 's3': bs.S3Store, 41 | 'memory': bs.MemoryStore, 42 | 'chained': bs.ChainedStore, 43 | } 44 | 45 | try: 46 | import provenance.sftp as sftp 47 | 48 | BLOBSTORE_TYPES['sftp'] = sftp.SFTPStore 49 | 50 | except ImportError as e: 51 | 52 | class SFTPStore: 53 | _err = e 54 | 55 | def __init__(self, *args, **kargs): 56 | raise (self._err) 57 | 58 | BLOBSTORE_TYPES['sftp'] = SFTPStore 59 | 60 | try: 61 | import provenance.google_storage as gs 62 | 63 | BLOBSTORE_TYPES['gs'] = gs.GSStore 64 | 65 | except ImportError as e: 66 | 67 | class GSStore: 68 | _err = e 69 | 70 | def __init__(self, *args, **kargs): 71 | raise (self._err) 72 | 73 | BLOBSTORE_TYPES['gs'] = GSStore 74 | 75 | blobstore_from_config = atomic_item_from_config(type_dict=BLOBSTORE_TYPES, item_plural='Blobstores') 76 | 77 | REPO_TYPES = { 78 | 'postgres': r.PostgresRepo, 79 | 'memory': r.MemoryRepo, 80 | 'chained': r.ChainedRepo, 81 | } 82 | 83 | repo_from_config = atomic_item_from_config(type_dict=REPO_TYPES, item_plural='Artifact Repos') 84 | 85 | 86 | def items_from_config(config, atomic_from_config, items_name, item_type, silence_warnings): 87 | config = merge_prototypes(copy.deepcopy(config)) 88 | 89 | atomic_stores = {} 90 | for k, c in config.items(): 91 | try: 92 | if c['type'] != 'chained': 93 | store = atomic_from_config(c, name=k) 94 | if store: 95 | atomic_stores[k] = store 96 | except Exception: 97 | if not silence_warnings: 98 | logger.warning( 99 | 'Error creating %s %s from config - Skipping', 100 | item_type, 101 | k, 102 | exc_info=True, 103 | ) 104 | 105 | def create_chained(name, config): 106 | # resolve the stores 107 | chained = {n for n in config[items_name] if n in atomic_stores} 108 | if len(chained) != len(config[items_name]): 109 | missing_configs = set(config[items_name]) - chained 110 | if not silence_warnings: 111 | logger.warning( 112 | 'Skipping chained %s %s due to missing %s: %s', 113 | item_type, 114 | name, 115 | items_name, 116 | missing_configs, 117 | ) 118 | return None 119 | 120 | config[items_name] = [atomic_stores[n] for n in config[items_name]] 121 | return atomic_from_config(config, name=name) 122 | 123 | chained_stores = {} 124 | for k, c in config.items(): 125 | try: 126 | if c['type'] == 'chained': 127 | store = create_chained(k, c) 128 | if store: 129 | chained_stores[k] = store 130 | except Exception: 131 | if not silence_warnings: 132 | logger.warning( 133 | 'Error creating %s %s from config - Skipping', 134 | item_type, 135 | k, 136 | exc_info=True, 137 | ) 138 | 139 | return t.merge(chained_stores, atomic_stores) 140 | 141 | 142 | def blobstores_from_config(config, silence_warnings=False): 143 | return items_from_config(config, blobstore_from_config, 'stores', 'blobstore', silence_warnings) 144 | 145 | 146 | def repos_from_config(config, blobstores, silence_warnings=False): 147 | 148 | def from_config(atomic_config, name): 149 | if 'store' in atomic_config: 150 | if not atomic_config['store'] in blobstores: 151 | if not silence_warnings: 152 | logger.warning( 153 | 'Skipping %s repo due to missing store: %s', 154 | name, 155 | atomic_config['store'], 156 | ) 157 | return None 158 | 159 | atomic_config['store'] = blobstores[atomic_config['store']] 160 | return repo_from_config(atomic_config) 161 | 162 | return items_from_config(config, from_config, 'repos', 'repo', silence_warnings) 163 | 164 | 165 | def from_config(config): 166 | silence_warnings = config.get('silence_warnings', False) 167 | blobstores = blobstores_from_config(config['blobstores'], silence_warnings) 168 | repos = repos_from_config(config['artifact_repos'], blobstores, silence_warnings) 169 | return {'blobstores': blobstores, 'repos': repos} 170 | 171 | 172 | def load_config(config): 173 | objs = from_config(config) 174 | pconfig = r.Config( 175 | objs['blobstores'], 176 | objs['repos'], 177 | default_repo=config['default_repo'], 178 | run_info_fn=config.get('run_info_fn', None), 179 | use_cache=config.get('use_cache', True), 180 | read_only=config.get('read_only', False), 181 | check_mutations=config.get('check_mutations', False), 182 | ) 183 | r.Config.set_current(pconfig) 184 | return pconfig 185 | 186 | 187 | def load_yaml_config(filename): 188 | import yaml 189 | 190 | with open(filename, 'r') as f: 191 | return load_config(yaml.load(f)) 192 | -------------------------------------------------------------------------------- /provenance/_dependencies.py: -------------------------------------------------------------------------------- 1 | import io 2 | import pickle 3 | 4 | import cloudpickle 5 | 6 | from . import repos as r 7 | 8 | Pickler = cloudpickle.CloudPickler 9 | 10 | 11 | class DependencyWalker(Pickler): 12 | 13 | def __init__(self): 14 | self.stream = io.BytesIO() 15 | self.dependents = [] 16 | self.branches = [] 17 | protocol = pickle.DEFAULT_PROTOCOL 18 | Pickler.__init__(self, self.stream, protocol=protocol) 19 | 20 | def save(self, obj): 21 | if isinstance(obj, r.Artifact): 22 | self.dependents.append(obj) 23 | elif r.is_proxy(obj): 24 | self.dependents.append(obj.artifact) 25 | else: 26 | Pickler.save(self, obj) 27 | 28 | def deps(self, artifact): 29 | self.dependents = [] 30 | self.dump(artifact) 31 | return self.dependents 32 | 33 | 34 | def _deps(val): 35 | return DependencyWalker().deps(val) 36 | 37 | 38 | def _artifact_branches(artifact): 39 | objs = _deps(artifact.inputs) + _deps(artifact.value) 40 | objs.sort(key=lambda a: a.id) 41 | return objs 42 | 43 | 44 | def dependencies(artifact_or_id): 45 | """ 46 | Returns a reversed breadth first search. This guarantees that 47 | for all artifacts in the list. All of an artifacts dependencies 48 | will come before it. 49 | """ 50 | artifact = r.coerce_to_artifact(artifact_or_id) 51 | visited = [] 52 | queue = [artifact] 53 | while queue: 54 | a, *queue = queue 55 | 56 | if a in visited: 57 | continue 58 | 59 | visited.append(a) 60 | queue.extend(_artifact_branches(a)) 61 | 62 | visited.reverse() 63 | return visited 64 | -------------------------------------------------------------------------------- /provenance/_version.py: -------------------------------------------------------------------------------- 1 | # This file helps to compute a version number in source trees obtained from 2 | # git-archive tarball (such as those provided by githubs download-from-tag 3 | # feature). Distribution tarballs (built by setup.py sdist) and build 4 | # directories (produced by setup.py build) will contain a much shorter file 5 | # that just contains the computed version number. 6 | 7 | # This file is released into the public domain. Generated by 8 | # versioneer-0.18 (https://github.com/warner/python-versioneer) 9 | """Git implementation of _version.py.""" 10 | 11 | import errno 12 | import os 13 | import re 14 | import subprocess 15 | import sys 16 | 17 | 18 | def get_keywords(): 19 | """Get the keywords needed to look up the version information.""" 20 | # these strings will be replaced by git during git-archive. 21 | # setup.py/versioneer.py will grep for the variable names, so they must 22 | # each be defined on a line of their own. _version.py will just call 23 | # get_keywords(). 24 | git_refnames = ' (HEAD -> trunk)' 25 | git_full = 'd29ad2ffc39fbc389600df092da9e7df4f920100' 26 | git_date = '2020-12-02 11:05:43 -0700' 27 | keywords = {'refnames': git_refnames, 'full': git_full, 'date': git_date} 28 | return keywords 29 | 30 | 31 | class VersioneerConfig: 32 | """Container for Versioneer configuration parameters.""" 33 | 34 | 35 | def get_config(): 36 | """Create, populate and return the VersioneerConfig() object.""" 37 | # these strings are filled in when 'setup.py versioneer' creates 38 | # _version.py 39 | cfg = VersioneerConfig() 40 | cfg.VCS = 'git' 41 | cfg.style = 'pep440' 42 | cfg.tag_prefix = '' 43 | cfg.parentdir_prefix = 'provenance-' 44 | cfg.versionfile_source = 'provenance/_version.py' 45 | cfg.verbose = False 46 | return cfg 47 | 48 | 49 | class NotThisMethod(Exception): 50 | """Exception raised if a method is not valid for the current scenario.""" 51 | 52 | 53 | LONG_VERSION_PY = {} 54 | HANDLERS = {} 55 | 56 | 57 | def register_vcs_handler(vcs, method): # decorator 58 | """Decorator to mark a method as the handler for a particular VCS.""" 59 | def decorate(f): 60 | """Store f in HANDLERS[vcs][method].""" 61 | if vcs not in HANDLERS: 62 | HANDLERS[vcs] = {} 63 | HANDLERS[vcs][method] = f 64 | return f 65 | 66 | return decorate 67 | 68 | 69 | def run_command(commands, 70 | args, 71 | cwd=None, 72 | verbose=False, 73 | hide_stderr=False, 74 | env=None): 75 | """Call the given command(s).""" 76 | assert isinstance(commands, list) 77 | p = None 78 | for c in commands: 79 | try: 80 | dispcmd = str([c] + args) 81 | # remember shell=False, so use git.cmd on windows, not just git 82 | p = subprocess.Popen( 83 | [c] + args, 84 | cwd=cwd, 85 | env=env, 86 | stdout=subprocess.PIPE, 87 | stderr=(subprocess.PIPE if hide_stderr else None), 88 | ) 89 | break 90 | except EnvironmentError: 91 | e = sys.exc_info()[1] 92 | if e.errno == errno.ENOENT: 93 | continue 94 | if verbose: 95 | print('unable to run %s' % dispcmd) 96 | print(e) 97 | return None, None 98 | else: 99 | if verbose: 100 | print('unable to find command, tried %s' % (commands, )) 101 | return None, None 102 | stdout = p.communicate()[0].strip() 103 | if sys.version_info[0] >= 3: 104 | stdout = stdout.decode() 105 | if p.returncode != 0: 106 | if verbose: 107 | print('unable to run %s (error)' % dispcmd) 108 | print('stdout was %s' % stdout) 109 | return None, p.returncode 110 | return stdout, p.returncode 111 | 112 | 113 | def versions_from_parentdir(parentdir_prefix, root, verbose): 114 | """Try to determine the version from the parent directory name. 115 | 116 | Source tarballs conventionally unpack into a directory that includes both 117 | the project name and a version string. We will also support searching up 118 | two directory levels for an appropriately named parent directory 119 | """ 120 | rootdirs = [] 121 | 122 | for i in range(3): 123 | dirname = os.path.basename(root) 124 | if dirname.startswith(parentdir_prefix): 125 | return { 126 | 'version': dirname[len(parentdir_prefix):], 127 | 'full-revisionid': None, 128 | 'dirty': False, 129 | 'error': None, 130 | 'date': None, 131 | } 132 | else: 133 | rootdirs.append(root) 134 | root = os.path.dirname(root) # up a level 135 | 136 | if verbose: 137 | print('Tried directories %s but none started with prefix %s' % 138 | (str(rootdirs), parentdir_prefix)) 139 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") 140 | 141 | 142 | @register_vcs_handler('git', 'get_keywords') 143 | def git_get_keywords(versionfile_abs): 144 | """Extract version information from the given file.""" 145 | # the code embedded in _version.py can just fetch the value of these 146 | # keywords. When used from setup.py, we don't want to import _version.py, 147 | # so we do it with a regexp instead. This function is not used from 148 | # _version.py. 149 | keywords = {} 150 | try: 151 | f = open(versionfile_abs, 'r') 152 | for line in f.readlines(): 153 | if line.strip().startswith('git_refnames ='): 154 | mo = re.search(r'=\s*"(.*)"', line) 155 | if mo: 156 | keywords['refnames'] = mo.group(1) 157 | if line.strip().startswith('git_full ='): 158 | mo = re.search(r'=\s*"(.*)"', line) 159 | if mo: 160 | keywords['full'] = mo.group(1) 161 | if line.strip().startswith('git_date ='): 162 | mo = re.search(r'=\s*"(.*)"', line) 163 | if mo: 164 | keywords['date'] = mo.group(1) 165 | f.close() 166 | except EnvironmentError: 167 | pass 168 | return keywords 169 | 170 | 171 | @register_vcs_handler('git', 'keywords') 172 | def git_versions_from_keywords(keywords, tag_prefix, verbose): 173 | """Get version information from git keywords.""" 174 | if not keywords: 175 | raise NotThisMethod('no keywords at all, weird') 176 | date = keywords.get('date') 177 | if date is not None: 178 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant 179 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 180 | # -like" string, which we must then edit to make compliant), because 181 | # it's been around since git-1.5.3, and it's too difficult to 182 | # discover which version we're using, or to work around using an 183 | # older one. 184 | date = date.strip().replace(' ', 'T', 1).replace(' ', '', 1) 185 | refnames = keywords['refnames'].strip() 186 | if refnames.startswith('$Format'): 187 | if verbose: 188 | print('keywords are unexpanded, not using') 189 | raise NotThisMethod('unexpanded keywords, not a git-archive tarball') 190 | refs = set([r.strip() for r in refnames.strip('()').split(',')]) 191 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of 192 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. 193 | TAG = 'tag: ' 194 | tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) 195 | if not tags: 196 | # Either we're using git < 1.8.3, or there really are no tags. We use 197 | # a heuristic: assume all version tags have a digit. The old git %d 198 | # expansion behaves like git log --decorate=short and strips out the 199 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish 200 | # between branches and tags. By ignoring refnames without digits, we 201 | # filter out many common branch names like "release" and 202 | # "stabilization", as well as "HEAD" and "trunk". 203 | tags = set([r for r in refs if re.search(r'\d', r)]) 204 | if verbose: 205 | print("discarding '%s', no digits" % ','.join(refs - tags)) 206 | if verbose: 207 | print('likely tags: %s' % ','.join(sorted(tags))) 208 | for ref in sorted(tags): 209 | # sorting will prefer e.g. "2.0" over "2.0rc1" 210 | if ref.startswith(tag_prefix): 211 | r = ref[len(tag_prefix):] 212 | if verbose: 213 | print('picking %s' % r) 214 | return { 215 | 'version': r, 216 | 'full-revisionid': keywords['full'].strip(), 217 | 'dirty': False, 218 | 'error': None, 219 | 'date': date, 220 | } 221 | # no suitable tags, so version is "0+unknown", but full hex is still there 222 | if verbose: 223 | print('no suitable tags, using unknown + full revision id') 224 | return { 225 | 'version': '0+unknown', 226 | 'full-revisionid': keywords['full'].strip(), 227 | 'dirty': False, 228 | 'error': 'no suitable tags', 229 | 'date': None, 230 | } 231 | 232 | 233 | @register_vcs_handler('git', 'pieces_from_vcs') 234 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): 235 | """Get version from 'git describe' in the root of the source tree. 236 | 237 | This only gets called if the git-archive 'subst' keywords were *not* 238 | expanded, and _version.py hasn't already been rewritten with a short 239 | version string, meaning we're inside a checked out source tree. 240 | """ 241 | GITS = ['git'] 242 | if sys.platform == 'win32': 243 | GITS = ['git.cmd', 'git.exe'] 244 | 245 | out, rc = run_command(GITS, ['rev-parse', '--git-dir'], 246 | cwd=root, 247 | hide_stderr=True) 248 | if rc != 0: 249 | if verbose: 250 | print('Directory %s not under git control' % root) 251 | raise NotThisMethod("'git rev-parse --git-dir' returned error") 252 | 253 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] 254 | # if there isn't one, this yields HEX[-dirty] (no NUM) 255 | describe_out, rc = run_command( 256 | GITS, 257 | [ 258 | 'describe', '--tags', '--dirty', '--always', '--long', '--match', 259 | '%s*' % tag_prefix 260 | ], 261 | cwd=root, 262 | ) 263 | # --long was added in git-1.5.5 264 | if describe_out is None: 265 | raise NotThisMethod("'git describe' failed") 266 | describe_out = describe_out.strip() 267 | full_out, rc = run_command(GITS, ['rev-parse', 'HEAD'], cwd=root) 268 | if full_out is None: 269 | raise NotThisMethod("'git rev-parse' failed") 270 | full_out = full_out.strip() 271 | 272 | pieces = {} 273 | pieces['long'] = full_out 274 | pieces['short'] = full_out[:7] # maybe improved later 275 | pieces['error'] = None 276 | 277 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] 278 | # TAG might have hyphens. 279 | git_describe = describe_out 280 | 281 | # look for -dirty suffix 282 | dirty = git_describe.endswith('-dirty') 283 | pieces['dirty'] = dirty 284 | if dirty: 285 | git_describe = git_describe[:git_describe.rindex('-dirty')] 286 | 287 | # now we have TAG-NUM-gHEX or HEX 288 | 289 | if '-' in git_describe: 290 | # TAG-NUM-gHEX 291 | mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) 292 | if not mo: 293 | # unparseable. Maybe git-describe is misbehaving? 294 | pieces[ 295 | 'error'] = "unable to parse git-describe output: '%s'" % describe_out 296 | return pieces 297 | 298 | # tag 299 | full_tag = mo.group(1) 300 | if not full_tag.startswith(tag_prefix): 301 | if verbose: 302 | fmt = "tag '%s' doesn't start with prefix '%s'" 303 | print(fmt % (full_tag, tag_prefix)) 304 | pieces['error'] = "tag '%s' doesn't start with prefix '%s'" % ( 305 | full_tag, tag_prefix) 306 | return pieces 307 | pieces['closest-tag'] = full_tag[len(tag_prefix):] 308 | 309 | # distance: number of commits since tag 310 | pieces['distance'] = int(mo.group(2)) 311 | 312 | # commit: short hex revision ID 313 | pieces['short'] = mo.group(3) 314 | 315 | else: 316 | # HEX: no tags 317 | pieces['closest-tag'] = None 318 | count_out, rc = run_command(GITS, ['rev-list', 'HEAD', '--count'], 319 | cwd=root) 320 | pieces['distance'] = int(count_out) # total number of commits 321 | 322 | # commit date: see ISO-8601 comment in git_versions_from_keywords() 323 | date = run_command(GITS, ['show', '-s', '--format=%ci', 'HEAD'], 324 | cwd=root)[0].strip() 325 | pieces['date'] = date.strip().replace(' ', 'T', 1).replace(' ', '', 1) 326 | 327 | return pieces 328 | 329 | 330 | def plus_or_dot(pieces): 331 | """Return a + if we don't already have one, else return a .""" 332 | if '+' in pieces.get('closest-tag', ''): 333 | return '.' 334 | return '+' 335 | 336 | 337 | def render_pep440(pieces): 338 | """Build up version string, with post-release "local version identifier". 339 | 340 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you 341 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty 342 | 343 | Exceptions: 344 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] 345 | """ 346 | if pieces['closest-tag']: 347 | rendered = pieces['closest-tag'] 348 | if pieces['distance'] or pieces['dirty']: 349 | rendered += plus_or_dot(pieces) 350 | rendered += '%d.g%s' % (pieces['distance'], pieces['short']) 351 | if pieces['dirty']: 352 | rendered += '.dirty' 353 | else: 354 | # exception #1 355 | rendered = '0+untagged.%d.g%s' % (pieces['distance'], pieces['short']) 356 | if pieces['dirty']: 357 | rendered += '.dirty' 358 | return rendered 359 | 360 | 361 | def render_pep440_pre(pieces): 362 | """TAG[.post.devDISTANCE] -- No -dirty. 363 | 364 | Exceptions: 365 | 1: no tags. 0.post.devDISTANCE 366 | """ 367 | if pieces['closest-tag']: 368 | rendered = pieces['closest-tag'] 369 | if pieces['distance']: 370 | rendered += '.post.dev%d' % pieces['distance'] 371 | else: 372 | # exception #1 373 | rendered = '0.post.dev%d' % pieces['distance'] 374 | return rendered 375 | 376 | 377 | def render_pep440_post(pieces): 378 | """TAG[.postDISTANCE[.dev0]+gHEX] . 379 | 380 | The ".dev0" means dirty. Note that .dev0 sorts backwards 381 | (a dirty tree will appear "older" than the corresponding clean one), 382 | but you shouldn't be releasing software with -dirty anyways. 383 | 384 | Exceptions: 385 | 1: no tags. 0.postDISTANCE[.dev0] 386 | """ 387 | if pieces['closest-tag']: 388 | rendered = pieces['closest-tag'] 389 | if pieces['distance'] or pieces['dirty']: 390 | rendered += '.post%d' % pieces['distance'] 391 | if pieces['dirty']: 392 | rendered += '.dev0' 393 | rendered += plus_or_dot(pieces) 394 | rendered += 'g%s' % pieces['short'] 395 | else: 396 | # exception #1 397 | rendered = '0.post%d' % pieces['distance'] 398 | if pieces['dirty']: 399 | rendered += '.dev0' 400 | rendered += '+g%s' % pieces['short'] 401 | return rendered 402 | 403 | 404 | def render_pep440_old(pieces): 405 | """TAG[.postDISTANCE[.dev0]] . 406 | 407 | The ".dev0" means dirty. 408 | 409 | Eexceptions: 410 | 1: no tags. 0.postDISTANCE[.dev0] 411 | """ 412 | if pieces['closest-tag']: 413 | rendered = pieces['closest-tag'] 414 | if pieces['distance'] or pieces['dirty']: 415 | rendered += '.post%d' % pieces['distance'] 416 | if pieces['dirty']: 417 | rendered += '.dev0' 418 | else: 419 | # exception #1 420 | rendered = '0.post%d' % pieces['distance'] 421 | if pieces['dirty']: 422 | rendered += '.dev0' 423 | return rendered 424 | 425 | 426 | def render_git_describe(pieces): 427 | """TAG[-DISTANCE-gHEX][-dirty]. 428 | 429 | Like 'git describe --tags --dirty --always'. 430 | 431 | Exceptions: 432 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 433 | """ 434 | if pieces['closest-tag']: 435 | rendered = pieces['closest-tag'] 436 | if pieces['distance']: 437 | rendered += '-%d-g%s' % (pieces['distance'], pieces['short']) 438 | else: 439 | # exception #1 440 | rendered = pieces['short'] 441 | if pieces['dirty']: 442 | rendered += '-dirty' 443 | return rendered 444 | 445 | 446 | def render_git_describe_long(pieces): 447 | """TAG-DISTANCE-gHEX[-dirty]. 448 | 449 | Like 'git describe --tags --dirty --always -long'. 450 | The distance/hash is unconditional. 451 | 452 | Exceptions: 453 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 454 | """ 455 | if pieces['closest-tag']: 456 | rendered = pieces['closest-tag'] 457 | rendered += '-%d-g%s' % (pieces['distance'], pieces['short']) 458 | else: 459 | # exception #1 460 | rendered = pieces['short'] 461 | if pieces['dirty']: 462 | rendered += '-dirty' 463 | return rendered 464 | 465 | 466 | def render(pieces, style): 467 | """Render the given version pieces into the requested style.""" 468 | if pieces['error']: 469 | return { 470 | 'version': 'unknown', 471 | 'full-revisionid': pieces.get('long'), 472 | 'dirty': None, 473 | 'error': pieces['error'], 474 | 'date': None, 475 | } 476 | 477 | if not style or style == 'default': 478 | style = 'pep440' # the default 479 | 480 | if style == 'pep440': 481 | rendered = render_pep440(pieces) 482 | elif style == 'pep440-pre': 483 | rendered = render_pep440_pre(pieces) 484 | elif style == 'pep440-post': 485 | rendered = render_pep440_post(pieces) 486 | elif style == 'pep440-old': 487 | rendered = render_pep440_old(pieces) 488 | elif style == 'git-describe': 489 | rendered = render_git_describe(pieces) 490 | elif style == 'git-describe-long': 491 | rendered = render_git_describe_long(pieces) 492 | else: 493 | raise ValueError("unknown style '%s'" % style) 494 | 495 | return { 496 | 'version': rendered, 497 | 'full-revisionid': pieces['long'], 498 | 'dirty': pieces['dirty'], 499 | 'error': None, 500 | 'date': pieces.get('date'), 501 | } 502 | 503 | 504 | def get_versions(): 505 | """Get version information or return default if unable to do so.""" 506 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have 507 | # __file__, we can work backwards from there to the root. Some 508 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which 509 | # case we can only use expanded keywords. 510 | 511 | cfg = get_config() 512 | verbose = cfg.verbose 513 | 514 | try: 515 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, 516 | verbose) 517 | except NotThisMethod: 518 | pass 519 | 520 | try: 521 | root = os.path.realpath(__file__) 522 | # versionfile_source is the relative path from the top of the source 523 | # tree (where the .git directory might live) to this file. Invert 524 | # this to find the root from __file__. 525 | for i in cfg.versionfile_source.split('/'): 526 | root = os.path.dirname(root) 527 | except NameError: 528 | return { 529 | 'version': '0+unknown', 530 | 'full-revisionid': None, 531 | 'dirty': None, 532 | 'error': 'unable to find root of source tree', 533 | 'date': None, 534 | } 535 | 536 | try: 537 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) 538 | return render(pieces, cfg.style) 539 | except NotThisMethod: 540 | pass 541 | 542 | try: 543 | if cfg.parentdir_prefix: 544 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) 545 | except NotThisMethod: 546 | pass 547 | 548 | return { 549 | 'version': '0+unknown', 550 | 'full-revisionid': None, 551 | 'dirty': None, 552 | 'error': 'unable to compute version', 553 | 'date': None, 554 | } 555 | -------------------------------------------------------------------------------- /provenance/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = migrations 6 | 7 | # template used to generate migration files 8 | # file_template = %%(rev)s_%%(slug)s 9 | 10 | # max length of characters to apply to the 11 | # "slug" field 12 | #truncate_slug_length = 40 13 | 14 | # set to 'true' to run the environment during 15 | # the 'revision' command, regardless of autogenerate 16 | # revision_environment = false 17 | 18 | # set to 'true' to allow .pyc and .pyo files without 19 | # a source .py file to be detected as revisions in the 20 | # versions/ directory 21 | # sourceless = false 22 | 23 | # version location specification; this defaults 24 | # to migrations/versions. When using multiple version 25 | # directories, initial revisions must be specified with --version-path 26 | # version_locations = %(here)s/bar %(here)s/bat migrations/versions 27 | 28 | # the output encoding used when revision files 29 | # are written from script.py.mako 30 | # output_encoding = utf-8 31 | 32 | #sqlalchemy.url = postgresql://localhost/test_provenance 33 | -------------------------------------------------------------------------------- /provenance/artifact_hasher.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from . import hashing as h, repos as r 4 | 5 | 6 | def _save(obj, artifacts): 7 | if isinstance(obj, r.Artifact): 8 | artifacts[obj.id] = obj 9 | if r.is_proxy(obj): 10 | artifacts[obj.artifact.id] = obj.artifact 11 | 12 | 13 | class ArtifactHasher(h.Hasher): 14 | 15 | def __init__(self, artifacts=None, hash_name='md5'): 16 | if artifacts is None: 17 | artifacts = {} 18 | 19 | self.artifacts = artifacts 20 | h.Hasher.__init__(self, hash_name=hash_name) 21 | 22 | def save(self, obj): 23 | _save(obj, self.artifacts) 24 | h.Hasher.save(self, obj) 25 | 26 | def hash(self, obj): 27 | return (h.Hasher.hash(self, obj), self.artifacts.values()) 28 | 29 | 30 | class NumpyArtifactHasher(h.NumpyHasher): 31 | 32 | def __init__(self, artifacts=None, hash_name='md5', coerce_mmap=True): 33 | if artifacts is None: 34 | artifacts = {} 35 | 36 | self.artifacts = artifacts 37 | h.NumpyHasher.__init__(self, hash_name=hash_name, coerce_mmap=coerce_mmap) 38 | 39 | def save(self, obj): 40 | _save(obj, self.artifacts) 41 | h.NumpyHasher.save(self, obj) 42 | 43 | def hash(self, obj): 44 | return (h.NumpyHasher.hash(self, obj), self.artifacts.values()) 45 | 46 | 47 | def artifact_hasher(*args, **kwargs): 48 | if 'numpy' in sys.modules: 49 | return NumpyArtifactHasher(*args, **kwargs) 50 | else: 51 | return ArtifactHasher(*args, **kwargs) 52 | -------------------------------------------------------------------------------- /provenance/blobstores.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | import os.path 4 | import shutil 5 | import tempfile 6 | 7 | from joblib.disk import mkdirp 8 | from s3fs import S3FileSystem 9 | 10 | from . import _commonstore as cs 11 | from .serializers import DEFAULT_VALUE_SERIALIZER 12 | 13 | 14 | class BaseBlobStore: 15 | 16 | def __init__( 17 | self, 18 | read=True, 19 | write=True, 20 | read_through_write=True, 21 | delete=False, 22 | on_duplicate_key='skip', 23 | ): 24 | self._read = read 25 | self._write = write 26 | self._read_through_write = read_through_write 27 | self._delete = delete 28 | self._on_duplicate_key = on_duplicate_key 29 | 30 | valid_on_duplicate_keys = {'skip', 'overwrite', 'check_collision', 'raise'} 31 | if self._on_duplicate_key not in valid_on_duplicate_keys: 32 | msg = 'on_duplicate_key must be one of {}'.format(valid_on_duplicate_keys) 33 | raise RuntimeError(msg) 34 | 35 | def __getitem__(self, id, *args, **kargs): 36 | return self.get(id, *args, **kargs) 37 | 38 | def put(self, id, value, serializer=DEFAULT_VALUE_SERIALIZER, read_through=False): 39 | method = getattr(self, '_put_' + self._on_duplicate_key) 40 | return method(id, value, serializer, read_through) 41 | 42 | def _put_raise(self, id, value, serializer, read_through): 43 | cs.ensure_put(self, id, read_through) 44 | self._put_overwrite(id, value, serializer, read_through) 45 | 46 | def _put_skip(self, id, value, serializer, read_through): 47 | if id not in self: 48 | self._put_overwrite(id, value, serializer, read_through) 49 | 50 | def _put_check_collision(self, id, value, serializer, read_through): 51 | cs.ensure_put(self, id, read_through, check_contains=False) 52 | if id not in self: 53 | self._put_overwrite(id, value, serializer, read_through) 54 | else: 55 | self._check_collision(id, value, serializer) 56 | 57 | # TODO: Right now our only thought is that this can be 58 | # checked by using an alternate hash, this will require 59 | # deserializing the old value and running the hash algorithm 60 | # with an alternate hash 61 | def _check_collision(self, id, value, serializer): 62 | raise NotImplementedError() 63 | 64 | 65 | class MemoryStore(BaseBlobStore): 66 | 67 | def __init__( 68 | self, 69 | values=None, 70 | read=True, 71 | write=True, 72 | read_through_write=True, 73 | delete=True, 74 | on_duplicate_key='skip', 75 | ): 76 | super(MemoryStore, self).__init__( 77 | read=read, 78 | write=write, 79 | read_through_write=read_through_write, 80 | delete=delete, 81 | on_duplicate_key=on_duplicate_key, 82 | ) 83 | if values is None: 84 | self.values = {} 85 | else: 86 | self.values = values 87 | 88 | def __contains__(self, id): 89 | cs.ensure_contains(self) 90 | return id in self.values 91 | 92 | def _put_overwrite(self, id, value, serializer, read_through): 93 | cs.ensure_put(self, id, read_through, check_contains=False) 94 | self.values[id] = value 95 | 96 | def get(self, id, serialzier=None, **_kargs): 97 | cs.ensure_read(self) 98 | cs.ensure_present(self, id) 99 | return self.values[id] 100 | 101 | def delete(self, id): 102 | cs.ensure_delete(self, id) 103 | del self.values[id] 104 | 105 | 106 | @contextlib.contextmanager 107 | def _temp_filename(): 108 | try: 109 | temp = tempfile.NamedTemporaryFile('wb', delete=False) 110 | temp.close() 111 | yield temp.name 112 | finally: 113 | if os.path.isfile(temp.name): 114 | os.remove(temp.name) 115 | 116 | 117 | @contextlib.contextmanager 118 | def _atomic_write(filename): 119 | with _temp_filename() as temp: 120 | yield temp 121 | shutil.move(temp, filename) 122 | 123 | 124 | def _abspath(path): 125 | return os.path.abspath(os.path.expanduser(path)) 126 | 127 | 128 | class DiskStore(BaseBlobStore): 129 | 130 | def __init__( 131 | self, 132 | cachedir, 133 | read=True, 134 | write=True, 135 | read_through_write=True, 136 | delete=False, 137 | on_duplicate_key='skip', 138 | ): 139 | super(DiskStore, self).__init__( 140 | read=read, 141 | write=write, 142 | read_through_write=read_through_write, 143 | delete=delete, 144 | on_duplicate_key=on_duplicate_key, 145 | ) 146 | self.cachedir = _abspath(cachedir) 147 | mkdirp(self.cachedir) 148 | 149 | def _filename(self, id): 150 | return os.path.join(self.cachedir, id) 151 | 152 | def __contains__(self, id): 153 | cs.ensure_contains(self) 154 | return os.path.isfile(self._filename(id)) 155 | 156 | def _put_overwrite(self, id, value, serializer, read_through): 157 | cs.ensure_put(self, id, read_through, check_contains=False) 158 | with _atomic_write(self._filename(id)) as temp: 159 | serializer.dump(value, temp) 160 | 161 | def get(self, id, serializer=DEFAULT_VALUE_SERIALIZER, **_kargs): 162 | cs.ensure_read(self) 163 | cs.ensure_present(self, id) 164 | return serializer.load(self._filename(id)) 165 | 166 | def delete(self, id): 167 | cs.ensure_delete(self, id) 168 | os.remove(self._filename(id)) 169 | 170 | 171 | class RemoteStore(BaseBlobStore): 172 | 173 | def __init__( 174 | self, 175 | cachedir, 176 | basepath, 177 | read=True, 178 | write=True, 179 | read_through_write=True, 180 | delete=False, 181 | on_duplicate_key='skip', 182 | cleanup_cachedir=False, 183 | always_check_remote=False, 184 | ): 185 | """ 186 | Parameters 187 | ---------- 188 | always_check_remote : bool 189 | When True the remote store will be checked with every __contains__ call. Otherwise it will 190 | short-circuit if the blob is found in the cachedir. For performance reasons this 191 | should always be set to False. The only reason why you would want to use this 192 | is if you are using a RemoteStore and a DiskStore in a ChainedStore together for 193 | some reason. Since the RemoteStore basically doubles as a DiskStore with it's cachedir 194 | chaining the two doesn't really make sense though. 195 | """ 196 | super(RemoteStore, self).__init__( 197 | read=read, 198 | write=write, 199 | read_through_write=read_through_write, 200 | delete=delete, 201 | on_duplicate_key=on_duplicate_key, 202 | ) 203 | 204 | self.always_check = always_check_remote 205 | 206 | self.cachedir = _abspath(cachedir) 207 | self.basepath = basepath 208 | self.cleanup_cachedir = cleanup_cachedir 209 | mkdirp(self.cachedir) 210 | 211 | def __del__(self): 212 | if self.cleanup_cachedir: 213 | shutil.rmtree(self.cachedir) 214 | 215 | def _filename(self, id): 216 | return os.path.join(self.cachedir, id) 217 | 218 | def _path(self, id): 219 | return os.path.join(self.basepath, id) 220 | 221 | def _exists(self, path): 222 | raise NotImplementedError() 223 | 224 | def _delete_remote(self, path): 225 | raise NotImplementedError() 226 | 227 | def _upload_file(self, filename, path): 228 | raise NotImplementedError() 229 | 230 | def _download_file(self, path, dest_filename): 231 | raise NotImplementedError() 232 | 233 | def __contains__(self, id): 234 | cs.ensure_contains(self) 235 | path = self._path(id) 236 | if self.always_check: 237 | return self._exists(path) 238 | else: 239 | return os.path.exists(self._filename(id)) or self._exists(path) 240 | 241 | def _put_overwrite(self, id, value, serializer, read_through): 242 | cs.ensure_put(self, id, read_through, check_contains=False) 243 | filename = self._filename(id) 244 | # not already saved by DiskStore? 245 | if not os.path.isfile(filename): 246 | with _atomic_write(filename) as temp: 247 | serializer.dump(value, temp) 248 | self._upload_file(filename, self._path(id)) 249 | 250 | def get(self, id, serializer=DEFAULT_VALUE_SERIALIZER, **_kargs): 251 | cs.ensure_read(self) 252 | cs.ensure_present(self, id) 253 | filename = self._filename(id) 254 | if not os.path.exists(filename): 255 | with _atomic_write(filename) as temp: 256 | self._download_file(self._path(id), temp) 257 | return serializer.load(filename) 258 | 259 | def delete(self, id): 260 | cs.ensure_delete(self, id) 261 | filename = self._filename(id) 262 | if os.path.exists(filename): 263 | os.remove(filename) 264 | self._delete_remote(self._path(id)) 265 | 266 | 267 | class S3Store(RemoteStore): 268 | 269 | def __init__( 270 | self, 271 | cachedir, 272 | basepath, 273 | s3_config=None, 274 | s3fs=None, 275 | read=True, 276 | write=True, 277 | read_through_write=True, 278 | delete=False, 279 | on_duplicate_key='skip', 280 | cleanup_cachedir=False, 281 | always_check_remote=False, 282 | ): 283 | """ 284 | Parameters 285 | ---------- 286 | always_check_remote : bool 287 | When True S3 will be checked with every __contains__ call. Otherwise it will 288 | short-circuit if the blob is found in the cachedir. For performance reasons this 289 | should always be set to False. The only reason why you would want to use this 290 | is if you are using a S3Store and a DiskStore in a ChainedStore together for 291 | some reason. Since the S3Store basically doubles as a DiskStore with it's cachedir 292 | chaining the two doesn't really make sense though. 293 | """ 294 | super(S3Store, self).__init__( 295 | always_check_remote=always_check_remote, 296 | cachedir=cachedir, 297 | basepath=basepath, 298 | cleanup_cachedir=cleanup_cachedir, 299 | read=read, 300 | write=write, 301 | read_through_write=read_through_write, 302 | delete=delete, 303 | on_duplicate_key=on_duplicate_key, 304 | ) 305 | 306 | if s3fs: 307 | self.s3fs = s3fs 308 | elif s3_config is not None: 309 | self.s3fs = S3FileSystem(**s3_config) 310 | else: 311 | raise ValueError('You must provide either s3_config or s3fs for a S3Store') 312 | 313 | def _exists(self, path): 314 | return self.s3fs.exists(path) 315 | 316 | def _delete_remote(self, path): 317 | self.s3fs.rm(path) 318 | 319 | def _upload_file(self, filename, path): 320 | self.s3fs.put(filename, path) 321 | 322 | def _download_file(self, remote_path, dest_filename): 323 | self.s3fs.get(remote_path, dest_filename) 324 | 325 | 326 | class ChainedStore(BaseBlobStore): 327 | 328 | def __init__( 329 | self, 330 | stores, 331 | read=True, 332 | write=True, 333 | read_through_write=True, 334 | delete=True, 335 | on_duplicate_key='skip', 336 | ): 337 | super(ChainedStore, self).__init__( 338 | read=read, 339 | write=write, 340 | read_through_write=read_through_write, 341 | delete=delete, 342 | on_duplicate_key=on_duplicate_key, 343 | ) 344 | self.stores = stores 345 | 346 | def __contains__(self, id): 347 | return cs.chained_contains(self, id) 348 | 349 | def _filename(self, id): 350 | return cs.chained_filename(self, id) 351 | 352 | def _put_overwrite(self, id, value, serializer, read_through): 353 | return cs.chained_put(self, id, value, overwrite=True, serializer=serializer) 354 | 355 | def get(self, id, serializer=DEFAULT_VALUE_SERIALIZER, **kargs): 356 | 357 | def get(store, id): 358 | return store.get(id, serializer=serializer, **kargs) 359 | 360 | return cs.chained_get(self, get, id) 361 | 362 | def __getitem__(self, id, **kargs): 363 | return self.get(id, **kargs) 364 | 365 | def delete(self, id): 366 | return cs.chained_delete(self, id) 367 | -------------------------------------------------------------------------------- /provenance/google_storage.py: -------------------------------------------------------------------------------- 1 | from boltons import funcutils as bfu 2 | from google.cloud import storage as gs 3 | from memoized_property import memoized_property 4 | 5 | from . import blobstores as bs 6 | 7 | # TODO: catch and retry w/new client on 8 | # BrokenPipeError: [Errno 32] Broken pipe 9 | # ConnectionResetError: [Errno 54] Connection reset by peer 10 | # more? 11 | 12 | 13 | def retry(f, max_attempts=2): 14 | 15 | @bfu.wraps(f) 16 | def with_retry(store, *args, **kargs): 17 | actual_attempts = 0 18 | while True: 19 | try: 20 | return f(store, *args, **kargs) 21 | except (BrokenPipeError, ConnectionError) as e: 22 | actual_attempts += 1 23 | if actual_attempts >= max_attempts: 24 | raise e 25 | else: 26 | store._setup_client() 27 | 28 | return with_retry 29 | 30 | 31 | class GSStore(bs.RemoteStore): 32 | 33 | def __init__( 34 | self, 35 | cachedir, 36 | bucket, 37 | basepath='', 38 | project=None, 39 | read=True, 40 | write=True, 41 | read_through_write=True, 42 | delete=False, 43 | on_duplicate_key='skip', 44 | cleanup_cachedir=False, 45 | always_check_remote=False, 46 | ): 47 | """ 48 | Parameters 49 | ---------- 50 | always_check_remote : bool 51 | When True GS (Google Storage) will be checked with every __contains__ call. Otherwise it will 52 | short-circuit if the blob is found in the cachedir. For performance reasons this 53 | should always be set to False. The only reason why you would want to use this 54 | is if you are using a GSStore and a DiskStore in a ChainedStore together for 55 | some reason. Since the GSStore basically doubles as a DiskStore with it's cachedir 56 | chaining the two doesn't really make sense though. 57 | """ 58 | super(GSStore, self).__init__( 59 | always_check_remote=always_check_remote, 60 | cachedir=cachedir, 61 | basepath=basepath, 62 | cleanup_cachedir=cleanup_cachedir, 63 | read=read, 64 | write=write, 65 | read_through_write=read_through_write, 66 | delete=delete, 67 | on_duplicate_key=on_duplicate_key, 68 | ) 69 | 70 | self.bucket_name = bucket 71 | self.project = project 72 | 73 | def _setup_client(self): 74 | del self._client 75 | del self._bucket 76 | # force re-memoization 77 | assert self.bucket is not None 78 | 79 | @memoized_property 80 | def client(self): 81 | return gs.Client(project=self.project) 82 | 83 | @memoized_property 84 | def bucket(self): 85 | return self.client.get_bucket(self.bucket_name) 86 | 87 | @retry 88 | def _exists(self, path): 89 | blobs = list(self.bucket.list_blobs(prefix=path)) 90 | return len(blobs) == 1 91 | 92 | @retry 93 | def _delete_remote(self, path): 94 | self.blob(path).delete() 95 | 96 | def _blob(self, path): 97 | return self._bucket.blob(path) 98 | 99 | @retry 100 | def _upload_file(self, filename, path): 101 | self._blob(path).upload_from_filename(filename) 102 | 103 | @retry 104 | def _download_file(self, remote_path, dest_filename): 105 | self._blob(remote_path).download_to_filename(dest_filename) 106 | -------------------------------------------------------------------------------- /provenance/hashing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Fast cryptographic hash of Python objects, with a special case for fast 3 | hashing of numpy arrays. 4 | 5 | 6 | This code was originally taken from joblib and modified. 7 | 8 | Author: Gael Varoquaux 9 | Copyright (c) 2009 Gael Varoquaux 10 | License: BSD Style, 3 clauses. 11 | """ 12 | 13 | # Author: Gael Varoquaux 14 | # Copyright (c) 2009 Gael Varoquaux 15 | # License: BSD Style, 3 clauses. 16 | 17 | import decimal 18 | import hashlib 19 | import io 20 | import pickle 21 | import struct 22 | import sys 23 | import types 24 | from functools import singledispatch 25 | 26 | import cloudpickle 27 | 28 | 29 | @singledispatch 30 | def value_repr(obj): 31 | method = getattr(obj, 'value_repr', None) 32 | if callable(method): 33 | return method() 34 | else: 35 | return obj 36 | 37 | 38 | Pickler = cloudpickle.CloudPickler 39 | 40 | 41 | class _ConsistentSet(object): 42 | """ Class used to ensure the hash of Sets is preserved 43 | whatever the order of its items. 44 | """ 45 | 46 | def __init__(self, _set): 47 | # Forces order of elements in set to ensure consistent hash. 48 | self._type = type(_set) 49 | try: 50 | # Trying first to order the set assuming the type of elements is 51 | # consistent and orderable. 52 | # This fails on python 3 when elements are unorderable 53 | # but we keep it in a try as it's faster. 54 | self._sequence = sorted(_set) 55 | except (TypeError, decimal.InvalidOperation): 56 | # If elements are unorderable, sorting them using their hash. 57 | # This is slower but works in any case. 58 | self._sequence = sorted((hash(e) for e in _set)) 59 | 60 | 61 | class _MyHash(object): 62 | """ Class used to hash objects that won't normally pickle """ 63 | 64 | def __init__(self, *args): 65 | self.args = args 66 | 67 | 68 | class Hasher(Pickler): 69 | """ A subclass of pickler, to do cryptographic hashing, rather than 70 | pickling. 71 | """ 72 | 73 | def __init__(self, hash_name='md5'): 74 | self.stream = io.BytesIO() 75 | # By default we want a pickle protocol that only changes with 76 | # the major python version and not the minor one 77 | protocol = pickle.DEFAULT_PROTOCOL 78 | Pickler.__init__(self, self.stream, protocol=protocol) 79 | # Initialise the hash obj 80 | self._hash = hashlib.new(hash_name) 81 | 82 | def hash(self, obj): 83 | try: 84 | self.dump(obj) 85 | except pickle.PicklingError as e: 86 | e.args += ('PicklingError while hashing %r: %r' % (obj, e),) 87 | raise 88 | dumps = self.stream.getvalue() 89 | self._hash.update(dumps) 90 | return self._hash.hexdigest() 91 | 92 | def save(self, obj): 93 | obj = value_repr(obj) 94 | if isinstance(obj, (types.MethodType, type({}.pop))): 95 | # the Pickler cannot pickle instance methods; here we decompose 96 | # them into components that make them uniquely identifiable 97 | if hasattr(obj, '__func__'): 98 | func_name = obj.__func__.__name__ 99 | else: 100 | func_name = obj.__name__ 101 | inst = obj.__self__ 102 | if type(inst) == type(pickle): 103 | obj = _MyHash(func_name, inst.__name__) 104 | elif inst is None: 105 | # type(None) or type(module) do not pickle 106 | obj = _MyHash(func_name, inst) 107 | else: 108 | cls = obj.__self__.__class__ 109 | obj = _MyHash(func_name, inst, cls) 110 | Pickler.save(self, obj) 111 | 112 | def memoize(self, obj): 113 | # don't memoize so that the hashes are completely value-based 114 | return 115 | 116 | # The dispatch table of the pickler is not accessible in Python 117 | # 3, as these lines are only bugware for IPython, we skip them. 118 | def save_global(self, obj, name=None, pack=struct.pack): 119 | # We have to override this method in order to deal with objects 120 | # defined interactively in IPython that are not injected in 121 | # __main__ 122 | kwargs = dict(name=name, pack=pack) 123 | if sys.version_info >= (3, 4): 124 | del kwargs['pack'] 125 | try: 126 | Pickler.save_global(self, obj, **kwargs) 127 | except pickle.PicklingError: 128 | Pickler.save_global(self, obj, **kwargs) 129 | module = getattr(obj, '__module__', None) 130 | if module == '__main__': 131 | my_name = name 132 | if my_name is None: 133 | my_name = obj.__name__ 134 | mod = sys.modules[module] 135 | if not hasattr(mod, my_name): 136 | # IPython doesn't inject the variables define 137 | # interactively in __main__ 138 | setattr(mod, my_name, obj) 139 | 140 | dispatch = Pickler.dispatch.copy() 141 | # builtin 142 | dispatch[type(len)] = save_global 143 | # type 144 | dispatch[type(object)] = save_global 145 | # classobj 146 | dispatch[type(Pickler)] = save_global 147 | # function 148 | dispatch[type(pickle.dump)] = save_global 149 | 150 | def _batch_setitems(self, items): 151 | # forces order of keys in dict to ensure consistent hash. 152 | try: 153 | # Trying first to compare dict assuming the type of keys is 154 | # consistent and orderable. 155 | # This fails on python 3 when keys are unorderable 156 | # but we keep it in a try as it's faster. 157 | Pickler._batch_setitems(self, iter(sorted(items))) 158 | except TypeError: 159 | # If keys are unorderable, sorting them using their hash. This is 160 | # slower but works in any case. 161 | Pickler._batch_setitems(self, iter(sorted((hash(k), v) for k, v in items))) 162 | 163 | def save_set(self, set_items): 164 | # forces order of items in Set to ensure consistent hash 165 | Pickler.save(self, _ConsistentSet(set_items)) 166 | 167 | dispatch[type(set())] = save_set 168 | dispatch[type(frozenset())] = save_set 169 | 170 | 171 | class NumpyHasher(Hasher): 172 | """ Special case the hasher for when numpy is loaded. 173 | """ 174 | 175 | def __init__(self, hash_name='md5', coerce_mmap=True): 176 | """ 177 | Parameters 178 | ---------- 179 | hash_name: string 180 | The hash algorithm to be used 181 | coerce_mmap: boolean 182 | Make no difference between np.memmap and np.ndarray 183 | objects. 184 | """ 185 | self.coerce_mmap = coerce_mmap 186 | self.chunk_size = 200 * 1024 * 1024 # 200 Mb 187 | Hasher.__init__(self, hash_name=hash_name) 188 | # delayed import of numpy, to avoid tight coupling 189 | import numpy as np 190 | 191 | self.np = np 192 | 193 | def hash_array(self, a): 194 | self._hash.update(a.tobytes()) 195 | 196 | def save(self, obj): 197 | """ Subclass the save method, to hash ndarray subclass, rather 198 | than pickling them. Off course, this is a total abuse of 199 | the Pickler class. 200 | """ 201 | if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: 202 | # Compute a hash of the object 203 | obj_bytes = obj.dtype.itemsize * obj.size 204 | if obj_bytes > self.chunk_size: 205 | # For arrays larger than `self.chunk_size` we will attempt 206 | # to change the shape of a shallow copy and then hash the data 207 | # in chunks 208 | try: 209 | copy = obj[:] 210 | copy.shape = (copy.size,) 211 | except AttributeError as e: 212 | if e.args[0] != 'incompatible shape for a non-contiguous array': 213 | raise e 214 | 215 | # TODO: I am punting here for now and do a reshape that will make 216 | # a copy, but it could be possible to get the bytes out of obj 217 | # without needing one 218 | copy = obj.reshape((obj.size,)) 219 | 220 | i = 0 221 | size = copy.size 222 | typed_chunk_size = self.chunk_size // copy.dtype.itemsize 223 | while i < size: 224 | end = min(i + typed_chunk_size, size) 225 | self.hash_array(copy[i:end]) 226 | i = end 227 | 228 | else: 229 | # Small arrays are hashed all at once 230 | self.hash_array(obj) 231 | 232 | # We store the class, to be able to distinguish between 233 | # Objects with the same binary content, but different 234 | # classes. 235 | if self.coerce_mmap and isinstance(obj, self.np.memmap): 236 | # We don't make the difference between memmap and 237 | # normal ndarrays, to be able to reload previously 238 | # computed results with memmap. 239 | klass = self.np.ndarray 240 | else: 241 | klass = obj.__class__ 242 | # We also return the dtype and the shape, to distinguish 243 | # different views on the same data with different dtypes. 244 | 245 | # The object will be pickled by the pickler hashed at the end. 246 | obj = (klass, ('HASHED', obj.dtype, obj.shape)) 247 | elif isinstance(obj, self.np.dtype): 248 | # Atomic dtype objects are interned by their default constructor: 249 | # np.dtype('f8') is np.dtype('f8') 250 | # This interning is not maintained by a 251 | # pickle.loads + pickle.dumps cycle, because __reduce__ 252 | # uses copy=True in the dtype constructor. This 253 | # non-deterministic behavior causes the internal memoizer 254 | # of the hasher to generate different hash values 255 | # depending on the history of the dtype object. 256 | # To prevent the hash from being sensitive to this, we use 257 | # .descr which is a full (and never interned) description of 258 | # the array dtype according to the numpy doc. 259 | klass = obj.__class__ 260 | obj = (klass, ('HASHED', obj.descr)) 261 | Hasher.save(self, obj) 262 | 263 | 264 | def hash(obj, hasher=None, hash_name='md5', coerce_mmap=True): 265 | """ Quick calculation of a hash to identify uniquely Python objects 266 | containing numpy arrays. The difference with this hash and joblib 267 | is that it tries to hash different mutable objects with the same 268 | values to the same hash. 269 | 270 | 271 | Parameters 272 | ----------- 273 | hash_name: 'md5' or 'sha1' 274 | Hashing algorithm used. sha1 is supposedly safer, but md5 is 275 | faster. 276 | coerce_mmap: boolean 277 | Make no difference between np.memmap and np.ndarray 278 | """ 279 | if hasher is None: 280 | if 'numpy' in sys.modules: 281 | hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) 282 | else: 283 | hasher = Hasher(hash_name=hash_name) 284 | 285 | return hasher.hash(obj) 286 | 287 | 288 | def file_hash(filename, hash_name='md5'): 289 | """Streams the bytes of the given file through either md5 or sha1 290 | and returns the hexdigest. 291 | """ 292 | if hash_name not in set(['md5', 'sha1']): 293 | raise ValueError('hashname must be "md5" or "sha1"') 294 | 295 | hasher = hashlib.md5() if hash_name == 'md5' else hashlib.sha1() 296 | with open(filename, 'rb') as f: 297 | for chunk in iter(lambda: f.read(4096), b''): 298 | hasher.update(chunk) 299 | return hasher.hexdigest() 300 | -------------------------------------------------------------------------------- /provenance/migrations/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. 2 | -------------------------------------------------------------------------------- /provenance/migrations/env.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | 3 | from alembic import context 4 | from sqlalchemy import engine_from_config, pool 5 | 6 | from provenance import models 7 | 8 | # this is the Alembic Config object, which provides 9 | # access to the values within the .ini file in use. 10 | config = context.config 11 | 12 | # add your model's MetaData object here 13 | # for 'autogenerate' support 14 | # from myapp import mymodel 15 | # target_metadata = mymodel.Base.metadata 16 | target_metadata = models.Base.metadata 17 | 18 | # other values from the config, defined by the needs of env.py, 19 | # can be acquired: 20 | # my_important_option = config.get_main_option("my_important_option") 21 | # ... etc. 22 | 23 | 24 | def run_migrations_offline(): 25 | """Run migrations in 'offline' mode. 26 | 27 | This configures the context with just a URL 28 | and not an Engine, though an Engine is acceptable 29 | here as well. By skipping the Engine creation 30 | we don't even need a DBAPI to be available. 31 | 32 | Calls to context.execute() here emit the given string to the 33 | script output. 34 | 35 | """ 36 | url = config.get_main_option('sqlalchemy.url') 37 | context.configure(url=url, target_metadata=target_metadata, literal_binds=True) 38 | 39 | with context.begin_transaction(): 40 | context.run_migrations() 41 | 42 | 43 | def run_migrations_online(): 44 | connectable = config.attributes.get('connection', None) 45 | 46 | if connectable is None: 47 | # only create Engine if we don't have a Connection 48 | # from the outside 49 | connectable = engine_from_config( 50 | config.get_section(config.config_ini_section), 51 | prefix='sqlalchemy.', 52 | poolclass=pool.NullPool, 53 | ) 54 | 55 | # when connectable is already a Connection object, calling 56 | # connect() gives us a *branched connection*. 57 | 58 | with connectable.connect() as connection: 59 | context.configure(connection=connection, target_metadata=target_metadata) 60 | 61 | with context.begin_transaction(): 62 | context.run_migrations() 63 | 64 | 65 | if context.is_offline_mode(): 66 | run_migrations_offline() 67 | else: 68 | run_migrations_online() 69 | -------------------------------------------------------------------------------- /provenance/migrations/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /provenance/migrations/versions/e0317ab07ba4_initial_schema.py: -------------------------------------------------------------------------------- 1 | """initial schema 2 | 3 | Revision ID: e0317ab07ba4 4 | Revises: 5 | Create Date: 2017-03-13 13:33:59.644604 6 | 7 | """ 8 | import sqlalchemy as sa 9 | import sqlalchemy.dialects.postgresql as pg 10 | from alembic import op 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'e0317ab07ba4' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table( 21 | 'artifact_set_members', 22 | sa.Column('set_id', sa.VARCHAR(length=40), nullable=False), 23 | sa.Column('artifact_id', sa.VARCHAR(length=40), nullable=False), 24 | sa.PrimaryKeyConstraint('set_id', 'artifact_id'), 25 | ) 26 | 27 | op.create_table( 28 | 'artifact_sets', 29 | sa.Column('id', sa.INTEGER(), nullable=False), 30 | sa.Column('set_id', sa.VARCHAR(length=40), nullable=True), 31 | sa.Column('name', sa.VARCHAR(length=1000), nullable=True), 32 | sa.Column('created_at', pg.TIMESTAMP(), nullable=True), 33 | sa.PrimaryKeyConstraint('id'), 34 | ) 35 | 36 | op.create_table( 37 | 'runs', 38 | sa.Column('id', sa.VARCHAR(length=40), nullable=False), 39 | sa.Column('hostname', sa.VARCHAR(length=256), nullable=True), 40 | sa.Column('info', pg.JSONB(), nullable=True), 41 | sa.Column('created_at', pg.TIMESTAMP(), nullable=True), 42 | sa.PrimaryKeyConstraint('id'), 43 | ) 44 | 45 | op.create_table( 46 | 'artifacts', 47 | sa.Column('id', sa.VARCHAR(length=40), nullable=False), 48 | sa.Column('value_id', sa.VARCHAR(length=50), nullable=True), 49 | sa.Column('run_id', sa.VARCHAR(length=40), nullable=True), 50 | sa.Column('name', sa.VARCHAR(length=1000), nullable=True), 51 | sa.Column('version', sa.INTEGER(), nullable=True), 52 | sa.Column('fn_module', sa.VARCHAR(length=100), nullable=True), 53 | sa.Column('fn_name', sa.VARCHAR(length=100), nullable=True), 54 | sa.Column('composite', sa.BOOLEAN(), nullable=True), 55 | sa.Column('value_id_duration', sa.FLOAT(), nullable=True), 56 | sa.Column('compute_duration', sa.FLOAT(), nullable=True), 57 | sa.Column('hash_duration', sa.FLOAT(), nullable=True), 58 | sa.Column('computed_at', pg.TIMESTAMP(), nullable=True), 59 | sa.Column('added_at', pg.TIMESTAMP(), nullable=True), 60 | sa.Column('input_artifact_ids', pg.ARRAY(pg.VARCHAR(length=40)), nullable=True), 61 | sa.Column('inputs_json', pg.JSONB(), nullable=True), 62 | sa.Column('serializer', sa.VARCHAR(length=128), nullable=True), 63 | sa.Column('load_kwargs', pg.JSONB(), nullable=True), 64 | sa.Column('dump_kwargs', pg.JSONB(), nullable=True), 65 | sa.Column('custom_fields', pg.JSONB(), nullable=True), 66 | sa.ForeignKeyConstraint( 67 | ['run_id'], 68 | ['runs.id'], 69 | ), 70 | sa.PrimaryKeyConstraint('id'), 71 | ) 72 | 73 | 74 | def downgrade(): 75 | op.drop_table('artifacts') 76 | op.drop_table('runs') 77 | op.drop_table('artifact_sets') 78 | op.drop_table('artifact_set_members') 79 | -------------------------------------------------------------------------------- /provenance/models.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from datetime import datetime 3 | 4 | import sqlalchemy as sa 5 | import sqlalchemy.dialects.postgresql as pg 6 | import sqlalchemy.ext.declarative 7 | import sqlalchemy.orm 8 | from memoized_property import memoized_property 9 | 10 | Base = sa.ext.declarative.declarative_base() 11 | 12 | SHA1_LENGTH = 40 13 | VALUE_ID_LENGTH = SHA1_LENGTH + 10 # extra 10 for optional file extension info 14 | 15 | 16 | class Run(Base): 17 | __tablename__ = 'runs' 18 | 19 | id = sa.Column(pg.VARCHAR(SHA1_LENGTH), primary_key=True) 20 | hostname = sa.Column(pg.VARCHAR(256)) 21 | info = sa.Column(pg.JSONB) 22 | created_at = sa.Column(pg.TIMESTAMP, default=datetime.utcnow) 23 | artifacts = sqlalchemy.orm.relationship('Artifact') 24 | 25 | def __init__(self, info): 26 | self.id = info['id'] 27 | self.info = info 28 | self.hostname = info['host']['nodename'] 29 | self.created_at = info['created_at'] 30 | 31 | @memoized_property 32 | def info_with_datetimes(self): 33 | result = copy.copy(self.info) 34 | result['created_at'] = self.created_at 35 | return result 36 | 37 | 38 | class Artifact(Base): 39 | __tablename__ = 'artifacts' 40 | 41 | id = sa.Column(pg.VARCHAR(SHA1_LENGTH), primary_key=True) 42 | value_id = sa.Column(pg.VARCHAR(VALUE_ID_LENGTH)) 43 | run_id = sa.Column(pg.VARCHAR(SHA1_LENGTH), sa.ForeignKey('runs.id')) 44 | 45 | name = sa.Column(pg.VARCHAR(1000)) 46 | version = sa.Column(pg.INTEGER) 47 | fn_module = sa.Column(pg.VARCHAR(100)) 48 | fn_name = sa.Column(pg.VARCHAR(100)) 49 | 50 | composite = sa.Column(pg.BOOLEAN) 51 | 52 | value_id_duration = sa.Column(pg.FLOAT) 53 | compute_duration = sa.Column(pg.FLOAT) 54 | hash_duration = sa.Column(pg.FLOAT) 55 | 56 | computed_at = sa.Column(pg.TIMESTAMP) 57 | added_at = sa.Column(pg.TIMESTAMP, default=datetime.utcnow) 58 | 59 | input_artifact_ids = sa.Column(pg.ARRAY(pg.VARCHAR(SHA1_LENGTH))) 60 | inputs_json = sa.orm.deferred(sa.Column(pg.JSONB)) 61 | serializer = sa.Column(pg.VARCHAR(128), default='joblib') 62 | load_kwargs = sa.Column(pg.JSONB) 63 | dump_kwargs = sa.Column(pg.JSONB) 64 | custom_fields = sa.Column(pg.JSONB) 65 | 66 | def __init__(self, artifact, inputs_json, run): 67 | self.id = artifact.id 68 | self.run = run 69 | self.run_id = run.id 70 | self.value_id = artifact.value_id 71 | self.name = artifact.name 72 | self.version = artifact.version 73 | self.fn_module = artifact.fn_module 74 | self.fn_name = artifact.fn_name 75 | self.composite = artifact.composite 76 | self.value_id_duration = artifact.value_id_duration 77 | self.compute_duration = artifact.compute_duration 78 | self.hash_duration = artifact.hash_duration 79 | self.input_artifact_ids = artifact.input_artifact_ids 80 | self.inputs_json = inputs_json 81 | self.custom_fields = artifact.custom_fields 82 | self.computed_at = artifact.computed_at 83 | self.serializer = artifact.serializer 84 | self.load_kwargs = artifact.load_kwargs 85 | self.dump_kwargs = artifact.dump_kwargs 86 | 87 | @memoized_property 88 | def props(self): 89 | return { 90 | 'id': self.id, 91 | 'value_id': self.value_id, 92 | 'name': self.name, 93 | 'version': self.version, 94 | 'fn_module': self.fn_module, 95 | 'fn_name': self.fn_name, 96 | 'composite': self.composite, 97 | 'value_id_duration': self.value_id_duration, 98 | 'compute_duration': self.compute_duration, 99 | 'hash_duration': self.hash_duration, 100 | 'input_artifact_ids': self.input_artifact_ids, 101 | 'serializer': self.serializer, 102 | 'load_kwargs': self.load_kwargs, 103 | 'dump_kwargs': self.dump_kwargs, 104 | 'custom_fields': self.custom_fields, 105 | 'computed_at': self.computed_at, 106 | } 107 | 108 | def __repr__(self): 109 | return '' % self.id 110 | 111 | 112 | class ArtifactSet(Base): 113 | __tablename__ = 'artifact_sets' 114 | 115 | id = sa.Column(pg.INTEGER, primary_key=True) 116 | set_id = sa.Column(pg.VARCHAR(SHA1_LENGTH)) 117 | labels = sa.Column(pg.JSONB) 118 | created_at = sa.Column(pg.TIMESTAMP, default=datetime.utcnow) 119 | 120 | def __init__(self, artifact_set): 121 | self.set_id = artifact_set.id 122 | labels = artifact_set.labels 123 | if isinstance(artifact_set.labels, str): 124 | labels = {'name': artifact_set.labels} 125 | self.labels = labels 126 | self.created_at = artifact_set.created_at 127 | 128 | @memoized_property 129 | def props(self): 130 | return {'id': self.set_id, 'labels': self.labels, 'created_at': self.created_at} 131 | 132 | def __repr__(self): 133 | return '' % (self.set_id, self.labels) 134 | 135 | 136 | class ArtifactSetMember(Base): 137 | __tablename__ = 'artifact_set_members' 138 | 139 | set_id = sa.Column( 140 | pg.VARCHAR(SHA1_LENGTH), 141 | primary_key=True, # sa.ForeignKey("artifact_sets.set_id"), 142 | ) 143 | artifact_id = sa.Column( 144 | pg.VARCHAR(SHA1_LENGTH), 145 | primary_key=True # sa.ForeignKey("artifacts.id"), 146 | ) 147 | -------------------------------------------------------------------------------- /provenance/serializers.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from functools import singledispatch 3 | 4 | import cloudpickle 5 | import joblib 6 | import toolz as t 7 | 8 | from .hashing import hash 9 | 10 | 11 | def cloudpickle_dump(obj, filename, **kwargs): 12 | with open(filename, 'wb') as f: 13 | return cloudpickle.dump(obj, f, **kwargs) 14 | 15 | 16 | def cloudpickle_load(filename, **kwargs): 17 | with open(filename, 'rb') as f: 18 | return cloudpickle.load(f, **kwargs) 19 | 20 | 21 | Serializer = namedtuple( 22 | 'Serializer', 23 | 'name, dump, load, content_type, content_encoding, content_disposition', 24 | ) 25 | 26 | 27 | def joblib_dump(obj, filename, compress=2, **kwargs): 28 | joblib.dump(obj, filename, compress=compress, **kwargs) 29 | 30 | 31 | serializers = {} 32 | 33 | 34 | @singledispatch 35 | def object_serializer(obj): 36 | """ 37 | Takes an object and returns the appropirate serializer name, dump, and load arguments. 38 | 39 | Parameters 40 | ---------- 41 | obj : any python object or primitive 42 | 43 | Returns 44 | ------- 45 | tuple of serializer name (str), dump args (dictionary), load args (dictionary) 46 | """ 47 | return DEFAULT_VALUE_SERIALIZER.name 48 | 49 | 50 | def register_serializer( 51 | name, 52 | dump, 53 | load, 54 | content_type=None, 55 | content_encoding=None, 56 | content_disposition=None, 57 | classes=None, 58 | ): 59 | serializers[name] = Serializer( 60 | name, dump, load, content_type, content_encoding, content_disposition 61 | ) 62 | if classes is None: 63 | return 64 | for cls in classes: 65 | object_serializer.register(cls, lambda _: name) 66 | 67 | 68 | register_serializer('joblib', joblib_dump, joblib.load) 69 | register_serializer('cloudpickle', cloudpickle_dump, cloudpickle_load) 70 | 71 | 72 | def _pandas_and_parquet_present(): 73 | try: 74 | import pandas 75 | except ImportError: 76 | return False 77 | try: 78 | import pyarrow 79 | except: 80 | try: 81 | import fastparquet 82 | except ImportError: 83 | return False 84 | return True 85 | 86 | 87 | if _pandas_and_parquet_present(): 88 | import pandas as pd 89 | 90 | def pd_df_parquet_dump(df, filename, **kwargs): 91 | return df.to_parquet(filename, **kwargs) 92 | 93 | def pd_df_parquet_load(filename, **kwargs): 94 | return pd.read_parquet(filename, **kwargs) 95 | 96 | register_serializer( 97 | 'pd_df_parquet', pd_df_parquet_dump, pd_df_parquet_load, classes=[pd.DataFrame] 98 | ) 99 | 100 | def pd_series_parquet_dump(series, filename, **kwargs): 101 | if series.name is None: 102 | # pyarrow requires the column names be strings 103 | series = pd.Series(series, name='_series') 104 | return pd.DataFrame(series).to_parquet(filename, **kwargs) 105 | 106 | def pd_series_parquet_load(filename, **kwargs): 107 | series = pd.read_parquet(filename, **kwargs).ix[:, 0] 108 | if series.name == '_series': 109 | series.name = None 110 | return series 111 | 112 | register_serializer( 113 | 'pd_series_parquet', 114 | pd_series_parquet_dump, 115 | pd_series_parquet_load, 116 | classes=[pd.Series], 117 | ) 118 | 119 | 120 | def _pytorch_present(): 121 | try: 122 | import torch 123 | except: 124 | return False 125 | return True 126 | 127 | 128 | if _pytorch_present(): 129 | import torch 130 | 131 | def pytorch_model_dump(model, filename, **kwargs): 132 | return torch.save(model, filename) 133 | 134 | def pytorch_model_load(filename, **kwargs): 135 | return torch.load(filename) 136 | 137 | register_serializer( 138 | 'pytorch_model', 139 | pytorch_model_dump, 140 | pytorch_model_load, 141 | classes=[torch.nn.Module], 142 | ) 143 | 144 | 145 | @t.memoize(key=lambda *args: hash(args)) 146 | def partial_serializer(serializer_name, dump_kwargs, load_kwargs): 147 | s = serializers[serializer_name] 148 | return Serializer( 149 | s.name, 150 | t.partial(s.dump, **dump_kwargs) if dump_kwargs else s.dump, 151 | t.partial(s.load, **load_kwargs) if load_kwargs else s.load, 152 | s.content_type, 153 | s.content_encoding, 154 | s.content_disposition, 155 | ) 156 | 157 | 158 | def serializer(artifact): 159 | return partial_serializer(artifact.serializer, artifact.dump_kwargs, artifact.load_kwargs) 160 | 161 | 162 | DEFAULT_VALUE_SERIALIZER = serializers['joblib'] 163 | DEFAULT_INPUT_SERIALIZER = serializers['joblib'] 164 | -------------------------------------------------------------------------------- /provenance/sftp/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import paramiko 4 | 5 | from .. import blobstores as bs 6 | 7 | 8 | def _ssh_client(ssh_config): 9 | client = paramiko.SSHClient() 10 | client.load_host_keys(os.path.expanduser('~/.ssh/known_hosts')) 11 | # There still seems to be problems with some types keys. 12 | # See https://github.com/paramiko/paramiko/issues/243 13 | # So you might try uncommenting if you are using an ecdsa-sha2-nistp256 14 | # client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 15 | client.connect(**ssh_config) 16 | return client 17 | 18 | 19 | class SFTPStore(bs.RemoteStore): 20 | 21 | def __init__( 22 | self, 23 | cachedir, 24 | basepath, 25 | ssh_config=None, 26 | ssh_client=None, 27 | sftp_client=None, 28 | read=True, 29 | write=True, 30 | read_through_write=True, 31 | delete=False, 32 | on_duplicate_key='skip', 33 | cleanup_cachedir=False, 34 | always_check_remote=False, 35 | ): 36 | """ 37 | Parameters 38 | ---------- 39 | always_check_remote : bool 40 | When True the SFTP server will be checked with every __contains__ call. Otherwise it will 41 | short-circuit if the blob is found in the cachedir. For performance reasons this 42 | should always be set to False. The only reason why you would want to use this 43 | is if you are using a SFTPStore and a DiskStore in a ChainedStore together for 44 | some reason. Since the SFTPStore basically doubles as a DiskStore with it's cachedir 45 | chaining the two doesn't really make sense though. 46 | """ 47 | super(SFTPStore, self).__init__( 48 | always_check_remote=always_check_remote, 49 | cachedir=cachedir, 50 | basepath=basepath, 51 | cleanup_cachedir=cleanup_cachedir, 52 | read=read, 53 | write=write, 54 | read_through_write=read_through_write, 55 | delete=delete, 56 | on_duplicate_key=on_duplicate_key, 57 | ) 58 | 59 | self.ssh_client = None 60 | if ssh_config is not None: 61 | self.ssh_client = _ssh_client(ssh_config) 62 | if self.ssh_client is not None: 63 | sftp_client = paramiko.SFTPClient.from_transport(self.ssh_client._transport) 64 | if sftp_client is not None: 65 | self.sftp_client = sftp_client 66 | else: 67 | # This is to allow testing the importing/subpackage aspect without 68 | # having to actually test the class by mocking an ssh connection. 69 | if cachedir is None and basepath is None: 70 | return 71 | raise ValueError( 72 | 'You must specify a SFTP client by passing in one of: sftp_client, ssh_config, ssh_client' 73 | ) 74 | 75 | def _exists(self, path): 76 | try: 77 | self.sftp_client.stat(path) 78 | return True 79 | except FileNotFoundError: 80 | return False 81 | 82 | def _delete_remote(self, path): 83 | self.sftp_client.remove(path) 84 | 85 | def _upload_file(self, filename, path): 86 | self.sftp_client.put(filename, path) 87 | 88 | def _download_file(self, remote_path, dest_filename): 89 | self.sftp_client.get(remote_path, dest_filename) 90 | -------------------------------------------------------------------------------- /provenance/test_serializers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import provenance.serializers as s 4 | 5 | 6 | def test_default_object_serializers(): 7 | assert s.object_serializer('foo') == 'joblib' 8 | assert s.object_serializer((1, 2, 3)) == 'joblib' 9 | assert s.object_serializer({'foo': 42}) == 'joblib' 10 | 11 | df = pd.DataFrame([{'foo': 42}, {'foo': 55}]) 12 | assert s.object_serializer(df) == 'pd_df_parquet' 13 | 14 | series = df.foo 15 | assert s.object_serializer(series) == 'pd_series_parquet' 16 | -------------------------------------------------------------------------------- /provenance/utils.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from collections import OrderedDict, Sequence 3 | 4 | import toolz as t 5 | import toolz.curried as tc 6 | from boltons import funcutils as bfu 7 | 8 | UNSPECIFIED_ARG = '::unspecified::' 9 | 10 | 11 | def args_extractor(f, merge_defaults=False): 12 | """ 13 | Takes a function, inspects it's parameter lists, and returns a 14 | function that will return all of the named and key arguments 15 | back as a dictionary. The varargs are also returned which don't 16 | have a names. 17 | 18 | """ 19 | spec = inspect.getfullargspec(f) 20 | if spec.defaults: 21 | param_defaults = dict(zip(spec.args[-len(spec.defaults):], spec.defaults)) 22 | else: 23 | param_defaults = {} 24 | named_param_defaults = spec.kwonlydefaults or {} 25 | default_dicts = {} 26 | num_named_args = len(spec.args) 27 | 28 | if merge_defaults is True and hasattr(f, '__merge_defaults__'): 29 | merge_defaults = f.__merge_defaults__ 30 | 31 | if merge_defaults: 32 | default_dicts = t.pipe( 33 | t.merge(named_param_defaults, param_defaults), 34 | tc.valfilter(lambda v: isinstance(v, dict)), 35 | ) 36 | 37 | if isinstance(merge_defaults, Sequence): 38 | default_dicts = {k: default_dicts[k] for k in merge_defaults} 39 | 40 | def _args_dict(args, kargs): 41 | unnamed_args = dict(zip(spec.args, args[0:num_named_args])) 42 | varargs = args[num_named_args:] 43 | kargs = t.merge(kargs, unnamed_args) 44 | for k, d in default_dicts.items(): 45 | kargs[k] = t.merge(d, kargs.get(k) or {}) 46 | return varargs, kargs 47 | 48 | else: 49 | 50 | def _args_dict(args, kargs): 51 | unnamed_args = dict(zip(spec.args, args[0:num_named_args])) 52 | varargs = args[num_named_args:] 53 | kargs = t.merge(kargs, unnamed_args) 54 | return varargs, kargs 55 | 56 | return _args_dict 57 | 58 | 59 | def with_merged_defaults(*kwargs_to_default): 60 | """ 61 | Introspects the argspec of the function being decorated to see what 62 | keyword arguments take dictionaries. If a dictionary is passed in when 63 | then function is called then it is merged with the dictionary defined 64 | in the parameter list. 65 | """ 66 | merge_defaults = True 67 | if len(kwargs_to_default) > 0: 68 | merge_defaults = kwargs_to_default 69 | 70 | def _with_merged_defaults(f): 71 | extract_kargs = args_extractor(f, merge_defaults) 72 | 73 | @bfu.wraps(f) 74 | def _merge_defaults(*args, **kargs): 75 | vargs, kargs = extract_kargs(args, kargs) 76 | return f(*vargs, **kargs) 77 | 78 | _merge_defaults.__merge_defaults__ = merge_defaults 79 | 80 | return _merge_defaults 81 | 82 | return _with_merged_defaults 83 | 84 | 85 | def is_curry_func(f): 86 | """ 87 | Checks if f is a toolz or cytoolz function by inspecting the available attributes. 88 | Avoids explicit type checking to accommodate all versions of the curry fn. 89 | """ 90 | return hasattr(f, 'func') and hasattr(f, 'args') and hasattr(f, 'keywords') 91 | 92 | 93 | def _func_param_info(argspec): 94 | params = argspec.args 95 | defaults = argspec.defaults or [] 96 | start_default_ix = -max(len(defaults), 1) - 1 97 | values = [UNSPECIFIED_ARG] * (len(params) - len(defaults)) + list(defaults[start_default_ix:]) 98 | return OrderedDict(zip(params, values)) 99 | 100 | 101 | def param_info(f): 102 | if is_curry_func(f): 103 | argspec = inspect.getfullargspec(f.func) 104 | num_args = len(f.args) 105 | args_to_remove = argspec.args[0:num_args] + list(f.keywords.keys()) 106 | base = _func_param_info(argspec) 107 | return t.dissoc(base, *args_to_remove) 108 | return _func_param_info(inspect.getfullargspec(f)) 109 | 110 | 111 | def inner_function(partial_fn): 112 | """Returns the wrapped function of either a partial or curried function.""" 113 | fn = partial_fn.func 114 | if '__module__' not in dir(fn): 115 | # for some reason the curry decorator nests the actual function 116 | # metadata one level deeper 117 | fn = fn.func 118 | return fn 119 | 120 | 121 | def partial_fn_info(partial_fn): 122 | fn = inner_function(partial_fn) 123 | varargs, kargs = args_extractor(fn)(partial_fn.args, partial_fn.keywords) 124 | return { 125 | 'varargs': varargs, 126 | 'kargs': kargs, 127 | 'module': fn.__module__, 128 | 'name': fn.__name__, 129 | } 130 | 131 | 132 | # TODO: consider using the functions in joblib.func_inspect, e.g. for the fn name 133 | def fn_info(fn): 134 | if 'func' in dir(fn): 135 | return partial_fn_info(fn) 136 | return {'name': fn.__name__, 'module': fn.__module__, 'varargs': (), 'kargs': {}} 137 | 138 | 139 | def when_type(type): 140 | 141 | def _decorator(f): 142 | 143 | @bfu.wraps(f) 144 | def _when_type(val): 145 | if isinstance(val, type): 146 | return f(val) 147 | else: 148 | return val 149 | 150 | return _when_type 151 | 152 | return _decorator 153 | -------------------------------------------------------------------------------- /provenance/vis/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import lineage_dot 2 | 3 | visualize_lineage = lineage_dot 4 | -------------------------------------------------------------------------------- /provenance/vis/utils.py: -------------------------------------------------------------------------------- 1 | import graphviz 2 | from frozendict import frozendict as fd 3 | 4 | from ..repos import is_proxy 5 | 6 | 7 | def elide(obj, length=30): 8 | table = str.maketrans({'{': r'\{', '}': r'\}', '<': r'\<', '>': r'\>'}) 9 | s = str(obj).translate(table) 10 | return (s[:length] + '..') if len(s) > length else s 11 | 12 | 13 | def artifact_id(artifact, length=7): 14 | return artifact.id[0:7] 15 | 16 | 17 | def artifact_record(artifact, elide_len=30): 18 | return '|'.join(['' + artifact_id(artifact), '' + elide(artifact.value, elide_len)]) 19 | 20 | 21 | def param_node_id(child_artifact, name, val): 22 | if is_proxy(val): 23 | artifact = val.artifact 24 | return artifact.id 25 | # hmmm... we could share the inputs to other functions if we wanted to remove the child_artifact.id... 26 | return '|'.join([child_artifact.id, name]) 27 | 28 | 29 | def node(name, label=None, **attrs): 30 | attrs['type'] = 'node' 31 | attrs['name'] = name 32 | attrs['label'] = label 33 | return fd(attrs) 34 | 35 | 36 | def edge(tail_name, head_name, **attrs): 37 | attrs['type'] = 'edge' 38 | attrs['tail_name'] = tail_name 39 | attrs['head_name'] = head_name 40 | return fd(attrs) 41 | 42 | 43 | def dicts_to_digraph(dicts): 44 | g = graphviz.Digraph() 45 | for d in dicts: 46 | d = dict(d) 47 | t = d['type'] 48 | del d['type'] 49 | if t == 'node': 50 | g.node(**d) 51 | elif t == 'edge': 52 | g.edge(**d) 53 | return g 54 | 55 | 56 | class DigraphDicts: 57 | 58 | def __init__(self): 59 | self.set = set() 60 | 61 | def node(self, name, label=None, **attrs): 62 | self.set.add(node(name, label, **attrs)) 63 | return self 64 | 65 | def edge(self, tail_name, head_name, **attrs): 66 | self.set.add(edge(tail_name, head_name, **attrs)) 67 | return self 68 | 69 | def to_dot(self): 70 | return dicts_to_digraph(self.set) 71 | 72 | def _repr_svg_(self): 73 | return self.to_dot()._repr_svg_() 74 | 75 | 76 | def _viz_artifact(artifact, g): 77 | function_id = 'fn_' + artifact.id 78 | fn_qalified_name = '.'.join([artifact.fn_module, artifact.fn_name]) 79 | fn_name = artifact.fn_name 80 | fn_params = '{fn}({params})'.format( 81 | fn=fn_qalified_name, params=','.join(artifact.inputs['kargs'].keys()) 82 | ) 83 | 84 | g.node(function_id, fn_name, shape='circle', tooltip=fn_params) 85 | g.edge(function_id, artifact.id) 86 | g.node( 87 | artifact.id, 88 | label=artifact_record(artifact, elide_len=15), 89 | shape='record', 90 | tooltip=elide(artifact.value, 50), 91 | color='red', 92 | ) 93 | 94 | # ignore varargs for now... 95 | for name, val in artifact.inputs['kargs'].items(): 96 | arg_node_id = param_node_id(artifact, name, val) 97 | if is_proxy(val): 98 | _viz_artifact(val.artifact, g) 99 | g.edge(val.artifact.id, function_id, label=name) 100 | else: 101 | g.node(arg_node_id, label=elide(val), shape='box') 102 | g.edge(arg_node_id, function_id, label=name) 103 | 104 | 105 | def lineage_dot(artifact): 106 | """Walks the lineage of an artifact returning a DigraphDicts object 107 | that can be turned into a graphviz.Digraph and is automatically rendered 108 | as SVG in an IPython notebook. 109 | """ 110 | g = DigraphDicts() 111 | if is_proxy(artifact): 112 | artifact = artifact.artifact 113 | _viz_artifact(artifact, g) 114 | return g 115 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | conda: 2 | file: docs/readthedocs-environment.yml 3 | python: 4 | setup_py_install: false 5 | -------------------------------------------------------------------------------- /release-procedure.md: -------------------------------------------------------------------------------- 1 | 1. Verify tests pass. 2 | 3 | 2. Tag the commit 4 | 5 | git tag 1.2.3 6 | 7 | 3. Push new version bump commit and tag to github 8 | 9 | git push trunk --tags 10 | 11 | 4. Build source and wheel packages 12 | 13 | make dist 14 | 15 | 6. Upload packages to PyPI 16 | 17 | make release 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | s3fs>=0.0.9 2 | boltons>=16.5.1 3 | joblib>=0.15.0 4 | toolz>=0.8.2 5 | cloudpickle>=0.2.1 6 | psutil>=5.0.0 7 | ordered-set>=2.0.1 8 | sqlalchemy>=1.1.3 9 | alembic>=0.9.1 10 | sqlalchemy-utils>=0.32.12 11 | memoized-property>=1.0.2 12 | wrapt>=1.10.8 13 | psycopg2 14 | numpy 15 | pyarrow 16 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [tool:pytest] 5 | addopts = --verbose 6 | python_files = tests/*/*.py 7 | 8 | [versioneer] 9 | VCS = git 10 | style = pep440 11 | versionfile_source = provenance/_version.py 12 | versionfile_build = provenance/_version.py 13 | tag_prefix = 14 | parentdir_prefix = provenance- 15 | 16 | 17 | [flake8] 18 | exclude = docs 19 | ignore = E203,E266,E501,W503,E722,E402,C901,E731,F401 20 | max-line-length = 100 21 | max-complexity = 18 22 | select = B,C,E,F,W,T4,B9 23 | 24 | [yapf] 25 | based_on_style = google 26 | column_limit=100 27 | dedent_closing_brackets = true 28 | join_multiple_lines = false 29 | spaces_before_comment = 4 30 | split_arguments_when_comma_terminated = true 31 | split_before_first_argument = true 32 | split_before_logical_operator = true 33 | split_before_arithmetic_operator=true 34 | split_before_named_assigns = true 35 | 36 | 37 | [isort] 38 | known_first_party=provenance 39 | known_third_party=alembic,boltons,cloudpickle,conftest,frozendict,google,graphviz,hypothesis,joblib,memoized_property,numpy,pandas,paramiko,psutil,pytest,s3fs,setuptools,sqlalchemy,sqlalchemy_utils,strategies,toolz,wrapt 40 | multi_line_output=3 41 | include_trailing_comma=True 42 | force_grid_wrap=0 43 | combine_as_imports=True 44 | line_length=100 45 | skip= 46 | docs/source/conf.py 47 | setup.py 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from os.path import exists 4 | 5 | from setuptools import setup 6 | 7 | import versioneer 8 | 9 | subpackages = { 10 | 'sftp': ['paramiko'], 11 | 'google_storage': ['google-cloud'], 12 | 'vis': ['graphviz', 'frozendict'], 13 | } 14 | 15 | DESCRIPTION = 'Provenance and caching library for functions, built for creating lightweight machine learning pipelines.' 16 | 17 | setup( 18 | name='provenance', 19 | version=versioneer.get_version(), 20 | cmdclass=versioneer.get_cmdclass(), 21 | packages=['provenance', 'provenance.sftp', 'provenance.vis'], 22 | install_requires=[open('requirements.txt').read().strip().split('\n')], 23 | extras_require=subpackages, 24 | include_package_data=True, 25 | description=DESCRIPTION, 26 | long_description=(open('README.rst').read() if exists('README.rst') else ''), 27 | author='Ben Mabey', 28 | author_email='ben@benmabey.com', 29 | url='http://github.com/bmabey/provenance', 30 | license='MIT', 31 | classifiers=[ 32 | 'Development Status :: 4 - Beta', 33 | 'Intended Audience :: Developers', 34 | 'Intended Audience :: Science/Research', 35 | 'License :: OSI Approved :: MIT License', 36 | ], 37 | ) 38 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | moto>=0.4.30 2 | pytest==3.6 3 | pytest-runner 4 | pytest-pythonpath>=0.7.1 5 | # remove hypothesis if we don't end up writing property tests with it 6 | hypothesis>=3.6.0 7 | h5py 8 | pandas 9 | torch 10 | -------------------------------------------------------------------------------- /tests/provenance/conftest.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | import shutil 4 | import tempfile 5 | 6 | import hypothesis.strategies as st 7 | import pytest 8 | import sqlalchemy_utils.functions as sql_utils 9 | import toolz as t 10 | from sqlalchemy import create_engine, event 11 | from sqlalchemy.orm import sessionmaker 12 | 13 | import provenance as p 14 | import provenance.blobstores as bs 15 | import provenance.core as pc 16 | import provenance.repos as r 17 | from provenance.models import Base 18 | 19 | 20 | @pytest.fixture(scope='session') 21 | def s3fs(): 22 | import moto 23 | 24 | m = moto.mock_s3() 25 | m.start() 26 | import boto3 27 | import s3fs 28 | 29 | client = boto3.client('s3') 30 | client.create_bucket(Bucket='bucket') 31 | fs = s3fs.S3FileSystem(anon=False) 32 | return fs 33 | 34 | 35 | @pytest.fixture(scope='session') 36 | def db_conn_str(): 37 | env_conn_str = os.environ.get('DB', None) 38 | return env_conn_str or 'postgresql://localhost/test_provenance' 39 | 40 | 41 | ### This should be the SQLAlchemy db_conn 42 | @pytest.fixture(scope='session') 43 | def db_engine(db_conn_str): 44 | if sql_utils.database_exists(db_conn_str): 45 | sql_utils.drop_database(db_conn_str) 46 | 47 | sql_utils.create_database(db_conn_str) 48 | engine = create_engine(db_conn_str, json_serializer=r.Encoder().encode) 49 | Base.metadata.create_all(engine) 50 | 51 | return engine 52 | 53 | 54 | @pytest.fixture() 55 | def db_session(db_engine): 56 | connection = db_engine.connect() 57 | transaction = connection.begin() 58 | session = sessionmaker()(bind=connection) 59 | 60 | session.begin_nested() 61 | 62 | @event.listens_for(session, 'after_transaction_end') 63 | def restart_savepoint(sess, trans): 64 | if trans.nested and not trans._parent.nested: 65 | sess.expire_all() 66 | sess.begin_nested() 67 | 68 | yield session 69 | 70 | session.close() 71 | transaction.rollback() 72 | connection.close() 73 | 74 | 75 | @contextlib.contextmanager 76 | def cd(newdir, cleanup=lambda: True): 77 | prevdir = os.getcwd() 78 | os.chdir(os.path.expanduser(newdir)) 79 | try: 80 | yield 81 | finally: 82 | os.chdir(prevdir) 83 | cleanup() 84 | 85 | 86 | @contextlib.contextmanager 87 | def tempdir(): 88 | dirpath = tempfile.mkdtemp() 89 | 90 | def cleanup(): 91 | shutil.rmtree(dirpath) 92 | 93 | with cd(dirpath, cleanup): 94 | yield dirpath 95 | 96 | 97 | @pytest.fixture(scope='function') 98 | def disk_store(): 99 | with tempdir() as dirname: 100 | yield bs.DiskStore(cachedir=dirname, delete=True) 101 | 102 | 103 | @pytest.fixture(scope='function') 104 | def memory_store(): 105 | return bs.MemoryStore() 106 | 107 | 108 | @pytest.fixture(scope='function') 109 | def memory_repo(): 110 | repo = r.MemoryRepo(read=True, write=True, delete=True) 111 | p.set_default_repo(repo) 112 | yield repo 113 | p.set_default_repo(None) 114 | 115 | 116 | @pytest.fixture(scope='function', params=['memory_store', 'disk_store']) 117 | def blobstore(request, memory_store, disk_store): 118 | if request.param == 'memory_store': 119 | store = memory_store 120 | else: 121 | store = disk_store 122 | return store 123 | 124 | 125 | # there must be a better way, but this is so I can get get two db_session fixtures 126 | db_session_ = db_session 127 | 128 | 129 | @pytest.fixture( 130 | scope='function', 131 | # params=['memoryrepo']) 132 | params=[ 133 | 'memoryrepo', 134 | 'dbrepo-diskstore', 135 | 'dbrepo-memorystore', 136 | 'chained-memmem', 137 | # 'chained-repo' 138 | ], 139 | ) 140 | def repo(request, db_session): 141 | # clean old config settings 142 | r.Config.set_current(r.Config({}, {}, None)) 143 | disk_store_gen = None 144 | disk_store_gen2 = None 145 | repo2 = None 146 | prevdir = os.getcwd() 147 | if request.param == 'memoryrepo': 148 | repo = r.MemoryRepo(read=True, write=True, delete=True) 149 | elif request.param == 'dbrepo-diskstore': 150 | disk_store_gen = disk_store() 151 | repo = r.DbRepo(db_session, next(disk_store_gen), read=True, write=True, delete=True) 152 | elif request.param == 'chained-memmem': 153 | repo = r.ChainedRepo( 154 | [ 155 | r.MemoryRepo(read=True, write=True, delete=True), 156 | r.MemoryRepo(read=True, write=True, delete=True), 157 | ] 158 | ) 159 | elif request.param == 'chained-repo': 160 | disk_store_gen = disk_store() 161 | disk_store_gen2 = disk_store() 162 | repo1 = r.DbRepo(db_session, next(disk_store_gen), read=True, write=True, delete=True) 163 | os.chdir(prevdir) 164 | repo2 = r.DbRepo( 165 | 'postgresql://localhost/test_provenance', 166 | next(disk_store_gen2), 167 | read=True, 168 | write=True, 169 | delete=True, 170 | schema='second_repo', 171 | ) 172 | repo = r.ChainedRepo([repo1, repo2]) 173 | else: 174 | repo = r.DbRepo(db_session, memory_store(), read=True, write=True, delete=True) 175 | 176 | p.set_default_repo(repo) 177 | yield repo 178 | p.set_default_repo(None) 179 | if repo2 is not None: 180 | repo2._db_engine.execute('drop schema second_repo cascade;') 181 | 182 | if disk_store_gen: 183 | next(disk_store_gen, 'ignore') 184 | if disk_store_gen2: 185 | next(disk_store_gen2, 'ignore') 186 | 187 | 188 | @pytest.fixture(scope='function', params=['dbrepo-diskstore']) 189 | def dbdiskrepo(request, db_session): 190 | repo_gen = repo(request, db_session) 191 | yield next(repo_gen) 192 | next(repo_gen, 'ignore') 193 | 194 | 195 | another_dbdiskrepo = dbdiskrepo 196 | 197 | 198 | @pytest.fixture(scope='function', params=['memoryrepo' 'dbrepo-diskstore', 'dbrepo-memorystore']) 199 | def atomic_repo(request, db_session): 200 | repo_gen = repo(request, db_session) 201 | yield next(repo_gen) 202 | next(repo_gen, 'ignore') 203 | 204 | 205 | md5 = st.text('0123456789abcdef', min_size=32, max_size=32) 206 | _artifact_record_st = st.fixed_dictionaries({'id': md5, 'value_id': md5}) 207 | 208 | 209 | def artifact_record(**kargs): 210 | artifact_props = t.merge( 211 | {k: None for k in pc.artifact_properties}, 212 | _artifact_record_st.example(), 213 | { 214 | 'inputs': { 215 | 'varargs': [1, 2, 3], 216 | 'kargs': {} 217 | }, 218 | 'fn_module': 'foo', 219 | 'fn_name': 'bar', 220 | 'value': 55, 221 | 'name': 'bar', 222 | 'version': 0, 223 | 'serializer': 'joblib', 224 | 'run_info': pc.run_info(), 225 | }, 226 | kargs, 227 | ) 228 | return pc.ArtifactRecord(**artifact_props) 229 | 230 | 231 | @pytest.fixture() 232 | def with_check_mutations(): 233 | p.set_check_mutations(True) 234 | yield True 235 | p.set_check_mutations(False) 236 | -------------------------------------------------------------------------------- /tests/provenance/strategies.py: -------------------------------------------------------------------------------- 1 | import hypothesis.strategies as st 2 | import numpy as np 3 | 4 | primitive_data = ( 5 | st.floats(allow_nan=False) | st.booleans() | st.text() | st.none() | st.fractions() | 6 | st.integers() | st.characters() 7 | ) 8 | # | st.complex_numbers() \ nanj is annoying to deal with 9 | # | st.decimals() can add back in once a new version of joblib is released with bug fix 10 | 11 | hashable_data = primitive_data | st.tuples(primitive_data) 12 | sets = st.sets(hashable_data) 13 | builtin_data = st.recursive( 14 | primitive_data | sets, 15 | lambda children: st.lists(children) | st.dictionaries(st.text(), children) | st. 16 | tuples(children), 17 | ) 18 | 19 | 20 | def rand_nparray(seed, w=3, h=3): 21 | rnd = np.random.RandomState(seed) 22 | return rnd.random_sample((w, h)) 23 | 24 | 25 | np_random_states = st.integers(0, 4294967295).map(np.random.RandomState) 26 | fixed_numpy_arrays = st.integers(0, 4294967295).map(rand_nparray) 27 | numpy_data = fixed_numpy_arrays 28 | data = st.recursive( 29 | primitive_data | sets | fixed_numpy_arrays, 30 | lambda children: st.lists(children) | st.dictionaries(st.text(), children) | st. 31 | tuples(children), 32 | ) 33 | -------------------------------------------------------------------------------- /tests/provenance/test_blobstores.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | import hypothesis.strategies as st 4 | import pytest 5 | from hypothesis import given 6 | from strategies import builtin_data 7 | 8 | import provenance._commonstore as cs 9 | import provenance.blobstores as bs 10 | 11 | 12 | def assert_store_basic_ops(store, key, data): 13 | assert key not in store 14 | store.put(key, data) 15 | assert key in store 16 | 17 | if store._on_duplicate_key == 'raise': 18 | with pytest.raises(cs.KeyExistsError): 19 | store.put(key, 'new value') 20 | 21 | assert store.get(key) == data 22 | assert store[key] == data 23 | 24 | store.delete(key) 25 | assert key not in store 26 | 27 | with pytest.raises(KeyError): 28 | store.delete(key) 29 | 30 | with pytest.raises(KeyError): 31 | store.get(key) 32 | 33 | 34 | hex_alphabet = tuple(map(str, range(0, 10))) + tuple('abcdefABCDEF') 35 | sha1 = st.text(alphabet=hex_alphabet, min_size=40, max_size=40) 36 | 37 | 38 | @given(sha1, builtin_data) 39 | def test_memory_blobstore(key, obj): 40 | store = bs.MemoryStore(read=True, write=True, delete=True) 41 | assert_store_basic_ops(store, key, obj) 42 | 43 | 44 | @given(sha1, builtin_data) 45 | def test_memory_blobstore_raises(key, obj): 46 | store = bs.MemoryStore(read=True, write=True, delete=True, on_duplicate_key='raise') 47 | assert_store_basic_ops(store, key, obj) 48 | 49 | 50 | @given(sha1, builtin_data) 51 | def test_disk_blobstore(key, obj): 52 | tmp_dir = '/tmp/prov_diskstore' 53 | shutil.rmtree(tmp_dir, ignore_errors=True) 54 | store = bs.DiskStore(tmp_dir, read=True, write=True, delete=True) 55 | assert_store_basic_ops(store, key, obj) 56 | 57 | 58 | def test_permissions(): 59 | store = bs.MemoryStore(read=True, write=True, delete=True) 60 | store.put('a', 1) 61 | assert store.get('a') == 1 62 | store.delete('a') 63 | 64 | store = bs.MemoryStore(read=False, write=False, delete=False) 65 | with pytest.raises(cs.PermissionError): 66 | store.put('a', 1) 67 | 68 | with pytest.raises(cs.PermissionError): 69 | store.get('a') 70 | 71 | with pytest.raises(cs.PermissionError): 72 | store.delete('a') 73 | 74 | 75 | def test_s3store(s3fs): 76 | tmp_dir = '/tmp/prov_s3store' 77 | shutil.rmtree(tmp_dir, ignore_errors=True) 78 | basepath = 'bucket/prov_test' 79 | store = bs.S3Store(tmp_dir, basepath, s3fs=s3fs, delete=True) 80 | key = sha1.example() 81 | obj = builtin_data.example() 82 | 83 | assert_store_basic_ops(store, key, obj) 84 | 85 | 86 | def test_sftpstore_import(): 87 | import provenance._config as c 88 | 89 | try: 90 | import paramiko 91 | 92 | _paramiko = True 93 | except ImportError: 94 | _paramiko = False 95 | try: 96 | _ = c.BLOBSTORE_TYPES['sftp'](cachedir=None, basepath=None) 97 | assert _paramiko is True 98 | except ImportError: 99 | assert _paramiko is False 100 | 101 | 102 | def test_chained_storage_with_disk_and_s3_sharing_cachedir(s3fs): 103 | tmp_dir = '/tmp/prov_shared_store' 104 | shutil.rmtree(tmp_dir, ignore_errors=True) 105 | mem_store = bs.MemoryStore(read=True, write=True, delete=True) 106 | disk_store = bs.DiskStore(tmp_dir, read=True, write=True, delete=True) 107 | s3_store = bs.S3Store( 108 | tmp_dir, 109 | s3fs=s3fs, 110 | basepath='bucket/prov_test', 111 | read=True, 112 | write=True, 113 | delete=True, 114 | always_check_remote=True, 115 | ) 116 | stores = [mem_store, disk_store, s3_store] 117 | 118 | chained_store = bs.ChainedStore(stores) 119 | 120 | key = 'foobar' 121 | data = {'a': 1, 'b': 2} 122 | 123 | for store in stores: 124 | assert key not in store 125 | assert key not in store 126 | 127 | chained_store.put(key, data) 128 | assert key in store 129 | for store in stores: 130 | assert key in store 131 | 132 | assert store.get(key) == data 133 | assert store[key] == data 134 | 135 | store.delete(key) 136 | assert key not in store 137 | 138 | with pytest.raises(KeyError): 139 | store.delete(key) 140 | 141 | with pytest.raises(KeyError): 142 | store.get(key) 143 | 144 | 145 | def test_chained_with_readonly(): 146 | read_store = bs.MemoryStore({'foo': 42}, read=True, write=False, delete=False) 147 | write_store = bs.MemoryStore(read=True, write=True, delete=False) 148 | stores = [read_store, write_store] 149 | chained_store = bs.ChainedStore(stores) 150 | 151 | # verify we read from the read-only store 152 | assert chained_store['foo'] == 42 153 | 154 | # but that it is not written to 155 | chained_store.put('bar', 55) 156 | assert 'bar' in chained_store 157 | assert 'bar' in write_store 158 | assert 'bar' not in read_store 159 | 160 | 161 | def test_chained_read_through_write(): 162 | read_store = bs.MemoryStore({'foo': 42}, read=True, write=False) 163 | store_ahead = bs.MemoryStore(read=True, write=True, read_through_write=True) 164 | read_through_write_store = bs.MemoryStore(read=True, write=True, read_through_write=True) 165 | no_read_through_write_store = bs.MemoryStore(read=True, write=True, read_through_write=False) 166 | stores = [ 167 | no_read_through_write_store, 168 | read_through_write_store, 169 | read_store, 170 | store_ahead, 171 | ] 172 | chained_store = bs.ChainedStore(stores) 173 | 174 | assert 'foo' not in read_through_write_store 175 | assert 'foo' not in no_read_through_write_store 176 | assert 'foo' not in store_ahead 177 | # verify we read from the read-only store 178 | assert chained_store['foo'] == 42 179 | 180 | assert 'foo' in read_through_write_store 181 | assert 'foo' not in store_ahead 182 | assert 'foo' not in no_read_through_write_store 183 | 184 | 185 | def test_chained_writes_may_be_allowed_on_read_throughs_only(): 186 | read_store = bs.MemoryStore({'foo': 42}, read=True, write=False) 187 | read_through_write_only_store = bs.MemoryStore(read=True, write=False, read_through_write=True) 188 | write_store = bs.MemoryStore(read=True, write=True, read_through_write=False) 189 | stores = [write_store, read_through_write_only_store, read_store] 190 | chained_store = bs.ChainedStore(stores) 191 | 192 | # verify we read from the read-only store 193 | assert chained_store['foo'] == 42 194 | 195 | assert 'foo' in read_through_write_only_store 196 | assert 'foo' not in write_store 197 | 198 | chained_store.put('bar', 55) 199 | assert 'bar' in chained_store 200 | assert 'bar' not in read_through_write_only_store 201 | assert 'bar' in write_store 202 | -------------------------------------------------------------------------------- /tests/provenance/test_config.py: -------------------------------------------------------------------------------- 1 | import conftest as ct 2 | 3 | import provenance._config as c 4 | import provenance.blobstores as bs 5 | import provenance.repos as r 6 | 7 | 8 | def test_atomic_blobstore_config_reading(): 9 | config = { 10 | 'type': 'disk', 11 | 'cachedir': '.artifacts/', 12 | 'read': True, 13 | 'write': True, 14 | 'read_through_write': False, 15 | 'delete': True, 16 | } 17 | store = c.blobstore_from_config(config) 18 | assert type(store) == bs.DiskStore 19 | assert store.cachedir == bs._abspath(config['cachedir']) 20 | assert store._read == config['read'] 21 | assert store._write == config['write'] 22 | assert store._delete == config['delete'] 23 | assert store._read_through_write == config['read_through_write'] 24 | 25 | 26 | def test_prototypes_are_merged(): 27 | config = { 28 | 'local_disk': 29 | { 30 | 'type': 'disk', 31 | 'cachedir': '.artifacts/', 32 | 'read': True, 33 | 'write': True, 34 | 'read_through_write': False, 35 | 'delete': True, 36 | }, 37 | 'local_read_only': { 38 | 'prototype': 'local_disk', 39 | 'write': False, 40 | 'delete': False 41 | }, 42 | 'local_read_through_write': { 43 | 'prototype': 'local_read_only', 44 | 'read_through_write': True, 45 | }, 46 | } 47 | 48 | stores = c.blobstores_from_config(config) 49 | store = stores['local_read_through_write'] 50 | assert type(store) == bs.DiskStore 51 | assert store.cachedir == bs._abspath('.artifacts/') 52 | assert store._read 53 | assert not store._write 54 | assert not store._delete 55 | assert store._read_through_write 56 | 57 | 58 | def test_blobstores_config_reading(): 59 | config = { 60 | 'local_disk': 61 | { 62 | 'type': 'disk', 63 | 'cachedir': '.artifacts/', 64 | 'read': True, 65 | 'write': True, 66 | 'read_through_write': False, 67 | 'delete': True, 68 | }, 69 | 'mem': 70 | { 71 | 'type': 'memory', 72 | 'read': True, 73 | 'write': True, 74 | 'read_through_write': False, 75 | 'delete': True, 76 | }, 77 | 'shared_s3': 78 | { 79 | 'type': 's3', 80 | 'cachedir': '/tmp/foo', 81 | 'basepath': 'mybucket/blobs', 82 | 'delete': False, 83 | 's3_config': { 84 | 'anon': True 85 | }, 86 | }, 87 | 'chained': { 88 | 'type': 'chained', 89 | 'stores': ['local_disk', 'mem', 'shared_s3'] 90 | }, 91 | } 92 | 93 | stores = c.blobstores_from_config(config) 94 | chained = stores['chained'] 95 | assert isinstance(chained, bs.ChainedStore) 96 | assert [type(s) for s in chained.stores] == [ 97 | bs.DiskStore, 98 | bs.MemoryStore, 99 | bs.S3Store, 100 | ] 101 | 102 | 103 | def test_from_config(): 104 | config = { 105 | 'blobstores': 106 | { 107 | 'mem': 108 | { 109 | 'type': 'memory', 110 | 'read': True, 111 | 'write': True, 112 | 'read_through_write': False, 113 | 'delete': True, 114 | } 115 | }, 116 | 'artifact_repos': { 117 | 'db': { 118 | 'type': 'postgres', 119 | 'db': ct.db_conn_str(), 120 | 'store': 'mem' 121 | } 122 | }, 123 | } 124 | objs = c.from_config(config) 125 | repo = objs['repos']['db'] 126 | assert isinstance(repo, r.PostgresRepo) 127 | assert isinstance(repo.blobstore, bs.MemoryStore) 128 | -------------------------------------------------------------------------------- /tests/provenance/test_hashing.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import hypothesis.strategies as st 4 | import numpy as np 5 | from hypothesis import given 6 | from strategies import data 7 | 8 | import provenance as p 9 | import provenance.artifact_hasher as ah 10 | from provenance.hashing import hash 11 | 12 | 13 | @given(data) 14 | def test_shallow_and_deep_copies_hashing(o): 15 | original_hash = hash(o) 16 | shallow_copy = copy.copy(o) 17 | deep_copy = copy.deepcopy(o) 18 | assert hash(shallow_copy) == original_hash 19 | assert hash(deep_copy) == original_hash 20 | 21 | 22 | @given(st.data()) 23 | def test_shared_values_hashing(base_data): 24 | base_data = base_data.draw(data) 25 | base_copy = lambda: copy.deepcopy(base_data) 26 | 27 | shared_dict = {'a': base_data, 'b': base_data} 28 | without_sharing_dict = {'a': base_copy(), 'b': base_copy()} 29 | 30 | assert hash(shared_dict) == hash(without_sharing_dict) 31 | 32 | shared_tuple = (base_data, base_data) 33 | without_sharing_tuple = (base_copy(), base_copy()) 34 | 35 | assert hash(shared_tuple) == hash(without_sharing_tuple) 36 | 37 | shared_list = [base_data, base_data] 38 | without_sharing_list = [base_copy(), base_copy()] 39 | 40 | assert hash(shared_list) == hash(without_sharing_list) 41 | 42 | 43 | def test_hash_of_contiguous_array_is_the_same_as_noncontiguous(): 44 | a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F')[:, :1, :] 45 | b = np.ascontiguousarray(a) 46 | assert hash(a) == hash(b) 47 | 48 | 49 | def test_hash_of_fortran_array_is_the_same_as_c_array(): 50 | c = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='C') 51 | f = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F') 52 | 53 | assert hash(c) == hash(f) 54 | 55 | 56 | def test_hashing_of_functions(): 57 | 58 | def foo(a, b): 59 | return a + b 60 | 61 | assert hash(foo) == hash(foo) 62 | 63 | 64 | def test_hashing_of_artifacts_and_proxies(repo): 65 | 66 | @p.provenance() 67 | def load_data(): 68 | return [1, 2, 3] 69 | 70 | original_proxy = load_data() 71 | original_artifact = original_proxy.artifact 72 | loaded_artifact = repo.get_by_id(original_artifact.id) 73 | loaded_proxy = loaded_artifact.proxy() 74 | 75 | # All artifacts should have the same hash 76 | assert hash(original_artifact) == hash(loaded_artifact) 77 | 78 | # All proxies should have the same hash 79 | assert hash(original_proxy) == hash(loaded_proxy) 80 | 81 | # All values should have the same hash 82 | assert hash(original_artifact.value) == hash(loaded_artifact.value) 83 | 84 | # Artifacts and proxies should not have the same hash 85 | assert hash(original_artifact) != hash(original_proxy) 86 | 87 | # Proxies and values should have the same hash 88 | assert hash(original_proxy) == hash(original_artifact.value) 89 | 90 | 91 | def test_hashing_with_artifact_hasher_also_returns_iter_of_artifacts_preserves_hash(repo,): 92 | 93 | @p.provenance() 94 | def load_data(): 95 | return [1, 2, 3] 96 | 97 | @p.provenance() 98 | def create_composite(data): 99 | return {'foo': 'bar', 'data': data} 100 | 101 | data = load_data() 102 | 103 | original_proxy = create_composite(data) 104 | original_artifact = original_proxy.artifact 105 | loaded_artifact = repo.get_by_id(original_artifact.id) 106 | loaded_proxy = loaded_artifact.proxy() 107 | 108 | expected_proxy_ids = frozenset((original_artifact.id, data.artifact.id)) 109 | expected_artifact_ids = frozenset((original_artifact.id,)) 110 | 111 | original_proxy_hash, artifacts = hash(original_proxy, hasher=ah.artifact_hasher()) 112 | ids = frozenset(a.id for a in artifacts) 113 | assert original_proxy_hash == hash(original_proxy) 114 | assert ids == expected_proxy_ids 115 | 116 | original_artifact_hash, artifacts = hash(original_artifact, hasher=ah.artifact_hasher()) 117 | ids = frozenset(a.id for a in artifacts) 118 | assert original_artifact_hash == hash(original_artifact) 119 | assert ids == expected_artifact_ids 120 | 121 | loaded_artifact_hash, artifacts = hash(loaded_artifact, hasher=ah.artifact_hasher()) 122 | ids = frozenset(a.id for a in artifacts) 123 | assert loaded_artifact_hash == hash(loaded_artifact) 124 | assert ids == expected_artifact_ids 125 | 126 | loaded_proxy_hash, artifacts = hash(loaded_proxy, hasher=ah.artifact_hasher()) 127 | ids = frozenset(a.id for a in artifacts) 128 | assert loaded_proxy_hash == hash(loaded_proxy) 129 | assert ids == expected_proxy_ids 130 | -------------------------------------------------------------------------------- /tests/provenance/test_pytorch.py: -------------------------------------------------------------------------------- 1 | from copy import copy, deepcopy 2 | 3 | import pytest 4 | 5 | import provenance as p 6 | from provenance.hashing import hash 7 | 8 | torch = pytest.importorskip('torch') 9 | 10 | 11 | class TwoLayerNet(torch.nn.Module): 12 | """ 13 | This class is copied from PyTorch's documentation and is meant to be the 14 | simplest, non-trivial custom NN we can use for testing provenance. 15 | See [here](https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html#sphx-glr-beginner-examples-nn-two-layer-net-module-py) 16 | """ 17 | 18 | def __init__(self, D_in, H, D_out): 19 | """ 20 | In the constructor we instantiate two nn.Linear modules and assign them 21 | as member variables. 22 | """ 23 | super(TwoLayerNet, self).__init__() 24 | self.linear1 = torch.nn.Linear(D_in, H) 25 | self.linear2 = torch.nn.Linear(H, D_out) 26 | 27 | def forward(self, x): 28 | """ 29 | In the forward function we accept a Tensor of input data and we must 30 | return a Tensor of output data. We can use Modules defined in the 31 | constructor as well as arbitrary operators on Tensors. 32 | """ 33 | h_relu = self.linear1(x).clamp(min=0) 34 | y_pred = self.linear2(h_relu) 35 | return y_pred 36 | 37 | 38 | def random_data(N, D_in, D_out): 39 | """ 40 | Generates random data for training/testing the PyTorch model. 41 | 42 | N is the data size 43 | D_in is the input dimension 44 | D_out is the output dimension 45 | """ 46 | 47 | # Create random Tensors to hold inputs and outputs 48 | x = torch.randn(N, D_in) 49 | y = torch.randn(N, D_out) 50 | return {'X_train': x, 'Y_train': y, 'X_test': x, 'Y_test': y} 51 | 52 | 53 | @p.provenance(returns_composite=True) 54 | def fit_model(N=64, D_in=1000, D_out=10, H=100, epochs=500, seed=None): 55 | """ 56 | An example workflow that provenance can handle from PyTorch. The model 57 | parameters, the data parameters, and the fit parameters are all passed 58 | into this function, and the output includes the PyTorch model and some 59 | metadata regarding its fit history (a list of losses after each epoch). 60 | """ 61 | if seed is not None: 62 | torch.manual_seed(seed) 63 | 64 | data = random_data(N, D_in, D_out) 65 | x = data['X_train'] 66 | y = data['Y_train'] 67 | 68 | model = TwoLayerNet(D_in, H, D_out) 69 | 70 | # Construct our loss function and an Optimizer. The call to 71 | # model.parameters() in the SGD constructor will contain the learnable 72 | # parameters of the two nn.Linear modules which are members of the model. 73 | criterion = torch.nn.MSELoss(reduction='sum') 74 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-4) 75 | 76 | losses = [] 77 | for t in range(epochs): 78 | # Forward pass: Compute predicted y by passing x to the model 79 | y_pred = model(x) 80 | 81 | # Compute and print loss 82 | loss = criterion(y_pred, y) 83 | losses.append(loss.item()) 84 | 85 | # Zero gradients, perform a backward pass, and update the weights. 86 | optimizer.zero_grad() 87 | loss.backward() 88 | optimizer.step() 89 | 90 | return {'model': model, 'losses': losses} 91 | 92 | 93 | def test_same_models_are_equal(dbdiskrepo): 94 | """ 95 | Validates that two separately constructed models using the same parameters 96 | hash to the same artifact in provenance terms. 97 | """ 98 | fit1 = fit_model() 99 | fit2 = fit_model() 100 | assert fit1.artifact.id == fit2.artifact.id 101 | assert fit1.artifact.value_id == fit2.artifact.value_id 102 | assert hash(fit1) == hash(fit2) 103 | 104 | 105 | def test_copied_models_are_equal(dbdiskrepo): 106 | """ 107 | Validates that a copied model (deep or shallow copied) hashes to the same 108 | artifact as the original in provenance terms. 109 | """ 110 | original = fit_model() 111 | 112 | shallow = copy(original) 113 | assert original.artifact.id == shallow.artifact.id 114 | assert original.artifact.value_id == shallow.artifact.value_id 115 | assert hash(original) == hash(shallow) 116 | 117 | deep = deepcopy(original) 118 | assert original.artifact.id == deep.artifact.id 119 | assert original.artifact.value_id == deep.artifact.value_id 120 | assert hash(original) == hash(deep) 121 | 122 | 123 | def test_reloading_from_disk_has_same_value_id(dbdiskrepo): 124 | """ 125 | Validates that we can write and read a pytorch model as an artifact and that 126 | it is the same going in as coming out. 127 | """ 128 | original = fit_model() 129 | loaded = p.load_proxy(original.artifact.id) 130 | 131 | assert loaded.artifact.value_id == p.hash(loaded.artifact.value) 132 | assert loaded.artifact.value_id == original.artifact.value_id 133 | assert loaded.artifact.id == original.artifact.id 134 | 135 | 136 | def test_different_seeds_result_in_different_models(dbdiskrepo): 137 | """ 138 | Validates that using different pytorch seeds to the fit model results in 139 | the same artifact. 140 | """ 141 | fit1 = fit_model(seed=0) 142 | fit2 = fit_model(seed=1) 143 | 144 | assert p.hash(fit1) != p.hash(fit2) 145 | assert fit1.artifact.id != fit2.artifact.id 146 | assert fit1.artifact.value_id != fit2.artifact.value_id 147 | 148 | 149 | def test_same_seeds_result_in_same_models(dbdiskrepo): 150 | """ 151 | Validates that using the same pytorch seed to the fit model results in 152 | different artifacts. 153 | """ 154 | fit1 = fit_model(seed=0) 155 | fit2 = fit_model(seed=0) 156 | 157 | assert p.hash(fit1) == p.hash(fit2) 158 | assert fit1.artifact.id == fit2.artifact.id 159 | assert fit1.artifact.value_id == fit2.artifact.value_id 160 | -------------------------------------------------------------------------------- /tests/provenance/test_repos.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from datetime import datetime 3 | 4 | import pandas as pd 5 | import pytest 6 | import sqlalchemy_utils.functions as sql_utils 7 | from conftest import artifact_record 8 | 9 | import provenance as p 10 | import provenance._commonstore as cs 11 | import provenance.blobstores as bs 12 | import provenance.repos as r 13 | 14 | 15 | def test_inputs_json(db_session): 16 | repo = r.DbRepo(db_session, bs.MemoryStore()) 17 | 18 | @p.provenance(version=0, name='initial_data', repo=repo) 19 | def load_data(filename, timestamp): 20 | return {'data': [1, 2, 3], 'timestamp': timestamp} 21 | 22 | @p.provenance(repo=repo) 23 | def process_data_X(data, process_x_inc, timestamp): 24 | _data = [i + process_x_inc for i in data['data']] 25 | return {'data': _data, 'timestamp': timestamp} 26 | 27 | @p.provenance(repo=repo) 28 | def process_data_Y(data, process_y_inc, timestamp): 29 | _data = [i + process_y_inc for i in data['data']] 30 | return {'data': _data, 'timestamp': timestamp} 31 | 32 | @p.provenance(repo=repo) 33 | def combine_processed_data(filename, inc_x, inc_y, timestamp): 34 | _data = [a + b for a, b in zip(inc_x['data'], inc_y['data'])] 35 | return {'data': _data, 'timestamp': timestamp} 36 | 37 | def pipeline(filename, timestamp, process_x_inc, process_y_inc): 38 | data = load_data(filename, timestamp) 39 | inc_x = process_data_X(data, process_x_inc, timestamp) 40 | inc_y = process_data_Y(data, process_y_inc, timestamp) 41 | res = combine_processed_data(filename, inc_x, inc_y, timestamp) 42 | return {'data': data, 'inc_x': inc_x, 'inc_y': inc_y, 'res': res} 43 | 44 | now = datetime(2016, 9, 27, 7, 51, 11, 613544) 45 | 46 | expected_inputs_json = { 47 | '__varargs': [], 48 | 'filename': 'foo-bar', 49 | 'timestamp': now, 50 | 'inc_x': 51 | { 52 | 'id': 'c74da9d379234901fe7a89e03fa800b0', # md5 53 | # "id": "2c33a362ebd51f830d0b245473ab6c1269674259", # sha1 54 | 'name': 'test_repos.process_data_X', 55 | 'type': 'ArtifactProxy', 56 | }, 57 | 'inc_y': 58 | { 59 | 'id': 'a1bd4d4ae1f33ae6379613618427f127', # md5 60 | # "id": "f9b1bb7a8aaf435fbf60b92cd88bf6c46604f702", # sha1 61 | 'name': 'test_repos.process_data_Y', 62 | 'type': 'ArtifactProxy', 63 | }, 64 | } 65 | 66 | results = pipeline(filename='foo-bar', process_x_inc=5, process_y_inc=10, timestamp=now) 67 | res = results['res'].artifact 68 | inputs_json = r._inputs_json(res.inputs) 69 | assert inputs_json == expected_inputs_json 70 | 71 | results = pipeline(filename='foo-bar', process_x_inc=5, process_y_inc=10, timestamp=now) 72 | res = results['res'].artifact 73 | inputs_json = r._inputs_json(res.inputs) 74 | assert inputs_json == expected_inputs_json 75 | 76 | 77 | def test_basic_repo_ops(repo): 78 | artifact = artifact_record() 79 | 80 | assert artifact.id not in repo 81 | repo.put(artifact) 82 | 83 | assert artifact.id in repo 84 | assert artifact in repo 85 | 86 | with pytest.raises(cs.KeyExistsError): 87 | repo.put(artifact) 88 | 89 | assert repo.get_by_id(artifact.id).id == artifact.id 90 | assert repo[artifact.id].id == artifact.id 91 | assert repo.get_by_value_id(artifact.value_id).id == artifact.id 92 | 93 | repo.delete(artifact.id) 94 | assert artifact.id not in repo 95 | if hasattr(repo, 'blobstore'): 96 | assert artifact.id not in repo.blobstore 97 | assert artifact.value_id not in repo.blobstore 98 | 99 | with pytest.raises(KeyError): 100 | repo.delete(artifact.id) 101 | 102 | with pytest.raises(KeyError): 103 | repo.get_by_id(artifact.id) 104 | 105 | with pytest.raises(KeyError): 106 | repo.get_by_value_id(artifact.id) 107 | 108 | 109 | @pytest.mark.parametrize('artifact_class', [r.ArtifactProxy, r.CallableArtifactProxy]) 110 | @pytest.mark.parametrize('copy_method', [copy.copy, copy.deepcopy]) 111 | def test_copy_Proxies(repo, artifact_class, copy_method): 112 | 113 | class Artifact: 114 | 115 | def __init__(self, id): 116 | self.id = id 117 | 118 | a = artifact_class({'a': 1, 'b': 2, 'c': 3}, Artifact('1')) 119 | b = copy_method(a) 120 | b['a'] = 10 121 | 122 | assert a['a'] != b['a'] 123 | 124 | 125 | def test_repo_set_put_and_finding(repo): 126 | artifact = artifact_record(id='123') 127 | repo.put(artifact) 128 | artifact_set = r.ArtifactSet([artifact.id], 'foo') 129 | repo.put_set(artifact_set) 130 | 131 | assert repo.get_set_by_id(artifact_set.id) == artifact_set 132 | found_set = repo.get_set_by_labels('foo') 133 | assert found_set.name == 'foo' 134 | assert found_set.artifact_ids == {'123'} 135 | 136 | 137 | def test_repo_raises_key_error_when_set_id_not_found(repo): 138 | with pytest.raises(KeyError): 139 | repo.get_set_by_id('foo') 140 | 141 | 142 | def test_repo_raises_key_error_when_set_name_not_found(repo): 143 | with pytest.raises(KeyError): 144 | repo.get_set_by_labels('foo') 145 | 146 | 147 | def test_repo_contains_set(repo): 148 | assert not repo.contains_set('foo') 149 | 150 | artifact = artifact_record(id='123') 151 | repo.put(artifact) 152 | artifact_set = r.ArtifactSet([artifact.id], 'foo') 153 | 154 | repo.put_set(artifact_set) 155 | assert repo.contains_set(artifact_set.id) 156 | 157 | 158 | def test_repo_delete_set(repo): 159 | artifact = artifact_record(id='123') 160 | repo.put(artifact) 161 | artifact_set = r.ArtifactSet(['123'], 'foo') 162 | repo.put_set(artifact_set) 163 | 164 | repo.delete_set(artifact_set.id) 165 | 166 | with pytest.raises(KeyError): 167 | repo.get_set_by_id(artifact_set.id) 168 | 169 | 170 | def test_permissions(atomic_repo): 171 | repo = atomic_repo 172 | artifact = artifact_record() 173 | 174 | repo._write = False 175 | assert not repo._write 176 | 177 | with pytest.raises(cs.PermissionError): 178 | repo.put(artifact) 179 | assert artifact not in repo 180 | 181 | repo._write = True 182 | repo.put(artifact) 183 | 184 | repo._read = False 185 | 186 | with pytest.raises(cs.PermissionError): 187 | repo.get_by_id(artifact.id) 188 | 189 | with pytest.raises(cs.PermissionError): 190 | repo.get_by_value_id(artifact.value_id) 191 | 192 | with pytest.raises(cs.PermissionError): 193 | repo.get_value(artifact.id) 194 | 195 | with pytest.raises(cs.PermissionError): 196 | repo.get_inputs(artifact) 197 | 198 | with pytest.raises(cs.PermissionError): 199 | artifact.id in repo 200 | 201 | repo._read = True 202 | assert repo.get_by_id(artifact.id) 203 | assert artifact.id in repo 204 | 205 | repo._delete = False 206 | with pytest.raises(cs.PermissionError): 207 | repo.delete(artifact.id) 208 | 209 | repo._delete = True 210 | repo.delete(artifact.id) 211 | assert artifact.id not in repo 212 | 213 | 214 | def test_chained_with_readonly(): 215 | read_repo = r.MemoryRepo([artifact_record(id='foo')], read=True, write=False, delete=False) 216 | write_repo = r.MemoryRepo(read=True, write=True, delete=False) 217 | repos = [read_repo, write_repo] 218 | chained = r.ChainedRepo(repos) 219 | 220 | # verify we read from the read-only store 221 | assert 'foo' in chained 222 | 223 | # but that it is not written to 224 | record = artifact_record(id='bar', value_id='baz') 225 | chained.put(record) 226 | assert 'bar' in chained 227 | assert 'bar' in write_repo 228 | assert 'bar' not in read_repo 229 | assert chained.get_by_value_id(record.value_id).id == record.id 230 | assert chained.get_by_id(record.id).id == record.id 231 | assert chained.get_value(record) == record.value 232 | 233 | 234 | def test_chained_read_through_write(): 235 | foo = artifact_record(id='foo') 236 | read_repo = r.MemoryRepo([foo], read=True, write=False) 237 | repo_ahead = r.MemoryRepo(read=True, write=True, read_through_write=True) 238 | read_through_write_repo = r.MemoryRepo(read=True, write=True, read_through_write=True) 239 | no_read_through_write_repo = r.MemoryRepo(read=True, write=True, read_through_write=False) 240 | repos = [no_read_through_write_repo, read_through_write_repo, read_repo, repo_ahead] 241 | chained_repo = r.ChainedRepo(repos) 242 | 243 | assert 'foo' not in read_through_write_repo 244 | assert 'foo' not in no_read_through_write_repo 245 | assert 'foo' not in repo_ahead 246 | # verify we read from the read-only store 247 | assert chained_repo['foo'].id == foo.id 248 | 249 | assert 'foo' in read_through_write_repo 250 | assert 'foo' not in repo_ahead 251 | assert 'foo' not in no_read_through_write_repo 252 | 253 | 254 | def test_chained_writes_may_be_allowed_on_read_throughs_only(): 255 | foo = artifact_record(id='foo') 256 | read_repo = r.MemoryRepo([foo], read=True, write=False) 257 | read_through_write_only_repo = r.MemoryRepo(read=True, write=False, read_through_write=True) 258 | write_repo = r.MemoryRepo(read=True, write=True, read_through_write=False) 259 | repos = [write_repo, read_through_write_only_repo, read_repo] 260 | chained_repo = r.ChainedRepo(repos) 261 | 262 | # verify we read from the read-only repo 263 | assert chained_repo['foo'].id == foo.id 264 | 265 | assert 'foo' in read_through_write_only_repo 266 | assert 'foo' not in write_repo 267 | 268 | bar = artifact_record(id='bar') 269 | chained_repo.put(bar) 270 | assert 'bar' in chained_repo 271 | assert 'bar' not in read_through_write_only_repo 272 | assert 'bar' in write_repo 273 | 274 | 275 | def test_db_is_automatically_created_and_migrated(disk_store): 276 | db_conn_str = 'postgresql://localhost/test_provenance_autocreate' 277 | if sql_utils.database_exists(db_conn_str): 278 | sql_utils.drop_database(db_conn_str) 279 | 280 | repo = r.PostgresRepo( 281 | db_conn_str, disk_store, read=True, write=True, delete=True, create_db=True 282 | ) 283 | p.set_default_repo(repo) 284 | 285 | @p.provenance() 286 | def calculate(a, b): 287 | return a + b 288 | 289 | assert sql_utils.database_exists(db_conn_str) 290 | 291 | # make sure it all works 292 | assert calculate(1, 2) == 3 293 | 294 | p.set_default_repo(None) 295 | sql_utils.drop_database(db_conn_str) 296 | 297 | 298 | def test_db_is_automatically_created_and_migrated_with_the_right_schema(disk_store): 299 | db_conn_str = 'postgresql://localhost/test_provenance_autocreate_schema' 300 | if sql_utils.database_exists(db_conn_str): 301 | sql_utils.drop_database(db_conn_str) 302 | 303 | repo = r.PostgresRepo( 304 | db_conn_str, 305 | disk_store, 306 | read=True, 307 | write=True, 308 | delete=True, 309 | create_db=True, 310 | schema='foobar', 311 | ) 312 | p.set_default_repo(repo) 313 | 314 | @p.provenance() 315 | def calculate(a, b): 316 | return a + b 317 | 318 | assert calculate(1, 2) == 3 319 | 320 | with repo.session() as s: 321 | res = pd.read_sql('select * from foobar.artifacts', s.connection()) 322 | 323 | repo2 = r.PostgresRepo( 324 | db_conn_str, 325 | disk_store, 326 | read=True, 327 | write=True, 328 | delete=True, 329 | create_db=True, 330 | schema='baz', 331 | ) 332 | 333 | p.set_default_repo(repo2) 334 | 335 | assert calculate(5, 5) == 10 336 | 337 | with repo2.session() as s: 338 | res = pd.read_sql('select * from baz.artifacts', s.connection()) 339 | 340 | assert res.iloc[0]['inputs_json'] == {'b': 5, 'a': 5, '__varargs': []} 341 | 342 | p.set_default_repo(None) 343 | sql_utils.drop_database(db_conn_str) 344 | 345 | 346 | def xtest_db_is_automatically_migrated(disk_store): 347 | db_conn_str = 'postgresql://localhost/test_provenance_automigrate' 348 | if sql_utils.database_exists(db_conn_str): 349 | sql_utils.drop_database(db_conn_str) 350 | 351 | sql_utils.create_database(db_conn_str) 352 | 353 | repo = r.PostgresRepo( 354 | db_conn_str, 355 | disk_store, 356 | read=True, 357 | write=True, 358 | delete=True, 359 | create_db=False, 360 | upgrade_db=True, 361 | ) 362 | p.set_default_repo(repo) 363 | 364 | @p.provenance() 365 | def calculate(a, b): 366 | return a + b 367 | 368 | # make sure it all works 369 | assert calculate(1, 2) == 3 370 | 371 | p.set_default_repo(None) 372 | sql_utils.drop_database(db_conn_str) 373 | 374 | 375 | def test_artifact_proxy_works_with_iterables(): 376 | 377 | class Foo: 378 | 379 | def __init__(self, a): 380 | self.a = a 381 | 382 | def __next__(self): 383 | return self.a 384 | 385 | foo = r.artifact_proxy(Foo(5), 'stub artifact') 386 | 387 | assert next(foo) == 5 388 | -------------------------------------------------------------------------------- /tests/provenance/test_utils.py: -------------------------------------------------------------------------------- 1 | import toolz as t 2 | 3 | import provenance.utils as u 4 | 5 | 6 | def test_fn_info_with_regular_function(): 7 | 8 | def inc(x): 9 | return x + 1 10 | 11 | info = u.fn_info(inc) 12 | assert info == {'name': 'inc', 'module': 'test_utils', 'varargs': (), 'kargs': {}} 13 | 14 | 15 | def test_fn_info_with_partial(): 16 | 17 | def mult(x, y): 18 | return x * y 19 | 20 | double = t.partial(mult, 2) 21 | info = u.fn_info(double) 22 | 23 | assert info == { 24 | 'name': 'mult', 25 | 'module': 'test_utils', 26 | 'varargs': (), 27 | 'kargs': { 28 | 'x': 2 29 | }, 30 | } 31 | 32 | 33 | def test_fn_info_with_partial_of_partial(): 34 | 35 | def mult(*args): 36 | return t.reduce(lambda a, b: a * b, args) 37 | 38 | double = t.partial(mult, 2) 39 | quad = t.partial(double, 2) 40 | info = u.fn_info(quad) 41 | 42 | assert info == { 43 | 'name': 'mult', 44 | 'module': 'test_utils', 45 | 'varargs': (2, 2), 46 | 'kargs': {}, 47 | } 48 | 49 | 50 | def test_fn_info_with_curry(): 51 | 52 | @t.curry 53 | def mult(x, y): 54 | return x * y 55 | 56 | double = mult(2) 57 | assert double(2) == 4 58 | info = u.fn_info(double) 59 | 60 | assert info == { 61 | 'name': 'mult', 62 | 'module': 'test_utils', 63 | 'varargs': (), 64 | 'kargs': { 65 | 'x': 2 66 | }, 67 | } 68 | 69 | 70 | def test_fn_info_with_multiple_curries(): 71 | 72 | @t.curry 73 | def mult(a, b, c): 74 | return a * b * c 75 | 76 | double = mult(2) 77 | quad = double(2) 78 | info = u.fn_info(quad) 79 | 80 | assert info == { 81 | 'name': 'mult', 82 | 'module': 'test_utils', 83 | 'varargs': (), 84 | 'kargs': { 85 | 'a': 2, 86 | 'b': 2 87 | }, 88 | } 89 | 90 | 91 | def test_with_merged_defaults_basic_merging(): 92 | foo_defaults = {'a': 1, 'b': 2} 93 | 94 | @u.with_merged_defaults() 95 | def bar(foo=foo_defaults): 96 | return foo 97 | 98 | assert bar() == {'a': 1, 'b': 2} 99 | assert bar(foo={'c': 3}) == {'a': 1, 'b': 2, 'c': 3} 100 | assert bar(foo={'a': 10}) == {'a': 10, 'b': 2} 101 | 102 | 103 | def test_with_merged_defaults_with_non_dict_args(): 104 | foo_defaults = {'a': 1, 'b': 2} 105 | 106 | @u.with_merged_defaults() 107 | def bar(a, foo=foo_defaults, baz=None): 108 | return a, baz, foo 109 | 110 | assert bar(5) == (5, None, {'a': 1, 'b': 2}) 111 | assert bar(5, baz='baz', foo={'c': 3}) == (5, 'baz', {'a': 1, 'b': 2, 'c': 3}) 112 | 113 | 114 | def test_with_merged_defaults_with_args_splat(): 115 | foo_defaults = {'a': 1, 'b': 2} 116 | 117 | @u.with_merged_defaults() 118 | def bar(*args, foo=foo_defaults): 119 | return args, foo 120 | 121 | assert bar(5, 10) == ((5, 10), {'a': 1, 'b': 2}) 122 | assert bar() == ((), {'a': 1, 'b': 2}) 123 | --------------------------------------------------------------------------------