├── .gitattributes
├── .github
    └── workflows
    │   └── linting.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .travis.yml
├── .yapfignore
├── CONTRIBUTING.rst
├── Dockerfile
├── HISTORY.rst
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.rst
├── docs
    ├── Makefile
    ├── make.bat
    ├── readthedocs-environment.yml
    └── source
    │   ├── API.rst
    │   ├── conf.py
    │   ├── contributing.rst
    │   ├── hashing-and-mutation.ipynb
    │   ├── history.rst
    │   ├── images
    │       ├── keep_calm_pipeline_on.png
    │       └── lineage_example.png
    │   ├── index.rst
    │   ├── intro-guide.ipynb
    │   └── ml-pipeline.ipynb
├── environment.yml
├── examples
    ├── basic
    │   ├── README.md
    │   ├── basic_example.py
    │   └── environment.yml
    ├── experiment
    │   ├── README.md
    │   ├── config.yaml
    │   ├── environment.yml
    │   └── experiment_example.py
    └── sftp
    │   ├── README.md
    │   ├── environment.yml
    │   └── sftp_example.py
├── provenance
    ├── __init__.py
    ├── _commonstore.py
    ├── _config.py
    ├── _dependencies.py
    ├── _version.py
    ├── alembic.ini
    ├── artifact_hasher.py
    ├── blobstores.py
    ├── core.py
    ├── google_storage.py
    ├── hashing.py
    ├── migrations
    │   ├── README
    │   ├── env.py
    │   ├── script.py.mako
    │   └── versions
    │   │   └── e0317ab07ba4_initial_schema.py
    ├── models.py
    ├── repos.py
    ├── serializers.py
    ├── sftp
    │   └── __init__.py
    ├── test_serializers.py
    ├── utils.py
    └── vis
    │   ├── __init__.py
    │   └── utils.py
├── readthedocs.yml
├── release-procedure.md
├── requirements.txt
├── setup.cfg
├── setup.py
├── test_requirements.txt
├── tests
    └── provenance
    │   ├── conftest.py
    │   ├── strategies.py
    │   ├── test_blobstores.py
    │   ├── test_config.py
    │   ├── test_core.py
    │   ├── test_hashing.py
    │   ├── test_pytorch.py
    │   ├── test_repos.py
    │   └── test_utils.py
└── versioneer.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | provenance/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.github/workflows/linting.yaml:
--------------------------------------------------------------------------------
 1 | name: code-style
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: "*"
 6 |   pull_request:
 7 |     branches: trunk
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout
14 |         uses: actions/checkout@v2
15 |       - name: Set up Python 3.8
16 |         uses: actions/setup-python@v2
17 |         with:
18 |           python-version: 3.8
19 |       - name: Lint via pre-commit checks
20 |         uses: pre-commit/action@v2.0.0
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.pyo
 3 | *.egg-info
 4 | .benchmarks
 5 | docs/build
 6 | build/
 7 | dist/
 8 | .idea/
 9 | log.*
10 | log
11 | .coverage
12 | .DS_Store
13 | *.swp
14 | *.swo
15 | scratch/
16 | .hypothesis
17 | .cache
18 | .eggs
19 | .artifacts
20 | examples/**/artifacts/
21 | examples/**/blobstore/
22 | .ipynb_checkpoints
23 | docs/build
24 | .pytest_cache/
25 | .vscode/
26 | .mypy_cache/
27 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | 
 3 |   -   repo: https://github.com/pre-commit/pre-commit-hooks
 4 |       rev: v3.3.0
 5 |       hooks:
 6 |       -   id: trailing-whitespace
 7 |       -   id: end-of-file-fixer
 8 |       -   id: check-docstring-first
 9 |       -   id: check-json
10 |       -   id: check-yaml
11 |       -   id: double-quote-string-fixer
12 | 
13 |   -   repo: https://github.com/pre-commit/mirrors-yapf
14 |       rev: v0.30.0
15 |       hooks:
16 |        -  id: yapf
17 |           args: ['--parallel', '--in-place']
18 | 
19 |   -   repo: https://github.com/asottile/seed-isort-config
20 |       rev: v2.2.0
21 |       hooks:
22 |       -   id: seed-isort-config
23 |   -   repo: https://github.com/pre-commit/mirrors-isort
24 |       rev: v5.6.4
25 |       hooks:
26 |       -   id: isort
27 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: required
 3 | dist: xenial
 4 | addons:
 5 |   postgresql: "9.5"
 6 | services:
 7 |   - postgresql
 8 | before_script:
 9 |   - psql -c 'create database test_provenance;' -U postgres
10 | env:
11 |   global:
12 |     - DB=postgresql://postgres@localhost/test_provenance
13 | 
14 | python:
15 |   # We don't actually use the Travis Python, but this keeps it organized. For now only python 3.5 is supported.
16 |   # - "2.7"
17 |   # - "3.3"
18 |   # - "3.4"
19 |   - "3.5"
20 | 
21 | install:
22 |   - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
23 |     wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
24 |     else
25 |     wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
26 |     fi
27 |   - bash miniconda.sh -b -p $HOME/miniconda
28 |   - export PATH="$HOME/miniconda/bin:$PATH"
29 |   - hash -r
30 |   - conda config --set always_yes yes --set changeps1 no
31 |   - conda update -q conda
32 |   # Useful for debugging any issues with conda.
33 |   - conda info -a
34 |   - conda create -n test-environment python=$TRAVIS_PYTHON_VERSION
35 |   - source activate test-environment
36 |   - conda install numpy
37 |   - conda install -c conda-forge pyarrow
38 |   - pip install -r test_requirements.txt
39 |   # Due to [this issue](https://github.com/boto/botocore/issues/1872), we have
40 |   # to explicitly install a specific version of dateutil. Note, this is not
41 |   # being added to the requirements file as this does not affect local builds,
42 |   # only the travis environment which is using boto.
43 |   - pip install python-dateutil==2.8.0
44 |   - python setup.py install
45 | 
46 | script: pytest -v
47 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
1 | versioneer.py
2 | provenance/_version.py
3 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Contributing
  3 | ============
  4 | 
  5 | Contributions are welcome, and they are greatly appreciated! Every
  6 | little bit helps, and credit will always be given.
  7 | 
  8 | You can contribute in many ways:
  9 | 
 10 | Types of Contributions
 11 | ----------------------
 12 | 
 13 | Report Bugs
 14 | ~~~~~~~~~~~
 15 | 
 16 | Report bugs at https://github.com/bmabey/provenance/issues.
 17 | 
 18 | If you are reporting a bug, please include:
 19 | 
 20 | * Your operating system name and version.
 21 | * Any details about your local setup that might be helpful in troubleshooting.
 22 | * Detailed steps to reproduce the bug.
 23 | 
 24 | Fix Bugs
 25 | ~~~~~~~~
 26 | 
 27 | Look through the GitHub issues for bugs. Anything tagged with "bug"
 28 | is open to whoever wants to implement it.
 29 | 
 30 | Implement Features
 31 | ~~~~~~~~~~~~~~~~~~
 32 | 
 33 | Look through the GitHub issues for features. Anything tagged with "feature"
 34 | is open to whoever wants to implement it.
 35 | 
 36 | Write Documentation
 37 | ~~~~~~~~~~~~~~~~~~~
 38 | 
 39 | provenance could always use more documentation, whether as part of the
 40 | official provenance docs, in docstrings, or even on the web in blog posts,
 41 | articles, and such.
 42 | 
 43 | Submit Feedback
 44 | ~~~~~~~~~~~~~~~
 45 | 
 46 | The best way to send feedback is to file an issue at https://github.com/bmabey/provenance/issues.
 47 | 
 48 | If you are proposing a feature:
 49 | 
 50 | * Explain in detail how it would work.
 51 | * Keep the scope as narrow as possible, to make it easier to implement.
 52 | * Remember that this is a volunteer-driven project, and that contributions
 53 |   are welcome :)
 54 | 
 55 | Get Started!
 56 | ------------
 57 | 
 58 | Ready to contribute? Here's how to set up `provenance` for local development.
 59 | 
 60 | 1. Fork the `provenance` repo on GitHub.
 61 | 2. Clone your fork locally::
 62 | 
 63 |     $ git clone git@github.com:your_name_here/provenance.git
 64 | 
 65 | 3. Setup your development environment. Assuming you have conda installed, the following commands can be used to create a development environment::
 66 | 
 67 | Initial environment creation
 68 | 
 69 | .. code:: bash
 70 | 
 71 |     conda env create
 72 |     source activate provenance-dev
 73 |     pip install -r requirements.txt
 74 |     pip install -r test_requirements.txt
 75 | 
 76 | Reactivating the environment after it has been created
 77 | 
 78 | .. code:: bash
 79 | 
 80 |     source activate provenance-dev
 81 | 
 82 | 4. Create a branch for local development::
 83 | 
 84 |     $ git checkout -b name-of-your-bugfix-or-feature
 85 | 
 86 |    Now you can make your changes locally.
 87 | 
 88 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
 89 | 
 90 |     $ flake8 provenance tests
 91 |     $ python setup.py test
 92 | 
 93 | 6. Commit your changes and push your branch to GitHub::
 94 | 
 95 |     $ git add .
 96 |     $ git commit -m "Your detailed description of your changes."
 97 |     $ git push origin name-of-your-bugfix-or-feature
 98 | 
 99 | 7. Submit a pull request through the GitHub website.
100 | 
101 | Pull Request Guidelines
102 | -----------------------
103 | 
104 | Before you submit a pull request, check that it meets these guidelines:
105 | 
106 | 1. The pull request should include tests.
107 | 2. If the pull request adds functionality, the docs should be updated. Put
108 |    your new functionality into a function with a docstring. Consider updating
109 |    a guide or other documentation as well.
110 | 3. The pull request should pass the all the TravisCI builds.
111 |    https://travis-ci.org/bmabey/provenance/pull_requests
112 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM andrewosh/binder-base
 2 | 
 3 | MAINTAINER Ben Mabey <ben@benmabey.com>
 4 | 
 5 | USER root
 6 | 
 7 | RUN apt-get update -y && \
 8 |     apt-get install -y postgresql postgresql-contrib && \
 9 |     service postgresql start
10 | 
11 | USER main
12 | 
13 | ADD environment.yml /home/main/environment.yml
14 | RUN /home/main/anaconda/bin/conda install nb_conda_kernels && \
15 |     cd /home/main &&  /home/main/anaconda/bin/conda env create && \
16 |     /bin/bash -c "source /home/main/anaconda/bin/activate provenance-dev && pip install git+https://github.com/bmabey/provenance"
17 | 
18 | 
19 | CMD ["/bin/bash"]
20 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | .. :changelog:
 2 | 
 3 | History
 4 | =======
 5 | 
 6 | 
 7 | 0.14.0 (2020-10-22)
 8 | ------------
 9 | 
10 | Thanks to Anderson Banihirwe, @andersy005, for this release!
11 | 
12 | * Updates joblib pin "joblib>=0.15.0" and related code.
13 | * Tests and code formatting improvements!
14 | 
15 | 0.13.0 (2019-12-02)
16 | ------------
17 | 
18 | Thanks to Dan Maljovec, @dmaljovec, for these fixes and additions!
19 | 
20 | * Updates ``wrapt`` dependency and makes Artifact proxies compatible.
21 | * Adds optional PyTorch model serialization.
22 | * Adds helpful error message when a user does not set a default repo.
23 | 
24 | 0.12.0 (2018-10-08)
25 | ------------
26 | * Change default hashing algorithm to MD5 since SHA1 for performance considerations.
27 | * Extends serialziaiton so the type used is inferred off of type.
28 | * Makes the default serializer for Pandas DataFrames and Series to use Parquet.
29 | * (breaking change!) Remove names from ArtifactSets, use a JSONB of labels instead.
30 | * Doc tweaks.
31 | 
32 | 0.11.0 (2018-08-23)
33 | ------------
34 | * Optional Google Storage support.
35 | * Adds `persistent_connections` option to Postgres repo so NullPoll can be used when appropriate.
36 | * Doc tweaks.
37 | 
38 | 
39 | 0.10.0 (2016-04-30)
40 | ------------
41 | 
42 | * Change the default artifact name from the function name to the fully qualified module and function name.
43 |   This will invalidate previously cached artifacts unless the names are migrated or explicitly set.
44 | * Documentation! A start at least, more docstrings and guides will be added soon.
45 | * Adds ``use_cache`` parameter and config option for when you only want to track provenance but not look for cache hits.
46 | * Adds ``check_mutations`` option to prevent ``Artifact`` value mutations.
47 | * Adds ``tags`` parameter to the ``provenance`` decorator for when you only want to track provenance but not look for cache hits.
48 | * Adds experimental (alpha!) ``keras`` support.
49 | * Adds a visualization module, pretty basic and mostly for docs and to illustrate what is possible.
50 | * Adds ``ensure_proxies`` decorator to guard against non ``ArtifactProxy`` being sent to functions.
51 | 
52 | 0.9.4.2 (2016-03-23)
53 | ---------------------
54 | 
55 | * Improved error reporing when paramiko not present for SFTP store.
56 | 
57 | 0.9.4.1 (2016-03-22) (0.9.4 was a bad release)
58 | ---------------------
59 | 
60 | * Adds ability for a database and/or schema to be created when it doesn't exist.
61 | * Adds SFTP blobstore as separate package provenance[sftp].
62 | * Adds examples to illustrate how the library is used.
63 | 
64 | 0.9.3 (2016-02-17)
65 | ---------------------
66 | 
67 | * Patch release to fix packaging problems in 0.9.2.
68 | 
69 | 0.9.2 (2016-02-17)
70 | ---------------------
71 | 
72 | * Adds archive_file feature.
73 | 
74 | 0.9.1 (2015-10-05)
75 | ---------------------
76 | 
77 | * Python versions now supported: 2.7, 3.3, 3.4, 3.5
78 | 
79 | 0.9.0 (2015-10-05)
80 | ---------------------
81 | 
82 | * First release on PyPI. Basic functionality but lacking in docs.
83 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | ﻿Copyright (c) 2016 Savvysherpa and contributors
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | recursive-include provenance *.py
 2 | recursive-include docs *.rst
 3 | include provenance/migrations/*
 4 | include provenance/vis/*
 5 | include provenance/vis
 6 | include provenance/migrations/versions/*
 7 | include provenance/alembic.ini
 8 | 
 9 | include setup.py
10 | include requirements.txt
11 | include test_requirements.txt
12 | include README.rst
13 | include LICENSE.txt
14 | include HISTORY.rst
15 | include MANIFEST.in
16 | 
17 | prune docs/build
18 | include versioneer.py
19 | include provenance/_version.py
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-pyc clean-build docs clean
 2 | 
 3 | help:
 4 | 	@echo "clean - remove all build, test, coverage and Python artifacts"
 5 | 	@echo "clean-build - remove build artifacts"
 6 | 	@echo "clean-pyc - remove Python file artifacts"
 7 | 	@echo "clean-test - remove test and coverage artifacts"
 8 | #	@echo "lint - check style with flake8"
 9 | 	@echo "test - run tests quickly with the default Python"
10 | 	@echo "test-all - run tests on every Python version with tox"
11 | #	@echo "coverage - check code coverage quickly with the default Python" #
12 | #	@echo "docs - generate Sphinx HTML documentation, including API docs"
13 | 	@echo "release - package and upload a release"
14 | 	@echo "dist - package"
15 | 	@echo "install - install the package to the active Python's site-packages"
16 | 
17 | clean: clean-build clean-pyc clean-test
18 | 
19 | clean-build:
20 | 	rm -rf build/
21 | 	rm -rf dist/
22 | 	rm -rf .eggs/
23 | 	find . -name '*.egg-info' -exec rm -rf {} +
24 | 	find . -name '*.egg' -exec rm -rf {} +
25 | 
26 | clean-pyc:
27 | 	find . -name '*.pyc' -exec rm -f {} +
28 | 	find . -name '*.pyo' -exec rm -f {} +
29 | 	find . -name '*~' -exec rm -f {} +
30 | 	find . -name '__pycache__' -exec rm -rf {} +
31 | 
32 | clean-test:
33 | 	rm -rf .tox/
34 | 	rm -f .coverage
35 | 	rm -rf htmlcov/
36 | 
37 | # lint:
38 | # 	flake8 provenance tests
39 | 
40 | test:
41 | 	python setup.py test
42 | 
43 | test-all:
44 | 	tox
45 | 
46 | # coverage:
47 | # 	coverage run --source provenance setup.py test
48 | # 	coverage report -m
49 | # 	coverage html
50 | # 	open htmlcov/index.html
51 | 
52 | 
53 | release: dist
54 | 	twine upload dist/*
55 | 
56 | dist: clean
57 | 	python setup.py sdist bdist_wheel
58 | 	ls -l dist
59 | 
60 | install: clean
61 | 	python setup.py install
62 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ==========
  2 | provenance
  3 | ==========
  4 | 
  5 | |version status| |conda-version status| |build status| |docs|
  6 | 
  7 | 
  8 | .. |version status| image:: https://img.shields.io/pypi/v/provenance.svg
  9 |    :target: https://pypi.python.org/pypi/provenance
 10 |    :alt: Version Status
 11 | .. |conda-version status| image:: https://img.shields.io/conda/vn/conda-forge/provenance
 12 |    :target: https://anaconda.org/conda-forge/provenance
 13 |    :alt: Conda version Status
 14 | .. |build status| image:: https://travis-ci.org/bmabey/provenance.png?branch=trunk
 15 |    :target: https://travis-ci.org/bmabey/provenance
 16 |    :alt: Build Status
 17 | .. |docs| image:: https://readthedocs.org/projects/provenance/badge/?version=latest
 18 |    :target: https://provenance.readthedocs.org
 19 |    :alt: Documentation Status
 20 | 
 21 | ``provenance`` is a Python library for function-level caching and provenance that aids in
 22 | creating Parsimonious Pythonic |Pipelines|. By wrapping functions in the ``provenance``
 23 | decorator computed results are cached across various tiered stores (disk, S3, SFTP) and
 24 | `provenance <https://en.wikipedia.org/wiki/Provenance>`_ (i.e. lineage) information is tracked
 25 | and stored in an artifact repository. A central artifact repository can be used to enable
 26 | production pipelines, team collaboration, and reproducible results. The library is general
 27 | purpose but was built with machine learning pipelines in mind. By leveraging the fantastic
 28 | `joblib`_ library object serialization is optimized for ``numpy`` and other PyData libraries.
 29 | 
 30 | What that means in practice is that you can easily keep track of how artifacts (models,
 31 | features, or any object or file) are created, where they are used, and have a central place
 32 | to store and share these artifacts. This basic plumbing is required (or at least desired!)
 33 | in any machine learning pipeline and project. ``provenance`` can be used standalone along with
 34 | a build server to run pipelines or in conjunction with more advanced workflow systems
 35 | (e.g. `Airflow`_, `Luigi`_).
 36 | 
 37 | .. |Pipelines| unicode:: Pipelines U+2122
 38 | .. _joblib: https://pythonhosted.org/joblib/
 39 | .. _Airflow: http://airbnb.io/projects/airflow/
 40 | .. _Luigi: https://github.com/spotify/luigi
 41 | 
 42 | Example
 43 | =======
 44 | 
 45 | For an explanation of this example please see the `Introductory Guide`_.
 46 | 
 47 | .. code-block:: python
 48 | 
 49 |     import provenance as p
 50 | 
 51 |     p.load_config(...)
 52 | 
 53 |     import time
 54 | 
 55 |     @p.provenance()
 56 |     def expensive_add(a, b):
 57 |         time.sleep(2)
 58 |         return a + b
 59 | 
 60 | 
 61 |     @p.provenance()
 62 |     def expensive_mult(a, b):
 63 |         time.sleep(2)
 64 |         return a * b
 65 | 
 66 | 
 67 |     a1 = expensive_add(4, 3)
 68 |     a2 = expensive_add(1, 1)
 69 | 
 70 |     result = expensive_mult(a1, a2)
 71 | 
 72 |     vis.visualize_lineage(result)
 73 | 
 74 | 
 75 | .. image:: https://raw.githubusercontent.com/bmabey/provenance/trunk/docs/source/images/lineage_example.png
 76 | 
 77 | 
 78 | .. _Introductory Guide: http://provenance.readthedocs.io/en/latest/intro-guide.html
 79 | 
 80 | Installation
 81 | ============
 82 | 
 83 | For the base functionality:
 84 | 
 85 | .. code:: bash
 86 | 
 87 |     pip install provenance
 88 | 
 89 | 
 90 | For the visualization module (which requires ``graphviz`` to be installed):
 91 | 
 92 | .. code:: bash
 93 | 
 94 |     pip install provenance[vis]
 95 | 
 96 | For the SFTP store:
 97 | 
 98 | .. code:: bash
 99 | 
100 |     pip install provenance[sftp]
101 | 
102 | For everything all at once:
103 | 
104 | 
105 | .. code:: bash
106 | 
107 |     pip install provenance[all]
108 | 
109 | provenance is also available from conda-forge for conda installations:
110 | 
111 | .. code:: bash
112 | 
113 |     conda install -c conda-forge provenance
114 | 
115 | 
116 | 
117 | Compatibility
118 | =============
119 | 
120 | ``provenance`` is currently only compatible with Python 3.5 and higher. Updating it to work with Python 2.7x
121 | should be easy, follow this `ticket`_ if you are interested in that.
122 | 
123 | 
124 | .. _ticket: https://github.com/bmabey/provenance/issues/32
125 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = provenance
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | 
23 | clean-provenance:
24 | 	rm -rf /tmp/provenance-intro-artifacts
25 | 	rm -rf /tmp/provenance-ml-artifacts
26 | 	dropdb provenance-intro
27 | 	dropdb provenance-ml-guide
28 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=provenance
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/readthedocs-environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |     - conda-forge
 3 | dependencies:
 4 | - python==3.5
 5 | - numpy
 6 | - pandas
 7 | - nbconvert
 8 | - ipykernel
 9 | - alembic
10 | - numpydoc
11 | - sphinx
12 | - pandoc
13 | # for ML examples
14 | - scikit-learn
15 | - pip:
16 |   - nbsphinx
17 |   - yamlmagic
18 |   - sphinxcontrib-websupport
19 |   - s3fs>=0.0.8
20 |   - boltons>=16.5.1
21 |   - joblib>=0.10.2
22 |   - toolz>=0.8.2
23 |   - cloudpickle>=0.2.1
24 |   - psutil>=5.0.0
25 |   - ordered-set>=2.0.1
26 |   - sqlalchemy>=1.1.3
27 |   - sqlalchemy-utils>=0.32.12
28 |   - memoized-property>=1.0.2
29 |   - wrapt>=1.10.8
30 |   - psycopg2
31 |   - numpy
32 | 


--------------------------------------------------------------------------------
/docs/source/API.rst:
--------------------------------------------------------------------------------
  1 | API
  2 | ===
  3 | 
  4 | .. currentmodule:: provenance
  5 | 
  6 | Primary API
  7 | ~~~~~~~~~~~~~~~~
  8 | 
  9 | .. autosummary::
 10 |    provenance
 11 |    load_artifact
 12 |    load_proxy
 13 |    ensure_proxies
 14 |    promote
 15 |    provenance_set
 16 |    capture_set
 17 |    create_set
 18 |    load_set_by_id
 19 |    load_set_by_name
 20 |    archive_file
 21 | 
 22 | Configuration
 23 | ~~~~~~~~~~~~~
 24 | 
 25 | .. autosummary::
 26 |    from_config
 27 |    load_config
 28 |    load_yaml_config
 29 |    current_config
 30 |    get_repo_by_name
 31 |    set_default_repo
 32 |    get_default_repo
 33 |    set_check_mutations
 34 |    get_check_mutations
 35 |    set_run_info_fn
 36 |    get_use_cache
 37 |    set_use_cache
 38 |    using_repo
 39 | 
 40 | 
 41 | Utils
 42 | ~~~~~
 43 | 
 44 | .. autosummary::
 45 |    is_proxy
 46 |    lazy_dict
 47 |    lazy_proxy_dict
 48 | 
 49 | Visualization
 50 | ~~~~~~~~~~~~~
 51 | 
 52 | .. currentmodule:: provenance.vis
 53 | 
 54 | .. autosummary::
 55 |    visualize_lineage
 56 | 
 57 | 
 58 | Detailed Docs
 59 | ~~~~~~~~~~~~~
 60 | 
 61 | .. currentmodule:: provenance
 62 | 
 63 | 
 64 | Primary API
 65 | 
 66 | .. autofunction:: provenance
 67 | .. autofunction:: load_artifact
 68 | .. autofunction:: load_proxy
 69 | .. autofunction:: ensure_proxies
 70 | .. autofunction:: promote
 71 | .. autofunction:: provenance_set
 72 | .. autofunction:: capture_set
 73 | .. autofunction:: create_set
 74 | .. autofunction:: load_set_by_id
 75 | .. autofunction:: load_set_by_name
 76 | .. autofunction:: archive_file
 77 | 
 78 | 
 79 | Configuration
 80 | 
 81 | .. autofunction:: from_config
 82 | .. autofunction:: load_config
 83 | .. autofunction:: load_yaml_config
 84 | .. autofunction:: current_config
 85 | .. autofunction:: get_repo_by_name
 86 | .. autofunction:: set_default_repo
 87 | .. autofunction:: get_default_repo
 88 | .. autofunction:: set_check_mutations
 89 | .. autofunction:: get_check_mutations
 90 | .. autofunction:: set_run_info_fn
 91 | .. autofunction:: get_use_cache
 92 | .. autofunction:: set_use_cache
 93 | .. autofunction:: using_repo
 94 | 
 95 | 
 96 | Utils
 97 | 
 98 | .. autofunction:: is_proxy
 99 | .. autofunction:: lazy_dict
100 | .. autofunction:: lazy_proxy_dict
101 | 
102 | Visualization (beta)
103 | 
104 | .. currentmodule:: provenance.vis
105 | 
106 | .. autofunction:: visualize_lineage
107 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # provenance documentation build configuration file, created by
  5 | # sphinx-quickstart on Sat Apr 29 08:47:13 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | 
 23 | # The version info for the project you're documenting, acts as replacement for
 24 | # |version| and |release|, also used in various other places throughout the
 25 | # built documents.
 26 | #
 27 | from provenance import __version__ as version
 28 | 
 29 | sys.path.insert(0, os.path.abspath('../../'))
 30 | 
 31 | # -- General configuration ------------------------------------------------
 32 | 
 33 | # If your documentation needs a minimal Sphinx version, state it here.
 34 | #
 35 | # needs_sphinx = '1.0'
 36 | 
 37 | # Add any Sphinx extension module names here, as strings. They can be
 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 39 | # ones.
 40 | 
 41 | extensions = [
 42 |     'nbsphinx',
 43 |     'sphinx.ext.autodoc',
 44 |     'sphinx.ext.autosummary',
 45 |     'sphinx.ext.doctest',
 46 |     'sphinx.ext.coverage',
 47 |     'sphinx.ext.viewcode',
 48 |     'numpydoc',
 49 | ]
 50 | 
 51 | numpydoc_show_class_members = False
 52 | 
 53 | exclude_patterns = ['_build', '**.ipynb_checkpoints']
 54 | 
 55 | # Add any paths that contain templates here, relative to this directory.
 56 | templates_path = ['_templates']
 57 | 
 58 | # The suffix(es) of source filenames.
 59 | # You can specify multiple suffix as a list of string:
 60 | #
 61 | # source_suffix = ['.rst', '.md']
 62 | source_suffix = ['.rst', '.ipynb']
 63 | 
 64 | # The master toctree document.
 65 | master_doc = 'index'
 66 | 
 67 | # General information about the project.
 68 | project = 'provenance'
 69 | copyright = '2017, Ben Mabey'
 70 | author = 'Ben Mabey'
 71 | 
 72 | release = version
 73 | 
 74 | # The language for content autogenerated by Sphinx. Refer to documentation
 75 | # for a list of supported languages.
 76 | #
 77 | # This is also used if you do content translation via gettext catalogs.
 78 | # Usually you set "language" from the command line for these cases.
 79 | language = None
 80 | 
 81 | # List of patterns, relative to source directory, that match files and
 82 | # directories to ignore when looking for source files.
 83 | # This patterns also effect to html_static_path and html_extra_path
 84 | exclude_patterns = []
 85 | 
 86 | # The name of the Pygments (syntax highlighting) style to use.
 87 | pygments_style = 'sphinx'
 88 | 
 89 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 90 | todo_include_todos = False
 91 | 
 92 | # -- Options for HTML output ----------------------------------------------
 93 | 
 94 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 95 | # a list of builtin themes.
 96 | #
 97 | 
 98 | # Taken from docs.readthedocs.io:
 99 | # on_rtd is whether we are on readthedocs.io
100 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
101 | 
102 | if not on_rtd:    # only import and set the theme if we're building docs locally
103 |     import sphinx_rtd_theme
104 | 
105 |     html_theme = 'sphinx_rtd_theme'
106 |     html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
107 | 
108 | # Theme options are theme-specific and customize the look and feel of a theme
109 | # further.  For a list of options available for each theme, see the
110 | # documentation.
111 | #
112 | # html_theme_options = {}
113 | 
114 | # Add any paths that contain custom static files (such as style sheets) here,
115 | # relative to this directory. They are copied after the builtin static files,
116 | # so a file named "default.css" will overwrite the builtin "default.css".
117 | html_static_path = ['_static']
118 | 
119 | # -- Options for HTMLHelp output ------------------------------------------
120 | 
121 | # Output file base name for HTML help builder.
122 | htmlhelp_basename = 'provenancedoc'
123 | 
124 | # -- Options for LaTeX output ---------------------------------------------
125 | 
126 | latex_elements = {
127 |     # The paper size ('letterpaper' or 'a4paper').
128 |     #
129 |     # 'papersize': 'letterpaper',
130 |     # The font size ('10pt', '11pt' or '12pt').
131 |     #
132 |     # 'pointsize': '10pt',
133 |     # Additional stuff for the LaTeX preamble.
134 |     #
135 |     # 'preamble': '',
136 |     # Latex figure (float) alignment
137 |     #
138 |     # 'figure_align': 'htbp',
139 | }
140 | 
141 | # Grouping the document tree into LaTeX files. List of tuples
142 | # (source start file, target name, title,
143 | #  author, documentclass [howto, manual, or own class]).
144 | latex_documents = [
145 |     (master_doc, 'provenance.tex', 'provenance Documentation', 'Ben Mabey', 'manual'),
146 | ]
147 | 
148 | # -- Options for manual page output ---------------------------------------
149 | 
150 | # One entry per manual page. List of tuples
151 | # (source start file, name, description, authors, manual section).
152 | man_pages = [(master_doc, 'provenance', 'provenance Documentation', [author], 1)]
153 | 
154 | # -- Options for Texinfo output -------------------------------------------
155 | 
156 | # Grouping the document tree into Texinfo files. List of tuples
157 | # (source start file, target name, title, author,
158 | #  dir menu entry, description, category)
159 | texinfo_documents = [
160 |     (
161 |         master_doc,
162 |         'provenance',
163 |         'provenance Documentation',
164 |         author,
165 |         'provenance',
166 |         'Provenance and caching library for functions, built for creating lightweight machine learning pipelines.',
167 |         'Miscellaneous',
168 |     ),
169 | ]
170 | 
171 | extlinks = {
172 |     'issue': ('https://github.com/bmabey/provenance/issues/%s', 'GH#'),
173 |     'pr': ('https://github.com/bmabey/provenance/pull/%s', 'GH#'),
174 | }
175 | 


--------------------------------------------------------------------------------
/docs/source/contributing.rst:
--------------------------------------------------------------------------------
1 | ../../CONTRIBUTING.rst


--------------------------------------------------------------------------------
/docs/source/history.rst:
--------------------------------------------------------------------------------
1 | ../../HISTORY.rst


--------------------------------------------------------------------------------
/docs/source/images/keep_calm_pipeline_on.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bmabey/provenance/d29ad2ffc39fbc389600df092da9e7df4f920100/docs/source/images/keep_calm_pipeline_on.png


--------------------------------------------------------------------------------
/docs/source/images/lineage_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bmabey/provenance/d29ad2ffc39fbc389600df092da9e7df4f920100/docs/source/images/lineage_example.png


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. include:: ../../README.rst
 2 | 
 3 | 
 4 | 
 5 | Index
 6 | -----
 7 | 
 8 | 
 9 | .. toctree::
10 |     :maxdepth: 1
11 |     :caption: Guides
12 | 
13 |     intro-guide.ipynb
14 |     ml-pipeline.ipynb
15 | 
16 | .. toctree::
17 |     :maxdepth: 1
18 |     :caption: Main Concepts
19 | 
20 |     hashing-and-mutation.ipynb
21 |     API.rst
22 | 
23 | .. toctree::
24 |     :maxdepth: 1
25 |     :caption: Project Information
26 | 
27 |     contributing.rst
28 |     history.rst
29 | 


--------------------------------------------------------------------------------
/docs/source/ml-pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "ein.tags": [
  7 |      "worksheet-0"
  8 |     ],
  9 |     "slideshow": {
 10 |      "slide_type": "-"
 11 |     }
 12 |    },
 13 |    "source": [
 14 |     "# Machine Learning Pipeline\n",
 15 |     "\n",
 16 |     "** WORK IN PROGRESS ** This guide isn't complete but the code examples may be useful as is.\n",
 17 |     "\n",
 18 |     "This guide assumes you are familiar with all the content in the [Introductory Guide](intro-guide.ipynb).\n",
 19 |     "\n",
 20 |     "A typical machine learning pipeline consists of loading data, extracting features, training models and storing the models for later use in a production system or further analysis. In some cases the feature extraction process is quick and the features are transitory without any need of saving them independently of the finished trained model. Other times the features are a representation of the data that you wish to reuse in different settings, e.g. in a dashboard explaining predictions, ad-hoc analysis, further model development. \n",
 21 |     "\n",
 22 |     "In the end a good deal of plumbing is required to wire up an app/service with the latest models and features in such a way that API calls can be traced back to the originating model, features, and even data sources. `provenance` abstracts much of this plumbing so you can focus on writing parsimonious pythonic pipelines&trade; with plain old functions."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {
 29 |     "autoscroll": "json-false",
 30 |     "ein.tags": [
 31 |      "worksheet-0"
 32 |     ],
 33 |     "slideshow": {
 34 |      "slide_type": "-"
 35 |     }
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "%load_ext yamlmagic"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {
 46 |     "autoscroll": "json-false",
 47 |     "ein.tags": [
 48 |      "worksheet-0"
 49 |     ],
 50 |     "slideshow": {
 51 |      "slide_type": "-"
 52 |     }
 53 |    },
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "application/javascript": [
 58 |        "\n",
 59 |        "            require(\n",
 60 |        "                [\n",
 61 |        "                    \"notebook/js/codecell\",\n",
 62 |        "                    \"codemirror/mode/yaml/yaml\"\n",
 63 |        "                ],\n",
 64 |        "                function(cc){\n",
 65 |        "                    cc.CodeCell.options_default.highlight_modes.magic_yaml = {\n",
 66 |        "                        reg: [\"^%%yaml\"]\n",
 67 |        "                    }\n",
 68 |        "                }\n",
 69 |        "            );\n",
 70 |        "            "
 71 |       ],
 72 |       "text/plain": [
 73 |        "<IPython.core.display.Javascript object>"
 74 |       ]
 75 |      },
 76 |      "metadata": {},
 77 |      "output_type": "display_data"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "%%yaml basic_config\n",
 82 |     "blobstores:\n",
 83 |     "    disk:\n",
 84 |     "        type: disk\n",
 85 |     "        cachedir: /tmp/provenance-ml-artifacts\n",
 86 |     "        read: True\n",
 87 |     "        write: True\n",
 88 |     "        delete: True\n",
 89 |     "artifact_repos:\n",
 90 |     "    local:\n",
 91 |     "        type: postgres\n",
 92 |     "        db: postgresql://localhost/provenance-ml-guide\n",
 93 |     "        store: 'disk'\n",
 94 |     "        read: True\n",
 95 |     "        write: True\n",
 96 |     "        delete: True\n",
 97 |     "        # this option will create the database if it doesn't exist\n",
 98 |     "        create_db: True\n",
 99 |     "default_repo: local"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 3,
105 |    "metadata": {
106 |     "autoscroll": "json-false",
107 |     "ein.tags": [
108 |      "worksheet-0"
109 |     ],
110 |     "slideshow": {
111 |      "slide_type": "-"
112 |     }
113 |    },
114 |    "outputs": [
115 |     {
116 |      "name": "stderr",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "INFO  [alembic.runtime.migration] Context impl PostgresqlImpl.\n",
120 |       "INFO  [alembic.runtime.migration] Will assume transactional DDL.\n",
121 |       "INFO  [alembic.runtime.migration] Running stamp_revision  -> e0317ab07ba4\n"
122 |      ]
123 |     },
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "<provenance.repos.Config at 0x11200ebe0>"
128 |       ]
129 |      },
130 |      "execution_count": 3,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "import provenance as p\n",
137 |     "\n",
138 |     "\n",
139 |     "p.load_config(basic_config)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 4,
145 |    "metadata": {
146 |     "autoscroll": "json-false",
147 |     "collapsed": true,
148 |     "ein.tags": [
149 |      "worksheet-0"
150 |     ],
151 |     "slideshow": {
152 |      "slide_type": "skip"
153 |     }
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "import numpy as np\n",
158 |     "import pandas as pd\n",
159 |     "import time\n",
160 |     "from sklearn.utils import check_random_state\n",
161 |     "import toolz as t"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 5,
167 |    "metadata": {
168 |     "autoscroll": "json-false",
169 |     "collapsed": true,
170 |     "ein.tags": [
171 |      "worksheet-0"
172 |     ],
173 |     "slideshow": {
174 |      "slide_type": "-"
175 |     }
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "@p.provenance()\n",
180 |     "def load_data(query):\n",
181 |     "    # fetch something from the DB in real life...\n",
182 |     "    random_state = check_random_state(abs(hash(query)) // (10**10))\n",
183 |     "    return random_state.uniform(0, 10, 10)\n",
184 |     "\n",
185 |     "\n",
186 |     "@p.provenance()\n",
187 |     "def extract_features_a(data, hyperparam_a=5):\n",
188 |     "    time.sleep(2)\n",
189 |     "    rs = check_random_state(hyperparam_a)\n",
190 |     "    return data[0:5] + 1 + rs.rand(5)\n",
191 |     "\n",
192 |     "\n",
193 |     "@p.provenance()\n",
194 |     "def extract_features_b(data, hyperparam_x=10):\n",
195 |     "    time.sleep(2)\n",
196 |     "    rs = check_random_state(hyperparam_x)\n",
197 |     "    return data[5:] + 1 + rs.rand(5)\n",
198 |     "\n",
199 |     "\n",
200 |     "@p.provenance()\n",
201 |     "def build_model(features_a, features_b, num_trees=100):\n",
202 |     "    return {'whatever': 'special model with {} trees'.format(num_trees)}\n",
203 |     "\n",
204 |     "\n",
205 |     "@p.provenance()\n",
206 |     "def evaluate(model, data):\n",
207 |     "    return {'some_metric': 0.5, 'another_metric': 0.4}\n",
208 |     "\n",
209 |     "\n",
210 |     "def pipeline(train_query='some query', valid_query=\"another query\", hyperparam_a=5, hyperparam_x=10):\n",
211 |     "    data = load_data(\"some query\")\n",
212 |     "    features_a = extract_features_a(data, hyperparam_a)\n",
213 |     "    features_b = extract_features_b(data, hyperparam_x)\n",
214 |     "    model = build_model(data, features_a, features_b)\n",
215 |     "\n",
216 |     "    validation_data = load_data(\"another query\")\n",
217 |     "    evaluation = evaluate(model, validation_data)\n",
218 |     "\n",
219 |     "    return {'features_a': features_a, 'features_b': features_b,\n",
220 |     "            'model': model, 'evaluation': evaluation}\n",
221 |     "\n",
222 |     "\n",
223 |     "@p.provenance()\n",
224 |     "def make_decision(model, request):\n",
225 |     "    # make some sort of prediction, classification, with the model\n",
226 |     "    # to help make a 'decision' and return it as the result\n",
227 |     "    return {'prediction': 0.5, 'model': model.artifact.id}"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "**TODO** explain everything.. including the concept of artifact sets and how they simpify the building and deployment of models."
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 6,
240 |    "metadata": {
241 |     "autoscroll": "json-false",
242 |     "collapsed": true,
243 |     "ein.tags": [
244 |      "worksheet-0"
245 |     ],
246 |     "slideshow": {
247 |      "slide_type": "-"
248 |     }
249 |    },
250 |    "outputs": [],
251 |    "source": [
252 |     "def run_production_pipeline():\n",
253 |     "    with p.capture_set('production'):\n",
254 |     "        return pipeline()"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 7,
260 |    "metadata": {
261 |     "autoscroll": "json-false",
262 |     "collapsed": true,
263 |     "ein.tags": [
264 |      "worksheet-0"
265 |     ],
266 |     "slideshow": {
267 |      "slide_type": "-"
268 |     }
269 |    },
270 |    "outputs": [],
271 |    "source": [
272 |     "res = run_production_pipeline()"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 8,
278 |    "metadata": {
279 |     "autoscroll": "json-false",
280 |     "collapsed": true,
281 |     "ein.tags": [
282 |      "worksheet-0"
283 |     ],
284 |     "slideshow": {
285 |      "slide_type": "-"
286 |     }
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "res = p.load_set_by_name('production')"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 9,
296 |    "metadata": {
297 |     "autoscroll": "json-false",
298 |     "ein.tags": [
299 |      "worksheet-0"
300 |     ],
301 |     "slideshow": {
302 |      "slide_type": "-"
303 |     }
304 |    },
305 |    "outputs": [
306 |     {
307 |      "data": {
308 |       "text/plain": [
309 |        "ArtifactSet(id='08f3c7c6a84132faa155ca9996a26c4df92bd798', artifact_ids=frozenset({'2411521185b4267706a24f85b16c46e3a24b4e66', '96c47ddbeff008e2b3a27913611c9648c3e74aa2', 'd3bb8e7625b7093b079bdc8b7d50c6eaaa62f835', '46268ac8c40932b63033b387aa0217974c82c717', 'd3c930d243d6ec4d7be481ddd1f4c3e9277d5f09', '3fdafd792f113c669d55b416bed9b5091f954029'}), created_at=datetime.datetime(2017, 5, 1, 0, 1, 9, 119196), name='production')"
310 |       ]
311 |      },
312 |      "execution_count": 9,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "res"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 10,
324 |    "metadata": {
325 |     "autoscroll": "json-false",
326 |     "collapsed": true,
327 |     "ein.tags": [
328 |      "worksheet-0"
329 |     ],
330 |     "slideshow": {
331 |      "slide_type": "-"
332 |     }
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "build_artifacts = res.proxy_dict(group_artifacts_of_same_name=True)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 11,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "dict_keys(['__main__.load_data', '__main__.build_model', '__main__.extract_features_b', '__main__.evaluate', '__main__.extract_features_a'])"
348 |       ]
349 |      },
350 |      "execution_count": 11,
351 |      "metadata": {},
352 |      "output_type": "execute_result"
353 |     }
354 |    ],
355 |    "source": [
356 |     "build_artifacts.keys()"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 12,
362 |    "metadata": {
363 |     "collapsed": true
364 |    },
365 |    "outputs": [],
366 |    "source": [
367 |     "model = build_artifacts['__main__.build_model']"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 13,
373 |    "metadata": {},
374 |    "outputs": [
375 |     {
376 |      "data": {
377 |       "text/plain": [
378 |        "<provenance.ArtifactProxy(46268ac8c40932b63033b387aa0217974c82c717) {'whatever': 'special model with [ 9.01053908  9.49144101  2.69614552  5.28085722  3.44221989] trees'} >"
379 |       ]
380 |      },
381 |      "execution_count": 13,
382 |      "metadata": {},
383 |      "output_type": "execute_result"
384 |     }
385 |    ],
386 |    "source": [
387 |     "model"
388 |    ]
389 |   }
390 |  ],
391 |  "metadata": {
392 |   "kernelspec": {
393 |    "display_name": "Python 3",
394 |    "language": "python",
395 |    "name": "python3"
396 |   },
397 |   "language_info": {
398 |    "codemirror_mode": {
399 |     "name": "ipython",
400 |     "version": 3
401 |    },
402 |    "file_extension": ".py",
403 |    "mimetype": "text/x-python",
404 |    "name": "python",
405 |    "nbconvert_exporter": "python",
406 |    "pygments_lexer": "ipython3",
407 |    "version": "3.7.0"
408 |   },
409 |   "name": "Introduction Guide.ipynb"
410 |  },
411 |  "nbformat": 4,
412 |  "nbformat_minor": 1
413 | }
414 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: provenance-dev
 2 | channels:
 3 |     - conda-forge
 4 | dependencies:
 5 | - python==3.5
 6 | - ipython
 7 | - numpy
 8 | - pandas
 9 | - alembic
10 | - numpydoc
11 | - sphinx
12 | - pandoc
13 | - ipykernel
14 | - pyarrow
15 | # for the docs
16 | - scikit-learn
17 | - pip:
18 |   - versioneer
19 |   - twine
20 |   - nbsphinx
21 |   - yamlmagic
22 |   - sphinxcontrib-websupport
23 |   - sphinx-autobuild
24 |   - sphinx_rtd_theme
25 | 


--------------------------------------------------------------------------------
/examples/basic/README.md:
--------------------------------------------------------------------------------
 1 | # provenance-basic-example
 2 | ## Step 1: Run some stuff
 3 | 
 4 | All you do is `conda env create`, `source activate provenance-basic-example`,
 5 | and `./basic_example.py`.
 6 | 
 7 | Then you can explore how the artifacts and blobs were saved in `./artifacts` and
 8 | in `psql provenance-basic-example`.
 9 | 
10 | ## Step 2: Learn some stuff
11 | 
12 | ### The gist
13 | In `basic_example.py` you'll see the decorator `@p.provenance()` above the
14 | function `my_add`. Because of this, Provenance will keep track of inputs and
15 | outputs to this function. Then if you call the function again, it won't compute
16 | the sum, rather it will say "I've already seen these inputs!" and simply look up
17 | the answer based on the inputs. It's safe to say that this is a gross
18 | simplification but it lays the ground work for going forward.
19 | 
20 | ### Terminology
21 | #### Artifact
22 | An artifact is the mechanism by which Provenance stores the inputs and outputs
23 | to our function `my_add`. It actually stores more than that but we'll get there.
24 | An artifact exists as an entry in a database table. It's probably best described
25 | by looking at the columns in the artifact table. There are 21 columns but we'll
26 | start by only looking at 2 of them: `id` and `value_id`. The `id` is just that,
27 | the id of the artifact. But it's actually more then that, it's also a hash of
28 | the inputs (as well as other things like the function name). In the blobstore
29 | (see [below](#blobs-and-blobstore)) there is a blob
30 | (see [below](#blobs-and-blobstore)), the name of that blob is this same as `id`
31 | and the blob contains a pickled version of the inputs. Next is `value_id`, this
32 | is a hash of the output and similarly shares a name with a blob which contains a
33 | pickled version of the output. We won't go over the other columns in the
34 | artifact table now.
35 | 
36 | #### Blobs and Blobstore
37 | A blob is a Binary Large OBject. Although in this case we don't require them to
38 | be large. A blob is simply a file, what type of file? Doesn't really matter. The
39 | blobstore is simply the place where the blobs are kept. In this example it is
40 | the `artifacts` directory. To be a bit more technical, we can see the blobstore
41 | defined [here](basic_example.py#L5). The `cachedir` part of the blobstore is the
42 | `artifacts` directory but since that's really the heart of the blobstore, we'll
43 | just think of them as synonymous for now until we go into more details about the
44 | config [below](#the-config).
45 | 
46 | #### Repo (or artifact repo)
47 | A repo is the place where the artifacts are stored. You can see it
48 | defined [here](basic_example.py#L12). In this case it's just a postgres database
49 | as you can see in the `db` part of the repos definition. Again there is more to
50 | a repo but the db is the heart so for now they are synonymous.
51 | 
52 | ### Recap
53 | The first time we run `basic_example.py` we print the result of calling `my_add`
54 | with 1 and 4. We see 5 printed, along with the string 'Executed' that lets us
55 | know that the function was actually executed. The blobstore (artifacts
56 | directory) now contains two blobs (just files). An artifact (entry in a db
57 | table) is created in the repo (postgres db). The artifact has an `id` which is
58 | the hash of the inputs and some other things. One of the blobs (files) has this
59 | as it's name. In that blob (file) is the pickled inputs 1 and 4. The other blob
60 | shares it's name with the `value_id` of the artifact and the blob contains a
61 | pickled 5. Now if we run the same call to my_add with 1 and 4, we won't see
62 | 'Executed' printed, but 5 will still be returned. This is evidence that the
63 | function was not ran, rather the answer was looked up. If we call my_add with
64 | different inputs the function is executed and more artifacts and blobs are
65 | created.
66 | 
67 | ### The Config
68 | The config is a map (see [here](basic_example.py#L5)). At the top level we have
69 | `blobstores`, `artifact_repos`, and `default_repo`. We won't get into the reason
70 | for first two are plural here. It will be addressed in a more advanced example.
71 | So for now we have to define a blobstore, and an artifact_repo. (In the case of
72 | the plural artifact\_repos we also have a default_repo, which for us is just our
73 | only repo.) Our blobstore is called 'disk', this name is totally up to you. It
74 | is of type 'disk', meaning on your drive. The possible types are disk, memory,
75 | s3, and chained (chained gets into the plural thing so we'll hold of on that
76 | explaination). The cachedir is defined as discussed earlier. We'll come back to
77 | read, write, read\_through\_right, and delete. We then define Our artifact\_repo
78 | is called 'local', this name is again up to you. It is of type 'postgres'. The
79 | possible types are postgres, memory, and chained. Again our db is defined as
80 | discussed earlier. The read, write, read\_through\_write, and delete fields in
81 | the config of both the blobstore and artifact\_repo are boolean permissions. Am I
82 | allowed to read, write, or delete from this blobstore or artifact\_repo? The
83 | read\_through\_write is concerned with chained blobstores and artifact_repos and
84 | we'll continue to hold off discussing that.
85 | 


--------------------------------------------------------------------------------
/examples/basic/basic_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import provenance as p
 4 | 
 5 | p.load_config(
 6 |     {
 7 |         'blobstores':
 8 |             {
 9 |                 'disk':
10 |                     {
11 |                         'type': 'disk',
12 |                         'cachedir': 'artifacts',
13 |                         'read': True,
14 |                         'write': True,
15 |                         'read_through_write': False,
16 |                         'delete': True,
17 |                     }
18 |             },
19 |         'artifact_repos':
20 |             {
21 |                 'local':
22 |                     {
23 |                         'type': 'postgres',
24 |                         'db': 'postgresql://localhost/provenance-basic-example',
25 |                         'store': 'disk',
26 |                         'read': True,
27 |                         'write': True,
28 |                         'create_db': True,
29 |                         'read_through_write': False,
30 |                         'delete': True,
31 |                     }
32 |             },
33 |         'default_repo': 'local',
34 |     }
35 | )
36 | 
37 | 
38 | @p.provenance()
39 | def my_add(x, y):
40 |     print('Executed')
41 |     return x + y
42 | 
43 | 
44 | print(my_add(1, 4))
45 | 


--------------------------------------------------------------------------------
/examples/basic/environment.yml:
--------------------------------------------------------------------------------
1 | name: provenance-basic-example
2 | dependencies:
3 | - python==3.5
4 | - pip:
5 |   - "git+https://github.com/bmabey/provenance.git"
6 | 


--------------------------------------------------------------------------------
/examples/experiment/README.md:
--------------------------------------------------------------------------------
 1 | # provenance-experiment-example
 2 | 
 3 | ## Step 0: Understand sftp-example
 4 | 
 5 | ## Step 1: Run some stuff
 6 | 
 7 | First you run `conda env create`, `source activate
 8 | provenance-experiment-example`. Now open up `config.yaml`. In the config fill in
 9 | the `cachedir`, `basepath`, and `ssh_config` for the sftp blobstore. You'll find
10 | some directions acting as placeholders or you can see [here](#the-config). Now
11 | you can run `./experiment_example.py`.
12 | 
13 | Then you can explore how the artifacts and blobs were saved in your specified
14 | `cachedir`,`basepath`, and in `psql provenance-experiment-example`.
15 | 
16 | ## Step 2: Learn some stuff
17 | 
18 | ### The gist
19 | Here we learn about archiving files, provenance\_sets and chaining blobstores.
20 | For chaining blobstores see [here](#the-config). Archiving files is really
21 | straight forward if you've understood the previous examples, you can see it in
22 | action [here](sftp\_example.py#62). Two blobs are created for each file, one is
23 | the actual file and the other is the inputs to the call to archive_file. See the
24 | comments in [sftp\_example.py](sftp\_example.py) for additional detail.
25 | 
26 | At the top of the function that contains the calls to `archive_file`
27 | (see [here](sftp\_example.py#54)) you'll see we have `p.provenance_set` instead
28 | of the `p.provenance` that we've seen before. A provenance_set is simply a named
29 | set containing the id's of the artifacts in the set. In this example each entry
30 | (demographic.json and matrix.csv) are put in a set named after the entry id
31 | (0000 or 0001 etc.). Details on how to get the set back and the artifacts out
32 | have not yet been written but are coming soon.
33 | 
34 | ### The Config
35 | We changed the config to be a yaml file and loaded it in. You'll notice that we
36 | define two blobstores (the same two from basic-example and sftp-example). Then
37 | there is a third. This third, called `experiment` is a chained blobstore. It
38 | chains `disk` to `sftp`. Remember in `sftp-example` a local blobstore was
39 | created but Provenance didn't know that it could look there when asked to
40 | retrieve an artifact. By chaining we say, first look/write in `disk`, then to
41 | `sftp`. Here's where `read_through_write` comes into play. We've set it to
42 | `True`. This means that if Provenance is trying to look up an artifact and it
43 | doesn't find it in `disk` but it does find it in `sftp` it will write it in
44 | `disk` "on the way back". Notice that we set the store for the artifact_repo to
45 | `experiment`.
46 | 


--------------------------------------------------------------------------------
/examples/experiment/config.yaml:
--------------------------------------------------------------------------------
 1 | blobstores:
 2 |   disk:
 3 |     type: 'disk'
 4 |     cachedir: 'blobstore'
 5 |     read: True
 6 |     write: True
 7 |     read_through_write: True
 8 |     delete: True
 9 |   sftp:
10 |     type: 'sftp'
11 |     cachedir: |
12 |       <This is the path on your local machine where you want the
13 |       blobs to be cached, ex. /Users/me/provenance/examples/sftp/artifacts>
14 |     basepath: |
15 |       remote machine
16 |       ex. /home/me/artifacts>, you need to make sure that path directory exists.'
17 |     read: True
18 |     write: True
19 |     read_through_write: True
20 |     delete: True
21 |     ssh_config:
22 |       hostname: '<your host here>'
23 |       port: '<your port here as an int, defaults to 22 if excluded>'
24 |       username: '<your user here>'
25 |       password: '<your password here>'
26 |   experiment:
27 |     type: 'chained'
28 |     stores: ['disk', 'sftp']
29 | artifact_repos:
30 |   local:
31 |     type: 'postgres'
32 |     db: 'postgresql://localhost/provenance-experiment-example'
33 |     store: 'experiment'
34 |     read: True
35 |     write: True
36 |     create_db: True
37 |     read_through_write: False
38 |     delete: True
39 | default_repo: 'local'
40 | 


--------------------------------------------------------------------------------
/examples/experiment/environment.yml:
--------------------------------------------------------------------------------
1 | name: provenance-experiment-example
2 | dependencies:
3 |   - python==3.5
4 |   - paramiko
5 |   - pip:
6 |       - "git+https://github.com/bmabey/provenance.git"
7 |       - pyyaml
8 | 


--------------------------------------------------------------------------------
/examples/experiment/experiment_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import csv
  4 | import json
  5 | import os
  6 | import random
  7 | import shutil
  8 | 
  9 | import provenance as p
 10 | 
 11 | p.load_yaml_config('config.yaml')
 12 | 
 13 | # Suppose you are conducting an experiment to determine the correlation between
 14 | # geographic location and favorite 3x3 matrix. To do this you have them sit at a
 15 | # computer and enter in the information. To store the data you create a
 16 | # directory structure such that each entry gets its own numbered directory in
 17 | # which there are two files, info.json which has the demographic info and
 18 | # data.csv which contains their favorite 3x3 matrix. Now, is this the best way
 19 | # to store the data for this experiment? No. But we'll ignore that for the sake
 20 | # of instruction.
 21 | 
 22 | ################################################################################
 23 | ## Generate random data, you wouldn't actually have this code in your experiment.
 24 | 
 25 | first_names = ['Eric', 'Belinda', 'Jane', 'Scott', 'Joe', 'Mike', 'Wilhelmina']
 26 | last_names = ['Thompson', 'Erikson', 'Gandalfo', 'Wesson', 'Black', 'Stephens']
 27 | 
 28 | 
 29 | def gen_name():
 30 |     return random.choice(first_names) + ' ' + random.choice(last_names)
 31 | 
 32 | 
 33 | def gen_age():
 34 |     return random.randint(18, 100)
 35 | 
 36 | 
 37 | street_names = [
 38 |     'Maple St',
 39 |     'Corner Ave',
 40 |     'West Helm Lp',
 41 |     '4th St',
 42 |     'Main St',
 43 |     'Center St',
 44 | ]
 45 | 
 46 | 
 47 | def gen_address():
 48 |     return str(random.randint(1000, 10000)) + ' ' + random.choice(street_names)
 49 | 
 50 | 
 51 | def gen_matrix():
 52 |     return [[random.randint(0, 100) for x in range(3)] for y in range(3)]
 53 | 
 54 | 
 55 | ################################################################################
 56 | ## Here's the crux. You WOULD have this code in your experiment. This function
 57 | ## actually writes the data files that you want to keep track of and share with
 58 | ## others. Here we introduce the provenance_set, which is basically a named set
 59 | ## of artifacts. It makes sense if each entry (which includes two files) becomes
 60 | ## a set. We can name the set, then use that name to retreive the latest
 61 | ## version.
 62 | 
 63 | 
 64 | def save_entry(id, name, age, address, matrix):
 65 |     directory = os.path.join('./experiment_data', id)
 66 |     os.mkdir(directory)
 67 |     demographics = {'name': name, 'age': age, 'address': address}
 68 | 
 69 |     @p.provenance_set(set_name=id)
 70 |     def write_entry():
 71 |         with open(os.path.join(directory, 'demographic.json'), 'w') as demof:
 72 |             json.dump(demographics, demof)
 73 | 
 74 |         with open(os.path.join(directory, 'matrix.csv'), 'w') as matrixf:
 75 |             writer = csv.writer(matrixf)
 76 |             writer.writerows(matrix)
 77 |         p.archive_file(
 78 |             os.path.join(directory, 'demographic.json'),
 79 |             name=id + '/demographic',
 80 |             delete_original=True,
 81 |         )
 82 |         p.archive_file(
 83 |             os.path.join(directory, 'matrix.csv'),
 84 |             name=id + '/matrix',
 85 |             delete_original=True,
 86 |         )
 87 | 
 88 |     write_entry()
 89 | 
 90 | 
 91 | ################################################################################
 92 | ## Simulate some number of participants, you wouldn't actually have this code in
 93 | ## your experiment.
 94 | 
 95 | 
 96 | def simulate_entry(id):
 97 |     name = gen_name()
 98 |     age = gen_age()
 99 |     address = gen_address()
100 |     matrix = gen_matrix()
101 |     save_entry(id, name, age, address, matrix)
102 | 
103 | 
104 | def simulate_experiment(num_participants):
105 |     # I use the experiment_data as a temporary location to write the data to.
106 |     # Provenance will store the files in the blobstore so...
107 |     if not os.path.exists('./experiment_data'):
108 |         os.mkdir('./experiment_data')
109 | 
110 |     for i in range(num_participants):
111 |         simulate_entry(str(i).zfill(4))
112 | 
113 |     # ... then I erase the folder at the end.
114 |     shutil.rmtree('./experiment_data')
115 | 
116 | 
117 | simulate_experiment(10)
118 | 


--------------------------------------------------------------------------------
/examples/sftp/README.md:
--------------------------------------------------------------------------------
 1 | # provenance-sftp-example
 2 | 
 3 | ## Step 0: Understand the basic example
 4 | 
 5 | ## Step 1: Run some stuff
 6 | 
 7 | First you run `conda env create`, `source activate provenance-sftp-example`, and
 8 | Now open up `sftp_example.py`. In the config fill in the `cachedir`, `basepath`,
 9 | and `ssh_config` for the sftp blobstore. You'll find some directions acting as
10 | placeholders or you can see [here](#the-config). Now you can run
11 | `./sftp_example.py`.
12 | 
13 | Then you can explore how the artifacts and blobs were saved in your specified
14 | `cachedir`,`basepath`, and in `psql provenance-sftp-example`.
15 | 
16 | ## Step 2: Learn some stuff
17 | 
18 | ### The gist
19 | This is pretty much the same as the basic example, only the blobstore is on a
20 | remote machine. The artifacts in the postgres db are referring to the blobs in
21 | the remote blobstore.
22 | 
23 | ### The Config
24 | `cachedir` here has the same meaning as it did in the basic example. This leads
25 | to the question "do I then have an implicit local blobstore?". Yes and no, yes
26 | because all the blobs will be in the `cachedir`. No because Provenance will not
27 | look for them there but will go immediately to the remote host and look in
28 | `basepath`. (You could use the local blobstore via a chained blobstore which is
29 | not covered in this example. Also this is a feature that should probably be
30 | added so it's automatic.) `basepath` is the path to the blobstore on the remote
31 | machine. While `cachedir` will be created for you if it doesn't exist,
32 | `basepath` won't be so make sure you create it. `ssh_config` is relatively
33 | straight forward as it is the standard things you need to ssh onto a machine.
34 | 


--------------------------------------------------------------------------------
/examples/sftp/environment.yml:
--------------------------------------------------------------------------------
1 | name: provenance-sftp-example
2 | dependencies:
3 |   - python==3.5
4 |   - paramiko
5 |   - pip:
6 |     - "git+https://github.com/bmabey/provenance.git"
7 | 


--------------------------------------------------------------------------------
/examples/sftp/sftp_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from joblib.disk import mkdirp
 4 | 
 5 | import provenance as p
 6 | 
 7 | mkdirp('./remote-machine/sftp-artifacts')
 8 | 
 9 | p.load_config(
10 |     {
11 |         'blobstores':
12 |             {
13 |                 'sftp':
14 |                     {
15 |                         'type':
16 |                             'sftp',
17 |                         'cachedir':
18 |                             '<This is the path on your local machine where you want the blobs to be cached, ex. /Users/me/provenance/examples/sftp/artifacts>',
19 |                         'basepath':
20 |                             '<            ""           remote machine                ""                       , ex. /home/me/artifacts>, you need to make sure that path directory exists.',
21 |                         'read':
22 |                             True,
23 |                         'write':
24 |                             True,
25 |                         'read_through_write':
26 |                             False,
27 |                         'delete':
28 |                             True,
29 |                         'ssh_config':
30 |                             {
31 |                                 'hostname': '<your host here>',
32 |                                 'port': '<your port here as an int, defaults to 22 if excluded>',
33 |                                 'username': '<your user here>',
34 |                                 'password': '<your password here>',
35 |                             },
36 |                     }
37 |             },
38 |         'artifact_repos':
39 |             {
40 |                 'local':
41 |                     {
42 |                         'type': 'postgres',
43 |                         'db': 'postgresql://localhost/provenance-sftp-example',
44 |                         'store': 'sftp',
45 |                         'read': True,
46 |                         'write': True,
47 |                         'create_db': True,
48 |                         'read_through_write': False,
49 |                         'delete': True,
50 |                     }
51 |             },
52 |         'default_repo': 'local',
53 |     }
54 | )
55 | 
56 | 
57 | @p.provenance()
58 | def my_add(x, y):
59 |     print('Executed')
60 |     return x + y
61 | 
62 | 
63 | print(my_add(1, 4))
64 | 


--------------------------------------------------------------------------------
/provenance/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from ._config import from_config, load_config, load_yaml_config
 3 | from ._dependencies import dependencies
 4 | from ._version import get_versions
 5 | from .core import archive_file, ensure_proxies, promote, provenance, provenance_set
 6 | from .hashing import hash, value_repr
 7 | from .repos import (
 8 |     capture_set,
 9 |     create_set,
10 |     current_config,
11 |     get_check_mutations,
12 |     get_default_repo,
13 |     get_read_only,
14 |     get_repo_by_name,
15 |     get_use_cache,
16 |     is_proxy,
17 |     lazy_dict,
18 |     lazy_proxy_dict,
19 |     load_artifact,
20 |     load_proxy,
21 |     load_set_by_id,
22 |     load_set_by_labels,
23 |     load_set_by_name,
24 |     set_check_mutations,
25 |     set_default_repo,
26 |     set_read_only,
27 |     set_run_info_fn,
28 |     set_use_cache,
29 |     using_repo,
30 | )
31 | from .serializers import register_serializer
32 | 
33 | __version__ = get_versions()['version']
34 | del get_versions
35 | 


--------------------------------------------------------------------------------
/provenance/_commonstore.py:
--------------------------------------------------------------------------------
  1 | import operator as op
  2 | 
  3 | import toolz as t
  4 | 
  5 | 
  6 | class PermissionError(Exception):
  7 | 
  8 |     def __init__(self, action, store, permission):
  9 |         message = 'A `{}` operation was attempted on {} and {} is set to `False`!'.format(
 10 |             action, store, permission
 11 |         )
 12 |         self.action = action
 13 |         self.store = store
 14 |         self.permission = permission
 15 |         Exception.__init__(self, message)
 16 | 
 17 | 
 18 | class KeyExistsError(Exception):
 19 | 
 20 |     def __init__(self, key, store):
 21 |         msg = 'The key {} is already present in {}, you can not overwrite it!'.format(key, store)
 22 |         self.key = key
 23 |         self.store = store
 24 |         Exception.__init__(self, msg)
 25 | 
 26 | 
 27 | class InconsistentKeyError(Exception):
 28 | 
 29 |     def __init__(self, key, store, value):
 30 |         msg = 'The key {} already represents a different value in {}'.format(key, store)
 31 |         self.key = key
 32 |         self.store = store
 33 |         self.value = value
 34 |         Exception.__init__(self, msg)
 35 | 
 36 | 
 37 | def find_first(pred, seq):
 38 |     for i in seq:
 39 |         if pred(i):
 40 |             return i
 41 | 
 42 | 
 43 | def ensure_read(obj, action='get'):
 44 |     if not obj._read:
 45 |         raise PermissionError(action, obj, 'read')
 46 | 
 47 | 
 48 | def ensure_write(obj, action='put'):
 49 |     if not obj._write:
 50 |         raise PermissionError(action, obj, 'write')
 51 | 
 52 | 
 53 | ensure_contains = t.partial(ensure_read, action='contains')
 54 | 
 55 | 
 56 | def ensure_present(obj, id):
 57 |     if id not in obj:
 58 |         raise KeyError(id, obj)
 59 | 
 60 | 
 61 | def ensure_delete(obj, id=None, check_contains=True):
 62 |     if not obj._delete:
 63 |         raise PermissionError('delete', obj, 'delete')
 64 |     if check_contains and id is not None and id not in obj:
 65 |         raise KeyError(id, obj)
 66 | 
 67 | 
 68 | def ensure_put(obj, id, read_through=None, check_contains=True):
 69 |     if read_through:
 70 |         if not obj._read_through_write:
 71 |             raise PermissionError('read_through_put', obj, 'read_through_write')
 72 |     elif not obj._write:
 73 |         raise PermissionError('put', obj, 'write')
 74 |     if check_contains and id in obj:
 75 |         raise KeyExistsError(id, obj)
 76 | 
 77 | 
 78 | def chained_contains(chained, id, contains=op.contains):
 79 |     stores_with_read = [s for s in chained.stores if s._read]
 80 |     if len(stores_with_read) == 0:
 81 |         raise PermissionError('contains', chained, 'read')
 82 | 
 83 |     for store in stores_with_read:
 84 |         if store._read and contains(store, id):
 85 |             return True
 86 |     return False
 87 | 
 88 | 
 89 | def chained_put(chained, id, value, put=None, overwrite=False, contains=op.contains, **kargs):
 90 |     stores_with_write = [s for s in chained.stores if s._write]
 91 |     if len(stores_with_write) == 0:
 92 |         raise PermissionError('put', chained, 'write')
 93 | 
 94 |     record = None
 95 |     putin = []
 96 |     for store in stores_with_write:
 97 |         if overwrite or not contains(store, id):
 98 |             if put:
 99 |                 record = put(store, id, value, **kargs)
100 |             else:
101 |                 record = store.put(id, value, **kargs)
102 |             putin.append(store)
103 | 
104 |     if len(putin) == 0 and len(stores_with_write) > 0:
105 |         raise KeyExistsError(id, chained)
106 | 
107 |     return record
108 | 
109 | 
110 | def chained_get(chained, get, id, put=None):
111 |     stores_with_read = [s for s in chained.stores if s._read]
112 |     if len(stores_with_read) == 0:
113 |         raise KeyError(id, chained)
114 | 
115 |     pushback = []
116 |     for store in stores_with_read:
117 |         try:
118 |             value = get(store, id)
119 |             break
120 |         except KeyError:
121 |             if store._read_through_write:
122 |                 pushback.append(store)
123 |     else:
124 |         raise KeyError(id, chained)
125 | 
126 |     for store in pushback:
127 |         if put:
128 |             put(store, id, value, read_through=True)
129 |         else:
130 |             store.put(id, value, read_through=True)
131 |     return value
132 | 
133 | 
134 | def chained_delete(chained, id, delete=None, contains=op.contains):
135 |     stores_with_delete = [s for s in chained.stores if s._delete]
136 |     if len(stores_with_delete) == 0:
137 |         raise PermissionError('delete', chained, 'delete')
138 | 
139 |     foundin = []
140 |     for store in stores_with_delete:
141 |         if contains(store, id):
142 |             foundin.append(store)
143 |             if delete:
144 |                 delete(store, id)
145 |             else:
146 |                 store.delete(id)
147 |     if len(foundin) == 0:
148 |         raise KeyError(id, chained)
149 |     else:
150 |         return foundin
151 | 
152 | 
153 | def chained_filename(chained, id):
154 |     if id in chained:
155 | 
156 |         def valid_store(s):
157 |             return s._read and hasattr(s, '_filename') and id in s
158 | 
159 |         store = find_first(valid_store, chained.stores)
160 | 
161 |         if store is not None:
162 |             return store._filename(id)
163 |         else:
164 |             raise Exception('You do not have a disk-based store setup.')
165 | 


--------------------------------------------------------------------------------
/provenance/_config.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | 
  4 | import toolz as t
  5 | 
  6 | import provenance.blobstores as bs
  7 | import provenance.repos as r
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | @t.curry
 13 | def full_config(configs, base_config):
 14 |     if 'type' in base_config:
 15 |         return base_config
 16 |     prototype = full_config(configs, configs[base_config['prototype']])
 17 |     return t.thread_first(prototype, (t.merge, base_config), (t.dissoc, 'prototype'))
 18 | 
 19 | 
 20 | def merge_prototypes(config):
 21 |     return t.valmap(full_config(config), config)
 22 | 
 23 | 
 24 | @t.curry
 25 | def atomic_item_from_config(config, type_dict, item_plural, name=None):
 26 |     stype = config['type']
 27 |     if stype not in type_dict:
 28 |         raise Exception(
 29 |             '{} may only be created of types: {}, you had {}'.format(
 30 |                 item_plural, tuple(type_dict.keys()), stype
 31 |             )
 32 |         )
 33 |     cls = type_dict[stype]
 34 |     kargs = t.dissoc(config, 'type')
 35 |     return cls(**kargs)
 36 | 
 37 | 
 38 | BLOBSTORE_TYPES = {
 39 |     'disk': bs.DiskStore,
 40 |     's3': bs.S3Store,
 41 |     'memory': bs.MemoryStore,
 42 |     'chained': bs.ChainedStore,
 43 | }
 44 | 
 45 | try:
 46 |     import provenance.sftp as sftp
 47 | 
 48 |     BLOBSTORE_TYPES['sftp'] = sftp.SFTPStore
 49 | 
 50 | except ImportError as e:
 51 | 
 52 |     class SFTPStore:
 53 |         _err = e
 54 | 
 55 |         def __init__(self, *args, **kargs):
 56 |             raise (self._err)
 57 | 
 58 |     BLOBSTORE_TYPES['sftp'] = SFTPStore
 59 | 
 60 | try:
 61 |     import provenance.google_storage as gs
 62 | 
 63 |     BLOBSTORE_TYPES['gs'] = gs.GSStore
 64 | 
 65 | except ImportError as e:
 66 | 
 67 |     class GSStore:
 68 |         _err = e
 69 | 
 70 |         def __init__(self, *args, **kargs):
 71 |             raise (self._err)
 72 | 
 73 |     BLOBSTORE_TYPES['gs'] = GSStore
 74 | 
 75 | blobstore_from_config = atomic_item_from_config(type_dict=BLOBSTORE_TYPES, item_plural='Blobstores')
 76 | 
 77 | REPO_TYPES = {
 78 |     'postgres': r.PostgresRepo,
 79 |     'memory': r.MemoryRepo,
 80 |     'chained': r.ChainedRepo,
 81 | }
 82 | 
 83 | repo_from_config = atomic_item_from_config(type_dict=REPO_TYPES, item_plural='Artifact Repos')
 84 | 
 85 | 
 86 | def items_from_config(config, atomic_from_config, items_name, item_type, silence_warnings):
 87 |     config = merge_prototypes(copy.deepcopy(config))
 88 | 
 89 |     atomic_stores = {}
 90 |     for k, c in config.items():
 91 |         try:
 92 |             if c['type'] != 'chained':
 93 |                 store = atomic_from_config(c, name=k)
 94 |                 if store:
 95 |                     atomic_stores[k] = store
 96 |         except Exception:
 97 |             if not silence_warnings:
 98 |                 logger.warning(
 99 |                     'Error creating %s %s from config - Skipping',
100 |                     item_type,
101 |                     k,
102 |                     exc_info=True,
103 |                 )
104 | 
105 |     def create_chained(name, config):
106 |         # resolve the stores
107 |         chained = {n for n in config[items_name] if n in atomic_stores}
108 |         if len(chained) != len(config[items_name]):
109 |             missing_configs = set(config[items_name]) - chained
110 |             if not silence_warnings:
111 |                 logger.warning(
112 |                     'Skipping chained %s %s due to missing %s: %s',
113 |                     item_type,
114 |                     name,
115 |                     items_name,
116 |                     missing_configs,
117 |                 )
118 |             return None
119 | 
120 |         config[items_name] = [atomic_stores[n] for n in config[items_name]]
121 |         return atomic_from_config(config, name=name)
122 | 
123 |     chained_stores = {}
124 |     for k, c in config.items():
125 |         try:
126 |             if c['type'] == 'chained':
127 |                 store = create_chained(k, c)
128 |                 if store:
129 |                     chained_stores[k] = store
130 |         except Exception:
131 |             if not silence_warnings:
132 |                 logger.warning(
133 |                     'Error creating %s %s from config - Skipping',
134 |                     item_type,
135 |                     k,
136 |                     exc_info=True,
137 |                 )
138 | 
139 |     return t.merge(chained_stores, atomic_stores)
140 | 
141 | 
142 | def blobstores_from_config(config, silence_warnings=False):
143 |     return items_from_config(config, blobstore_from_config, 'stores', 'blobstore', silence_warnings)
144 | 
145 | 
146 | def repos_from_config(config, blobstores, silence_warnings=False):
147 | 
148 |     def from_config(atomic_config, name):
149 |         if 'store' in atomic_config:
150 |             if not atomic_config['store'] in blobstores:
151 |                 if not silence_warnings:
152 |                     logger.warning(
153 |                         'Skipping %s repo due to missing store: %s',
154 |                         name,
155 |                         atomic_config['store'],
156 |                     )
157 |                 return None
158 | 
159 |             atomic_config['store'] = blobstores[atomic_config['store']]
160 |         return repo_from_config(atomic_config)
161 | 
162 |     return items_from_config(config, from_config, 'repos', 'repo', silence_warnings)
163 | 
164 | 
165 | def from_config(config):
166 |     silence_warnings = config.get('silence_warnings', False)
167 |     blobstores = blobstores_from_config(config['blobstores'], silence_warnings)
168 |     repos = repos_from_config(config['artifact_repos'], blobstores, silence_warnings)
169 |     return {'blobstores': blobstores, 'repos': repos}
170 | 
171 | 
172 | def load_config(config):
173 |     objs = from_config(config)
174 |     pconfig = r.Config(
175 |         objs['blobstores'],
176 |         objs['repos'],
177 |         default_repo=config['default_repo'],
178 |         run_info_fn=config.get('run_info_fn', None),
179 |         use_cache=config.get('use_cache', True),
180 |         read_only=config.get('read_only', False),
181 |         check_mutations=config.get('check_mutations', False),
182 |     )
183 |     r.Config.set_current(pconfig)
184 |     return pconfig
185 | 
186 | 
187 | def load_yaml_config(filename):
188 |     import yaml
189 | 
190 |     with open(filename, 'r') as f:
191 |         return load_config(yaml.load(f))
192 | 


--------------------------------------------------------------------------------
/provenance/_dependencies.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import pickle
 3 | 
 4 | import cloudpickle
 5 | 
 6 | from . import repos as r
 7 | 
 8 | Pickler = cloudpickle.CloudPickler
 9 | 
10 | 
11 | class DependencyWalker(Pickler):
12 | 
13 |     def __init__(self):
14 |         self.stream = io.BytesIO()
15 |         self.dependents = []
16 |         self.branches = []
17 |         protocol = pickle.DEFAULT_PROTOCOL
18 |         Pickler.__init__(self, self.stream, protocol=protocol)
19 | 
20 |     def save(self, obj):
21 |         if isinstance(obj, r.Artifact):
22 |             self.dependents.append(obj)
23 |         elif r.is_proxy(obj):
24 |             self.dependents.append(obj.artifact)
25 |         else:
26 |             Pickler.save(self, obj)
27 | 
28 |     def deps(self, artifact):
29 |         self.dependents = []
30 |         self.dump(artifact)
31 |         return self.dependents
32 | 
33 | 
34 | def _deps(val):
35 |     return DependencyWalker().deps(val)
36 | 
37 | 
38 | def _artifact_branches(artifact):
39 |     objs = _deps(artifact.inputs) + _deps(artifact.value)
40 |     objs.sort(key=lambda a: a.id)
41 |     return objs
42 | 
43 | 
44 | def dependencies(artifact_or_id):
45 |     """
46 |     Returns a reversed breadth first search. This guarantees that
47 |     for all artifacts in the list. All of an artifacts dependencies
48 |     will come before it.
49 |     """
50 |     artifact = r.coerce_to_artifact(artifact_or_id)
51 |     visited = []
52 |     queue = [artifact]
53 |     while queue:
54 |         a, *queue = queue
55 | 
56 |         if a in visited:
57 |             continue
58 | 
59 |         visited.append(a)
60 |         queue.extend(_artifact_branches(a))
61 | 
62 |     visited.reverse()
63 |     return visited
64 | 


--------------------------------------------------------------------------------
/provenance/_version.py:
--------------------------------------------------------------------------------
  1 | # This file helps to compute a version number in source trees obtained from
  2 | # git-archive tarball (such as those provided by githubs download-from-tag
  3 | # feature). Distribution tarballs (built by setup.py sdist) and build
  4 | # directories (produced by setup.py build) will contain a much shorter file
  5 | # that just contains the computed version number.
  6 | 
  7 | # This file is released into the public domain. Generated by
  8 | # versioneer-0.18 (https://github.com/warner/python-versioneer)
  9 | """Git implementation of _version.py."""
 10 | 
 11 | import errno
 12 | import os
 13 | import re
 14 | import subprocess
 15 | import sys
 16 | 
 17 | 
 18 | def get_keywords():
 19 |     """Get the keywords needed to look up the version information."""
 20 |     # these strings will be replaced by git during git-archive.
 21 |     # setup.py/versioneer.py will grep for the variable names, so they must
 22 |     # each be defined on a line of their own. _version.py will just call
 23 |     # get_keywords().
 24 |     git_refnames = ' (HEAD -> trunk)'
 25 |     git_full = 'd29ad2ffc39fbc389600df092da9e7df4f920100'
 26 |     git_date = '2020-12-02 11:05:43 -0700'
 27 |     keywords = {'refnames': git_refnames, 'full': git_full, 'date': git_date}
 28 |     return keywords
 29 | 
 30 | 
 31 | class VersioneerConfig:
 32 |     """Container for Versioneer configuration parameters."""
 33 | 
 34 | 
 35 | def get_config():
 36 |     """Create, populate and return the VersioneerConfig() object."""
 37 |     # these strings are filled in when 'setup.py versioneer' creates
 38 |     # _version.py
 39 |     cfg = VersioneerConfig()
 40 |     cfg.VCS = 'git'
 41 |     cfg.style = 'pep440'
 42 |     cfg.tag_prefix = ''
 43 |     cfg.parentdir_prefix = 'provenance-'
 44 |     cfg.versionfile_source = 'provenance/_version.py'
 45 |     cfg.verbose = False
 46 |     return cfg
 47 | 
 48 | 
 49 | class NotThisMethod(Exception):
 50 |     """Exception raised if a method is not valid for the current scenario."""
 51 | 
 52 | 
 53 | LONG_VERSION_PY = {}
 54 | HANDLERS = {}
 55 | 
 56 | 
 57 | def register_vcs_handler(vcs, method):    # decorator
 58 |     """Decorator to mark a method as the handler for a particular VCS."""
 59 |     def decorate(f):
 60 |         """Store f in HANDLERS[vcs][method]."""
 61 |         if vcs not in HANDLERS:
 62 |             HANDLERS[vcs] = {}
 63 |         HANDLERS[vcs][method] = f
 64 |         return f
 65 | 
 66 |     return decorate
 67 | 
 68 | 
 69 | def run_command(commands,
 70 |                 args,
 71 |                 cwd=None,
 72 |                 verbose=False,
 73 |                 hide_stderr=False,
 74 |                 env=None):
 75 |     """Call the given command(s)."""
 76 |     assert isinstance(commands, list)
 77 |     p = None
 78 |     for c in commands:
 79 |         try:
 80 |             dispcmd = str([c] + args)
 81 |             # remember shell=False, so use git.cmd on windows, not just git
 82 |             p = subprocess.Popen(
 83 |                 [c] + args,
 84 |                 cwd=cwd,
 85 |                 env=env,
 86 |                 stdout=subprocess.PIPE,
 87 |                 stderr=(subprocess.PIPE if hide_stderr else None),
 88 |             )
 89 |             break
 90 |         except EnvironmentError:
 91 |             e = sys.exc_info()[1]
 92 |             if e.errno == errno.ENOENT:
 93 |                 continue
 94 |             if verbose:
 95 |                 print('unable to run %s' % dispcmd)
 96 |                 print(e)
 97 |             return None, None
 98 |     else:
 99 |         if verbose:
100 |             print('unable to find command, tried %s' % (commands, ))
101 |         return None, None
102 |     stdout = p.communicate()[0].strip()
103 |     if sys.version_info[0] >= 3:
104 |         stdout = stdout.decode()
105 |     if p.returncode != 0:
106 |         if verbose:
107 |             print('unable to run %s (error)' % dispcmd)
108 |             print('stdout was %s' % stdout)
109 |         return None, p.returncode
110 |     return stdout, p.returncode
111 | 
112 | 
113 | def versions_from_parentdir(parentdir_prefix, root, verbose):
114 |     """Try to determine the version from the parent directory name.
115 | 
116 |     Source tarballs conventionally unpack into a directory that includes both
117 |     the project name and a version string. We will also support searching up
118 |     two directory levels for an appropriately named parent directory
119 |     """
120 |     rootdirs = []
121 | 
122 |     for i in range(3):
123 |         dirname = os.path.basename(root)
124 |         if dirname.startswith(parentdir_prefix):
125 |             return {
126 |                 'version': dirname[len(parentdir_prefix):],
127 |                 'full-revisionid': None,
128 |                 'dirty': False,
129 |                 'error': None,
130 |                 'date': None,
131 |             }
132 |         else:
133 |             rootdirs.append(root)
134 |             root = os.path.dirname(root)    # up a level
135 | 
136 |     if verbose:
137 |         print('Tried directories %s but none started with prefix %s' %
138 |               (str(rootdirs), parentdir_prefix))
139 |     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
140 | 
141 | 
142 | @register_vcs_handler('git', 'get_keywords')
143 | def git_get_keywords(versionfile_abs):
144 |     """Extract version information from the given file."""
145 |     # the code embedded in _version.py can just fetch the value of these
146 |     # keywords. When used from setup.py, we don't want to import _version.py,
147 |     # so we do it with a regexp instead. This function is not used from
148 |     # _version.py.
149 |     keywords = {}
150 |     try:
151 |         f = open(versionfile_abs, 'r')
152 |         for line in f.readlines():
153 |             if line.strip().startswith('git_refnames ='):
154 |                 mo = re.search(r'=\s*"(.*)"', line)
155 |                 if mo:
156 |                     keywords['refnames'] = mo.group(1)
157 |             if line.strip().startswith('git_full ='):
158 |                 mo = re.search(r'=\s*"(.*)"', line)
159 |                 if mo:
160 |                     keywords['full'] = mo.group(1)
161 |             if line.strip().startswith('git_date ='):
162 |                 mo = re.search(r'=\s*"(.*)"', line)
163 |                 if mo:
164 |                     keywords['date'] = mo.group(1)
165 |         f.close()
166 |     except EnvironmentError:
167 |         pass
168 |     return keywords
169 | 
170 | 
171 | @register_vcs_handler('git', 'keywords')
172 | def git_versions_from_keywords(keywords, tag_prefix, verbose):
173 |     """Get version information from git keywords."""
174 |     if not keywords:
175 |         raise NotThisMethod('no keywords at all, weird')
176 |     date = keywords.get('date')
177 |     if date is not None:
178 |         # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
179 |         # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
180 |         # -like" string, which we must then edit to make compliant), because
181 |         # it's been around since git-1.5.3, and it's too difficult to
182 |         # discover which version we're using, or to work around using an
183 |         # older one.
184 |         date = date.strip().replace(' ', 'T', 1).replace(' ', '', 1)
185 |     refnames = keywords['refnames'].strip()
186 |     if refnames.startswith('$Format'):
187 |         if verbose:
188 |             print('keywords are unexpanded, not using')
189 |         raise NotThisMethod('unexpanded keywords, not a git-archive tarball')
190 |     refs = set([r.strip() for r in refnames.strip('()').split(',')])
191 |     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
192 |     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
193 |     TAG = 'tag: '
194 |     tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
195 |     if not tags:
196 |         # Either we're using git < 1.8.3, or there really are no tags. We use
197 |         # a heuristic: assume all version tags have a digit. The old git %d
198 |         # expansion behaves like git log --decorate=short and strips out the
199 |         # refs/heads/ and refs/tags/ prefixes that would let us distinguish
200 |         # between branches and tags. By ignoring refnames without digits, we
201 |         # filter out many common branch names like "release" and
202 |         # "stabilization", as well as "HEAD" and "trunk".
203 |         tags = set([r for r in refs if re.search(r'\d', r)])
204 |         if verbose:
205 |             print("discarding '%s', no digits" % ','.join(refs - tags))
206 |     if verbose:
207 |         print('likely tags: %s' % ','.join(sorted(tags)))
208 |     for ref in sorted(tags):
209 |         # sorting will prefer e.g. "2.0" over "2.0rc1"
210 |         if ref.startswith(tag_prefix):
211 |             r = ref[len(tag_prefix):]
212 |             if verbose:
213 |                 print('picking %s' % r)
214 |             return {
215 |                 'version': r,
216 |                 'full-revisionid': keywords['full'].strip(),
217 |                 'dirty': False,
218 |                 'error': None,
219 |                 'date': date,
220 |             }
221 |     # no suitable tags, so version is "0+unknown", but full hex is still there
222 |     if verbose:
223 |         print('no suitable tags, using unknown + full revision id')
224 |     return {
225 |         'version': '0+unknown',
226 |         'full-revisionid': keywords['full'].strip(),
227 |         'dirty': False,
228 |         'error': 'no suitable tags',
229 |         'date': None,
230 |     }
231 | 
232 | 
233 | @register_vcs_handler('git', 'pieces_from_vcs')
234 | def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
235 |     """Get version from 'git describe' in the root of the source tree.
236 | 
237 |     This only gets called if the git-archive 'subst' keywords were *not*
238 |     expanded, and _version.py hasn't already been rewritten with a short
239 |     version string, meaning we're inside a checked out source tree.
240 |     """
241 |     GITS = ['git']
242 |     if sys.platform == 'win32':
243 |         GITS = ['git.cmd', 'git.exe']
244 | 
245 |     out, rc = run_command(GITS, ['rev-parse', '--git-dir'],
246 |                           cwd=root,
247 |                           hide_stderr=True)
248 |     if rc != 0:
249 |         if verbose:
250 |             print('Directory %s not under git control' % root)
251 |         raise NotThisMethod("'git rev-parse --git-dir' returned error")
252 | 
253 |     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
254 |     # if there isn't one, this yields HEX[-dirty] (no NUM)
255 |     describe_out, rc = run_command(
256 |         GITS,
257 |         [
258 |             'describe', '--tags', '--dirty', '--always', '--long', '--match',
259 |             '%s*' % tag_prefix
260 |         ],
261 |         cwd=root,
262 |     )
263 |     # --long was added in git-1.5.5
264 |     if describe_out is None:
265 |         raise NotThisMethod("'git describe' failed")
266 |     describe_out = describe_out.strip()
267 |     full_out, rc = run_command(GITS, ['rev-parse', 'HEAD'], cwd=root)
268 |     if full_out is None:
269 |         raise NotThisMethod("'git rev-parse' failed")
270 |     full_out = full_out.strip()
271 | 
272 |     pieces = {}
273 |     pieces['long'] = full_out
274 |     pieces['short'] = full_out[:7]    # maybe improved later
275 |     pieces['error'] = None
276 | 
277 |     # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
278 |     # TAG might have hyphens.
279 |     git_describe = describe_out
280 | 
281 |     # look for -dirty suffix
282 |     dirty = git_describe.endswith('-dirty')
283 |     pieces['dirty'] = dirty
284 |     if dirty:
285 |         git_describe = git_describe[:git_describe.rindex('-dirty')]
286 | 
287 |     # now we have TAG-NUM-gHEX or HEX
288 | 
289 |     if '-' in git_describe:
290 |         # TAG-NUM-gHEX
291 |         mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
292 |         if not mo:
293 |             # unparseable. Maybe git-describe is misbehaving?
294 |             pieces[
295 |                 'error'] = "unable to parse git-describe output: '%s'" % describe_out
296 |             return pieces
297 | 
298 |         # tag
299 |         full_tag = mo.group(1)
300 |         if not full_tag.startswith(tag_prefix):
301 |             if verbose:
302 |                 fmt = "tag '%s' doesn't start with prefix '%s'"
303 |                 print(fmt % (full_tag, tag_prefix))
304 |             pieces['error'] = "tag '%s' doesn't start with prefix '%s'" % (
305 |                 full_tag, tag_prefix)
306 |             return pieces
307 |         pieces['closest-tag'] = full_tag[len(tag_prefix):]
308 | 
309 |         # distance: number of commits since tag
310 |         pieces['distance'] = int(mo.group(2))
311 | 
312 |         # commit: short hex revision ID
313 |         pieces['short'] = mo.group(3)
314 | 
315 |     else:
316 |         # HEX: no tags
317 |         pieces['closest-tag'] = None
318 |         count_out, rc = run_command(GITS, ['rev-list', 'HEAD', '--count'],
319 |                                     cwd=root)
320 |         pieces['distance'] = int(count_out)    # total number of commits
321 | 
322 |     # commit date: see ISO-8601 comment in git_versions_from_keywords()
323 |     date = run_command(GITS, ['show', '-s', '--format=%ci', 'HEAD'],
324 |                        cwd=root)[0].strip()
325 |     pieces['date'] = date.strip().replace(' ', 'T', 1).replace(' ', '', 1)
326 | 
327 |     return pieces
328 | 
329 | 
330 | def plus_or_dot(pieces):
331 |     """Return a + if we don't already have one, else return a ."""
332 |     if '+' in pieces.get('closest-tag', ''):
333 |         return '.'
334 |     return '+'
335 | 
336 | 
337 | def render_pep440(pieces):
338 |     """Build up version string, with post-release "local version identifier".
339 | 
340 |     Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
341 |     get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
342 | 
343 |     Exceptions:
344 |     1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
345 |     """
346 |     if pieces['closest-tag']:
347 |         rendered = pieces['closest-tag']
348 |         if pieces['distance'] or pieces['dirty']:
349 |             rendered += plus_or_dot(pieces)
350 |             rendered += '%d.g%s' % (pieces['distance'], pieces['short'])
351 |             if pieces['dirty']:
352 |                 rendered += '.dirty'
353 |     else:
354 |         # exception #1
355 |         rendered = '0+untagged.%d.g%s' % (pieces['distance'], pieces['short'])
356 |         if pieces['dirty']:
357 |             rendered += '.dirty'
358 |     return rendered
359 | 
360 | 
361 | def render_pep440_pre(pieces):
362 |     """TAG[.post.devDISTANCE] -- No -dirty.
363 | 
364 |     Exceptions:
365 |     1: no tags. 0.post.devDISTANCE
366 |     """
367 |     if pieces['closest-tag']:
368 |         rendered = pieces['closest-tag']
369 |         if pieces['distance']:
370 |             rendered += '.post.dev%d' % pieces['distance']
371 |     else:
372 |         # exception #1
373 |         rendered = '0.post.dev%d' % pieces['distance']
374 |     return rendered
375 | 
376 | 
377 | def render_pep440_post(pieces):
378 |     """TAG[.postDISTANCE[.dev0]+gHEX] .
379 | 
380 |     The ".dev0" means dirty. Note that .dev0 sorts backwards
381 |     (a dirty tree will appear "older" than the corresponding clean one),
382 |     but you shouldn't be releasing software with -dirty anyways.
383 | 
384 |     Exceptions:
385 |     1: no tags. 0.postDISTANCE[.dev0]
386 |     """
387 |     if pieces['closest-tag']:
388 |         rendered = pieces['closest-tag']
389 |         if pieces['distance'] or pieces['dirty']:
390 |             rendered += '.post%d' % pieces['distance']
391 |             if pieces['dirty']:
392 |                 rendered += '.dev0'
393 |             rendered += plus_or_dot(pieces)
394 |             rendered += 'g%s' % pieces['short']
395 |     else:
396 |         # exception #1
397 |         rendered = '0.post%d' % pieces['distance']
398 |         if pieces['dirty']:
399 |             rendered += '.dev0'
400 |         rendered += '+g%s' % pieces['short']
401 |     return rendered
402 | 
403 | 
404 | def render_pep440_old(pieces):
405 |     """TAG[.postDISTANCE[.dev0]] .
406 | 
407 |     The ".dev0" means dirty.
408 | 
409 |     Eexceptions:
410 |     1: no tags. 0.postDISTANCE[.dev0]
411 |     """
412 |     if pieces['closest-tag']:
413 |         rendered = pieces['closest-tag']
414 |         if pieces['distance'] or pieces['dirty']:
415 |             rendered += '.post%d' % pieces['distance']
416 |             if pieces['dirty']:
417 |                 rendered += '.dev0'
418 |     else:
419 |         # exception #1
420 |         rendered = '0.post%d' % pieces['distance']
421 |         if pieces['dirty']:
422 |             rendered += '.dev0'
423 |     return rendered
424 | 
425 | 
426 | def render_git_describe(pieces):
427 |     """TAG[-DISTANCE-gHEX][-dirty].
428 | 
429 |     Like 'git describe --tags --dirty --always'.
430 | 
431 |     Exceptions:
432 |     1: no tags. HEX[-dirty]  (note: no 'g' prefix)
433 |     """
434 |     if pieces['closest-tag']:
435 |         rendered = pieces['closest-tag']
436 |         if pieces['distance']:
437 |             rendered += '-%d-g%s' % (pieces['distance'], pieces['short'])
438 |     else:
439 |         # exception #1
440 |         rendered = pieces['short']
441 |     if pieces['dirty']:
442 |         rendered += '-dirty'
443 |     return rendered
444 | 
445 | 
446 | def render_git_describe_long(pieces):
447 |     """TAG-DISTANCE-gHEX[-dirty].
448 | 
449 |     Like 'git describe --tags --dirty --always -long'.
450 |     The distance/hash is unconditional.
451 | 
452 |     Exceptions:
453 |     1: no tags. HEX[-dirty]  (note: no 'g' prefix)
454 |     """
455 |     if pieces['closest-tag']:
456 |         rendered = pieces['closest-tag']
457 |         rendered += '-%d-g%s' % (pieces['distance'], pieces['short'])
458 |     else:
459 |         # exception #1
460 |         rendered = pieces['short']
461 |     if pieces['dirty']:
462 |         rendered += '-dirty'
463 |     return rendered
464 | 
465 | 
466 | def render(pieces, style):
467 |     """Render the given version pieces into the requested style."""
468 |     if pieces['error']:
469 |         return {
470 |             'version': 'unknown',
471 |             'full-revisionid': pieces.get('long'),
472 |             'dirty': None,
473 |             'error': pieces['error'],
474 |             'date': None,
475 |         }
476 | 
477 |     if not style or style == 'default':
478 |         style = 'pep440'    # the default
479 | 
480 |     if style == 'pep440':
481 |         rendered = render_pep440(pieces)
482 |     elif style == 'pep440-pre':
483 |         rendered = render_pep440_pre(pieces)
484 |     elif style == 'pep440-post':
485 |         rendered = render_pep440_post(pieces)
486 |     elif style == 'pep440-old':
487 |         rendered = render_pep440_old(pieces)
488 |     elif style == 'git-describe':
489 |         rendered = render_git_describe(pieces)
490 |     elif style == 'git-describe-long':
491 |         rendered = render_git_describe_long(pieces)
492 |     else:
493 |         raise ValueError("unknown style '%s'" % style)
494 | 
495 |     return {
496 |         'version': rendered,
497 |         'full-revisionid': pieces['long'],
498 |         'dirty': pieces['dirty'],
499 |         'error': None,
500 |         'date': pieces.get('date'),
501 |     }
502 | 
503 | 
504 | def get_versions():
505 |     """Get version information or return default if unable to do so."""
506 |     # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
507 |     # __file__, we can work backwards from there to the root. Some
508 |     # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
509 |     # case we can only use expanded keywords.
510 | 
511 |     cfg = get_config()
512 |     verbose = cfg.verbose
513 | 
514 |     try:
515 |         return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
516 |                                           verbose)
517 |     except NotThisMethod:
518 |         pass
519 | 
520 |     try:
521 |         root = os.path.realpath(__file__)
522 |         # versionfile_source is the relative path from the top of the source
523 |         # tree (where the .git directory might live) to this file. Invert
524 |         # this to find the root from __file__.
525 |         for i in cfg.versionfile_source.split('/'):
526 |             root = os.path.dirname(root)
527 |     except NameError:
528 |         return {
529 |             'version': '0+unknown',
530 |             'full-revisionid': None,
531 |             'dirty': None,
532 |             'error': 'unable to find root of source tree',
533 |             'date': None,
534 |         }
535 | 
536 |     try:
537 |         pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
538 |         return render(pieces, cfg.style)
539 |     except NotThisMethod:
540 |         pass
541 | 
542 |     try:
543 |         if cfg.parentdir_prefix:
544 |             return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
545 |     except NotThisMethod:
546 |         pass
547 | 
548 |     return {
549 |         'version': '0+unknown',
550 |         'full-revisionid': None,
551 |         'dirty': None,
552 |         'error': 'unable to compute version',
553 |         'date': None,
554 |     }
555 | 


--------------------------------------------------------------------------------
/provenance/alembic.ini:
--------------------------------------------------------------------------------
 1 | # A generic, single database configuration.
 2 | 
 3 | [alembic]
 4 | # path to migration scripts
 5 | script_location = migrations
 6 | 
 7 | # template used to generate migration files
 8 | # file_template = %%(rev)s_%%(slug)s
 9 | 
10 | # max length of characters to apply to the
11 | # "slug" field
12 | #truncate_slug_length = 40
13 | 
14 | # set to 'true' to run the environment during
15 | # the 'revision' command, regardless of autogenerate
16 | # revision_environment = false
17 | 
18 | # set to 'true' to allow .pyc and .pyo files without
19 | # a source .py file to be detected as revisions in the
20 | # versions/ directory
21 | # sourceless = false
22 | 
23 | # version location specification; this defaults
24 | # to migrations/versions.  When using multiple version
25 | # directories, initial revisions must be specified with --version-path
26 | # version_locations = %(here)s/bar %(here)s/bat migrations/versions
27 | 
28 | # the output encoding used when revision files
29 | # are written from script.py.mako
30 | # output_encoding = utf-8
31 | 
32 | #sqlalchemy.url = postgresql://localhost/test_provenance
33 | 


--------------------------------------------------------------------------------
/provenance/artifact_hasher.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from . import hashing as h, repos as r
 4 | 
 5 | 
 6 | def _save(obj, artifacts):
 7 |     if isinstance(obj, r.Artifact):
 8 |         artifacts[obj.id] = obj
 9 |     if r.is_proxy(obj):
10 |         artifacts[obj.artifact.id] = obj.artifact
11 | 
12 | 
13 | class ArtifactHasher(h.Hasher):
14 | 
15 |     def __init__(self, artifacts=None, hash_name='md5'):
16 |         if artifacts is None:
17 |             artifacts = {}
18 | 
19 |         self.artifacts = artifacts
20 |         h.Hasher.__init__(self, hash_name=hash_name)
21 | 
22 |     def save(self, obj):
23 |         _save(obj, self.artifacts)
24 |         h.Hasher.save(self, obj)
25 | 
26 |     def hash(self, obj):
27 |         return (h.Hasher.hash(self, obj), self.artifacts.values())
28 | 
29 | 
30 | class NumpyArtifactHasher(h.NumpyHasher):
31 | 
32 |     def __init__(self, artifacts=None, hash_name='md5', coerce_mmap=True):
33 |         if artifacts is None:
34 |             artifacts = {}
35 | 
36 |         self.artifacts = artifacts
37 |         h.NumpyHasher.__init__(self, hash_name=hash_name, coerce_mmap=coerce_mmap)
38 | 
39 |     def save(self, obj):
40 |         _save(obj, self.artifacts)
41 |         h.NumpyHasher.save(self, obj)
42 | 
43 |     def hash(self, obj):
44 |         return (h.NumpyHasher.hash(self, obj), self.artifacts.values())
45 | 
46 | 
47 | def artifact_hasher(*args, **kwargs):
48 |     if 'numpy' in sys.modules:
49 |         return NumpyArtifactHasher(*args, **kwargs)
50 |     else:
51 |         return ArtifactHasher(*args, **kwargs)
52 | 


--------------------------------------------------------------------------------
/provenance/blobstores.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import os
  3 | import os.path
  4 | import shutil
  5 | import tempfile
  6 | 
  7 | from joblib.disk import mkdirp
  8 | from s3fs import S3FileSystem
  9 | 
 10 | from . import _commonstore as cs
 11 | from .serializers import DEFAULT_VALUE_SERIALIZER
 12 | 
 13 | 
 14 | class BaseBlobStore:
 15 | 
 16 |     def __init__(
 17 |         self,
 18 |         read=True,
 19 |         write=True,
 20 |         read_through_write=True,
 21 |         delete=False,
 22 |         on_duplicate_key='skip',
 23 |     ):
 24 |         self._read = read
 25 |         self._write = write
 26 |         self._read_through_write = read_through_write
 27 |         self._delete = delete
 28 |         self._on_duplicate_key = on_duplicate_key
 29 | 
 30 |         valid_on_duplicate_keys = {'skip', 'overwrite', 'check_collision', 'raise'}
 31 |         if self._on_duplicate_key not in valid_on_duplicate_keys:
 32 |             msg = 'on_duplicate_key must be one of {}'.format(valid_on_duplicate_keys)
 33 |             raise RuntimeError(msg)
 34 | 
 35 |     def __getitem__(self, id, *args, **kargs):
 36 |         return self.get(id, *args, **kargs)
 37 | 
 38 |     def put(self, id, value, serializer=DEFAULT_VALUE_SERIALIZER, read_through=False):
 39 |         method = getattr(self, '_put_' + self._on_duplicate_key)
 40 |         return method(id, value, serializer, read_through)
 41 | 
 42 |     def _put_raise(self, id, value, serializer, read_through):
 43 |         cs.ensure_put(self, id, read_through)
 44 |         self._put_overwrite(id, value, serializer, read_through)
 45 | 
 46 |     def _put_skip(self, id, value, serializer, read_through):
 47 |         if id not in self:
 48 |             self._put_overwrite(id, value, serializer, read_through)
 49 | 
 50 |     def _put_check_collision(self, id, value, serializer, read_through):
 51 |         cs.ensure_put(self, id, read_through, check_contains=False)
 52 |         if id not in self:
 53 |             self._put_overwrite(id, value, serializer, read_through)
 54 |         else:
 55 |             self._check_collision(id, value, serializer)
 56 | 
 57 |     # TODO: Right now our only thought is that this can be
 58 |     # checked by using an alternate hash, this will require
 59 |     # deserializing the old value and running the hash algorithm
 60 |     # with an alternate hash
 61 |     def _check_collision(self, id, value, serializer):
 62 |         raise NotImplementedError()
 63 | 
 64 | 
 65 | class MemoryStore(BaseBlobStore):
 66 | 
 67 |     def __init__(
 68 |         self,
 69 |         values=None,
 70 |         read=True,
 71 |         write=True,
 72 |         read_through_write=True,
 73 |         delete=True,
 74 |         on_duplicate_key='skip',
 75 |     ):
 76 |         super(MemoryStore, self).__init__(
 77 |             read=read,
 78 |             write=write,
 79 |             read_through_write=read_through_write,
 80 |             delete=delete,
 81 |             on_duplicate_key=on_duplicate_key,
 82 |         )
 83 |         if values is None:
 84 |             self.values = {}
 85 |         else:
 86 |             self.values = values
 87 | 
 88 |     def __contains__(self, id):
 89 |         cs.ensure_contains(self)
 90 |         return id in self.values
 91 | 
 92 |     def _put_overwrite(self, id, value, serializer, read_through):
 93 |         cs.ensure_put(self, id, read_through, check_contains=False)
 94 |         self.values[id] = value
 95 | 
 96 |     def get(self, id, serialzier=None, **_kargs):
 97 |         cs.ensure_read(self)
 98 |         cs.ensure_present(self, id)
 99 |         return self.values[id]
100 | 
101 |     def delete(self, id):
102 |         cs.ensure_delete(self, id)
103 |         del self.values[id]
104 | 
105 | 
106 | @contextlib.contextmanager
107 | def _temp_filename():
108 |     try:
109 |         temp = tempfile.NamedTemporaryFile('wb', delete=False)
110 |         temp.close()
111 |         yield temp.name
112 |     finally:
113 |         if os.path.isfile(temp.name):
114 |             os.remove(temp.name)
115 | 
116 | 
117 | @contextlib.contextmanager
118 | def _atomic_write(filename):
119 |     with _temp_filename() as temp:
120 |         yield temp
121 |         shutil.move(temp, filename)
122 | 
123 | 
124 | def _abspath(path):
125 |     return os.path.abspath(os.path.expanduser(path))
126 | 
127 | 
128 | class DiskStore(BaseBlobStore):
129 | 
130 |     def __init__(
131 |         self,
132 |         cachedir,
133 |         read=True,
134 |         write=True,
135 |         read_through_write=True,
136 |         delete=False,
137 |         on_duplicate_key='skip',
138 |     ):
139 |         super(DiskStore, self).__init__(
140 |             read=read,
141 |             write=write,
142 |             read_through_write=read_through_write,
143 |             delete=delete,
144 |             on_duplicate_key=on_duplicate_key,
145 |         )
146 |         self.cachedir = _abspath(cachedir)
147 |         mkdirp(self.cachedir)
148 | 
149 |     def _filename(self, id):
150 |         return os.path.join(self.cachedir, id)
151 | 
152 |     def __contains__(self, id):
153 |         cs.ensure_contains(self)
154 |         return os.path.isfile(self._filename(id))
155 | 
156 |     def _put_overwrite(self, id, value, serializer, read_through):
157 |         cs.ensure_put(self, id, read_through, check_contains=False)
158 |         with _atomic_write(self._filename(id)) as temp:
159 |             serializer.dump(value, temp)
160 | 
161 |     def get(self, id, serializer=DEFAULT_VALUE_SERIALIZER, **_kargs):
162 |         cs.ensure_read(self)
163 |         cs.ensure_present(self, id)
164 |         return serializer.load(self._filename(id))
165 | 
166 |     def delete(self, id):
167 |         cs.ensure_delete(self, id)
168 |         os.remove(self._filename(id))
169 | 
170 | 
171 | class RemoteStore(BaseBlobStore):
172 | 
173 |     def __init__(
174 |         self,
175 |         cachedir,
176 |         basepath,
177 |         read=True,
178 |         write=True,
179 |         read_through_write=True,
180 |         delete=False,
181 |         on_duplicate_key='skip',
182 |         cleanup_cachedir=False,
183 |         always_check_remote=False,
184 |     ):
185 |         """
186 |         Parameters
187 |         ----------
188 |         always_check_remote : bool
189 |            When True the remote store will be checked with every __contains__ call. Otherwise it will
190 |         short-circuit if the blob is found in the cachedir. For performance reasons this
191 |         should always be set to False. The only reason why you would want to use this
192 |         is if you are using a RemoteStore and a DiskStore in a ChainedStore together for
193 |         some reason. Since the RemoteStore basically doubles as a DiskStore with it's cachedir
194 |         chaining the two doesn't really make sense though.
195 |         """
196 |         super(RemoteStore, self).__init__(
197 |             read=read,
198 |             write=write,
199 |             read_through_write=read_through_write,
200 |             delete=delete,
201 |             on_duplicate_key=on_duplicate_key,
202 |         )
203 | 
204 |         self.always_check = always_check_remote
205 | 
206 |         self.cachedir = _abspath(cachedir)
207 |         self.basepath = basepath
208 |         self.cleanup_cachedir = cleanup_cachedir
209 |         mkdirp(self.cachedir)
210 | 
211 |     def __del__(self):
212 |         if self.cleanup_cachedir:
213 |             shutil.rmtree(self.cachedir)
214 | 
215 |     def _filename(self, id):
216 |         return os.path.join(self.cachedir, id)
217 | 
218 |     def _path(self, id):
219 |         return os.path.join(self.basepath, id)
220 | 
221 |     def _exists(self, path):
222 |         raise NotImplementedError()
223 | 
224 |     def _delete_remote(self, path):
225 |         raise NotImplementedError()
226 | 
227 |     def _upload_file(self, filename, path):
228 |         raise NotImplementedError()
229 | 
230 |     def _download_file(self, path, dest_filename):
231 |         raise NotImplementedError()
232 | 
233 |     def __contains__(self, id):
234 |         cs.ensure_contains(self)
235 |         path = self._path(id)
236 |         if self.always_check:
237 |             return self._exists(path)
238 |         else:
239 |             return os.path.exists(self._filename(id)) or self._exists(path)
240 | 
241 |     def _put_overwrite(self, id, value, serializer, read_through):
242 |         cs.ensure_put(self, id, read_through, check_contains=False)
243 |         filename = self._filename(id)
244 |         # not already saved by DiskStore?
245 |         if not os.path.isfile(filename):
246 |             with _atomic_write(filename) as temp:
247 |                 serializer.dump(value, temp)
248 |         self._upload_file(filename, self._path(id))
249 | 
250 |     def get(self, id, serializer=DEFAULT_VALUE_SERIALIZER, **_kargs):
251 |         cs.ensure_read(self)
252 |         cs.ensure_present(self, id)
253 |         filename = self._filename(id)
254 |         if not os.path.exists(filename):
255 |             with _atomic_write(filename) as temp:
256 |                 self._download_file(self._path(id), temp)
257 |         return serializer.load(filename)
258 | 
259 |     def delete(self, id):
260 |         cs.ensure_delete(self, id)
261 |         filename = self._filename(id)
262 |         if os.path.exists(filename):
263 |             os.remove(filename)
264 |         self._delete_remote(self._path(id))
265 | 
266 | 
267 | class S3Store(RemoteStore):
268 | 
269 |     def __init__(
270 |         self,
271 |         cachedir,
272 |         basepath,
273 |         s3_config=None,
274 |         s3fs=None,
275 |         read=True,
276 |         write=True,
277 |         read_through_write=True,
278 |         delete=False,
279 |         on_duplicate_key='skip',
280 |         cleanup_cachedir=False,
281 |         always_check_remote=False,
282 |     ):
283 |         """
284 |         Parameters
285 |         ----------
286 |         always_check_remote : bool
287 |            When True S3 will be checked with every __contains__ call. Otherwise it will
288 |         short-circuit if the blob is found in the cachedir. For performance reasons this
289 |         should always be set to False. The only reason why you would want to use this
290 |         is if you are using a S3Store and a DiskStore in a ChainedStore together for
291 |         some reason. Since the S3Store basically doubles as a DiskStore with it's cachedir
292 |         chaining the two doesn't really make sense though.
293 |         """
294 |         super(S3Store, self).__init__(
295 |             always_check_remote=always_check_remote,
296 |             cachedir=cachedir,
297 |             basepath=basepath,
298 |             cleanup_cachedir=cleanup_cachedir,
299 |             read=read,
300 |             write=write,
301 |             read_through_write=read_through_write,
302 |             delete=delete,
303 |             on_duplicate_key=on_duplicate_key,
304 |         )
305 | 
306 |         if s3fs:
307 |             self.s3fs = s3fs
308 |         elif s3_config is not None:
309 |             self.s3fs = S3FileSystem(**s3_config)
310 |         else:
311 |             raise ValueError('You must provide either s3_config or s3fs for a S3Store')
312 | 
313 |     def _exists(self, path):
314 |         return self.s3fs.exists(path)
315 | 
316 |     def _delete_remote(self, path):
317 |         self.s3fs.rm(path)
318 | 
319 |     def _upload_file(self, filename, path):
320 |         self.s3fs.put(filename, path)
321 | 
322 |     def _download_file(self, remote_path, dest_filename):
323 |         self.s3fs.get(remote_path, dest_filename)
324 | 
325 | 
326 | class ChainedStore(BaseBlobStore):
327 | 
328 |     def __init__(
329 |         self,
330 |         stores,
331 |         read=True,
332 |         write=True,
333 |         read_through_write=True,
334 |         delete=True,
335 |         on_duplicate_key='skip',
336 |     ):
337 |         super(ChainedStore, self).__init__(
338 |             read=read,
339 |             write=write,
340 |             read_through_write=read_through_write,
341 |             delete=delete,
342 |             on_duplicate_key=on_duplicate_key,
343 |         )
344 |         self.stores = stores
345 | 
346 |     def __contains__(self, id):
347 |         return cs.chained_contains(self, id)
348 | 
349 |     def _filename(self, id):
350 |         return cs.chained_filename(self, id)
351 | 
352 |     def _put_overwrite(self, id, value, serializer, read_through):
353 |         return cs.chained_put(self, id, value, overwrite=True, serializer=serializer)
354 | 
355 |     def get(self, id, serializer=DEFAULT_VALUE_SERIALIZER, **kargs):
356 | 
357 |         def get(store, id):
358 |             return store.get(id, serializer=serializer, **kargs)
359 | 
360 |         return cs.chained_get(self, get, id)
361 | 
362 |     def __getitem__(self, id, **kargs):
363 |         return self.get(id, **kargs)
364 | 
365 |     def delete(self, id):
366 |         return cs.chained_delete(self, id)
367 | 


--------------------------------------------------------------------------------
/provenance/google_storage.py:
--------------------------------------------------------------------------------
  1 | from boltons import funcutils as bfu
  2 | from google.cloud import storage as gs
  3 | from memoized_property import memoized_property
  4 | 
  5 | from . import blobstores as bs
  6 | 
  7 | # TODO: catch and retry w/new client on
  8 | # BrokenPipeError: [Errno 32] Broken pipe
  9 | # ConnectionResetError: [Errno 54] Connection reset by peer
 10 | # more?
 11 | 
 12 | 
 13 | def retry(f, max_attempts=2):
 14 | 
 15 |     @bfu.wraps(f)
 16 |     def with_retry(store, *args, **kargs):
 17 |         actual_attempts = 0
 18 |         while True:
 19 |             try:
 20 |                 return f(store, *args, **kargs)
 21 |             except (BrokenPipeError, ConnectionError) as e:
 22 |                 actual_attempts += 1
 23 |                 if actual_attempts >= max_attempts:
 24 |                     raise e
 25 |                 else:
 26 |                     store._setup_client()
 27 | 
 28 |     return with_retry
 29 | 
 30 | 
 31 | class GSStore(bs.RemoteStore):
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         cachedir,
 36 |         bucket,
 37 |         basepath='',
 38 |         project=None,
 39 |         read=True,
 40 |         write=True,
 41 |         read_through_write=True,
 42 |         delete=False,
 43 |         on_duplicate_key='skip',
 44 |         cleanup_cachedir=False,
 45 |         always_check_remote=False,
 46 |     ):
 47 |         """
 48 |         Parameters
 49 |         ----------
 50 |         always_check_remote : bool
 51 |            When True GS (Google Storage) will be checked with every __contains__ call. Otherwise it will
 52 |         short-circuit if the blob is found in the cachedir. For performance reasons this
 53 |         should always be set to False. The only reason why you would want to use this
 54 |         is if you are using a GSStore and a DiskStore in a ChainedStore together for
 55 |         some reason. Since the GSStore basically doubles as a DiskStore with it's cachedir
 56 |         chaining the two doesn't really make sense though.
 57 |         """
 58 |         super(GSStore, self).__init__(
 59 |             always_check_remote=always_check_remote,
 60 |             cachedir=cachedir,
 61 |             basepath=basepath,
 62 |             cleanup_cachedir=cleanup_cachedir,
 63 |             read=read,
 64 |             write=write,
 65 |             read_through_write=read_through_write,
 66 |             delete=delete,
 67 |             on_duplicate_key=on_duplicate_key,
 68 |         )
 69 | 
 70 |         self.bucket_name = bucket
 71 |         self.project = project
 72 | 
 73 |     def _setup_client(self):
 74 |         del self._client
 75 |         del self._bucket
 76 |         # force re-memoization
 77 |         assert self.bucket is not None
 78 | 
 79 |     @memoized_property
 80 |     def client(self):
 81 |         return gs.Client(project=self.project)
 82 | 
 83 |     @memoized_property
 84 |     def bucket(self):
 85 |         return self.client.get_bucket(self.bucket_name)
 86 | 
 87 |     @retry
 88 |     def _exists(self, path):
 89 |         blobs = list(self.bucket.list_blobs(prefix=path))
 90 |         return len(blobs) == 1
 91 | 
 92 |     @retry
 93 |     def _delete_remote(self, path):
 94 |         self.blob(path).delete()
 95 | 
 96 |     def _blob(self, path):
 97 |         return self._bucket.blob(path)
 98 | 
 99 |     @retry
100 |     def _upload_file(self, filename, path):
101 |         self._blob(path).upload_from_filename(filename)
102 | 
103 |     @retry
104 |     def _download_file(self, remote_path, dest_filename):
105 |         self._blob(remote_path).download_to_filename(dest_filename)
106 | 


--------------------------------------------------------------------------------
/provenance/hashing.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Fast cryptographic hash of Python objects, with a special case for fast
  3 | hashing of numpy arrays.
  4 | 
  5 | 
  6 | This code was originally taken from joblib and modified.
  7 | 
  8 |  Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
  9 |  Copyright (c) 2009 Gael Varoquaux
 10 |  License: BSD Style, 3 clauses.
 11 | """
 12 | 
 13 | # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
 14 | # Copyright (c) 2009 Gael Varoquaux
 15 | # License: BSD Style, 3 clauses.
 16 | 
 17 | import decimal
 18 | import hashlib
 19 | import io
 20 | import pickle
 21 | import struct
 22 | import sys
 23 | import types
 24 | from functools import singledispatch
 25 | 
 26 | import cloudpickle
 27 | 
 28 | 
 29 | @singledispatch
 30 | def value_repr(obj):
 31 |     method = getattr(obj, 'value_repr', None)
 32 |     if callable(method):
 33 |         return method()
 34 |     else:
 35 |         return obj
 36 | 
 37 | 
 38 | Pickler = cloudpickle.CloudPickler
 39 | 
 40 | 
 41 | class _ConsistentSet(object):
 42 |     """ Class used to ensure the hash of Sets is preserved
 43 |         whatever the order of its items.
 44 |     """
 45 | 
 46 |     def __init__(self, _set):
 47 |         # Forces order of elements in set to ensure consistent hash.
 48 |         self._type = type(_set)
 49 |         try:
 50 |             # Trying first to order the set assuming the type of elements is
 51 |             # consistent and orderable.
 52 |             # This fails on python 3 when elements are unorderable
 53 |             # but we keep it in a try as it's faster.
 54 |             self._sequence = sorted(_set)
 55 |         except (TypeError, decimal.InvalidOperation):
 56 |             # If elements are unorderable, sorting them using their hash.
 57 |             # This is slower but works in any case.
 58 |             self._sequence = sorted((hash(e) for e in _set))
 59 | 
 60 | 
 61 | class _MyHash(object):
 62 |     """ Class used to hash objects that won't normally pickle """
 63 | 
 64 |     def __init__(self, *args):
 65 |         self.args = args
 66 | 
 67 | 
 68 | class Hasher(Pickler):
 69 |     """ A subclass of pickler, to do cryptographic hashing, rather than
 70 |         pickling.
 71 |     """
 72 | 
 73 |     def __init__(self, hash_name='md5'):
 74 |         self.stream = io.BytesIO()
 75 |         # By default we want a pickle protocol that only changes with
 76 |         # the major python version and not the minor one
 77 |         protocol = pickle.DEFAULT_PROTOCOL
 78 |         Pickler.__init__(self, self.stream, protocol=protocol)
 79 |         # Initialise the hash obj
 80 |         self._hash = hashlib.new(hash_name)
 81 | 
 82 |     def hash(self, obj):
 83 |         try:
 84 |             self.dump(obj)
 85 |         except pickle.PicklingError as e:
 86 |             e.args += ('PicklingError while hashing %r: %r' % (obj, e),)
 87 |             raise
 88 |         dumps = self.stream.getvalue()
 89 |         self._hash.update(dumps)
 90 |         return self._hash.hexdigest()
 91 | 
 92 |     def save(self, obj):
 93 |         obj = value_repr(obj)
 94 |         if isinstance(obj, (types.MethodType, type({}.pop))):
 95 |             # the Pickler cannot pickle instance methods; here we decompose
 96 |             # them into components that make them uniquely identifiable
 97 |             if hasattr(obj, '__func__'):
 98 |                 func_name = obj.__func__.__name__
 99 |             else:
100 |                 func_name = obj.__name__
101 |             inst = obj.__self__
102 |             if type(inst) == type(pickle):
103 |                 obj = _MyHash(func_name, inst.__name__)
104 |             elif inst is None:
105 |                 # type(None) or type(module) do not pickle
106 |                 obj = _MyHash(func_name, inst)
107 |             else:
108 |                 cls = obj.__self__.__class__
109 |                 obj = _MyHash(func_name, inst, cls)
110 |         Pickler.save(self, obj)
111 | 
112 |     def memoize(self, obj):
113 |         # don't memoize so that the hashes are completely value-based
114 |         return
115 | 
116 |     # The dispatch table of the pickler is not accessible in Python
117 |     # 3, as these lines are only bugware for IPython, we skip them.
118 |     def save_global(self, obj, name=None, pack=struct.pack):
119 |         # We have to override this method in order to deal with objects
120 |         # defined interactively in IPython that are not injected in
121 |         # __main__
122 |         kwargs = dict(name=name, pack=pack)
123 |         if sys.version_info >= (3, 4):
124 |             del kwargs['pack']
125 |         try:
126 |             Pickler.save_global(self, obj, **kwargs)
127 |         except pickle.PicklingError:
128 |             Pickler.save_global(self, obj, **kwargs)
129 |             module = getattr(obj, '__module__', None)
130 |             if module == '__main__':
131 |                 my_name = name
132 |                 if my_name is None:
133 |                     my_name = obj.__name__
134 |                 mod = sys.modules[module]
135 |                 if not hasattr(mod, my_name):
136 |                     # IPython doesn't inject the variables define
137 |                     # interactively in __main__
138 |                     setattr(mod, my_name, obj)
139 | 
140 |     dispatch = Pickler.dispatch.copy()
141 |     # builtin
142 |     dispatch[type(len)] = save_global
143 |     # type
144 |     dispatch[type(object)] = save_global
145 |     # classobj
146 |     dispatch[type(Pickler)] = save_global
147 |     # function
148 |     dispatch[type(pickle.dump)] = save_global
149 | 
150 |     def _batch_setitems(self, items):
151 |         # forces order of keys in dict to ensure consistent hash.
152 |         try:
153 |             # Trying first to compare dict assuming the type of keys is
154 |             # consistent and orderable.
155 |             # This fails on python 3 when keys are unorderable
156 |             # but we keep it in a try as it's faster.
157 |             Pickler._batch_setitems(self, iter(sorted(items)))
158 |         except TypeError:
159 |             # If keys are unorderable, sorting them using their hash. This is
160 |             # slower but works in any case.
161 |             Pickler._batch_setitems(self, iter(sorted((hash(k), v) for k, v in items)))
162 | 
163 |     def save_set(self, set_items):
164 |         # forces order of items in Set to ensure consistent hash
165 |         Pickler.save(self, _ConsistentSet(set_items))
166 | 
167 |     dispatch[type(set())] = save_set
168 |     dispatch[type(frozenset())] = save_set
169 | 
170 | 
171 | class NumpyHasher(Hasher):
172 |     """ Special case the hasher for when numpy is loaded.
173 |     """
174 | 
175 |     def __init__(self, hash_name='md5', coerce_mmap=True):
176 |         """
177 |             Parameters
178 |             ----------
179 |             hash_name: string
180 |                 The hash algorithm to be used
181 |             coerce_mmap: boolean
182 |                 Make no difference between np.memmap and np.ndarray
183 |                 objects.
184 |         """
185 |         self.coerce_mmap = coerce_mmap
186 |         self.chunk_size = 200 * 1024 * 1024    # 200 Mb
187 |         Hasher.__init__(self, hash_name=hash_name)
188 |         # delayed import of numpy, to avoid tight coupling
189 |         import numpy as np
190 | 
191 |         self.np = np
192 | 
193 |     def hash_array(self, a):
194 |         self._hash.update(a.tobytes())
195 | 
196 |     def save(self, obj):
197 |         """ Subclass the save method, to hash ndarray subclass, rather
198 |             than pickling them. Off course, this is a total abuse of
199 |             the Pickler class.
200 |         """
201 |         if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
202 |             # Compute a hash of the object
203 |             obj_bytes = obj.dtype.itemsize * obj.size
204 |             if obj_bytes > self.chunk_size:
205 |                 # For arrays larger than `self.chunk_size` we will attempt
206 |                 # to change the shape of a shallow copy and then hash the data
207 |                 # in chunks
208 |                 try:
209 |                     copy = obj[:]
210 |                     copy.shape = (copy.size,)
211 |                 except AttributeError as e:
212 |                     if e.args[0] != 'incompatible shape for a non-contiguous array':
213 |                         raise e
214 | 
215 |                     # TODO: I am punting here for now and do a reshape that will make
216 |                     # a copy, but it could be possible to get the bytes out of obj
217 |                     # without needing one
218 |                     copy = obj.reshape((obj.size,))
219 | 
220 |                 i = 0
221 |                 size = copy.size
222 |                 typed_chunk_size = self.chunk_size // copy.dtype.itemsize
223 |                 while i < size:
224 |                     end = min(i + typed_chunk_size, size)
225 |                     self.hash_array(copy[i:end])
226 |                     i = end
227 | 
228 |             else:
229 |                 # Small arrays are hashed all at once
230 |                 self.hash_array(obj)
231 | 
232 |             # We store the class, to be able to distinguish between
233 |             # Objects with the same binary content, but different
234 |             # classes.
235 |             if self.coerce_mmap and isinstance(obj, self.np.memmap):
236 |                 # We don't make the difference between memmap and
237 |                 # normal ndarrays, to be able to reload previously
238 |                 # computed results with memmap.
239 |                 klass = self.np.ndarray
240 |             else:
241 |                 klass = obj.__class__
242 |             # We also return the dtype and the shape, to distinguish
243 |             # different views on the same data with different dtypes.
244 | 
245 |             # The object will be pickled by the pickler hashed at the end.
246 |             obj = (klass, ('HASHED', obj.dtype, obj.shape))
247 |         elif isinstance(obj, self.np.dtype):
248 |             # Atomic dtype objects are interned by their default constructor:
249 |             # np.dtype('f8') is np.dtype('f8')
250 |             # This interning is not maintained by a
251 |             # pickle.loads + pickle.dumps cycle, because __reduce__
252 |             # uses copy=True in the dtype constructor. This
253 |             # non-deterministic behavior causes the internal memoizer
254 |             # of the hasher to generate different hash values
255 |             # depending on the history of the dtype object.
256 |             # To prevent the hash from being sensitive to this, we use
257 |             # .descr which is a full (and never interned) description of
258 |             # the array dtype according to the numpy doc.
259 |             klass = obj.__class__
260 |             obj = (klass, ('HASHED', obj.descr))
261 |         Hasher.save(self, obj)
262 | 
263 | 
264 | def hash(obj, hasher=None, hash_name='md5', coerce_mmap=True):
265 |     """ Quick calculation of a hash to identify uniquely Python objects
266 |         containing numpy arrays. The difference with this hash and joblib
267 |         is that it tries to hash different mutable objects with the same
268 |         values to the same hash.
269 | 
270 | 
271 |         Parameters
272 |         -----------
273 |         hash_name: 'md5' or 'sha1'
274 |             Hashing algorithm used. sha1 is supposedly safer, but md5 is
275 |             faster.
276 |         coerce_mmap: boolean
277 |             Make no difference between np.memmap and np.ndarray
278 |     """
279 |     if hasher is None:
280 |         if 'numpy' in sys.modules:
281 |             hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
282 |         else:
283 |             hasher = Hasher(hash_name=hash_name)
284 | 
285 |     return hasher.hash(obj)
286 | 
287 | 
288 | def file_hash(filename, hash_name='md5'):
289 |     """Streams the bytes of the given file through either md5 or sha1
290 |        and returns the hexdigest.
291 |     """
292 |     if hash_name not in set(['md5', 'sha1']):
293 |         raise ValueError('hashname must be "md5" or "sha1"')
294 | 
295 |     hasher = hashlib.md5() if hash_name == 'md5' else hashlib.sha1()
296 |     with open(filename, 'rb') as f:
297 |         for chunk in iter(lambda: f.read(4096), b''):
298 |             hasher.update(chunk)
299 |     return hasher.hexdigest()
300 | 


--------------------------------------------------------------------------------
/provenance/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.
2 | 


--------------------------------------------------------------------------------
/provenance/migrations/env.py:
--------------------------------------------------------------------------------
 1 | from __future__ import with_statement
 2 | 
 3 | from alembic import context
 4 | from sqlalchemy import engine_from_config, pool
 5 | 
 6 | from provenance import models
 7 | 
 8 | # this is the Alembic Config object, which provides
 9 | # access to the values within the .ini file in use.
10 | config = context.config
11 | 
12 | # add your model's MetaData object here
13 | # for 'autogenerate' support
14 | # from myapp import mymodel
15 | # target_metadata = mymodel.Base.metadata
16 | target_metadata = models.Base.metadata
17 | 
18 | # other values from the config, defined by the needs of env.py,
19 | # can be acquired:
20 | # my_important_option = config.get_main_option("my_important_option")
21 | # ... etc.
22 | 
23 | 
24 | def run_migrations_offline():
25 |     """Run migrations in 'offline' mode.
26 | 
27 |     This configures the context with just a URL
28 |     and not an Engine, though an Engine is acceptable
29 |     here as well.  By skipping the Engine creation
30 |     we don't even need a DBAPI to be available.
31 | 
32 |     Calls to context.execute() here emit the given string to the
33 |     script output.
34 | 
35 |     """
36 |     url = config.get_main_option('sqlalchemy.url')
37 |     context.configure(url=url, target_metadata=target_metadata, literal_binds=True)
38 | 
39 |     with context.begin_transaction():
40 |         context.run_migrations()
41 | 
42 | 
43 | def run_migrations_online():
44 |     connectable = config.attributes.get('connection', None)
45 | 
46 |     if connectable is None:
47 |         # only create Engine if we don't have a Connection
48 |         # from the outside
49 |         connectable = engine_from_config(
50 |             config.get_section(config.config_ini_section),
51 |             prefix='sqlalchemy.',
52 |             poolclass=pool.NullPool,
53 |         )
54 | 
55 |     # when connectable is already a Connection object, calling
56 |     # connect() gives us a *branched connection*.
57 | 
58 |     with connectable.connect() as connection:
59 |         context.configure(connection=connection, target_metadata=target_metadata)
60 | 
61 |         with context.begin_transaction():
62 |             context.run_migrations()
63 | 
64 | 
65 | if context.is_offline_mode():
66 |     run_migrations_offline()
67 | else:
68 |     run_migrations_online()
69 | 


--------------------------------------------------------------------------------
/provenance/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/provenance/migrations/versions/e0317ab07ba4_initial_schema.py:
--------------------------------------------------------------------------------
 1 | """initial schema
 2 | 
 3 | Revision ID: e0317ab07ba4
 4 | Revises:
 5 | Create Date: 2017-03-13 13:33:59.644604
 6 | 
 7 | """
 8 | import sqlalchemy as sa
 9 | import sqlalchemy.dialects.postgresql as pg
10 | from alembic import op
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'e0317ab07ba4'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.create_table(
21 |         'artifact_set_members',
22 |         sa.Column('set_id', sa.VARCHAR(length=40), nullable=False),
23 |         sa.Column('artifact_id', sa.VARCHAR(length=40), nullable=False),
24 |         sa.PrimaryKeyConstraint('set_id', 'artifact_id'),
25 |     )
26 | 
27 |     op.create_table(
28 |         'artifact_sets',
29 |         sa.Column('id', sa.INTEGER(), nullable=False),
30 |         sa.Column('set_id', sa.VARCHAR(length=40), nullable=True),
31 |         sa.Column('name', sa.VARCHAR(length=1000), nullable=True),
32 |         sa.Column('created_at', pg.TIMESTAMP(), nullable=True),
33 |         sa.PrimaryKeyConstraint('id'),
34 |     )
35 | 
36 |     op.create_table(
37 |         'runs',
38 |         sa.Column('id', sa.VARCHAR(length=40), nullable=False),
39 |         sa.Column('hostname', sa.VARCHAR(length=256), nullable=True),
40 |         sa.Column('info', pg.JSONB(), nullable=True),
41 |         sa.Column('created_at', pg.TIMESTAMP(), nullable=True),
42 |         sa.PrimaryKeyConstraint('id'),
43 |     )
44 | 
45 |     op.create_table(
46 |         'artifacts',
47 |         sa.Column('id', sa.VARCHAR(length=40), nullable=False),
48 |         sa.Column('value_id', sa.VARCHAR(length=50), nullable=True),
49 |         sa.Column('run_id', sa.VARCHAR(length=40), nullable=True),
50 |         sa.Column('name', sa.VARCHAR(length=1000), nullable=True),
51 |         sa.Column('version', sa.INTEGER(), nullable=True),
52 |         sa.Column('fn_module', sa.VARCHAR(length=100), nullable=True),
53 |         sa.Column('fn_name', sa.VARCHAR(length=100), nullable=True),
54 |         sa.Column('composite', sa.BOOLEAN(), nullable=True),
55 |         sa.Column('value_id_duration', sa.FLOAT(), nullable=True),
56 |         sa.Column('compute_duration', sa.FLOAT(), nullable=True),
57 |         sa.Column('hash_duration', sa.FLOAT(), nullable=True),
58 |         sa.Column('computed_at', pg.TIMESTAMP(), nullable=True),
59 |         sa.Column('added_at', pg.TIMESTAMP(), nullable=True),
60 |         sa.Column('input_artifact_ids', pg.ARRAY(pg.VARCHAR(length=40)), nullable=True),
61 |         sa.Column('inputs_json', pg.JSONB(), nullable=True),
62 |         sa.Column('serializer', sa.VARCHAR(length=128), nullable=True),
63 |         sa.Column('load_kwargs', pg.JSONB(), nullable=True),
64 |         sa.Column('dump_kwargs', pg.JSONB(), nullable=True),
65 |         sa.Column('custom_fields', pg.JSONB(), nullable=True),
66 |         sa.ForeignKeyConstraint(
67 |             ['run_id'],
68 |             ['runs.id'],
69 |         ),
70 |         sa.PrimaryKeyConstraint('id'),
71 |     )
72 | 
73 | 
74 | def downgrade():
75 |     op.drop_table('artifacts')
76 |     op.drop_table('runs')
77 |     op.drop_table('artifact_sets')
78 |     op.drop_table('artifact_set_members')
79 | 


--------------------------------------------------------------------------------
/provenance/models.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from datetime import datetime
  3 | 
  4 | import sqlalchemy as sa
  5 | import sqlalchemy.dialects.postgresql as pg
  6 | import sqlalchemy.ext.declarative
  7 | import sqlalchemy.orm
  8 | from memoized_property import memoized_property
  9 | 
 10 | Base = sa.ext.declarative.declarative_base()
 11 | 
 12 | SHA1_LENGTH = 40
 13 | VALUE_ID_LENGTH = SHA1_LENGTH + 10    # extra 10 for optional file extension info
 14 | 
 15 | 
 16 | class Run(Base):
 17 |     __tablename__ = 'runs'
 18 | 
 19 |     id = sa.Column(pg.VARCHAR(SHA1_LENGTH), primary_key=True)
 20 |     hostname = sa.Column(pg.VARCHAR(256))
 21 |     info = sa.Column(pg.JSONB)
 22 |     created_at = sa.Column(pg.TIMESTAMP, default=datetime.utcnow)
 23 |     artifacts = sqlalchemy.orm.relationship('Artifact')
 24 | 
 25 |     def __init__(self, info):
 26 |         self.id = info['id']
 27 |         self.info = info
 28 |         self.hostname = info['host']['nodename']
 29 |         self.created_at = info['created_at']
 30 | 
 31 |     @memoized_property
 32 |     def info_with_datetimes(self):
 33 |         result = copy.copy(self.info)
 34 |         result['created_at'] = self.created_at
 35 |         return result
 36 | 
 37 | 
 38 | class Artifact(Base):
 39 |     __tablename__ = 'artifacts'
 40 | 
 41 |     id = sa.Column(pg.VARCHAR(SHA1_LENGTH), primary_key=True)
 42 |     value_id = sa.Column(pg.VARCHAR(VALUE_ID_LENGTH))
 43 |     run_id = sa.Column(pg.VARCHAR(SHA1_LENGTH), sa.ForeignKey('runs.id'))
 44 | 
 45 |     name = sa.Column(pg.VARCHAR(1000))
 46 |     version = sa.Column(pg.INTEGER)
 47 |     fn_module = sa.Column(pg.VARCHAR(100))
 48 |     fn_name = sa.Column(pg.VARCHAR(100))
 49 | 
 50 |     composite = sa.Column(pg.BOOLEAN)
 51 | 
 52 |     value_id_duration = sa.Column(pg.FLOAT)
 53 |     compute_duration = sa.Column(pg.FLOAT)
 54 |     hash_duration = sa.Column(pg.FLOAT)
 55 | 
 56 |     computed_at = sa.Column(pg.TIMESTAMP)
 57 |     added_at = sa.Column(pg.TIMESTAMP, default=datetime.utcnow)
 58 | 
 59 |     input_artifact_ids = sa.Column(pg.ARRAY(pg.VARCHAR(SHA1_LENGTH)))
 60 |     inputs_json = sa.orm.deferred(sa.Column(pg.JSONB))
 61 |     serializer = sa.Column(pg.VARCHAR(128), default='joblib')
 62 |     load_kwargs = sa.Column(pg.JSONB)
 63 |     dump_kwargs = sa.Column(pg.JSONB)
 64 |     custom_fields = sa.Column(pg.JSONB)
 65 | 
 66 |     def __init__(self, artifact, inputs_json, run):
 67 |         self.id = artifact.id
 68 |         self.run = run
 69 |         self.run_id = run.id
 70 |         self.value_id = artifact.value_id
 71 |         self.name = artifact.name
 72 |         self.version = artifact.version
 73 |         self.fn_module = artifact.fn_module
 74 |         self.fn_name = artifact.fn_name
 75 |         self.composite = artifact.composite
 76 |         self.value_id_duration = artifact.value_id_duration
 77 |         self.compute_duration = artifact.compute_duration
 78 |         self.hash_duration = artifact.hash_duration
 79 |         self.input_artifact_ids = artifact.input_artifact_ids
 80 |         self.inputs_json = inputs_json
 81 |         self.custom_fields = artifact.custom_fields
 82 |         self.computed_at = artifact.computed_at
 83 |         self.serializer = artifact.serializer
 84 |         self.load_kwargs = artifact.load_kwargs
 85 |         self.dump_kwargs = artifact.dump_kwargs
 86 | 
 87 |     @memoized_property
 88 |     def props(self):
 89 |         return {
 90 |             'id': self.id,
 91 |             'value_id': self.value_id,
 92 |             'name': self.name,
 93 |             'version': self.version,
 94 |             'fn_module': self.fn_module,
 95 |             'fn_name': self.fn_name,
 96 |             'composite': self.composite,
 97 |             'value_id_duration': self.value_id_duration,
 98 |             'compute_duration': self.compute_duration,
 99 |             'hash_duration': self.hash_duration,
100 |             'input_artifact_ids': self.input_artifact_ids,
101 |             'serializer': self.serializer,
102 |             'load_kwargs': self.load_kwargs,
103 |             'dump_kwargs': self.dump_kwargs,
104 |             'custom_fields': self.custom_fields,
105 |             'computed_at': self.computed_at,
106 |         }
107 | 
108 |     def __repr__(self):
109 |         return '<Artifact %r>' % self.id
110 | 
111 | 
112 | class ArtifactSet(Base):
113 |     __tablename__ = 'artifact_sets'
114 | 
115 |     id = sa.Column(pg.INTEGER, primary_key=True)
116 |     set_id = sa.Column(pg.VARCHAR(SHA1_LENGTH))
117 |     labels = sa.Column(pg.JSONB)
118 |     created_at = sa.Column(pg.TIMESTAMP, default=datetime.utcnow)
119 | 
120 |     def __init__(self, artifact_set):
121 |         self.set_id = artifact_set.id
122 |         labels = artifact_set.labels
123 |         if isinstance(artifact_set.labels, str):
124 |             labels = {'name': artifact_set.labels}
125 |         self.labels = labels
126 |         self.created_at = artifact_set.created_at
127 | 
128 |     @memoized_property
129 |     def props(self):
130 |         return {'id': self.set_id, 'labels': self.labels, 'created_at': self.created_at}
131 | 
132 |     def __repr__(self):
133 |         return '<ArtifactSet %r, %r>' % (self.set_id, self.labels)
134 | 
135 | 
136 | class ArtifactSetMember(Base):
137 |     __tablename__ = 'artifact_set_members'
138 | 
139 |     set_id = sa.Column(
140 |         pg.VARCHAR(SHA1_LENGTH),
141 |         primary_key=True,    # sa.ForeignKey("artifact_sets.set_id"),
142 |     )
143 |     artifact_id = sa.Column(
144 |         pg.VARCHAR(SHA1_LENGTH),
145 |         primary_key=True    # sa.ForeignKey("artifacts.id"),
146 |     )
147 | 


--------------------------------------------------------------------------------
/provenance/serializers.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | from functools import singledispatch
  3 | 
  4 | import cloudpickle
  5 | import joblib
  6 | import toolz as t
  7 | 
  8 | from .hashing import hash
  9 | 
 10 | 
 11 | def cloudpickle_dump(obj, filename, **kwargs):
 12 |     with open(filename, 'wb') as f:
 13 |         return cloudpickle.dump(obj, f, **kwargs)
 14 | 
 15 | 
 16 | def cloudpickle_load(filename, **kwargs):
 17 |     with open(filename, 'rb') as f:
 18 |         return cloudpickle.load(f, **kwargs)
 19 | 
 20 | 
 21 | Serializer = namedtuple(
 22 |     'Serializer',
 23 |     'name, dump, load, content_type, content_encoding, content_disposition',
 24 | )
 25 | 
 26 | 
 27 | def joblib_dump(obj, filename, compress=2, **kwargs):
 28 |     joblib.dump(obj, filename, compress=compress, **kwargs)
 29 | 
 30 | 
 31 | serializers = {}
 32 | 
 33 | 
 34 | @singledispatch
 35 | def object_serializer(obj):
 36 |     """
 37 |     Takes an object and returns the appropirate serializer name, dump, and load arguments.
 38 | 
 39 |     Parameters
 40 |     ----------
 41 |     obj : any python object or primitive
 42 | 
 43 |     Returns
 44 |     -------
 45 |     tuple of serializer name (str), dump args (dictionary), load args (dictionary)
 46 |     """
 47 |     return DEFAULT_VALUE_SERIALIZER.name
 48 | 
 49 | 
 50 | def register_serializer(
 51 |     name,
 52 |     dump,
 53 |     load,
 54 |     content_type=None,
 55 |     content_encoding=None,
 56 |     content_disposition=None,
 57 |     classes=None,
 58 | ):
 59 |     serializers[name] = Serializer(
 60 |         name, dump, load, content_type, content_encoding, content_disposition
 61 |     )
 62 |     if classes is None:
 63 |         return
 64 |     for cls in classes:
 65 |         object_serializer.register(cls, lambda _: name)
 66 | 
 67 | 
 68 | register_serializer('joblib', joblib_dump, joblib.load)
 69 | register_serializer('cloudpickle', cloudpickle_dump, cloudpickle_load)
 70 | 
 71 | 
 72 | def _pandas_and_parquet_present():
 73 |     try:
 74 |         import pandas
 75 |     except ImportError:
 76 |         return False
 77 |     try:
 78 |         import pyarrow
 79 |     except:
 80 |         try:
 81 |             import fastparquet
 82 |         except ImportError:
 83 |             return False
 84 |     return True
 85 | 
 86 | 
 87 | if _pandas_and_parquet_present():
 88 |     import pandas as pd
 89 | 
 90 |     def pd_df_parquet_dump(df, filename, **kwargs):
 91 |         return df.to_parquet(filename, **kwargs)
 92 | 
 93 |     def pd_df_parquet_load(filename, **kwargs):
 94 |         return pd.read_parquet(filename, **kwargs)
 95 | 
 96 |     register_serializer(
 97 |         'pd_df_parquet', pd_df_parquet_dump, pd_df_parquet_load, classes=[pd.DataFrame]
 98 |     )
 99 | 
100 |     def pd_series_parquet_dump(series, filename, **kwargs):
101 |         if series.name is None:
102 |             # pyarrow requires the column names be strings
103 |             series = pd.Series(series, name='_series')
104 |         return pd.DataFrame(series).to_parquet(filename, **kwargs)
105 | 
106 |     def pd_series_parquet_load(filename, **kwargs):
107 |         series = pd.read_parquet(filename, **kwargs).ix[:, 0]
108 |         if series.name == '_series':
109 |             series.name = None
110 |         return series
111 | 
112 |     register_serializer(
113 |         'pd_series_parquet',
114 |         pd_series_parquet_dump,
115 |         pd_series_parquet_load,
116 |         classes=[pd.Series],
117 |     )
118 | 
119 | 
120 | def _pytorch_present():
121 |     try:
122 |         import torch
123 |     except:
124 |         return False
125 |     return True
126 | 
127 | 
128 | if _pytorch_present():
129 |     import torch
130 | 
131 |     def pytorch_model_dump(model, filename, **kwargs):
132 |         return torch.save(model, filename)
133 | 
134 |     def pytorch_model_load(filename, **kwargs):
135 |         return torch.load(filename)
136 | 
137 |     register_serializer(
138 |         'pytorch_model',
139 |         pytorch_model_dump,
140 |         pytorch_model_load,
141 |         classes=[torch.nn.Module],
142 |     )
143 | 
144 | 
145 | @t.memoize(key=lambda *args: hash(args))
146 | def partial_serializer(serializer_name, dump_kwargs, load_kwargs):
147 |     s = serializers[serializer_name]
148 |     return Serializer(
149 |         s.name,
150 |         t.partial(s.dump, **dump_kwargs) if dump_kwargs else s.dump,
151 |         t.partial(s.load, **load_kwargs) if load_kwargs else s.load,
152 |         s.content_type,
153 |         s.content_encoding,
154 |         s.content_disposition,
155 |     )
156 | 
157 | 
158 | def serializer(artifact):
159 |     return partial_serializer(artifact.serializer, artifact.dump_kwargs, artifact.load_kwargs)
160 | 
161 | 
162 | DEFAULT_VALUE_SERIALIZER = serializers['joblib']
163 | DEFAULT_INPUT_SERIALIZER = serializers['joblib']
164 | 


--------------------------------------------------------------------------------
/provenance/sftp/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import paramiko
 4 | 
 5 | from .. import blobstores as bs
 6 | 
 7 | 
 8 | def _ssh_client(ssh_config):
 9 |     client = paramiko.SSHClient()
10 |     client.load_host_keys(os.path.expanduser('~/.ssh/known_hosts'))
11 |     # There still seems to be problems with some types keys.
12 |     # See https://github.com/paramiko/paramiko/issues/243
13 |     # So you might try uncommenting if you are using an ecdsa-sha2-nistp256
14 |     # client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
15 |     client.connect(**ssh_config)
16 |     return client
17 | 
18 | 
19 | class SFTPStore(bs.RemoteStore):
20 | 
21 |     def __init__(
22 |         self,
23 |         cachedir,
24 |         basepath,
25 |         ssh_config=None,
26 |         ssh_client=None,
27 |         sftp_client=None,
28 |         read=True,
29 |         write=True,
30 |         read_through_write=True,
31 |         delete=False,
32 |         on_duplicate_key='skip',
33 |         cleanup_cachedir=False,
34 |         always_check_remote=False,
35 |     ):
36 |         """
37 |         Parameters
38 |         ----------
39 |         always_check_remote : bool
40 |            When True the SFTP server will be checked with every __contains__ call. Otherwise it will
41 |         short-circuit if the blob is found in the cachedir. For performance reasons this
42 |         should always be set to False. The only reason why you would want to use this
43 |         is if you are using a SFTPStore and a DiskStore in a ChainedStore together for
44 |         some reason. Since the SFTPStore basically doubles as a DiskStore with it's cachedir
45 |         chaining the two doesn't really make sense though.
46 |         """
47 |         super(SFTPStore, self).__init__(
48 |             always_check_remote=always_check_remote,
49 |             cachedir=cachedir,
50 |             basepath=basepath,
51 |             cleanup_cachedir=cleanup_cachedir,
52 |             read=read,
53 |             write=write,
54 |             read_through_write=read_through_write,
55 |             delete=delete,
56 |             on_duplicate_key=on_duplicate_key,
57 |         )
58 | 
59 |         self.ssh_client = None
60 |         if ssh_config is not None:
61 |             self.ssh_client = _ssh_client(ssh_config)
62 |         if self.ssh_client is not None:
63 |             sftp_client = paramiko.SFTPClient.from_transport(self.ssh_client._transport)
64 |         if sftp_client is not None:
65 |             self.sftp_client = sftp_client
66 |         else:
67 |             # This is to allow testing the importing/subpackage aspect without
68 |             # having to actually test the class by mocking an ssh connection.
69 |             if cachedir is None and basepath is None:
70 |                 return
71 |             raise ValueError(
72 |                 'You must specify a SFTP client by passing in one of: sftp_client, ssh_config, ssh_client'
73 |             )
74 | 
75 |     def _exists(self, path):
76 |         try:
77 |             self.sftp_client.stat(path)
78 |             return True
79 |         except FileNotFoundError:
80 |             return False
81 | 
82 |     def _delete_remote(self, path):
83 |         self.sftp_client.remove(path)
84 | 
85 |     def _upload_file(self, filename, path):
86 |         self.sftp_client.put(filename, path)
87 | 
88 |     def _download_file(self, remote_path, dest_filename):
89 |         self.sftp_client.get(remote_path, dest_filename)
90 | 


--------------------------------------------------------------------------------
/provenance/test_serializers.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | import provenance.serializers as s
 4 | 
 5 | 
 6 | def test_default_object_serializers():
 7 |     assert s.object_serializer('foo') == 'joblib'
 8 |     assert s.object_serializer((1, 2, 3)) == 'joblib'
 9 |     assert s.object_serializer({'foo': 42}) == 'joblib'
10 | 
11 |     df = pd.DataFrame([{'foo': 42}, {'foo': 55}])
12 |     assert s.object_serializer(df) == 'pd_df_parquet'
13 | 
14 |     series = df.foo
15 |     assert s.object_serializer(series) == 'pd_series_parquet'
16 | 


--------------------------------------------------------------------------------
/provenance/utils.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | from collections import OrderedDict, Sequence
  3 | 
  4 | import toolz as t
  5 | import toolz.curried as tc
  6 | from boltons import funcutils as bfu
  7 | 
  8 | UNSPECIFIED_ARG = '::unspecified::'
  9 | 
 10 | 
 11 | def args_extractor(f, merge_defaults=False):
 12 |     """
 13 |     Takes a function, inspects it's parameter lists, and returns a
 14 |     function that will return all of the named and key arguments
 15 |     back as a dictionary. The varargs are also returned which don't
 16 |     have a names.
 17 | 
 18 |     """
 19 |     spec = inspect.getfullargspec(f)
 20 |     if spec.defaults:
 21 |         param_defaults = dict(zip(spec.args[-len(spec.defaults):], spec.defaults))
 22 |     else:
 23 |         param_defaults = {}
 24 |     named_param_defaults = spec.kwonlydefaults or {}
 25 |     default_dicts = {}
 26 |     num_named_args = len(spec.args)
 27 | 
 28 |     if merge_defaults is True and hasattr(f, '__merge_defaults__'):
 29 |         merge_defaults = f.__merge_defaults__
 30 | 
 31 |     if merge_defaults:
 32 |         default_dicts = t.pipe(
 33 |             t.merge(named_param_defaults, param_defaults),
 34 |             tc.valfilter(lambda v: isinstance(v, dict)),
 35 |         )
 36 | 
 37 |         if isinstance(merge_defaults, Sequence):
 38 |             default_dicts = {k: default_dicts[k] for k in merge_defaults}
 39 | 
 40 |         def _args_dict(args, kargs):
 41 |             unnamed_args = dict(zip(spec.args, args[0:num_named_args]))
 42 |             varargs = args[num_named_args:]
 43 |             kargs = t.merge(kargs, unnamed_args)
 44 |             for k, d in default_dicts.items():
 45 |                 kargs[k] = t.merge(d, kargs.get(k) or {})
 46 |             return varargs, kargs
 47 | 
 48 |     else:
 49 | 
 50 |         def _args_dict(args, kargs):
 51 |             unnamed_args = dict(zip(spec.args, args[0:num_named_args]))
 52 |             varargs = args[num_named_args:]
 53 |             kargs = t.merge(kargs, unnamed_args)
 54 |             return varargs, kargs
 55 | 
 56 |     return _args_dict
 57 | 
 58 | 
 59 | def with_merged_defaults(*kwargs_to_default):
 60 |     """
 61 |     Introspects the argspec of the function being decorated to see what
 62 |     keyword arguments take dictionaries. If a dictionary is passed in when
 63 |     then function is called then it is merged with the dictionary defined
 64 |     in the parameter list.
 65 |     """
 66 |     merge_defaults = True
 67 |     if len(kwargs_to_default) > 0:
 68 |         merge_defaults = kwargs_to_default
 69 | 
 70 |     def _with_merged_defaults(f):
 71 |         extract_kargs = args_extractor(f, merge_defaults)
 72 | 
 73 |         @bfu.wraps(f)
 74 |         def _merge_defaults(*args, **kargs):
 75 |             vargs, kargs = extract_kargs(args, kargs)
 76 |             return f(*vargs, **kargs)
 77 | 
 78 |         _merge_defaults.__merge_defaults__ = merge_defaults
 79 | 
 80 |         return _merge_defaults
 81 | 
 82 |     return _with_merged_defaults
 83 | 
 84 | 
 85 | def is_curry_func(f):
 86 |     """
 87 |     Checks if f is a toolz or cytoolz function by inspecting the available attributes.
 88 |     Avoids explicit type checking to accommodate all versions of the curry fn.
 89 |     """
 90 |     return hasattr(f, 'func') and hasattr(f, 'args') and hasattr(f, 'keywords')
 91 | 
 92 | 
 93 | def _func_param_info(argspec):
 94 |     params = argspec.args
 95 |     defaults = argspec.defaults or []
 96 |     start_default_ix = -max(len(defaults), 1) - 1
 97 |     values = [UNSPECIFIED_ARG] * (len(params) - len(defaults)) + list(defaults[start_default_ix:])
 98 |     return OrderedDict(zip(params, values))
 99 | 
100 | 
101 | def param_info(f):
102 |     if is_curry_func(f):
103 |         argspec = inspect.getfullargspec(f.func)
104 |         num_args = len(f.args)
105 |         args_to_remove = argspec.args[0:num_args] + list(f.keywords.keys())
106 |         base = _func_param_info(argspec)
107 |         return t.dissoc(base, *args_to_remove)
108 |     return _func_param_info(inspect.getfullargspec(f))
109 | 
110 | 
111 | def inner_function(partial_fn):
112 |     """Returns the wrapped function of either a partial or curried function."""
113 |     fn = partial_fn.func
114 |     if '__module__' not in dir(fn):
115 |         # for some reason the curry decorator nests the actual function
116 |         # metadata one level deeper
117 |         fn = fn.func
118 |     return fn
119 | 
120 | 
121 | def partial_fn_info(partial_fn):
122 |     fn = inner_function(partial_fn)
123 |     varargs, kargs = args_extractor(fn)(partial_fn.args, partial_fn.keywords)
124 |     return {
125 |         'varargs': varargs,
126 |         'kargs': kargs,
127 |         'module': fn.__module__,
128 |         'name': fn.__name__,
129 |     }
130 | 
131 | 
132 | # TODO: consider using the functions in joblib.func_inspect, e.g. for the fn name
133 | def fn_info(fn):
134 |     if 'func' in dir(fn):
135 |         return partial_fn_info(fn)
136 |     return {'name': fn.__name__, 'module': fn.__module__, 'varargs': (), 'kargs': {}}
137 | 
138 | 
139 | def when_type(type):
140 | 
141 |     def _decorator(f):
142 | 
143 |         @bfu.wraps(f)
144 |         def _when_type(val):
145 |             if isinstance(val, type):
146 |                 return f(val)
147 |             else:
148 |                 return val
149 | 
150 |         return _when_type
151 | 
152 |     return _decorator
153 | 


--------------------------------------------------------------------------------
/provenance/vis/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import lineage_dot
2 | 
3 | visualize_lineage = lineage_dot
4 | 


--------------------------------------------------------------------------------
/provenance/vis/utils.py:
--------------------------------------------------------------------------------
  1 | import graphviz
  2 | from frozendict import frozendict as fd
  3 | 
  4 | from ..repos import is_proxy
  5 | 
  6 | 
  7 | def elide(obj, length=30):
  8 |     table = str.maketrans({'{': r'\{', '}': r'\}', '<': r'\<', '>': r'\>'})
  9 |     s = str(obj).translate(table)
 10 |     return (s[:length] + '..') if len(s) > length else s
 11 | 
 12 | 
 13 | def artifact_id(artifact, length=7):
 14 |     return artifact.id[0:7]
 15 | 
 16 | 
 17 | def artifact_record(artifact, elide_len=30):
 18 |     return '|'.join(['<f0>' + artifact_id(artifact), '<f1>' + elide(artifact.value, elide_len)])
 19 | 
 20 | 
 21 | def param_node_id(child_artifact, name, val):
 22 |     if is_proxy(val):
 23 |         artifact = val.artifact
 24 |         return artifact.id
 25 |     # hmmm... we could share the inputs to other functions if we wanted to remove the child_artifact.id...
 26 |     return '|'.join([child_artifact.id, name])
 27 | 
 28 | 
 29 | def node(name, label=None, **attrs):
 30 |     attrs['type'] = 'node'
 31 |     attrs['name'] = name
 32 |     attrs['label'] = label
 33 |     return fd(attrs)
 34 | 
 35 | 
 36 | def edge(tail_name, head_name, **attrs):
 37 |     attrs['type'] = 'edge'
 38 |     attrs['tail_name'] = tail_name
 39 |     attrs['head_name'] = head_name
 40 |     return fd(attrs)
 41 | 
 42 | 
 43 | def dicts_to_digraph(dicts):
 44 |     g = graphviz.Digraph()
 45 |     for d in dicts:
 46 |         d = dict(d)
 47 |         t = d['type']
 48 |         del d['type']
 49 |         if t == 'node':
 50 |             g.node(**d)
 51 |         elif t == 'edge':
 52 |             g.edge(**d)
 53 |     return g
 54 | 
 55 | 
 56 | class DigraphDicts:
 57 | 
 58 |     def __init__(self):
 59 |         self.set = set()
 60 | 
 61 |     def node(self, name, label=None, **attrs):
 62 |         self.set.add(node(name, label, **attrs))
 63 |         return self
 64 | 
 65 |     def edge(self, tail_name, head_name, **attrs):
 66 |         self.set.add(edge(tail_name, head_name, **attrs))
 67 |         return self
 68 | 
 69 |     def to_dot(self):
 70 |         return dicts_to_digraph(self.set)
 71 | 
 72 |     def _repr_svg_(self):
 73 |         return self.to_dot()._repr_svg_()
 74 | 
 75 | 
 76 | def _viz_artifact(artifact, g):
 77 |     function_id = 'fn_' + artifact.id
 78 |     fn_qalified_name = '.'.join([artifact.fn_module, artifact.fn_name])
 79 |     fn_name = artifact.fn_name
 80 |     fn_params = '{fn}({params})'.format(
 81 |         fn=fn_qalified_name, params=','.join(artifact.inputs['kargs'].keys())
 82 |     )
 83 | 
 84 |     g.node(function_id, fn_name, shape='circle', tooltip=fn_params)
 85 |     g.edge(function_id, artifact.id)
 86 |     g.node(
 87 |         artifact.id,
 88 |         label=artifact_record(artifact, elide_len=15),
 89 |         shape='record',
 90 |         tooltip=elide(artifact.value, 50),
 91 |         color='red',
 92 |     )
 93 | 
 94 |     # ignore varargs for now...
 95 |     for name, val in artifact.inputs['kargs'].items():
 96 |         arg_node_id = param_node_id(artifact, name, val)
 97 |         if is_proxy(val):
 98 |             _viz_artifact(val.artifact, g)
 99 |             g.edge(val.artifact.id, function_id, label=name)
100 |         else:
101 |             g.node(arg_node_id, label=elide(val), shape='box')
102 |             g.edge(arg_node_id, function_id, label=name)
103 | 
104 | 
105 | def lineage_dot(artifact):
106 |     """Walks the lineage of an artifact returning a DigraphDicts object
107 |     that can be turned into a graphviz.Digraph and is automatically rendered
108 |     as SVG in an IPython notebook.
109 |     """
110 |     g = DigraphDicts()
111 |     if is_proxy(artifact):
112 |         artifact = artifact.artifact
113 |     _viz_artifact(artifact, g)
114 |     return g
115 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
1 | conda:
2 |   file: docs/readthedocs-environment.yml
3 | python:
4 |     setup_py_install: false
5 | 


--------------------------------------------------------------------------------
/release-procedure.md:
--------------------------------------------------------------------------------
 1 | 1.  Verify tests pass.
 2 | 
 3 | 2.  Tag the commit
 4 | 
 5 |         git tag 1.2.3
 6 | 
 7 | 3.  Push new version bump commit and tag to github
 8 | 
 9 |         git push trunk --tags
10 | 
11 | 4.  Build source and wheel packages
12 | 
13 |         make dist
14 | 
15 | 6.  Upload packages to PyPI
16 | 
17 |         make release
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | s3fs>=0.0.9
 2 | boltons>=16.5.1
 3 | joblib>=0.15.0
 4 | toolz>=0.8.2
 5 | cloudpickle>=0.2.1
 6 | psutil>=5.0.0
 7 | ordered-set>=2.0.1
 8 | sqlalchemy>=1.1.3
 9 | alembic>=0.9.1
10 | sqlalchemy-utils>=0.32.12
11 | memoized-property>=1.0.2
12 | wrapt>=1.10.8
13 | psycopg2
14 | numpy
15 | pyarrow
16 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [aliases]
 2 | test=pytest
 3 | 
 4 | [tool:pytest]
 5 | addopts = --verbose
 6 | python_files = tests/*/*.py
 7 | 
 8 | [versioneer]
 9 | VCS = git
10 | style = pep440
11 | versionfile_source = provenance/_version.py
12 | versionfile_build = provenance/_version.py
13 | tag_prefix =
14 | parentdir_prefix = provenance-
15 | 
16 | 
17 | [flake8]
18 | exclude = docs
19 | ignore = E203,E266,E501,W503,E722,E402,C901,E731,F401
20 | max-line-length = 100
21 | max-complexity = 18
22 | select = B,C,E,F,W,T4,B9
23 | 
24 | [yapf]
25 | based_on_style = google
26 | column_limit=100
27 | dedent_closing_brackets = true
28 | join_multiple_lines = false
29 | spaces_before_comment = 4
30 | split_arguments_when_comma_terminated = true
31 | split_before_first_argument = true
32 | split_before_logical_operator = true
33 | split_before_arithmetic_operator=true
34 | split_before_named_assigns = true
35 | 
36 | 
37 | [isort]
38 | known_first_party=provenance
39 | known_third_party=alembic,boltons,cloudpickle,conftest,frozendict,google,graphviz,hypothesis,joblib,memoized_property,numpy,pandas,paramiko,psutil,pytest,s3fs,setuptools,sqlalchemy,sqlalchemy_utils,strategies,toolz,wrapt
40 | multi_line_output=3
41 | include_trailing_comma=True
42 | force_grid_wrap=0
43 | combine_as_imports=True
44 | line_length=100
45 | skip=
46 |     docs/source/conf.py
47 |     setup.py
48 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from os.path import exists
 4 | 
 5 | from setuptools import setup
 6 | 
 7 | import versioneer
 8 | 
 9 | subpackages = {
10 |     'sftp': ['paramiko'],
11 |     'google_storage': ['google-cloud'],
12 |     'vis': ['graphviz', 'frozendict'],
13 | }
14 | 
15 | DESCRIPTION = 'Provenance and caching library for functions, built for creating lightweight machine learning pipelines.'
16 | 
17 | setup(
18 |     name='provenance',
19 |     version=versioneer.get_version(),
20 |     cmdclass=versioneer.get_cmdclass(),
21 |     packages=['provenance', 'provenance.sftp', 'provenance.vis'],
22 |     install_requires=[open('requirements.txt').read().strip().split('\n')],
23 |     extras_require=subpackages,
24 |     include_package_data=True,
25 |     description=DESCRIPTION,
26 |     long_description=(open('README.rst').read() if exists('README.rst') else ''),
27 |     author='Ben Mabey',
28 |     author_email='ben@benmabey.com',
29 |     url='http://github.com/bmabey/provenance',
30 |     license='MIT',
31 |     classifiers=[
32 |         'Development Status :: 4 - Beta',
33 |         'Intended Audience :: Developers',
34 |         'Intended Audience :: Science/Research',
35 |         'License :: OSI Approved :: MIT License',
36 |     ],
37 | )
38 | 


--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
 1 | moto>=0.4.30
 2 | pytest==3.6
 3 | pytest-runner
 4 | pytest-pythonpath>=0.7.1
 5 | # remove hypothesis if we don't end up writing property tests with it
 6 | hypothesis>=3.6.0
 7 | h5py
 8 | pandas
 9 | torch
10 | 


--------------------------------------------------------------------------------
/tests/provenance/conftest.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import os
  3 | import shutil
  4 | import tempfile
  5 | 
  6 | import hypothesis.strategies as st
  7 | import pytest
  8 | import sqlalchemy_utils.functions as sql_utils
  9 | import toolz as t
 10 | from sqlalchemy import create_engine, event
 11 | from sqlalchemy.orm import sessionmaker
 12 | 
 13 | import provenance as p
 14 | import provenance.blobstores as bs
 15 | import provenance.core as pc
 16 | import provenance.repos as r
 17 | from provenance.models import Base
 18 | 
 19 | 
 20 | @pytest.fixture(scope='session')
 21 | def s3fs():
 22 |     import moto
 23 | 
 24 |     m = moto.mock_s3()
 25 |     m.start()
 26 |     import boto3
 27 |     import s3fs
 28 | 
 29 |     client = boto3.client('s3')
 30 |     client.create_bucket(Bucket='bucket')
 31 |     fs = s3fs.S3FileSystem(anon=False)
 32 |     return fs
 33 | 
 34 | 
 35 | @pytest.fixture(scope='session')
 36 | def db_conn_str():
 37 |     env_conn_str = os.environ.get('DB', None)
 38 |     return env_conn_str or 'postgresql://localhost/test_provenance'
 39 | 
 40 | 
 41 | ### This should be the SQLAlchemy db_conn
 42 | @pytest.fixture(scope='session')
 43 | def db_engine(db_conn_str):
 44 |     if sql_utils.database_exists(db_conn_str):
 45 |         sql_utils.drop_database(db_conn_str)
 46 | 
 47 |     sql_utils.create_database(db_conn_str)
 48 |     engine = create_engine(db_conn_str, json_serializer=r.Encoder().encode)
 49 |     Base.metadata.create_all(engine)
 50 | 
 51 |     return engine
 52 | 
 53 | 
 54 | @pytest.fixture()
 55 | def db_session(db_engine):
 56 |     connection = db_engine.connect()
 57 |     transaction = connection.begin()
 58 |     session = sessionmaker()(bind=connection)
 59 | 
 60 |     session.begin_nested()
 61 | 
 62 |     @event.listens_for(session, 'after_transaction_end')
 63 |     def restart_savepoint(sess, trans):
 64 |         if trans.nested and not trans._parent.nested:
 65 |             sess.expire_all()
 66 |             sess.begin_nested()
 67 | 
 68 |     yield session
 69 | 
 70 |     session.close()
 71 |     transaction.rollback()
 72 |     connection.close()
 73 | 
 74 | 
 75 | @contextlib.contextmanager
 76 | def cd(newdir, cleanup=lambda: True):
 77 |     prevdir = os.getcwd()
 78 |     os.chdir(os.path.expanduser(newdir))
 79 |     try:
 80 |         yield
 81 |     finally:
 82 |         os.chdir(prevdir)
 83 |         cleanup()
 84 | 
 85 | 
 86 | @contextlib.contextmanager
 87 | def tempdir():
 88 |     dirpath = tempfile.mkdtemp()
 89 | 
 90 |     def cleanup():
 91 |         shutil.rmtree(dirpath)
 92 | 
 93 |     with cd(dirpath, cleanup):
 94 |         yield dirpath
 95 | 
 96 | 
 97 | @pytest.fixture(scope='function')
 98 | def disk_store():
 99 |     with tempdir() as dirname:
100 |         yield bs.DiskStore(cachedir=dirname, delete=True)
101 | 
102 | 
103 | @pytest.fixture(scope='function')
104 | def memory_store():
105 |     return bs.MemoryStore()
106 | 
107 | 
108 | @pytest.fixture(scope='function')
109 | def memory_repo():
110 |     repo = r.MemoryRepo(read=True, write=True, delete=True)
111 |     p.set_default_repo(repo)
112 |     yield repo
113 |     p.set_default_repo(None)
114 | 
115 | 
116 | @pytest.fixture(scope='function', params=['memory_store', 'disk_store'])
117 | def blobstore(request, memory_store, disk_store):
118 |     if request.param == 'memory_store':
119 |         store = memory_store
120 |     else:
121 |         store = disk_store
122 |     return store
123 | 
124 | 
125 | # there must be a better way, but this is so I can get get two db_session fixtures
126 | db_session_ = db_session
127 | 
128 | 
129 | @pytest.fixture(
130 |     scope='function',
131 |     # params=['memoryrepo'])
132 |     params=[
133 |         'memoryrepo',
134 |         'dbrepo-diskstore',
135 |         'dbrepo-memorystore',
136 |         'chained-memmem',
137 |     # 'chained-repo'
138 |     ],
139 | )
140 | def repo(request, db_session):
141 |     # clean old config settings
142 |     r.Config.set_current(r.Config({}, {}, None))
143 |     disk_store_gen = None
144 |     disk_store_gen2 = None
145 |     repo2 = None
146 |     prevdir = os.getcwd()
147 |     if request.param == 'memoryrepo':
148 |         repo = r.MemoryRepo(read=True, write=True, delete=True)
149 |     elif request.param == 'dbrepo-diskstore':
150 |         disk_store_gen = disk_store()
151 |         repo = r.DbRepo(db_session, next(disk_store_gen), read=True, write=True, delete=True)
152 |     elif request.param == 'chained-memmem':
153 |         repo = r.ChainedRepo(
154 |             [
155 |                 r.MemoryRepo(read=True, write=True, delete=True),
156 |                 r.MemoryRepo(read=True, write=True, delete=True),
157 |             ]
158 |         )
159 |     elif request.param == 'chained-repo':
160 |         disk_store_gen = disk_store()
161 |         disk_store_gen2 = disk_store()
162 |         repo1 = r.DbRepo(db_session, next(disk_store_gen), read=True, write=True, delete=True)
163 |         os.chdir(prevdir)
164 |         repo2 = r.DbRepo(
165 |             'postgresql://localhost/test_provenance',
166 |             next(disk_store_gen2),
167 |             read=True,
168 |             write=True,
169 |             delete=True,
170 |             schema='second_repo',
171 |         )
172 |         repo = r.ChainedRepo([repo1, repo2])
173 |     else:
174 |         repo = r.DbRepo(db_session, memory_store(), read=True, write=True, delete=True)
175 | 
176 |     p.set_default_repo(repo)
177 |     yield repo
178 |     p.set_default_repo(None)
179 |     if repo2 is not None:
180 |         repo2._db_engine.execute('drop schema second_repo cascade;')
181 | 
182 |     if disk_store_gen:
183 |         next(disk_store_gen, 'ignore')
184 |     if disk_store_gen2:
185 |         next(disk_store_gen2, 'ignore')
186 | 
187 | 
188 | @pytest.fixture(scope='function', params=['dbrepo-diskstore'])
189 | def dbdiskrepo(request, db_session):
190 |     repo_gen = repo(request, db_session)
191 |     yield next(repo_gen)
192 |     next(repo_gen, 'ignore')
193 | 
194 | 
195 | another_dbdiskrepo = dbdiskrepo
196 | 
197 | 
198 | @pytest.fixture(scope='function', params=['memoryrepo' 'dbrepo-diskstore', 'dbrepo-memorystore'])
199 | def atomic_repo(request, db_session):
200 |     repo_gen = repo(request, db_session)
201 |     yield next(repo_gen)
202 |     next(repo_gen, 'ignore')
203 | 
204 | 
205 | md5 = st.text('0123456789abcdef', min_size=32, max_size=32)
206 | _artifact_record_st = st.fixed_dictionaries({'id': md5, 'value_id': md5})
207 | 
208 | 
209 | def artifact_record(**kargs):
210 |     artifact_props = t.merge(
211 |         {k: None for k in pc.artifact_properties},
212 |         _artifact_record_st.example(),
213 |         {
214 |             'inputs': {
215 |                 'varargs': [1, 2, 3],
216 |                 'kargs': {}
217 |             },
218 |             'fn_module': 'foo',
219 |             'fn_name': 'bar',
220 |             'value': 55,
221 |             'name': 'bar',
222 |             'version': 0,
223 |             'serializer': 'joblib',
224 |             'run_info': pc.run_info(),
225 |         },
226 |         kargs,
227 |     )
228 |     return pc.ArtifactRecord(**artifact_props)
229 | 
230 | 
231 | @pytest.fixture()
232 | def with_check_mutations():
233 |     p.set_check_mutations(True)
234 |     yield True
235 |     p.set_check_mutations(False)
236 | 


--------------------------------------------------------------------------------
/tests/provenance/strategies.py:
--------------------------------------------------------------------------------
 1 | import hypothesis.strategies as st
 2 | import numpy as np
 3 | 
 4 | primitive_data = (
 5 |     st.floats(allow_nan=False) | st.booleans() | st.text() | st.none() | st.fractions() |
 6 |     st.integers() | st.characters()
 7 | )
 8 | # | st.complex_numbers() \ nanj is annoying to deal with
 9 | # | st.decimals() can add back in once a new version of joblib is released with bug fix
10 | 
11 | hashable_data = primitive_data | st.tuples(primitive_data)
12 | sets = st.sets(hashable_data)
13 | builtin_data = st.recursive(
14 |     primitive_data | sets,
15 |     lambda children: st.lists(children) | st.dictionaries(st.text(), children) | st.
16 |     tuples(children),
17 | )
18 | 
19 | 
20 | def rand_nparray(seed, w=3, h=3):
21 |     rnd = np.random.RandomState(seed)
22 |     return rnd.random_sample((w, h))
23 | 
24 | 
25 | np_random_states = st.integers(0, 4294967295).map(np.random.RandomState)
26 | fixed_numpy_arrays = st.integers(0, 4294967295).map(rand_nparray)
27 | numpy_data = fixed_numpy_arrays
28 | data = st.recursive(
29 |     primitive_data | sets | fixed_numpy_arrays,
30 |     lambda children: st.lists(children) | st.dictionaries(st.text(), children) | st.
31 |     tuples(children),
32 | )
33 | 


--------------------------------------------------------------------------------
/tests/provenance/test_blobstores.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | 
  3 | import hypothesis.strategies as st
  4 | import pytest
  5 | from hypothesis import given
  6 | from strategies import builtin_data
  7 | 
  8 | import provenance._commonstore as cs
  9 | import provenance.blobstores as bs
 10 | 
 11 | 
 12 | def assert_store_basic_ops(store, key, data):
 13 |     assert key not in store
 14 |     store.put(key, data)
 15 |     assert key in store
 16 | 
 17 |     if store._on_duplicate_key == 'raise':
 18 |         with pytest.raises(cs.KeyExistsError):
 19 |             store.put(key, 'new value')
 20 | 
 21 |     assert store.get(key) == data
 22 |     assert store[key] == data
 23 | 
 24 |     store.delete(key)
 25 |     assert key not in store
 26 | 
 27 |     with pytest.raises(KeyError):
 28 |         store.delete(key)
 29 | 
 30 |     with pytest.raises(KeyError):
 31 |         store.get(key)
 32 | 
 33 | 
 34 | hex_alphabet = tuple(map(str, range(0, 10))) + tuple('abcdefABCDEF')
 35 | sha1 = st.text(alphabet=hex_alphabet, min_size=40, max_size=40)
 36 | 
 37 | 
 38 | @given(sha1, builtin_data)
 39 | def test_memory_blobstore(key, obj):
 40 |     store = bs.MemoryStore(read=True, write=True, delete=True)
 41 |     assert_store_basic_ops(store, key, obj)
 42 | 
 43 | 
 44 | @given(sha1, builtin_data)
 45 | def test_memory_blobstore_raises(key, obj):
 46 |     store = bs.MemoryStore(read=True, write=True, delete=True, on_duplicate_key='raise')
 47 |     assert_store_basic_ops(store, key, obj)
 48 | 
 49 | 
 50 | @given(sha1, builtin_data)
 51 | def test_disk_blobstore(key, obj):
 52 |     tmp_dir = '/tmp/prov_diskstore'
 53 |     shutil.rmtree(tmp_dir, ignore_errors=True)
 54 |     store = bs.DiskStore(tmp_dir, read=True, write=True, delete=True)
 55 |     assert_store_basic_ops(store, key, obj)
 56 | 
 57 | 
 58 | def test_permissions():
 59 |     store = bs.MemoryStore(read=True, write=True, delete=True)
 60 |     store.put('a', 1)
 61 |     assert store.get('a') == 1
 62 |     store.delete('a')
 63 | 
 64 |     store = bs.MemoryStore(read=False, write=False, delete=False)
 65 |     with pytest.raises(cs.PermissionError):
 66 |         store.put('a', 1)
 67 | 
 68 |     with pytest.raises(cs.PermissionError):
 69 |         store.get('a')
 70 | 
 71 |     with pytest.raises(cs.PermissionError):
 72 |         store.delete('a')
 73 | 
 74 | 
 75 | def test_s3store(s3fs):
 76 |     tmp_dir = '/tmp/prov_s3store'
 77 |     shutil.rmtree(tmp_dir, ignore_errors=True)
 78 |     basepath = 'bucket/prov_test'
 79 |     store = bs.S3Store(tmp_dir, basepath, s3fs=s3fs, delete=True)
 80 |     key = sha1.example()
 81 |     obj = builtin_data.example()
 82 | 
 83 |     assert_store_basic_ops(store, key, obj)
 84 | 
 85 | 
 86 | def test_sftpstore_import():
 87 |     import provenance._config as c
 88 | 
 89 |     try:
 90 |         import paramiko
 91 | 
 92 |         _paramiko = True
 93 |     except ImportError:
 94 |         _paramiko = False
 95 |     try:
 96 |         _ = c.BLOBSTORE_TYPES['sftp'](cachedir=None, basepath=None)
 97 |         assert _paramiko is True
 98 |     except ImportError:
 99 |         assert _paramiko is False
100 | 
101 | 
102 | def test_chained_storage_with_disk_and_s3_sharing_cachedir(s3fs):
103 |     tmp_dir = '/tmp/prov_shared_store'
104 |     shutil.rmtree(tmp_dir, ignore_errors=True)
105 |     mem_store = bs.MemoryStore(read=True, write=True, delete=True)
106 |     disk_store = bs.DiskStore(tmp_dir, read=True, write=True, delete=True)
107 |     s3_store = bs.S3Store(
108 |         tmp_dir,
109 |         s3fs=s3fs,
110 |         basepath='bucket/prov_test',
111 |         read=True,
112 |         write=True,
113 |         delete=True,
114 |         always_check_remote=True,
115 |     )
116 |     stores = [mem_store, disk_store, s3_store]
117 | 
118 |     chained_store = bs.ChainedStore(stores)
119 | 
120 |     key = 'foobar'
121 |     data = {'a': 1, 'b': 2}
122 | 
123 |     for store in stores:
124 |         assert key not in store
125 |     assert key not in store
126 | 
127 |     chained_store.put(key, data)
128 |     assert key in store
129 |     for store in stores:
130 |         assert key in store
131 | 
132 |     assert store.get(key) == data
133 |     assert store[key] == data
134 | 
135 |     store.delete(key)
136 |     assert key not in store
137 | 
138 |     with pytest.raises(KeyError):
139 |         store.delete(key)
140 | 
141 |     with pytest.raises(KeyError):
142 |         store.get(key)
143 | 
144 | 
145 | def test_chained_with_readonly():
146 |     read_store = bs.MemoryStore({'foo': 42}, read=True, write=False, delete=False)
147 |     write_store = bs.MemoryStore(read=True, write=True, delete=False)
148 |     stores = [read_store, write_store]
149 |     chained_store = bs.ChainedStore(stores)
150 | 
151 |     # verify we read from the read-only store
152 |     assert chained_store['foo'] == 42
153 | 
154 |     # but that it is not written to
155 |     chained_store.put('bar', 55)
156 |     assert 'bar' in chained_store
157 |     assert 'bar' in write_store
158 |     assert 'bar' not in read_store
159 | 
160 | 
161 | def test_chained_read_through_write():
162 |     read_store = bs.MemoryStore({'foo': 42}, read=True, write=False)
163 |     store_ahead = bs.MemoryStore(read=True, write=True, read_through_write=True)
164 |     read_through_write_store = bs.MemoryStore(read=True, write=True, read_through_write=True)
165 |     no_read_through_write_store = bs.MemoryStore(read=True, write=True, read_through_write=False)
166 |     stores = [
167 |         no_read_through_write_store,
168 |         read_through_write_store,
169 |         read_store,
170 |         store_ahead,
171 |     ]
172 |     chained_store = bs.ChainedStore(stores)
173 | 
174 |     assert 'foo' not in read_through_write_store
175 |     assert 'foo' not in no_read_through_write_store
176 |     assert 'foo' not in store_ahead
177 |     # verify we read from the read-only store
178 |     assert chained_store['foo'] == 42
179 | 
180 |     assert 'foo' in read_through_write_store
181 |     assert 'foo' not in store_ahead
182 |     assert 'foo' not in no_read_through_write_store
183 | 
184 | 
185 | def test_chained_writes_may_be_allowed_on_read_throughs_only():
186 |     read_store = bs.MemoryStore({'foo': 42}, read=True, write=False)
187 |     read_through_write_only_store = bs.MemoryStore(read=True, write=False, read_through_write=True)
188 |     write_store = bs.MemoryStore(read=True, write=True, read_through_write=False)
189 |     stores = [write_store, read_through_write_only_store, read_store]
190 |     chained_store = bs.ChainedStore(stores)
191 | 
192 |     # verify we read from the read-only store
193 |     assert chained_store['foo'] == 42
194 | 
195 |     assert 'foo' in read_through_write_only_store
196 |     assert 'foo' not in write_store
197 | 
198 |     chained_store.put('bar', 55)
199 |     assert 'bar' in chained_store
200 |     assert 'bar' not in read_through_write_only_store
201 |     assert 'bar' in write_store
202 | 


--------------------------------------------------------------------------------
/tests/provenance/test_config.py:
--------------------------------------------------------------------------------
  1 | import conftest as ct
  2 | 
  3 | import provenance._config as c
  4 | import provenance.blobstores as bs
  5 | import provenance.repos as r
  6 | 
  7 | 
  8 | def test_atomic_blobstore_config_reading():
  9 |     config = {
 10 |         'type': 'disk',
 11 |         'cachedir': '.artifacts/',
 12 |         'read': True,
 13 |         'write': True,
 14 |         'read_through_write': False,
 15 |         'delete': True,
 16 |     }
 17 |     store = c.blobstore_from_config(config)
 18 |     assert type(store) == bs.DiskStore
 19 |     assert store.cachedir == bs._abspath(config['cachedir'])
 20 |     assert store._read == config['read']
 21 |     assert store._write == config['write']
 22 |     assert store._delete == config['delete']
 23 |     assert store._read_through_write == config['read_through_write']
 24 | 
 25 | 
 26 | def test_prototypes_are_merged():
 27 |     config = {
 28 |         'local_disk':
 29 |             {
 30 |                 'type': 'disk',
 31 |                 'cachedir': '.artifacts/',
 32 |                 'read': True,
 33 |                 'write': True,
 34 |                 'read_through_write': False,
 35 |                 'delete': True,
 36 |             },
 37 |         'local_read_only': {
 38 |             'prototype': 'local_disk',
 39 |             'write': False,
 40 |             'delete': False
 41 |         },
 42 |         'local_read_through_write': {
 43 |             'prototype': 'local_read_only',
 44 |             'read_through_write': True,
 45 |         },
 46 |     }
 47 | 
 48 |     stores = c.blobstores_from_config(config)
 49 |     store = stores['local_read_through_write']
 50 |     assert type(store) == bs.DiskStore
 51 |     assert store.cachedir == bs._abspath('.artifacts/')
 52 |     assert store._read
 53 |     assert not store._write
 54 |     assert not store._delete
 55 |     assert store._read_through_write
 56 | 
 57 | 
 58 | def test_blobstores_config_reading():
 59 |     config = {
 60 |         'local_disk':
 61 |             {
 62 |                 'type': 'disk',
 63 |                 'cachedir': '.artifacts/',
 64 |                 'read': True,
 65 |                 'write': True,
 66 |                 'read_through_write': False,
 67 |                 'delete': True,
 68 |             },
 69 |         'mem':
 70 |             {
 71 |                 'type': 'memory',
 72 |                 'read': True,
 73 |                 'write': True,
 74 |                 'read_through_write': False,
 75 |                 'delete': True,
 76 |             },
 77 |         'shared_s3':
 78 |             {
 79 |                 'type': 's3',
 80 |                 'cachedir': '/tmp/foo',
 81 |                 'basepath': 'mybucket/blobs',
 82 |                 'delete': False,
 83 |                 's3_config': {
 84 |                     'anon': True
 85 |                 },
 86 |             },
 87 |         'chained': {
 88 |             'type': 'chained',
 89 |             'stores': ['local_disk', 'mem', 'shared_s3']
 90 |         },
 91 |     }
 92 | 
 93 |     stores = c.blobstores_from_config(config)
 94 |     chained = stores['chained']
 95 |     assert isinstance(chained, bs.ChainedStore)
 96 |     assert [type(s) for s in chained.stores] == [
 97 |         bs.DiskStore,
 98 |         bs.MemoryStore,
 99 |         bs.S3Store,
100 |     ]
101 | 
102 | 
103 | def test_from_config():
104 |     config = {
105 |         'blobstores':
106 |             {
107 |                 'mem':
108 |                     {
109 |                         'type': 'memory',
110 |                         'read': True,
111 |                         'write': True,
112 |                         'read_through_write': False,
113 |                         'delete': True,
114 |                     }
115 |             },
116 |         'artifact_repos': {
117 |             'db': {
118 |                 'type': 'postgres',
119 |                 'db': ct.db_conn_str(),
120 |                 'store': 'mem'
121 |             }
122 |         },
123 |     }
124 |     objs = c.from_config(config)
125 |     repo = objs['repos']['db']
126 |     assert isinstance(repo, r.PostgresRepo)
127 |     assert isinstance(repo.blobstore, bs.MemoryStore)
128 | 


--------------------------------------------------------------------------------
/tests/provenance/test_hashing.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | import hypothesis.strategies as st
  4 | import numpy as np
  5 | from hypothesis import given
  6 | from strategies import data
  7 | 
  8 | import provenance as p
  9 | import provenance.artifact_hasher as ah
 10 | from provenance.hashing import hash
 11 | 
 12 | 
 13 | @given(data)
 14 | def test_shallow_and_deep_copies_hashing(o):
 15 |     original_hash = hash(o)
 16 |     shallow_copy = copy.copy(o)
 17 |     deep_copy = copy.deepcopy(o)
 18 |     assert hash(shallow_copy) == original_hash
 19 |     assert hash(deep_copy) == original_hash
 20 | 
 21 | 
 22 | @given(st.data())
 23 | def test_shared_values_hashing(base_data):
 24 |     base_data = base_data.draw(data)
 25 |     base_copy = lambda: copy.deepcopy(base_data)
 26 | 
 27 |     shared_dict = {'a': base_data, 'b': base_data}
 28 |     without_sharing_dict = {'a': base_copy(), 'b': base_copy()}
 29 | 
 30 |     assert hash(shared_dict) == hash(without_sharing_dict)
 31 | 
 32 |     shared_tuple = (base_data, base_data)
 33 |     without_sharing_tuple = (base_copy(), base_copy())
 34 | 
 35 |     assert hash(shared_tuple) == hash(without_sharing_tuple)
 36 | 
 37 |     shared_list = [base_data, base_data]
 38 |     without_sharing_list = [base_copy(), base_copy()]
 39 | 
 40 |     assert hash(shared_list) == hash(without_sharing_list)
 41 | 
 42 | 
 43 | def test_hash_of_contiguous_array_is_the_same_as_noncontiguous():
 44 |     a = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F')[:, :1, :]
 45 |     b = np.ascontiguousarray(a)
 46 |     assert hash(a) == hash(b)
 47 | 
 48 | 
 49 | def test_hash_of_fortran_array_is_the_same_as_c_array():
 50 |     c = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='C')
 51 |     f = np.asarray(np.arange(6000).reshape((1000, 2, 3)), order='F')
 52 | 
 53 |     assert hash(c) == hash(f)
 54 | 
 55 | 
 56 | def test_hashing_of_functions():
 57 | 
 58 |     def foo(a, b):
 59 |         return a + b
 60 | 
 61 |     assert hash(foo) == hash(foo)
 62 | 
 63 | 
 64 | def test_hashing_of_artifacts_and_proxies(repo):
 65 | 
 66 |     @p.provenance()
 67 |     def load_data():
 68 |         return [1, 2, 3]
 69 | 
 70 |     original_proxy = load_data()
 71 |     original_artifact = original_proxy.artifact
 72 |     loaded_artifact = repo.get_by_id(original_artifact.id)
 73 |     loaded_proxy = loaded_artifact.proxy()
 74 | 
 75 |     # All artifacts should have the same hash
 76 |     assert hash(original_artifact) == hash(loaded_artifact)
 77 | 
 78 |     # All proxies should have the same hash
 79 |     assert hash(original_proxy) == hash(loaded_proxy)
 80 | 
 81 |     # All values should have the same hash
 82 |     assert hash(original_artifact.value) == hash(loaded_artifact.value)
 83 | 
 84 |     # Artifacts and proxies should not have the same hash
 85 |     assert hash(original_artifact) != hash(original_proxy)
 86 | 
 87 |     # Proxies and values should have the same hash
 88 |     assert hash(original_proxy) == hash(original_artifact.value)
 89 | 
 90 | 
 91 | def test_hashing_with_artifact_hasher_also_returns_iter_of_artifacts_preserves_hash(repo,):
 92 | 
 93 |     @p.provenance()
 94 |     def load_data():
 95 |         return [1, 2, 3]
 96 | 
 97 |     @p.provenance()
 98 |     def create_composite(data):
 99 |         return {'foo': 'bar', 'data': data}
100 | 
101 |     data = load_data()
102 | 
103 |     original_proxy = create_composite(data)
104 |     original_artifact = original_proxy.artifact
105 |     loaded_artifact = repo.get_by_id(original_artifact.id)
106 |     loaded_proxy = loaded_artifact.proxy()
107 | 
108 |     expected_proxy_ids = frozenset((original_artifact.id, data.artifact.id))
109 |     expected_artifact_ids = frozenset((original_artifact.id,))
110 | 
111 |     original_proxy_hash, artifacts = hash(original_proxy, hasher=ah.artifact_hasher())
112 |     ids = frozenset(a.id for a in artifacts)
113 |     assert original_proxy_hash == hash(original_proxy)
114 |     assert ids == expected_proxy_ids
115 | 
116 |     original_artifact_hash, artifacts = hash(original_artifact, hasher=ah.artifact_hasher())
117 |     ids = frozenset(a.id for a in artifacts)
118 |     assert original_artifact_hash == hash(original_artifact)
119 |     assert ids == expected_artifact_ids
120 | 
121 |     loaded_artifact_hash, artifacts = hash(loaded_artifact, hasher=ah.artifact_hasher())
122 |     ids = frozenset(a.id for a in artifacts)
123 |     assert loaded_artifact_hash == hash(loaded_artifact)
124 |     assert ids == expected_artifact_ids
125 | 
126 |     loaded_proxy_hash, artifacts = hash(loaded_proxy, hasher=ah.artifact_hasher())
127 |     ids = frozenset(a.id for a in artifacts)
128 |     assert loaded_proxy_hash == hash(loaded_proxy)
129 |     assert ids == expected_proxy_ids
130 | 


--------------------------------------------------------------------------------
/tests/provenance/test_pytorch.py:
--------------------------------------------------------------------------------
  1 | from copy import copy, deepcopy
  2 | 
  3 | import pytest
  4 | 
  5 | import provenance as p
  6 | from provenance.hashing import hash
  7 | 
  8 | torch = pytest.importorskip('torch')
  9 | 
 10 | 
 11 | class TwoLayerNet(torch.nn.Module):
 12 |     """
 13 |     This class is copied from PyTorch's documentation and is meant to be the
 14 |     simplest, non-trivial custom NN we can use for testing provenance.
 15 |     See [here](https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html#sphx-glr-beginner-examples-nn-two-layer-net-module-py)
 16 |     """
 17 | 
 18 |     def __init__(self, D_in, H, D_out):
 19 |         """
 20 |         In the constructor we instantiate two nn.Linear modules and assign them
 21 |         as member variables.
 22 |         """
 23 |         super(TwoLayerNet, self).__init__()
 24 |         self.linear1 = torch.nn.Linear(D_in, H)
 25 |         self.linear2 = torch.nn.Linear(H, D_out)
 26 | 
 27 |     def forward(self, x):
 28 |         """
 29 |         In the forward function we accept a Tensor of input data and we must
 30 |         return a Tensor of output data. We can use Modules defined in the
 31 |         constructor as well as arbitrary operators on Tensors.
 32 |         """
 33 |         h_relu = self.linear1(x).clamp(min=0)
 34 |         y_pred = self.linear2(h_relu)
 35 |         return y_pred
 36 | 
 37 | 
 38 | def random_data(N, D_in, D_out):
 39 |     """
 40 |     Generates random data for training/testing the PyTorch model.
 41 | 
 42 |     N is the data size
 43 |     D_in is the input dimension
 44 |     D_out is the output dimension
 45 |     """
 46 | 
 47 |     # Create random Tensors to hold inputs and outputs
 48 |     x = torch.randn(N, D_in)
 49 |     y = torch.randn(N, D_out)
 50 |     return {'X_train': x, 'Y_train': y, 'X_test': x, 'Y_test': y}
 51 | 
 52 | 
 53 | @p.provenance(returns_composite=True)
 54 | def fit_model(N=64, D_in=1000, D_out=10, H=100, epochs=500, seed=None):
 55 |     """
 56 |     An example workflow that provenance can handle from PyTorch. The model
 57 |     parameters, the data parameters, and the fit parameters are all passed
 58 |     into this function, and the output includes the PyTorch model and some
 59 |     metadata regarding its fit history (a list of losses after each epoch).
 60 |     """
 61 |     if seed is not None:
 62 |         torch.manual_seed(seed)
 63 | 
 64 |     data = random_data(N, D_in, D_out)
 65 |     x = data['X_train']
 66 |     y = data['Y_train']
 67 | 
 68 |     model = TwoLayerNet(D_in, H, D_out)
 69 | 
 70 |     # Construct our loss function and an Optimizer. The call to
 71 |     # model.parameters() in the SGD constructor will contain the learnable
 72 |     # parameters of the two nn.Linear modules which are members of the model.
 73 |     criterion = torch.nn.MSELoss(reduction='sum')
 74 |     optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
 75 | 
 76 |     losses = []
 77 |     for t in range(epochs):
 78 |         # Forward pass: Compute predicted y by passing x to the model
 79 |         y_pred = model(x)
 80 | 
 81 |         # Compute and print loss
 82 |         loss = criterion(y_pred, y)
 83 |         losses.append(loss.item())
 84 | 
 85 |         # Zero gradients, perform a backward pass, and update the weights.
 86 |         optimizer.zero_grad()
 87 |         loss.backward()
 88 |         optimizer.step()
 89 | 
 90 |     return {'model': model, 'losses': losses}
 91 | 
 92 | 
 93 | def test_same_models_are_equal(dbdiskrepo):
 94 |     """
 95 |     Validates that two separately constructed models using the same parameters
 96 |     hash to the same artifact in provenance terms.
 97 |     """
 98 |     fit1 = fit_model()
 99 |     fit2 = fit_model()
100 |     assert fit1.artifact.id == fit2.artifact.id
101 |     assert fit1.artifact.value_id == fit2.artifact.value_id
102 |     assert hash(fit1) == hash(fit2)
103 | 
104 | 
105 | def test_copied_models_are_equal(dbdiskrepo):
106 |     """
107 |     Validates that a copied model (deep or shallow copied) hashes to the same
108 |     artifact as the original in provenance terms.
109 |     """
110 |     original = fit_model()
111 | 
112 |     shallow = copy(original)
113 |     assert original.artifact.id == shallow.artifact.id
114 |     assert original.artifact.value_id == shallow.artifact.value_id
115 |     assert hash(original) == hash(shallow)
116 | 
117 |     deep = deepcopy(original)
118 |     assert original.artifact.id == deep.artifact.id
119 |     assert original.artifact.value_id == deep.artifact.value_id
120 |     assert hash(original) == hash(deep)
121 | 
122 | 
123 | def test_reloading_from_disk_has_same_value_id(dbdiskrepo):
124 |     """
125 |     Validates that we can write and read a pytorch model as an artifact and that
126 |     it is the same going in as coming out.
127 |     """
128 |     original = fit_model()
129 |     loaded = p.load_proxy(original.artifact.id)
130 | 
131 |     assert loaded.artifact.value_id == p.hash(loaded.artifact.value)
132 |     assert loaded.artifact.value_id == original.artifact.value_id
133 |     assert loaded.artifact.id == original.artifact.id
134 | 
135 | 
136 | def test_different_seeds_result_in_different_models(dbdiskrepo):
137 |     """
138 |     Validates that using different pytorch seeds to the fit model results in
139 |     the same artifact.
140 |     """
141 |     fit1 = fit_model(seed=0)
142 |     fit2 = fit_model(seed=1)
143 | 
144 |     assert p.hash(fit1) != p.hash(fit2)
145 |     assert fit1.artifact.id != fit2.artifact.id
146 |     assert fit1.artifact.value_id != fit2.artifact.value_id
147 | 
148 | 
149 | def test_same_seeds_result_in_same_models(dbdiskrepo):
150 |     """
151 |     Validates that using the same pytorch seed to the fit model results in
152 |     different artifacts.
153 |     """
154 |     fit1 = fit_model(seed=0)
155 |     fit2 = fit_model(seed=0)
156 | 
157 |     assert p.hash(fit1) == p.hash(fit2)
158 |     assert fit1.artifact.id == fit2.artifact.id
159 |     assert fit1.artifact.value_id == fit2.artifact.value_id
160 | 


--------------------------------------------------------------------------------
/tests/provenance/test_repos.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from datetime import datetime
  3 | 
  4 | import pandas as pd
  5 | import pytest
  6 | import sqlalchemy_utils.functions as sql_utils
  7 | from conftest import artifact_record
  8 | 
  9 | import provenance as p
 10 | import provenance._commonstore as cs
 11 | import provenance.blobstores as bs
 12 | import provenance.repos as r
 13 | 
 14 | 
 15 | def test_inputs_json(db_session):
 16 |     repo = r.DbRepo(db_session, bs.MemoryStore())
 17 | 
 18 |     @p.provenance(version=0, name='initial_data', repo=repo)
 19 |     def load_data(filename, timestamp):
 20 |         return {'data': [1, 2, 3], 'timestamp': timestamp}
 21 | 
 22 |     @p.provenance(repo=repo)
 23 |     def process_data_X(data, process_x_inc, timestamp):
 24 |         _data = [i + process_x_inc for i in data['data']]
 25 |         return {'data': _data, 'timestamp': timestamp}
 26 | 
 27 |     @p.provenance(repo=repo)
 28 |     def process_data_Y(data, process_y_inc, timestamp):
 29 |         _data = [i + process_y_inc for i in data['data']]
 30 |         return {'data': _data, 'timestamp': timestamp}
 31 | 
 32 |     @p.provenance(repo=repo)
 33 |     def combine_processed_data(filename, inc_x, inc_y, timestamp):
 34 |         _data = [a + b for a, b in zip(inc_x['data'], inc_y['data'])]
 35 |         return {'data': _data, 'timestamp': timestamp}
 36 | 
 37 |     def pipeline(filename, timestamp, process_x_inc, process_y_inc):
 38 |         data = load_data(filename, timestamp)
 39 |         inc_x = process_data_X(data, process_x_inc, timestamp)
 40 |         inc_y = process_data_Y(data, process_y_inc, timestamp)
 41 |         res = combine_processed_data(filename, inc_x, inc_y, timestamp)
 42 |         return {'data': data, 'inc_x': inc_x, 'inc_y': inc_y, 'res': res}
 43 | 
 44 |     now = datetime(2016, 9, 27, 7, 51, 11, 613544)
 45 | 
 46 |     expected_inputs_json = {
 47 |         '__varargs': [],
 48 |         'filename': 'foo-bar',
 49 |         'timestamp': now,
 50 |         'inc_x':
 51 |             {
 52 |                 'id': 'c74da9d379234901fe7a89e03fa800b0',    # md5
 53 |     # "id": "2c33a362ebd51f830d0b245473ab6c1269674259",  # sha1
 54 |                 'name': 'test_repos.process_data_X',
 55 |                 'type': 'ArtifactProxy',
 56 |             },
 57 |         'inc_y':
 58 |             {
 59 |                 'id': 'a1bd4d4ae1f33ae6379613618427f127',    # md5
 60 |     # "id": "f9b1bb7a8aaf435fbf60b92cd88bf6c46604f702",  # sha1
 61 |                 'name': 'test_repos.process_data_Y',
 62 |                 'type': 'ArtifactProxy',
 63 |             },
 64 |     }
 65 | 
 66 |     results = pipeline(filename='foo-bar', process_x_inc=5, process_y_inc=10, timestamp=now)
 67 |     res = results['res'].artifact
 68 |     inputs_json = r._inputs_json(res.inputs)
 69 |     assert inputs_json == expected_inputs_json
 70 | 
 71 |     results = pipeline(filename='foo-bar', process_x_inc=5, process_y_inc=10, timestamp=now)
 72 |     res = results['res'].artifact
 73 |     inputs_json = r._inputs_json(res.inputs)
 74 |     assert inputs_json == expected_inputs_json
 75 | 
 76 | 
 77 | def test_basic_repo_ops(repo):
 78 |     artifact = artifact_record()
 79 | 
 80 |     assert artifact.id not in repo
 81 |     repo.put(artifact)
 82 | 
 83 |     assert artifact.id in repo
 84 |     assert artifact in repo
 85 | 
 86 |     with pytest.raises(cs.KeyExistsError):
 87 |         repo.put(artifact)
 88 | 
 89 |     assert repo.get_by_id(artifact.id).id == artifact.id
 90 |     assert repo[artifact.id].id == artifact.id
 91 |     assert repo.get_by_value_id(artifact.value_id).id == artifact.id
 92 | 
 93 |     repo.delete(artifact.id)
 94 |     assert artifact.id not in repo
 95 |     if hasattr(repo, 'blobstore'):
 96 |         assert artifact.id not in repo.blobstore
 97 |         assert artifact.value_id not in repo.blobstore
 98 | 
 99 |     with pytest.raises(KeyError):
100 |         repo.delete(artifact.id)
101 | 
102 |     with pytest.raises(KeyError):
103 |         repo.get_by_id(artifact.id)
104 | 
105 |     with pytest.raises(KeyError):
106 |         repo.get_by_value_id(artifact.id)
107 | 
108 | 
109 | @pytest.mark.parametrize('artifact_class', [r.ArtifactProxy, r.CallableArtifactProxy])
110 | @pytest.mark.parametrize('copy_method', [copy.copy, copy.deepcopy])
111 | def test_copy_Proxies(repo, artifact_class, copy_method):
112 | 
113 |     class Artifact:
114 | 
115 |         def __init__(self, id):
116 |             self.id = id
117 | 
118 |     a = artifact_class({'a': 1, 'b': 2, 'c': 3}, Artifact('1'))
119 |     b = copy_method(a)
120 |     b['a'] = 10
121 | 
122 |     assert a['a'] != b['a']
123 | 
124 | 
125 | def test_repo_set_put_and_finding(repo):
126 |     artifact = artifact_record(id='123')
127 |     repo.put(artifact)
128 |     artifact_set = r.ArtifactSet([artifact.id], 'foo')
129 |     repo.put_set(artifact_set)
130 | 
131 |     assert repo.get_set_by_id(artifact_set.id) == artifact_set
132 |     found_set = repo.get_set_by_labels('foo')
133 |     assert found_set.name == 'foo'
134 |     assert found_set.artifact_ids == {'123'}
135 | 
136 | 
137 | def test_repo_raises_key_error_when_set_id_not_found(repo):
138 |     with pytest.raises(KeyError):
139 |         repo.get_set_by_id('foo')
140 | 
141 | 
142 | def test_repo_raises_key_error_when_set_name_not_found(repo):
143 |     with pytest.raises(KeyError):
144 |         repo.get_set_by_labels('foo')
145 | 
146 | 
147 | def test_repo_contains_set(repo):
148 |     assert not repo.contains_set('foo')
149 | 
150 |     artifact = artifact_record(id='123')
151 |     repo.put(artifact)
152 |     artifact_set = r.ArtifactSet([artifact.id], 'foo')
153 | 
154 |     repo.put_set(artifact_set)
155 |     assert repo.contains_set(artifact_set.id)
156 | 
157 | 
158 | def test_repo_delete_set(repo):
159 |     artifact = artifact_record(id='123')
160 |     repo.put(artifact)
161 |     artifact_set = r.ArtifactSet(['123'], 'foo')
162 |     repo.put_set(artifact_set)
163 | 
164 |     repo.delete_set(artifact_set.id)
165 | 
166 |     with pytest.raises(KeyError):
167 |         repo.get_set_by_id(artifact_set.id)
168 | 
169 | 
170 | def test_permissions(atomic_repo):
171 |     repo = atomic_repo
172 |     artifact = artifact_record()
173 | 
174 |     repo._write = False
175 |     assert not repo._write
176 | 
177 |     with pytest.raises(cs.PermissionError):
178 |         repo.put(artifact)
179 |     assert artifact not in repo
180 | 
181 |     repo._write = True
182 |     repo.put(artifact)
183 | 
184 |     repo._read = False
185 | 
186 |     with pytest.raises(cs.PermissionError):
187 |         repo.get_by_id(artifact.id)
188 | 
189 |     with pytest.raises(cs.PermissionError):
190 |         repo.get_by_value_id(artifact.value_id)
191 | 
192 |     with pytest.raises(cs.PermissionError):
193 |         repo.get_value(artifact.id)
194 | 
195 |     with pytest.raises(cs.PermissionError):
196 |         repo.get_inputs(artifact)
197 | 
198 |     with pytest.raises(cs.PermissionError):
199 |         artifact.id in repo
200 | 
201 |     repo._read = True
202 |     assert repo.get_by_id(artifact.id)
203 |     assert artifact.id in repo
204 | 
205 |     repo._delete = False
206 |     with pytest.raises(cs.PermissionError):
207 |         repo.delete(artifact.id)
208 | 
209 |     repo._delete = True
210 |     repo.delete(artifact.id)
211 |     assert artifact.id not in repo
212 | 
213 | 
214 | def test_chained_with_readonly():
215 |     read_repo = r.MemoryRepo([artifact_record(id='foo')], read=True, write=False, delete=False)
216 |     write_repo = r.MemoryRepo(read=True, write=True, delete=False)
217 |     repos = [read_repo, write_repo]
218 |     chained = r.ChainedRepo(repos)
219 | 
220 |     # verify we read from the read-only store
221 |     assert 'foo' in chained
222 | 
223 |     # but that it is not written to
224 |     record = artifact_record(id='bar', value_id='baz')
225 |     chained.put(record)
226 |     assert 'bar' in chained
227 |     assert 'bar' in write_repo
228 |     assert 'bar' not in read_repo
229 |     assert chained.get_by_value_id(record.value_id).id == record.id
230 |     assert chained.get_by_id(record.id).id == record.id
231 |     assert chained.get_value(record) == record.value
232 | 
233 | 
234 | def test_chained_read_through_write():
235 |     foo = artifact_record(id='foo')
236 |     read_repo = r.MemoryRepo([foo], read=True, write=False)
237 |     repo_ahead = r.MemoryRepo(read=True, write=True, read_through_write=True)
238 |     read_through_write_repo = r.MemoryRepo(read=True, write=True, read_through_write=True)
239 |     no_read_through_write_repo = r.MemoryRepo(read=True, write=True, read_through_write=False)
240 |     repos = [no_read_through_write_repo, read_through_write_repo, read_repo, repo_ahead]
241 |     chained_repo = r.ChainedRepo(repos)
242 | 
243 |     assert 'foo' not in read_through_write_repo
244 |     assert 'foo' not in no_read_through_write_repo
245 |     assert 'foo' not in repo_ahead
246 |     # verify we read from the read-only store
247 |     assert chained_repo['foo'].id == foo.id
248 | 
249 |     assert 'foo' in read_through_write_repo
250 |     assert 'foo' not in repo_ahead
251 |     assert 'foo' not in no_read_through_write_repo
252 | 
253 | 
254 | def test_chained_writes_may_be_allowed_on_read_throughs_only():
255 |     foo = artifact_record(id='foo')
256 |     read_repo = r.MemoryRepo([foo], read=True, write=False)
257 |     read_through_write_only_repo = r.MemoryRepo(read=True, write=False, read_through_write=True)
258 |     write_repo = r.MemoryRepo(read=True, write=True, read_through_write=False)
259 |     repos = [write_repo, read_through_write_only_repo, read_repo]
260 |     chained_repo = r.ChainedRepo(repos)
261 | 
262 |     # verify we read from the read-only repo
263 |     assert chained_repo['foo'].id == foo.id
264 | 
265 |     assert 'foo' in read_through_write_only_repo
266 |     assert 'foo' not in write_repo
267 | 
268 |     bar = artifact_record(id='bar')
269 |     chained_repo.put(bar)
270 |     assert 'bar' in chained_repo
271 |     assert 'bar' not in read_through_write_only_repo
272 |     assert 'bar' in write_repo
273 | 
274 | 
275 | def test_db_is_automatically_created_and_migrated(disk_store):
276 |     db_conn_str = 'postgresql://localhost/test_provenance_autocreate'
277 |     if sql_utils.database_exists(db_conn_str):
278 |         sql_utils.drop_database(db_conn_str)
279 | 
280 |     repo = r.PostgresRepo(
281 |         db_conn_str, disk_store, read=True, write=True, delete=True, create_db=True
282 |     )
283 |     p.set_default_repo(repo)
284 | 
285 |     @p.provenance()
286 |     def calculate(a, b):
287 |         return a + b
288 | 
289 |     assert sql_utils.database_exists(db_conn_str)
290 | 
291 |     # make sure it all works
292 |     assert calculate(1, 2) == 3
293 | 
294 |     p.set_default_repo(None)
295 |     sql_utils.drop_database(db_conn_str)
296 | 
297 | 
298 | def test_db_is_automatically_created_and_migrated_with_the_right_schema(disk_store):
299 |     db_conn_str = 'postgresql://localhost/test_provenance_autocreate_schema'
300 |     if sql_utils.database_exists(db_conn_str):
301 |         sql_utils.drop_database(db_conn_str)
302 | 
303 |     repo = r.PostgresRepo(
304 |         db_conn_str,
305 |         disk_store,
306 |         read=True,
307 |         write=True,
308 |         delete=True,
309 |         create_db=True,
310 |         schema='foobar',
311 |     )
312 |     p.set_default_repo(repo)
313 | 
314 |     @p.provenance()
315 |     def calculate(a, b):
316 |         return a + b
317 | 
318 |     assert calculate(1, 2) == 3
319 | 
320 |     with repo.session() as s:
321 |         res = pd.read_sql('select * from foobar.artifacts', s.connection())
322 | 
323 |     repo2 = r.PostgresRepo(
324 |         db_conn_str,
325 |         disk_store,
326 |         read=True,
327 |         write=True,
328 |         delete=True,
329 |         create_db=True,
330 |         schema='baz',
331 |     )
332 | 
333 |     p.set_default_repo(repo2)
334 | 
335 |     assert calculate(5, 5) == 10
336 | 
337 |     with repo2.session() as s:
338 |         res = pd.read_sql('select * from baz.artifacts', s.connection())
339 | 
340 |     assert res.iloc[0]['inputs_json'] == {'b': 5, 'a': 5, '__varargs': []}
341 | 
342 |     p.set_default_repo(None)
343 |     sql_utils.drop_database(db_conn_str)
344 | 
345 | 
346 | def xtest_db_is_automatically_migrated(disk_store):
347 |     db_conn_str = 'postgresql://localhost/test_provenance_automigrate'
348 |     if sql_utils.database_exists(db_conn_str):
349 |         sql_utils.drop_database(db_conn_str)
350 | 
351 |     sql_utils.create_database(db_conn_str)
352 | 
353 |     repo = r.PostgresRepo(
354 |         db_conn_str,
355 |         disk_store,
356 |         read=True,
357 |         write=True,
358 |         delete=True,
359 |         create_db=False,
360 |         upgrade_db=True,
361 |     )
362 |     p.set_default_repo(repo)
363 | 
364 |     @p.provenance()
365 |     def calculate(a, b):
366 |         return a + b
367 | 
368 |     # make sure it all works
369 |     assert calculate(1, 2) == 3
370 | 
371 |     p.set_default_repo(None)
372 |     sql_utils.drop_database(db_conn_str)
373 | 
374 | 
375 | def test_artifact_proxy_works_with_iterables():
376 | 
377 |     class Foo:
378 | 
379 |         def __init__(self, a):
380 |             self.a = a
381 | 
382 |         def __next__(self):
383 |             return self.a
384 | 
385 |     foo = r.artifact_proxy(Foo(5), 'stub artifact')
386 | 
387 |     assert next(foo) == 5
388 | 


--------------------------------------------------------------------------------
/tests/provenance/test_utils.py:
--------------------------------------------------------------------------------
  1 | import toolz as t
  2 | 
  3 | import provenance.utils as u
  4 | 
  5 | 
  6 | def test_fn_info_with_regular_function():
  7 | 
  8 |     def inc(x):
  9 |         return x + 1
 10 | 
 11 |     info = u.fn_info(inc)
 12 |     assert info == {'name': 'inc', 'module': 'test_utils', 'varargs': (), 'kargs': {}}
 13 | 
 14 | 
 15 | def test_fn_info_with_partial():
 16 | 
 17 |     def mult(x, y):
 18 |         return x * y
 19 | 
 20 |     double = t.partial(mult, 2)
 21 |     info = u.fn_info(double)
 22 | 
 23 |     assert info == {
 24 |         'name': 'mult',
 25 |         'module': 'test_utils',
 26 |         'varargs': (),
 27 |         'kargs': {
 28 |             'x': 2
 29 |         },
 30 |     }
 31 | 
 32 | 
 33 | def test_fn_info_with_partial_of_partial():
 34 | 
 35 |     def mult(*args):
 36 |         return t.reduce(lambda a, b: a * b, args)
 37 | 
 38 |     double = t.partial(mult, 2)
 39 |     quad = t.partial(double, 2)
 40 |     info = u.fn_info(quad)
 41 | 
 42 |     assert info == {
 43 |         'name': 'mult',
 44 |         'module': 'test_utils',
 45 |         'varargs': (2, 2),
 46 |         'kargs': {},
 47 |     }
 48 | 
 49 | 
 50 | def test_fn_info_with_curry():
 51 | 
 52 |     @t.curry
 53 |     def mult(x, y):
 54 |         return x * y
 55 | 
 56 |     double = mult(2)
 57 |     assert double(2) == 4
 58 |     info = u.fn_info(double)
 59 | 
 60 |     assert info == {
 61 |         'name': 'mult',
 62 |         'module': 'test_utils',
 63 |         'varargs': (),
 64 |         'kargs': {
 65 |             'x': 2
 66 |         },
 67 |     }
 68 | 
 69 | 
 70 | def test_fn_info_with_multiple_curries():
 71 | 
 72 |     @t.curry
 73 |     def mult(a, b, c):
 74 |         return a * b * c
 75 | 
 76 |     double = mult(2)
 77 |     quad = double(2)
 78 |     info = u.fn_info(quad)
 79 | 
 80 |     assert info == {
 81 |         'name': 'mult',
 82 |         'module': 'test_utils',
 83 |         'varargs': (),
 84 |         'kargs': {
 85 |             'a': 2,
 86 |             'b': 2
 87 |         },
 88 |     }
 89 | 
 90 | 
 91 | def test_with_merged_defaults_basic_merging():
 92 |     foo_defaults = {'a': 1, 'b': 2}
 93 | 
 94 |     @u.with_merged_defaults()
 95 |     def bar(foo=foo_defaults):
 96 |         return foo
 97 | 
 98 |     assert bar() == {'a': 1, 'b': 2}
 99 |     assert bar(foo={'c': 3}) == {'a': 1, 'b': 2, 'c': 3}
100 |     assert bar(foo={'a': 10}) == {'a': 10, 'b': 2}
101 | 
102 | 
103 | def test_with_merged_defaults_with_non_dict_args():
104 |     foo_defaults = {'a': 1, 'b': 2}
105 | 
106 |     @u.with_merged_defaults()
107 |     def bar(a, foo=foo_defaults, baz=None):
108 |         return a, baz, foo
109 | 
110 |     assert bar(5) == (5, None, {'a': 1, 'b': 2})
111 |     assert bar(5, baz='baz', foo={'c': 3}) == (5, 'baz', {'a': 1, 'b': 2, 'c': 3})
112 | 
113 | 
114 | def test_with_merged_defaults_with_args_splat():
115 |     foo_defaults = {'a': 1, 'b': 2}
116 | 
117 |     @u.with_merged_defaults()
118 |     def bar(*args, foo=foo_defaults):
119 |         return args, foo
120 | 
121 |     assert bar(5, 10) == ((5, 10), {'a': 1, 'b': 2})
122 |     assert bar() == ((), {'a': 1, 'b': 2})
123 | 


--------------------------------------------------------------------------------