├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_report.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── compliance-oss.yaml │ ├── pr.yaml │ └── prod.yaml ├── .gitignore ├── CHANGELOG.md ├── CODE-OF-CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── Makefile ├── conf.py ├── contrib.rst ├── getting_started.rst ├── index.rst ├── stork.rst └── tutorial.rst ├── requirements-dev.in ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── stork ├── __init__.py ├── _version.py ├── api_error.py ├── cli.py ├── cli_commands.py ├── configure.py ├── create_job_cluster.py ├── file_name.py └── update_databricks_library.py └── tests ├── __init__.py ├── conftest.py ├── fixtures.py ├── test_cli.py ├── test_filename_match.py ├── test_token_permissions.py ├── test_update_databricks_library.py └── unittest_helpers.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | .github/workflows/compliance-oss.yaml @ShopRunner/compliance 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Bug Report" 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug, invalid 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | 12 | > A clear and concise description of what the bug is. 13 | 14 | **Expected behavior** 15 | 16 | > A clear and concise description of what you expected to happen. 17 | 18 | **To Reproduce** 19 | 20 | > Add a code snippet to reproduce the issue 21 | 22 | **Environment** 23 | 24 | > Add PyTorch version, OS, Python version, CUDA/cuDNN version, GPU models, etc. 25 | 26 | **Screenshots** 27 | 28 | > If applicable, add screenshots or GIFs to help explain your problem. 29 | 30 | **Any additional information?** 31 | 32 | > Add any other context or information about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Suggest an idea for Stork 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | 12 | > A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 13 | 14 | **Describe the solution you'd like** 15 | 16 | > A clear and concise description of what you want to happen. 17 | 18 | **Describe alternatives you've considered** 19 | 20 | > A clear and concise description of any alternative solutions or features you've considered. 21 | 22 | **Any additional information?** 23 | 24 | > Add any other context or information about the feature request here. 25 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Pull Request Checklist 2 | - [ ] Pull request includes a description of the change and the reason behind it 3 | - [ ] CHANGELOG has been updated 4 | - [ ] Version in `_version.py` has been updated 5 | - [ ] README and docs have been updated (if applicable) 6 | - [ ] CI checks pass 7 | -------------------------------------------------------------------------------- /.github/workflows/compliance-oss.yaml: -------------------------------------------------------------------------------- 1 | name: "Compliance - OSS" 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - master 8 | push: 9 | branches: 10 | - main 11 | - master 12 | 13 | jobs: 14 | scan: 15 | name: Scan 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout Code 19 | uses: actions/checkout@v1 20 | - name: OSS Scan 21 | uses: shoprunner/action-oss@main 22 | -------------------------------------------------------------------------------- /.github/workflows/pr.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | pull_request: 8 | branches: [ main ] 9 | 10 | jobs: 11 | test: 12 | 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: [3.6, 3.7, 3.8] 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r requirements.txt -r requirements-dev.txt 27 | printf "[DEFAULT]\nhost = https://fake-org.cloud.databricks.com\nprod_folder = /databricks/folder\n" > ~/.storkcfg 28 | - name: Lint with flake8 29 | run: | 30 | flake8 stork tests 31 | - name: Test with pytest 32 | run: | 33 | pytest -v --deselect tests/test_token_permissions.py 34 | - name: Verify build 35 | run: | 36 | pip install --upgrade pip wheel twine 37 | python setup.py sdist bdist_wheel 38 | -------------------------------------------------------------------------------- /.github/workflows/prod.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | 10 | jobs: 11 | test: 12 | 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: [3.6, 3.7, 3.8] 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r requirements.txt -r requirements-dev.txt 27 | printf "[DEFAULT]\nhost = https://fake-org.cloud.databricks.com\nprod_folder = /databricks/folder\n" > ~/.storkcfg 28 | - name: Lint with flake8 29 | run: | 30 | # stop the build if there are Python syntax errors or undefined names 31 | flake8 stork tests 32 | - name: Test with pytest 33 | run: | 34 | pytest -v --deselect tests/test_token_permissions.py 35 | 36 | publish: 37 | 38 | runs-on: ubuntu-latest 39 | needs: [test] 40 | steps: 41 | - uses: actions/checkout@v2 42 | - name: Set up Python 43 | uses: actions/setup-python@v2 44 | with: 45 | python-version: '3.x' 46 | - name: Install dependencies 47 | run: | 48 | python -m pip install --upgrade pip 49 | pip install setuptools wheel twine 50 | - name: Get changed files 51 | id: changed-files 52 | uses: jitterbit/get-changed-files@v1 53 | with: 54 | format: space-delimited 55 | - name: Build and publish 56 | env: 57 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 58 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 59 | if: contains(steps.changed-files.outputs.modified, '_version.py') 60 | run: | 61 | python setup.py sdist bdist_wheel 62 | twine upload dist/* 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | docs/source/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | # mac stuff 106 | .DS_Store 107 | 108 | # IntelliJ 109 | *.iml 110 | .idea 111 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project uses [Semantic Versioning](http://semver.org/). 5 | 6 | # [3.2.1] - 2021-04-02 7 | ### Added 8 | - PyPI version badge to `README.md` 9 | 10 | # [3.2.0] - 2021-01-21 11 | ### Changed 12 | - Library mapping is created using the workspace 2.0 API 13 | - Exception classes and Filename classes moved to separate files 14 | 15 | # [3.1.0] - 2020-12-16 16 | ### Added 17 | - command to create a cluster matching a Databricks job cluster 18 | 19 | # [3.0.1] - 2020-07-24 20 | ### Fixed 21 | - fixed readthedocs link + build 22 | - re-compiled dependencies to fix security issue in a pinned dependency 23 | - removed requirements-read-the-docs.txt (no longer needed now that stork is installed in requirements-dev.txt) 24 | 25 | # [3.0.0] - 2020-07-23 26 | ### Changed 27 | - renamed repo to stork 28 | 29 | ========= prior development under the name apparate ========= 30 | 31 | # [2.3.0] - 2020-07-22 32 | ### Added 33 | - repo rename warning 34 | - github actions for CI/CD steps 35 | 36 | # [2.2.3] - 2020-06-15 37 | ### Changed 38 | - add repo name note 39 | - define requirements with pip-tools 40 | 41 | # [2.2.2] - 2019-02-14 42 | ### Fixed 43 | - allow user to specify path to config file when running pytest 44 | 45 | # [2.2.1] - 2019-02-14 46 | ### Fixed 47 | - added license file to setup.py so apparate can be installed from tarball 48 | - added note to docs that apparate only works on AWS 49 | 50 | # [2.2.0] - 2018-11-15 51 | ### Added 52 | - Support for loading jars 53 | - DEBUG logging 54 | ### Changed 55 | - Moved print statements to INFO logging 56 | - Updated dependency versions 57 | 58 | # [2.1.0] - 2018-10-11 59 | ### Added 60 | - Now with deployment pipeline! 61 | - Fixes markdown rendering on PyPi 62 | 63 | # [2.0.1] - 2018-10-10 64 | ### Fixed 65 | - Req file and link changes for hosting documentation on readthedocs.io 66 | 67 | # [2.0.0] - 2018-10-10 68 | ### Added 69 | - Initial open-source release of apparate! 70 | -------------------------------------------------------------------------------- /CODE-OF-CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. 8 | 9 | ## Our Standards 10 | 11 | Examples of behavior that contributes to a positive environment for our community include: 12 | 13 | * Demonstrating empathy and kindness toward other people 14 | * Being respectful of differing opinions, viewpoints, and experiences 15 | * Giving and gracefully accepting constructive feedback 16 | * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience 17 | * Focusing on what is best not just for us as individuals, but for the overall community 18 | 19 | Examples of unacceptable behavior include: 20 | 21 | * The use of sexualized language or imagery, and sexual attention or 22 | advances of any kind 23 | * Trolling, insulting or derogatory comments, and personal or political attacks 24 | * Public or private harassment 25 | * Publishing others' private information, such as a physical or email 26 | address, without their explicit permission 27 | * Other conduct which could reasonably be considered inappropriate in a 28 | professional setting 29 | 30 | ## Enforcement Responsibilities 31 | 32 | Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. 33 | 34 | Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. 35 | 36 | ## Scope 37 | 38 | This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. 39 | 40 | ## Enforcement 41 | 42 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement by submitting this [anonymous form](https://forms.gle/11DcyKpYkVjmRDKV9) or by sending an email to [opensource@shoprunner.com](mailto:opensource@shoprunner.com). All complaints will be reviewed and investigated promptly and fairly. 43 | 44 | All community leaders are obligated to respect the privacy and security of the reporter of any incident. 45 | 46 | ## Enforcement Guidelines 47 | 48 | Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: 49 | 50 | ### 1. Correction 51 | 52 | **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. 53 | 54 | **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. 55 | 56 | ### 2. Warning 57 | 58 | **Community Impact**: A violation through a single incident or series of actions. 59 | 60 | **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. 61 | 62 | ### 3. Temporary Ban 63 | 64 | **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. 65 | 66 | **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. 67 | 68 | ### 4. Permanent Ban 69 | 70 | **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. 71 | 72 | **Consequence**: A permanent ban from any sort of public interaction within the project community. 73 | 74 | ## Attribution 75 | 76 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, 77 | available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 78 | 79 | Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). 80 | 81 | [homepage]: https://www.contributor-covenant.org 82 | 83 | For answers to common questions about this code of conduct, see the FAQ at 84 | https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. 85 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## How to Contribute 4 | 5 | We welcome contributions in the form of issues or pull requests! 6 | 7 | We want this to be a place where all are welcome to discuss and contribute, so please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms. Find the code of conduct in the ``CODE-OF-CONDUCT.md`` file on GitHub. 8 | 9 | If you have a problem using stork or see a possible improvement, open an issue in the GitHub issue tracker. Please be as specific as you can. 10 | 11 | If you see an open issue you'd like to be fixed, take a stab at it and open a PR! 12 | 13 | ### Steps for making a pull request: 14 | 15 | 1. Fork the project from GitHub 16 | 17 | 2. Clone the forked repo to your local disk and ``cd`` into it:: 18 | 19 | git clone https://github.com//stork.git 20 | cd stork 21 | 22 | 3. Create a new branch:: 23 | 24 | git checkout -b my_awesome_new_feature 25 | 26 | 4. Install requirements (virtualenvs always recommended!):: 27 | 28 | pip install -r requirements-dev.txt 29 | 30 | 5. Write some awesome useful code 31 | 32 | 6. Update unittests, docs, and CHANGELOG - to view docs locally:: 33 | 34 | cd docs/ 35 | make docs 36 | open _build/html/index.html 37 | 38 | 7. Double-check that unittests pass and the linter doesn't complain:: 39 | 40 | pytest 41 | flake8 stork tests 42 | 43 | 8. Submit a PR! Once you open a PR github actions will run tests and linting. Once those pass someone will review your code and merge it into the main codebase. 44 | 45 | 46 | Note: several of the tests rely on the ``.storkcfg`` file, so make sure to run ``stork configure`` before running tests. If you want to run tests using a different token than is in your ``.storkcfg`` file, you can also pass in the values directly, as shown in the second example. Values passed as options will override those in the config. 47 | 48 | To run unittests using defaults in ``.storkcfg``:: 49 | 50 | pytest 51 | 52 | To run unittests using defaults in a ``.storkcfg`` file somewhere other than the root directory:: 53 | 54 | pytest --cfg=/Users/my_user/other_folder/.storkcfg 55 | 56 | To run unittests with a different token:: 57 | 58 | pytest --token abc123 59 | 60 | Warning: tests in ``test_token_permissions`` make actual API calls. They only make read calls, but do require an internet connection. To run only tests that are isolated, use:: 61 | 62 | pytest --deselect tests/test_token_permissions.py 63 | 64 | This package follows PEP8 standards, uses numpy-type docstrings, and should be tested in python3. 65 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021 ShopRunner, Inc. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stork 2 | Command line helpers for Databricks! 3 | 4 | [![PyPI version](https://badge.fury.io/py/stork.svg)](https://badge.fury.io/py/stork) 5 | [![Python package](https://github.com/ShopRunner/stork/workflows/Python%20package/badge.svg)](https://github.com/ShopRunner/stork/actions/workflows/prod.yaml) 6 | [![Documentation Status](https://readthedocs.org/projects/stork-library/badge/?version=latest)](https://stork-library.readthedocs.io/en/latest/?badge=latest) 7 | 8 | ## Maintenance Note 9 | ⚠️ [2021/07/08] After recent updates to the DataBricks platform it is now possible to install jars and wheels from internal repositories (such as an artifactory instance). We recommend this approach moving forward, since it allows more standard version management and wheels have several advantages over eggs for python libraries. Stork currently still works, but the library management does rely on a deprecated API and thus may break at some point in the future and we will likely not attempt to fix it at that point in time. 10 | 11 | 12 | ## Why we built this 13 | 14 | When our team started setting up CI/CD for the various packages we maintain, we encountered some difficulties integrating Jenkins with Databricks. 15 | 16 | We write a lot of Python + PySpark packages in our data science work, and we often deploy these as batch jobs run on a schedule using Databricks. However, each time we merged in a new change to one of these libraries we would have to manually create an egg, upload it using the Databricks GUI, go find all the jobs that used the library, and update each one to point to the new job. As our team and set of libraries and jobs grew, this became unsustainable (not to mention a big break from the CI/CD philosophy...). 17 | 18 | As we set out to automate this using Databrick's library API, we realized that this task required using two versions of the API and many dependant API calls. Instead of trying to recreate that logic in each Jenkinsfile, we wrote stork. Now you can enjoy the magic as well! 19 | 20 | Stork now works for both `.egg` and `.jar` files to support Python + PySpark and Scala + Spark libaries. 21 | Take advantage of stork's ability to update jobs, make sure you're following one of the following naming conventions: 22 | ``` 23 | new_library-1.0.0-py3.6.egg 24 | new_library-1.0.0-SNAPSHOT-py3.6.egg 25 | new_library-1.0.0-SNAPSHOT-my-branch-py3.6.egg 26 | new_library-1.0.0.egg 27 | new_library-1.0.0-SNAPSHOT.egg 28 | new_library-1.0.0-SNAPSHOT-my-branch.egg 29 | new_library-1.0.0.jar 30 | new_library-1.0.0-SNAPSHOT.jar 31 | new_library-1.0.0-SNAPSHOT-my-branch.jar 32 | ``` 33 | Where the first number in the version (in this case `1`) is a major version signaling breaking changes. 34 | 35 | ## What it does 36 | 37 | Stork is a set of command line helpers for Databricks. Some commands are for managing libraries in Databricks in an automated fashion. This allows you to move away from the point-and-click interface for your development work and for deploying production-level libraries for use in scheduled Databricks jobs. Another command allows you to create an interactive cluster that replicates the settings used on a job cluster. 38 | 39 | For a more detailed API and tutorials, check out the [docs](https://stork-library.readthedocs.io/en/latest/index.html). 40 | 41 | ## Installation 42 | 43 | Note: stork requires python3, and currently only works on Databricks accounts that run AWS (not Azure) 44 | 45 | Stork is hosted on PyPi, so to get the latest version simply install via pip: 46 | ``` 47 | pip install stork 48 | ``` 49 | 50 | You can also install from source, by cloning the git repository https://github.com/ShopRunner/stork.git and installing via easy_install: 51 | ``` 52 | git clone https://github.com/ShopRunner/stork.git 53 | cd stork 54 | easy_install . 55 | ``` 56 | 57 | ## Setup 58 | 59 | ### Configuration 60 | 61 | Stork uses a `.storkcfg` to store information about your Databricks account and setup. To create this file, run: 62 | ``` 63 | stork configure 64 | ``` 65 | 66 | You will be asked for your Databricks host name (the url you use to access the account - something like `https://my-organization.cloud.databricks.com`), an access token, and your production folder. This should be a folder your team creates to keep production-ready libraries. By isolating production-ready libraries in their own folder, you ensure that stork will never update a job to use a library still in development/testing. 67 | 68 | ### Databricks API token 69 | 70 | The API tokens can be generated in Databricks under Account Settings -> Access Tokens. To upload an egg to any folder in Databricks, you can use any token. To update jobs, you will need a token with admin permissions, which can be created in the same manner by an admin on the account. 71 | 72 | ## Usage notes 73 | 74 | While libraries can be uploaded to folders other than your specified production library, no libraries outside of this folder will ever be deleted and no jobs using libraries outside of this folder will be updated. 75 | 76 | If you try to upload a library to Databricks that already exists there with the same version, a warning will be printed instructing the user to update the version if a change has been made. Without a version change the new library will not be uploaded. 77 | 78 | ## Contributing 79 | See a way for stork to improve? We welcome contributions in the form of issues or pull requests! 80 | 81 | Please check out the [contributing](https://stork-library.readthedocs.io/en/latest/contrib.html) page for more information. 82 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | # 4 | # To build all docs, make sure you have stork installed and run `make docs` 5 | 6 | # You can set these variables from the command line. 7 | SPHINXOPTS = 8 | SPHINXBUILD = sphinx-build 9 | PAPER = 10 | BUILDDIR = _build 11 | 12 | # Internal variables. 13 | PAPEROPT_a4 = -D latex_paper_size=a4 14 | PAPEROPT_letter = -D latex_paper_size=letter 15 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " changes to make an overview of all changed/added/deprecated items" 35 | @echo " linkcheck to check all external links for integrity" 36 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 37 | 38 | clean: 39 | -rm -rf $(BUILDDIR)/* source/ 40 | 41 | html: 42 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 43 | @echo 44 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 45 | 46 | dirhtml: 47 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 48 | @echo 49 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 50 | 51 | singlehtml: 52 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 53 | @echo 54 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 55 | 56 | pickle: 57 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 58 | @echo 59 | @echo "Build finished; now you can process the pickle files." 60 | 61 | json: 62 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 63 | @echo 64 | @echo "Build finished; now you can process the JSON files." 65 | 66 | htmlhelp: 67 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 68 | @echo 69 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 70 | ".hhp project file in $(BUILDDIR)/htmlhelp." 71 | 72 | qthelp: 73 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 74 | @echo 75 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 76 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 77 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Mapnik.qhcp" 78 | @echo "To view the help file:" 79 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Mapnik.qhc" 80 | 81 | devhelp: 82 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 83 | @echo 84 | @echo "Build finished." 85 | @echo "To view the help file:" 86 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Mapnik" 87 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Mapnik" 88 | @echo "# devhelp" 89 | 90 | epub: 91 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 92 | @echo 93 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 94 | 95 | latex: 96 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 97 | @echo 98 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 99 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 100 | "(use \`make latexpdf' here to do that automatically)." 101 | 102 | latexpdf: 103 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 104 | @echo "Running LaTeX files through pdflatex..." 105 | make -C $(BUILDDIR)/latex all-pdf 106 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 107 | 108 | text: 109 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 110 | @echo 111 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 112 | 113 | man: 114 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 115 | @echo 116 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 117 | 118 | changes: 119 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 120 | @echo 121 | @echo "The overview file is in $(BUILDDIR)/changes." 122 | 123 | linkcheck: 124 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 125 | @echo 126 | @echo "Link check complete; look for any errors" 127 | 128 | docs: html 129 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | sys.path.insert(0, os.path.abspath('../../stork')) 18 | 19 | HERE = os.path.dirname(__file__) 20 | TOP_BINDIR = os.path.normpath(os.path.join(HERE, "..", "bin")) 21 | os.environ["PATH"] = os.pathsep.join([ 22 | os.path.abspath(TOP_BINDIR), os.environ.get("PATH", "") ]) 23 | 24 | # -- Project information ----------------------------------------------------- 25 | 26 | project = 'stork' 27 | copyright = '2018, ShopRunner' 28 | author = 'Hanna Torrence' 29 | 30 | 31 | with open("../stork/_version.py") as version_file: 32 | exec(version_file.read()) 33 | 34 | # The short X.Y version 35 | version = '.'.join(__version__.split('.')[:-1]) 36 | # The full version, including alpha/beta/rc tags 37 | release = __version__ 38 | 39 | 40 | # -- General configuration --------------------------------------------------- 41 | 42 | # If your documentation needs a minimal Sphinx version, state it here. 43 | # 44 | # needs_sphinx = '1.0' 45 | 46 | # Add any Sphinx extension module names here, as strings. They can be 47 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 48 | # ones. 49 | extensions = [ 50 | 'm2r2', 51 | 'sphinx.ext.autodoc', 52 | 'sphinx.ext.todo', 53 | 'sphinx.ext.coverage', 54 | 'sphinxcontrib.programoutput', 55 | 'numpydoc', # numpydoc extension has to be last 56 | ] 57 | 58 | # Add any paths that contain templates here, relative to this directory. 59 | templates_path = ['_templates'] 60 | 61 | # The suffix(es) of source filenames. 62 | # You can specify multiple suffix as a list of string: 63 | # 64 | source_suffix = ['.rst', '.md'] 65 | 66 | # The master toctree document. 67 | master_doc = 'index' 68 | 69 | # The language for content autogenerated by Sphinx. Refer to documentation 70 | # for a list of supported languages. 71 | # 72 | # This is also used if you do content translation via gettext catalogs. 73 | # Usually you set "language" from the command line for these cases. 74 | language = None 75 | 76 | # List of patterns, relative to source directory, that match files and 77 | # directories to ignore when looking for source files. 78 | # This pattern also affects html_static_path and html_extra_path . 79 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 80 | 81 | # The name of the Pygments (syntax highlighting) style to use. 82 | pygments_style = 'sphinx' 83 | 84 | 85 | # -- Options for HTML output ------------------------------------------------- 86 | 87 | # The theme to use for HTML and HTML Help pages. See the documentation for 88 | # a list of builtin themes. 89 | # 90 | html_theme = 'alabaster' 91 | 92 | # Theme options are theme-specific and customize the look and feel of a theme 93 | # further. For a list of options available for each theme, see the 94 | # documentation. 95 | # 96 | # html_theme_options = {} 97 | 98 | # Add any paths that contain custom static files (such as style sheets) here, 99 | # relative to this directory. They are copied after the builtin static files, 100 | # so a file named "default.css" will overwrite the builtin "default.css". 101 | html_static_path = [] 102 | 103 | # Custom sidebar templates, must be a dictionary that maps document names 104 | # to template names. 105 | # 106 | # The default sidebars (for documents that don't match any pattern) are 107 | # defined by theme itself. Builtin themes are using these templates by 108 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 109 | # 'searchbox.html']``. 110 | # 111 | # html_sidebars = {} 112 | 113 | 114 | # -- Options for HTMLHelp output --------------------------------------------- 115 | 116 | # Output file base name for HTML help builder. 117 | htmlhelp_basename = 'storkdoc' 118 | 119 | 120 | # -- Options for LaTeX output ------------------------------------------------ 121 | 122 | latex_elements = { 123 | # The paper size ('letterpaper' or 'a4paper'). 124 | # 125 | # 'papersize': 'letterpaper', 126 | 127 | # The font size ('10pt', '11pt' or '12pt'). 128 | # 129 | # 'pointsize': '10pt', 130 | 131 | # Additional stuff for the LaTeX preamble. 132 | # 133 | # 'preamble': '', 134 | 135 | # Latex figure (float) alignment 136 | # 137 | # 'figure_align': 'htbp', 138 | } 139 | 140 | # Grouping the document tree into LaTeX files. List of tuples 141 | # (source start file, target name, title, 142 | # author, documentclass [howto, manual, or own class]). 143 | latex_documents = [ 144 | (master_doc, 'stork.tex', 'stork Documentation', 145 | 'Hanna Torrence', 'manual'), 146 | ] 147 | 148 | 149 | # -- Options for manual page output ------------------------------------------ 150 | 151 | # One entry per manual page. List of tuples 152 | # (source start file, name, description, authors, manual section). 153 | man_pages = [ 154 | (master_doc, 'stork', 'stork Documentation', 155 | [author], 1) 156 | ] 157 | 158 | 159 | # -- Options for Texinfo output ---------------------------------------------- 160 | 161 | # Grouping the document tree into Texinfo files. List of tuples 162 | # (source start file, target name, title, author, 163 | # dir menu entry, description, category) 164 | texinfo_documents = [ 165 | (master_doc, 'stork', 'stork Documentation', 166 | author, 'stork', 'One line description of project.', 167 | 'Miscellaneous'), 168 | ] 169 | 170 | 171 | # -- Extension configuration ------------------------------------------------- 172 | 173 | # -- Options for todo extension ---------------------------------------------- 174 | 175 | # If true, `todo` and `todoList` produce output, else they produce nothing. 176 | todo_include_todos = True 177 | -------------------------------------------------------------------------------- /docs/contrib.rst: -------------------------------------------------------------------------------- 1 | .. _contrib: 2 | 3 | .. mdinclude:: ../CONTRIBUTING.md 4 | 5 | .. mdinclude:: ../CODE-OF-CONDUCT.md 6 | -------------------------------------------------------------------------------- /docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | .. _getting_started: 2 | 3 | Getting Started 4 | =============== 5 | 6 | .. _why: 7 | 8 | Why did we build this? 9 | ---------------------- 10 | 11 | When our team started setting up CI/CD for the various packages we maintain, we encountered some difficulties integrating Jenkins with Databricks. 12 | 13 | We write a lot of Python + PySpark packages in our data science work, and we often deploy these as batch jobs run on a schedule using Databricks. However, each time we merged in a new change to one of these libraries we would have to manually create an egg, upload it using the Databricks GUI, go find all the jobs that used the library, and update each one to point to the new job. As our team and set of libraries and jobs grew, this became unsustainable (not to mention a big break from the CI/CD philosophy...). 14 | 15 | As we set out to automate this using Databrick's library API, we realized that this task required using two versions of the API and many dependant API calls. Instead of trying to recreate that logic in each Jenkinsfile, we wrote stork. Now you can enjoy the magic as well! 16 | 17 | Note: Stork only works on Databricks accounts that run on AWS, not those that run on Azure. The V1 library API is required, and it only exists on AWS accounts. 18 | 19 | To get started, check out :ref:`install` or :ref:`start`. 20 | 21 | To learn more about how to use stork, check out :ref:`tutorial` or :ref:`usage_details`. 22 | 23 | To help improve stork, check out :ref:`contrib`. 24 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. stork documentation master file, created by 2 | sphinx-quickstart on Fri Jun 29 16:33:02 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to stork's documentation! 7 | ==================================== 8 | 9 | Command line helpers for Databricks! 10 | 11 | What is stork? 12 | ----------------- 13 | 14 | Stork is a set of command line helpers for Databricks. Some commands are for managing libraries in Databricks in an automated fashion. This allows you to move away from the point-and-click interface for your development work and for deploying production-level libraries for use in scheduled Databricks jobs. Another command allows you to create an interactive cluster that replicates the settings used on a job cluster. 15 | 16 | .. _install: 17 | 18 | Installation 19 | ------------ 20 | 21 | Stork is hosted on PyPi, so to get the latest version simply install via ``pip``:: 22 | 23 | pip install stork 24 | 25 | You can also install from source, by cloning the git repository ``https://github.com/ShopRunner/stork.git`` and installing via ``easy_install``:: 26 | 27 | git clone https://github.com/ShopRunner/stork.git 28 | cd stork 29 | easy_install . 30 | 31 | .. _start: 32 | 33 | Quickstart 34 | ---------- 35 | 36 | To get started, first run ``stork configure`` and answer the questions. 37 | 38 | Then you are ready to upload libraries to Databricks, using the ``stork upload`` and ``stork upload_and_update`` commands. 39 | 40 | Please see :ref:`getting_started` for an introduction to the package, and :ref:`usage_details` for specifics on availible options. 41 | 42 | Table of Contents 43 | ----------------- 44 | .. toctree:: 45 | :maxdepth: 2 46 | :caption: Contents: 47 | 48 | getting_started.rst 49 | stork.rst 50 | tutorial.rst 51 | contrib.rst 52 | 53 | Indices and tables 54 | ------------------ 55 | 56 | * :ref:`genindex` 57 | * :ref:`search` 58 | -------------------------------------------------------------------------------- /docs/stork.rst: -------------------------------------------------------------------------------- 1 | .. _usage_details: 2 | 3 | Stork 4 | ======== 5 | 6 | The stork cli is your point of contact for managing continuous delivery of 7 | Python packages for use in Databracks. 8 | 9 | Configure 10 | --------- 11 | 12 | To get started, configure your Databricks account information. You'll need your Databricks account connection info, and you will also be asked to name a production folder. To learn more about how these values will be used and where to find this information, check out the :ref:`getting_started` page. 13 | 14 | When you're ready to go, run ``stork configure``. 15 | 16 | .. command-output:: stork configure --help 17 | 18 | Now you're all set to start using stork! The two main commands avaliable in stork are ``upload`` and ``upload-and-update``. 19 | 20 | Upload 21 | ------ 22 | 23 | ``upload`` can be used anytime by anyone and promises not break anything. It simply uploads an egg or jar file, and will throw an error if a file with the same name alreay exists. 24 | 25 | If you've set up your ``.storkcfg`` file using the ``configure`` command, you only need to provide a path to the ``.egg`` or ``.jar`` file, but can also override the default api token and destination folder if desired. 26 | 27 | If you try to upload a library to Databricks that already exists there with the same version, a warning will be printed instructing the user to update the version if a change has been made. Without a version change the new library will not be uploaded. 28 | 29 | This command will print out a message letting you know the name of the egg or jar that was uploaded. 30 | 31 | .. command-output:: stork upload --help 32 | 33 | Upload and Update 34 | ----------------- 35 | 36 | ``upload-and-update`` requires a token with admin-level permissions. It does have the capacity to delete libraries, but if used in a CI/CD system will not cause any issues. For advice on how to set this up, check out the *Gettting Started* page. 37 | 38 | Used with default settings, ``upload-and-update`` will start by uploading the ``.egg`` or ``.jar`` file. It will then go find all jobs that use the same major version of the library and update them to point to the new version. Finally, it will clean up outdated versions in the production library. No libraries in any other folders will ever be deleted. 39 | 40 | If you're nervous about deleting files, you can always use the ``--no-cleanup`` flag and no files will be deleted or overwritten. If you're confident in your CI/CD system, however, leaving the cleanup variable set to ``True`` will keep your production folder tidy, with only the most current version of each major release of each library. 41 | 42 | This command will print out a message letting you know (1) the name of the egg or jar that was uploaded, (2) the list of jobs currently using the same major version of this library, (3) the list of jobs updated - this should match number 2, and (4) any old versions removed - if you haven't used the ``--no-cleanup`` flag. 43 | 44 | In the same way as ``upload``, if you try to upload a library to Databricks that already exists there with the same version, a warning will be printed instructing the user to update the version if a change has been made. Without a version change the new library will not be uploaded. 45 | 46 | .. command-output:: stork upload-and-update --help 47 | 48 | For more info about usage, check out the :ref:`tutorial`. 49 | 50 | Create cluster 51 | ------ 52 | 53 | ``create-cluster`` can be used anytime by anyone and promises not to break anything. It simply creates a new cluster and will create a second cluster if a cluster with the same name alreay exists. Note: this command calls APIs on a Databricks account that runs AWS (not Azure); there is not guarantee it will work with an Azure Databricks account. 54 | 55 | If you've set up your ``.storkcfg`` file using the ``configure`` command, you only need to provide a job_id and optionally a cluster_name, but can also override the default api token if desired. 56 | 57 | This command will print out a message letting you know the name of the cluster that was created. 58 | 59 | .. command-output:: stork create-cluster --help 60 | -------------------------------------------------------------------------------- /docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial: 2 | 3 | Tutorials 4 | ========= 5 | 6 | For testing and development 7 | --------------------------- 8 | 9 | When developing libraries, we often found it frustrating to frequently re-upload a library that was changing daily as we worked out a new feature. With stork, this workflow is much simpler. 10 | 11 | When you are ready to test out changes to your library, start by deleting the current version. (Unfortunately moving or renaming the old version is insufficient, and it must be fully deleted AND removed from the trash folder before the cluster will recognize the new copy). Next restart your cluster, so it wipes the old version from its imports. 12 | 13 | Create a new egg file from your python package using:: 14 | 15 | python setup.py bdist_egg 16 | 17 | or create a new jar file from your scala package. 18 | 19 | Upload the library to your preferred development folder using:: 20 | 21 | stork upload -p ./dist/my_library-1.0.1-py3.6.egg -f /Users/my_email@fake_organization.com/dev_folder 22 | 23 | stork upload -p ./libs/my_library-1.0.1.jar -f /Users/my_email@fake_organization.com/dev_folder 24 | 25 | Finally, attach the new library to your cluster, and you're ready to test away! 26 | 27 | For production libraries 28 | ------------------------ 29 | 30 | While useful for testing libraries, the real reason we wrote this package involved frustrations we encountered building out our continuous integration/continuous deployment infrastructure. If you are using a CI/CD setup with tools such as Jenkins or Travis, stork works in these tools to cleanly integrate your Python packages with production jobs in Databricks. As we use Jenkins here at ShopRunner to manage CI/CD, I will continue with that example, but this should work in any similar tool. 31 | 32 | First, you will need a Databricks token with admin permission accesible in Jenkins, here represented by the environment variable ``TOKEN``. You also need to set up the ``.storkcfg`` file. While the ``stork configure`` tool makes this easy to do locally, in an automated setup it's often easier to provide the file directly, using a command like:: 33 | 34 | echo """[DEFAULT] 35 | host = https://my-organization.cloud.databricks.com 36 | token = ${TOKEN} 37 | prod_folder = /Shared/production_libraries""" > ~/.storkcfg 38 | 39 | A standard Jenkinsfile for one of our Python packages will run a linting tool, run unittests, push the egg to our artifact store, and then use stork to push the egg to Databricks. This final steps works as follows:: 40 | 41 | stork upload-and-update -p `ls dist/*.egg` 42 | 43 | The ```ls dist/*.egg``` lists the egg files in the ``dist`` subfolder (which should just be the egg you want to upload). 44 | 45 | We've also found it useful to redirect the printed statements to a Slack channel, so we get notifications when jobs are updated. This makes it easy to diagnose which library version caused problems if jobs ever fail. 46 | 47 | For more details on options avaliable with these two commands, check out :ref:`usage_details`. 48 | -------------------------------------------------------------------------------- /requirements-dev.in: -------------------------------------------------------------------------------- 1 | -e file:.[dev] 2 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile requirements-dev.in 6 | # 7 | -e file:. 8 | # via -r requirements-dev.in 9 | alabaster==0.7.12 10 | # via sphinx 11 | attrs==20.3.0 12 | # via pytest 13 | babel==2.9.0 14 | # via sphinx 15 | certifi==2020.12.5 16 | # via requests 17 | chardet==4.0.0 18 | # via requests 19 | click-log==0.3.2 20 | # via stork 21 | click==7.1.2 22 | # via 23 | # click-log 24 | # stork 25 | configparser==5.0.2 26 | # via stork 27 | coverage==5.5 28 | # via pytest-cov 29 | docutils==0.16 30 | # via 31 | # m2r2 32 | # sphinx 33 | flake8==3.9.1 34 | # via stork 35 | idna==2.10 36 | # via requests 37 | imagesize==1.2.0 38 | # via sphinx 39 | iniconfig==1.1.1 40 | # via pytest 41 | jinja2==2.11.3 42 | # via 43 | # numpydoc 44 | # sphinx 45 | m2r2==0.2.7 46 | # via stork 47 | markupsafe==1.1.1 48 | # via jinja2 49 | mccabe==0.6.1 50 | # via flake8 51 | mistune==0.8.4 52 | # via m2r2 53 | numpydoc==1.1.0 54 | # via stork 55 | packaging==20.9 56 | # via 57 | # pytest 58 | # sphinx 59 | pluggy==0.13.1 60 | # via pytest 61 | py==1.10.0 62 | # via pytest 63 | pycodestyle==2.7.0 64 | # via flake8 65 | pyflakes==2.3.1 66 | # via flake8 67 | pygments==2.8.1 68 | # via sphinx 69 | pyparsing==2.4.7 70 | # via packaging 71 | pytest-cov==2.11.1 72 | # via stork 73 | pytest==6.2.3 74 | # via 75 | # pytest-cov 76 | # stork 77 | pytz==2021.1 78 | # via babel 79 | requests==2.25.1 80 | # via 81 | # responses 82 | # sphinx 83 | # stork 84 | responses==0.13.2 85 | # via stork 86 | simplejson==3.17.2 87 | # via stork 88 | six==1.15.0 89 | # via responses 90 | snowballstemmer==2.1.0 91 | # via sphinx 92 | sphinx==3.5.4 93 | # via 94 | # numpydoc 95 | # sphinxcontrib-programoutput 96 | # stork 97 | sphinxcontrib-applehelp==1.0.2 98 | # via sphinx 99 | sphinxcontrib-devhelp==1.0.2 100 | # via sphinx 101 | sphinxcontrib-htmlhelp==1.0.3 102 | # via sphinx 103 | sphinxcontrib-jsmath==1.0.1 104 | # via sphinx 105 | sphinxcontrib-programoutput==0.17 106 | # via stork 107 | sphinxcontrib-qthelp==1.0.3 108 | # via sphinx 109 | sphinxcontrib-serializinghtml==1.1.4 110 | # via sphinx 111 | toml==0.10.2 112 | # via pytest 113 | urllib3==1.26.4 114 | # via 115 | # requests 116 | # responses 117 | 118 | # The following packages are considered to be unsafe in a requirements file: 119 | # setuptools 120 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile 6 | # 7 | certifi==2020.12.5 8 | # via requests 9 | chardet==4.0.0 10 | # via requests 11 | click-log==0.3.2 12 | # via stork (setup.py) 13 | click==7.1.2 14 | # via 15 | # click-log 16 | # stork (setup.py) 17 | configparser==5.0.2 18 | # via stork (setup.py) 19 | idna==2.10 20 | # via requests 21 | requests==2.25.1 22 | # via stork (setup.py) 23 | simplejson==3.17.2 24 | # via stork (setup.py) 25 | urllib3==1.26.4 26 | # via requests 27 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [easy_install] 2 | 3 | [flake8] 4 | exclude = 5 | stork/__init__.py, 6 | conftest.py, 7 | 8 | [tool:pytest] 9 | log_level = info 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("stork/_version.py") as version_file: 4 | exec(version_file.read()) 5 | 6 | with open('README.md') as r: 7 | readme = r.read() 8 | 9 | with open('LICENSE') as l: 10 | license = l.read() 11 | 12 | setup( 13 | name='stork', 14 | version=__version__, 15 | description='Update libraries on Databricks', 16 | long_description=readme+'\n\n\nLicense\n-------\n'+license, 17 | long_description_content_type='text/markdown', 18 | author='Hanna Torrence', 19 | author_email='data-science@shoprunner.com', 20 | url='https://github.com/shoprunner/stork', 21 | license='BSD-3-Clause', 22 | packages=['stork'], 23 | data_files=[('', ['LICENSE'])], 24 | install_requires=[ 25 | 'click', 26 | 'click_log', 27 | 'configparser', 28 | 'requests', 29 | 'simplejson' 30 | ], 31 | extras_require={ 32 | 'dev': [ 33 | 'flake8', 34 | 'numpydoc', 35 | 'm2r2', 36 | 'pytest', 37 | 'pytest-cov', 38 | 'responses', 39 | 'sphinx', 40 | 'sphinxcontrib-programoutput', 41 | ] 42 | }, 43 | entry_points={'console_scripts': ['stork = stork.cli:cli']} 44 | ) 45 | -------------------------------------------------------------------------------- /stork/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | 3 | from .configure import configure 4 | from .update_databricks_library import update_databricks 5 | -------------------------------------------------------------------------------- /stork/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '3.2.1' 2 | -------------------------------------------------------------------------------- /stork/api_error.py: -------------------------------------------------------------------------------- 1 | from simplejson.errors import JSONDecodeError 2 | 3 | 4 | class APIError(Exception): 5 | """ 6 | exception to handle unifying two generations of API error responses 7 | from Databricks 8 | """ 9 | def __init__(self, response): 10 | Exception.__init__(self, response) 11 | try: 12 | res_body = response.json() 13 | except JSONDecodeError: 14 | self.code = 'http {}'.format(response.status_code) 15 | # non-json error message, didn't bother parsing neatly 16 | self.message = response.text 17 | else: 18 | if 'error_code' in res_body.keys(): 19 | self.code = res_body['error_code'] 20 | self.message = res_body['message'] 21 | else: 22 | self.code = 'http {}'.format(response.status_code) 23 | self.message = res_body['error'] 24 | 25 | def __str__(self): 26 | return '{}: {}'.format(self.code, self.message) 27 | -------------------------------------------------------------------------------- /stork/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from . import __version__ 4 | from .cli_commands import create_cluster, upload, upload_and_update 5 | from .configure import configure 6 | 7 | 8 | def print_version(ctx, param, value): 9 | if not value or ctx.resilient_parsing: 10 | return 11 | click.echo('Version {}'.format(__version__)) 12 | ctx.exit() 13 | 14 | 15 | @click.group() 16 | @click.option('--version', '-v', is_flag=True, callback=print_version, 17 | help=__version__) 18 | def cli(version): 19 | pass 20 | 21 | 22 | cli.add_command(configure) 23 | cli.add_command(create_cluster) 24 | cli.add_command(upload) 25 | cli.add_command(upload_and_update) 26 | -------------------------------------------------------------------------------- /stork/cli_commands.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import click 4 | import click_log 5 | from configparser import NoOptionError 6 | 7 | from .configure import _load_config, CFG_FILE, PROFILE 8 | from .create_job_cluster import create_job_library 9 | from .update_databricks_library import update_databricks 10 | 11 | logger = logging.getLogger(__name__) 12 | click_log.basic_config(logger) 13 | 14 | 15 | def _resolve_input(variable, variable_name, config_key, config): 16 | """ 17 | Resolve input entered as option values with config values 18 | 19 | If option values are provided (passed in as `variable`), then they are 20 | returned unchanged. If `variable` is None, then we first look for a config 21 | value to use. 22 | If no config value is found, then raise an error. 23 | 24 | Parameters 25 | ---------- 26 | variable: string or numeric 27 | value passed in as input by the user 28 | variable_name: string 29 | name of the variable, for clarity in the error message 30 | config_key: string 31 | key in the config whose value could be used to fill in the variable 32 | config: ConfigParser 33 | contains keys/values in .storkcfg 34 | """ 35 | if variable is None: 36 | try: 37 | variable = config.get(PROFILE, config_key) 38 | except NoOptionError: 39 | raise ValueError(( 40 | 'no {} found - either provide a command line argument or ' 41 | 'set up a default by running `stork configure`' 42 | ).format(variable_name)) 43 | return variable 44 | 45 | 46 | @click.command(short_help='upload an egg or jar') 47 | @click.option( 48 | '-p', 49 | '--path', 50 | help=('path to egg or jar file with name as output from setuptools ' 51 | '(e.g. dist/new_library-1.0.0-py3.6.egg ' 52 | 'or libs/new_library-1.0.0.jar)'), 53 | required=True 54 | ) 55 | @click.option( 56 | '-t', 57 | '--token', 58 | help=('Databricks API key - ' 59 | 'optional, read from `.storkcfg` if not provided'), 60 | ) 61 | @click.option( 62 | '-f', 63 | '--folder', 64 | type=str, 65 | help=('Databricks folder to upload to ' 66 | '(e.g. `/Users/my_email@fake_organization.com`) ' 67 | '- optional, read from `.storkcfg` if not provided'), 68 | ) 69 | @click_log.simple_verbosity_option(logger) 70 | def upload(path, token, folder): 71 | """ 72 | The egg that the provided path points to will be uploaded to Databricks. 73 | """ 74 | config = _load_config(CFG_FILE) 75 | token = _resolve_input(token, 'token', 'token', config) 76 | folder = _resolve_input(folder, 'folder', 'prod_folder', config) 77 | 78 | update_databricks( 79 | logger, 80 | path, 81 | token, 82 | folder, 83 | update_jobs=False, 84 | cleanup=False 85 | ) 86 | 87 | 88 | @click.command(short_help='upload an egg and update jobs') 89 | @click.option( 90 | '-p', 91 | '--path', 92 | help=('path to egg file with name as output from setuptools ' 93 | '(e.g. dist/new_library-1.0.0-py3.6.egg)'), 94 | required=True, 95 | ) 96 | @click.option( 97 | '-t', 98 | '--token', 99 | help=('Databricks API key with admin permissions on all jobs using library' 100 | ' - optional, read from `.storkcfg` if not provided'), 101 | ) 102 | @click.option( 103 | '--cleanup/--no-cleanup', 104 | help=('if cleanup, remove outdated files from production folder; ' 105 | 'if no-cleanup, remove nothing'), 106 | default=True, 107 | show_default=True, 108 | ) 109 | @click_log.simple_verbosity_option(logger) 110 | def upload_and_update(path, token, cleanup): 111 | """ 112 | The egg that the provided path points to will be uploaded to Databricks. 113 | All jobs which use the same major version of the library will be updated 114 | to use the new version, and all version of this library in the production 115 | folder with the same major version and a lower minor version will 116 | be deleted. 117 | 118 | Unlike `upload`, `upload_and_update` does not ask for a folder because it 119 | relies on the production folder specified in the config. This is to 120 | protect against accidentally updating jobs to versions of a library still 121 | in testing/development. 122 | 123 | All egg names already in Databricks must be properly formatted 124 | with versions of the form -0.0.0. 125 | """ 126 | config = _load_config(CFG_FILE) 127 | token = _resolve_input(token, 'token', 'token', config) 128 | folder = _resolve_input(None, 'folder', 'prod_folder', config) 129 | 130 | update_databricks( 131 | logger, 132 | path, 133 | token, 134 | folder, 135 | update_jobs=True, 136 | cleanup=cleanup 137 | ) 138 | 139 | 140 | @click.command(short_help='create a cluster based on a job_id') 141 | @click.option( 142 | '-j', 143 | '--job_id', 144 | help='job id of job you want to debug', 145 | required=True 146 | ) 147 | @click.option( 148 | '-c', 149 | '--cluster_name', 150 | default=None, 151 | help=('Cluster Name- ' 152 | 'optional, use default value if not provided'), 153 | ) 154 | @click.option( 155 | '-t', 156 | '--token', 157 | help=('Databricks API key - ' 158 | 'optional, read from `.storkcfg` if not provided'), 159 | ) 160 | @click_log.simple_verbosity_option(logger) 161 | def create_cluster(job_id, cluster_name, token): 162 | """ 163 | Create a cluster based on a job id 164 | """ 165 | config = _load_config(CFG_FILE) 166 | token = _resolve_input(token, 'token', 'token', config) 167 | 168 | create_job_library( 169 | logger, 170 | job_id, 171 | cluster_name, 172 | token 173 | ) 174 | -------------------------------------------------------------------------------- /stork/configure.py: -------------------------------------------------------------------------------- 1 | import click 2 | from configparser import ConfigParser 3 | from os.path import expanduser, join 4 | 5 | 6 | CFG_FILE = join(expanduser('~'), '.storkcfg') 7 | PROFILE = 'DEFAULT' 8 | 9 | 10 | def _load_config(filename): 11 | """ 12 | Reads in existing config 13 | 14 | Returns 15 | ------- 16 | config class with values read from existing file 17 | """ 18 | config = ConfigParser() 19 | config.read(filename) 20 | return config 21 | 22 | 23 | def _update_value(config, key, instruction, is_sensitive): 24 | """ 25 | creates (if needed) and updates the value of the key in the config with a 26 | value entered by the user 27 | 28 | Parameters 29 | ---------- 30 | config: ConfigParser object 31 | existing configuration 32 | key: string 33 | key to update 34 | instruction: string 35 | text to show in the prompt 36 | is_sensitive: bool 37 | if true, require confirmation and do not show typed characters 38 | 39 | Notes 40 | ----- 41 | sets key in config passed in 42 | """ 43 | if config.has_option(PROFILE, key): 44 | current_value = config.get(PROFILE, key) 45 | else: 46 | current_value = None 47 | 48 | proposed = click.prompt( 49 | instruction, 50 | default=current_value, 51 | hide_input=is_sensitive, 52 | confirmation_prompt=is_sensitive, 53 | ) 54 | 55 | if key == 'host' or key == 'prod_folder': 56 | if proposed[-1] == '/': 57 | proposed = proposed[:-1] 58 | 59 | if key == 'prod_folder': 60 | if proposed[0] != '/': 61 | proposed = '/' + proposed 62 | 63 | if key == 'host': 64 | if 'http' != proposed[:4]: 65 | proposed = click.prompt( 66 | ("looks like there's an issue - " 67 | 'make sure the host name starts with http'), 68 | default=current_value, 69 | hide_input=is_sensitive, 70 | confirmation_prompt=is_sensitive, 71 | ) 72 | config.set(PROFILE, key, proposed) 73 | 74 | 75 | @click.command(short_help='configure Databricks connection information') 76 | def configure(): 77 | """ 78 | Configure information about Databricks account and default behavior. 79 | 80 | Configuration is stored in a `.storkcfg` file. A config file must exist 81 | before this package can be used, and can be supplied either directly as a 82 | text file or generated using this configuration tool. 83 | """ 84 | config = _load_config(CFG_FILE) 85 | 86 | _update_value( 87 | config, 88 | 'host', 89 | 'Databricks host (e.g. https://my-organization.cloud.databricks.com)', 90 | is_sensitive=False, 91 | ) 92 | _update_value( 93 | config, 94 | 'token', 95 | 'Databricks API token', 96 | is_sensitive=True, 97 | ) 98 | _update_value( 99 | config, 100 | 'prod_folder', 101 | 'Databricks folder for production libraries', 102 | is_sensitive=False, 103 | ) 104 | 105 | with open(CFG_FILE, 'w+') as f: 106 | config.write(f) 107 | -------------------------------------------------------------------------------- /stork/create_job_cluster.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file handles all the logic and API calls involved in creating 3 | an interactive Databricks cluster configured as a specific job cluster. 4 | """ 5 | import json 6 | import time 7 | 8 | import requests 9 | from configparser import NoOptionError 10 | 11 | from .api_error import APIError 12 | from .configure import _load_config, CFG_FILE, PROFILE 13 | 14 | 15 | def get_job_cluster_config(job_id, token, host): 16 | """ 17 | Get cluster config for a job. 18 | 19 | Parameters 20 | ---------- 21 | job_id: int 22 | id of the job you are trying to debug 23 | token: string 24 | Databricks API key 25 | host: string 26 | Databricks host (e.g. https://my-organization.cloud.databricks.com) 27 | """ 28 | res = requests.get( 29 | host + f'/api/2.0/jobs/get/?job_id={job_id}', 30 | auth=('token', token), 31 | ) 32 | 33 | if res.status_code != 200: 34 | raise APIError(res) 35 | else: 36 | cluster_config = res.json()['settings'] 37 | if 'existing_cluster_id' in cluster_config.keys(): 38 | raise Exception(f""" 39 | This job uses an interactive cluster: 40 | {cluster_config['existing_cluster_id']}. 41 | """) 42 | return cluster_config 43 | 44 | 45 | def create_new_cluster(job_id, cluster_name, cluster_config, token, host): 46 | """ 47 | Creat a new cluster based on a job cluster config. 48 | 49 | Parameters 50 | ---------- 51 | job_id: int 52 | id of the job you are trying to debug 53 | cluster_name: string 54 | Name for your cluster, will be default if None 55 | cluster_config: dict 56 | dict containing the config details of the job cluster 57 | token: string 58 | Databricks API key 59 | host: string 60 | Databricks host (e.g. https://my-organization.cloud.databricks.com) 61 | 62 | Side Effects 63 | ------------ 64 | Creates a new cluster on Databricks 65 | """ 66 | if cluster_name is None: 67 | current_time = time.gmtime() 68 | 69 | current_time_formatted = time.strftime('%Y%m%d_%H%M%S', current_time) 70 | 71 | cluster_name = f'private-debug-job-{job_id}-{current_time_formatted}' 72 | 73 | data = { 74 | 'cluster_name': cluster_name, 75 | 'spark_version': cluster_config['new_cluster']['spark_version'], 76 | 'node_type_id': cluster_config['new_cluster']['node_type_id'], 77 | 'aws_attributes': cluster_config['new_cluster']['aws_attributes'], 78 | 'autotermination_minutes': 120 79 | } 80 | 81 | if 'autoscale' in cluster_config['new_cluster'].keys(): 82 | data['autoscale'] = cluster_config['new_cluster']['autoscale'] 83 | 84 | if 'driver_node_type_id' in cluster_config['new_cluster'].keys(): 85 | data['driver_node_type_id'] = ( 86 | cluster_config['new_cluster']['driver_node_type_id'] 87 | ) 88 | 89 | if 'num_workers' in cluster_config['new_cluster'].keys(): 90 | data['num_workers'] = cluster_config['new_cluster']['num_workers'] 91 | 92 | if 'spark_conf' in cluster_config['new_cluster'].keys(): 93 | data['spark_conf'] = cluster_config['new_cluster']['spark_conf'] 94 | 95 | res = requests.post( 96 | host + '/api/2.0/clusters/create', 97 | auth=('token', token), 98 | data=json.dumps(data) 99 | ) 100 | 101 | if res.status_code != 200: 102 | raise APIError(res) 103 | else: 104 | cluster_id = res.json()['cluster_id'] 105 | return cluster_id, cluster_name 106 | 107 | 108 | def attach_job_libraries_to_cluster(cluster_id, cluster_config, token, host): 109 | """ 110 | Attach job libraries to cluster 111 | 112 | Parameters 113 | ---------- 114 | cluster_id: int 115 | id of the cluster you want to attach libraries to 116 | cluster_config: dict 117 | dict containing the config details of the job cluster 118 | token: string 119 | Databricks API key 120 | host: string 121 | Databricks host (e.g. https://my-organization.cloud.databricks.com) 122 | 123 | Side Effects 124 | ------------ 125 | Attaches libraries to a cluster on Databricks 126 | """ 127 | data = { 128 | 'cluster_id': cluster_id, 129 | 'libraries': cluster_config['libraries'] 130 | } 131 | 132 | res = requests.post( 133 | host + '/api/2.0/libraries/install', 134 | auth=('token', token), 135 | data=json.dumps(data) 136 | ) 137 | 138 | if res.status_code != 200: 139 | raise APIError(res) 140 | 141 | 142 | def create_job_library(logger, job_id, cluster_name, token): 143 | """ 144 | Pull down a job cluster config, creates a new cluster with that config, 145 | and attaches job libraries to cluster 146 | 147 | Parameters 148 | ---------- 149 | logger: logging object 150 | configured in cli_commands.py 151 | job_id: int 152 | id of the job you are trying to debug 153 | cluster_name: string 154 | Name for your cluster, will be default if None 155 | token: string 156 | Databricks API key 157 | 158 | Side Effects 159 | ------------ 160 | creates new cluster in Databricks 161 | """ 162 | 163 | config = _load_config(CFG_FILE) 164 | try: 165 | host = config.get(PROFILE, 'host') 166 | except NoOptionError: 167 | raise ValueError('no host provided: please run `stork configure`' 168 | ' to get set up') 169 | 170 | try: 171 | cluster_config = get_job_cluster_config(job_id, token, host) 172 | 173 | cluster_id, cluster_name = create_new_cluster( 174 | job_id, 175 | cluster_name, 176 | cluster_config, 177 | token, 178 | host 179 | ) 180 | 181 | logger.info( 182 | f'Cluster {cluster_name} will come up in 20 seconds' 183 | ) 184 | 185 | time.sleep(20) # Wait for cluster to be up before attaching libraries 186 | 187 | attach_job_libraries_to_cluster( 188 | cluster_id, 189 | cluster_config, 190 | token, 191 | host 192 | ) 193 | 194 | logger.info( 195 | f'New cluster {cluster_name} created on Databricks' 196 | ) 197 | except APIError as err: 198 | raise err 199 | -------------------------------------------------------------------------------- /stork/file_name.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class FileNameError(Exception): 5 | """ 6 | exception to handle when filename is not of correct pattern 7 | """ 8 | def __init__(self, filename): 9 | Exception.__init__( 10 | self, 11 | 'Filename \'{}\' was not correct pattern'.format(filename) 12 | ) 13 | self.filename = filename 14 | 15 | 16 | class FileNameMatch(object): 17 | """ 18 | Matches eggs or jars for both released and snapshot versions 19 | 20 | Supported Patterns: 21 | new_library-1.0.0-py3.6.egg 22 | new_library-1.0.0-SNAPSHOT-py3.6.egg 23 | new_library-1.0.0-SNAPSHOT-my-branch-py3.6.egg 24 | 25 | new_library-1.0.0.egg 26 | new_library-1.0.0-SNAPSHOT.egg 27 | new_library-1.0.0-SNAPSHOT-my-branch.egg 28 | 29 | new_library-1.0.0.jar 30 | new_library-1.0.0-SNAPSHOT.jar 31 | new_library-1.0.0-SNAPSHOT-my-branch.jar 32 | 33 | Parameters 34 | ---------- 35 | library_name: string 36 | base name of library (e.g. 'test_library') 37 | version: string 38 | version of library (e.g. '1.0.0') 39 | 40 | """ 41 | file_pattern = ( 42 | r'([a-zA-Z0-9-\._]+)-((\d+)\.(\d+\.\d+)' 43 | r'(?:-SNAPSHOT(?:[a-zA-Z_\-\.]+)?)?)(?:-py.+)?\.(egg|jar)' 44 | ) 45 | 46 | def __init__(self, filename): 47 | match = re.match(FileNameMatch.file_pattern, filename) 48 | try: 49 | self.filename = filename 50 | self.library_name = match.group(1) 51 | self.version = match.group(2) 52 | self.major_version = match.group(3) 53 | self.minor_version = match.group(4) 54 | self.suffix = match.group(5) 55 | if self.suffix == 'jar': 56 | self.lib_type = 'java-jar' 57 | elif self.suffix == 'egg': 58 | self.lib_type = 'python-egg' 59 | except (IndexError, AttributeError): 60 | raise FileNameError(filename) 61 | 62 | def __eq__(self, other): 63 | if isinstance(other, self.__class__): 64 | self_attrs = {k: v for k, v in vars(self).items()} 65 | self_attrs.pop('filename') 66 | other_attrs = {k: v for k, v in vars(other).items()} 67 | other_attrs.pop('filename') 68 | return self_attrs == other_attrs 69 | else: 70 | return False 71 | 72 | def replace_version(self, other, logger): 73 | """ 74 | True if self can safely replace other 75 | 76 | based on version numbers only - snapshot and branch tags are ignored 77 | """ 78 | 79 | if other.library_name != self.library_name: 80 | logger.debug( 81 | 'not replacable: {} != {} ()' 82 | .format(other.library_name, self.library_name) 83 | ) 84 | return False 85 | elif int(other.major_version) != int(self.major_version): 86 | logger.debug( 87 | 'not replacable: {} != {} ({})' 88 | .format( 89 | int(self.major_version), 90 | int(other.major_version), 91 | other.filename, 92 | ) 93 | ) 94 | return False 95 | elif float(other.minor_version) >= float(self.minor_version): 96 | logger.debug( 97 | 'not replacable: {} >= {} ({})' 98 | .format( 99 | other.minor_version, 100 | self.minor_version, 101 | other.filename, 102 | ) 103 | ) 104 | return False 105 | else: 106 | return True 107 | -------------------------------------------------------------------------------- /stork/update_databricks_library.py: -------------------------------------------------------------------------------- 1 | """ 2 | The core of the program, update_databricks_library handles all the logic and 3 | API calls involved in creating continuous deployment for Python packages 4 | in Databricks. 5 | """ 6 | import json 7 | from os.path import basename 8 | 9 | import requests 10 | from configparser import NoOptionError 11 | 12 | from .api_error import APIError 13 | from .configure import _load_config, CFG_FILE, PROFILE 14 | from .file_name import FileNameError, FileNameMatch 15 | 16 | 17 | def load_library(filename, match, folder, token, host): 18 | """ 19 | upload an egg to the Databricks filesystem. 20 | 21 | Parameters 22 | ---------- 23 | filename: string 24 | local location of file to upload 25 | match: FilenameMatch object 26 | match object with library_type, library_name, and version 27 | folder: string 28 | Databricks folder to upload to 29 | (e.g. '/Users/htorrence@shoprunner.com/') 30 | token: string 31 | Databricks API key 32 | host: string 33 | Databricks host (e.g. https://my-organization.cloud.databricks.com) 34 | 35 | Side Effects 36 | ------------ 37 | uploads egg to Databricks 38 | """ 39 | with open(filename, 'rb') as file_obj: 40 | res = requests.post( 41 | host + '/api/1.2/libraries/upload', 42 | auth=('token', token), 43 | data={ 44 | 'libType': match.lib_type, 45 | 'name': '{0}-{1}'.format(match.library_name, match.version), 46 | 'folder': folder, 47 | }, 48 | files={'uri': file_obj} 49 | ) 50 | 51 | if res.status_code != 200: 52 | raise APIError(res) 53 | 54 | 55 | def get_job_list(logger, match, library_mapping, token, host): 56 | """ 57 | get a list of jobs using the major version of the given library 58 | 59 | Parameters 60 | ---------- 61 | logger: logging object 62 | configured in cli_commands.py 63 | match: FilenameMatch object 64 | match object with suffix 65 | library_mapping: dict 66 | first element of get_library_mapping output 67 | token: string 68 | Databricks API key 69 | host: string 70 | Databricks host (e.g. https://my-organization.cloud.databricks.com) 71 | 72 | Returns 73 | ------- 74 | list of dictionaries containing the job id, job name, and library path 75 | for each job 76 | """ 77 | res = requests.get( 78 | host + '/api/2.0/jobs/list', 79 | auth=('token', token), 80 | ) 81 | if res.status_code == 200: 82 | job_list = [] 83 | if len(res.json()['jobs']) == 0: 84 | return [] 85 | for job in res.json()['jobs']: 86 | logger.debug('job: {}'.format(job['settings']['name'])) 87 | if 'libraries' in job['settings'].keys(): 88 | for library in job['settings']['libraries']: 89 | if match.suffix in library.keys(): 90 | try: # if in prod_folder, mapping turns uri into name 91 | job_library_uri = basename(library[match.suffix]) 92 | job_match = library_mapping[job_library_uri] 93 | except KeyError: 94 | logger.debug( 95 | 'not in library map: {}' 96 | .format(job_library_uri) 97 | ) 98 | pass 99 | else: 100 | if match.replace_version(job_match, logger): 101 | job_list.append({ 102 | 'job_id': job['job_id'], 103 | 'job_name': job['settings']['name'], 104 | 'library_path': library[match.suffix], 105 | }) 106 | else: 107 | logger.debug( 108 | 'not replacable: {}' 109 | .format(job_match.filename) 110 | ) 111 | else: 112 | logger.debug( 113 | 'no matching suffix: looking for {}, found {}' 114 | .format(match.suffix, str(library.keys())) 115 | ) 116 | return job_list 117 | else: 118 | raise APIError(res) 119 | 120 | 121 | def get_library_mapping(logger, prod_folder, token, host): 122 | """ 123 | returns a pair of library mappings, the first mapping library uri to a 124 | library name for all libraries in the production folder, and the second 125 | mapping library name to info for libraries in the production folder with 126 | parsable versions 127 | 128 | Parameters 129 | ---------- 130 | logger: logging object 131 | configured in cli_commands.py 132 | prod_folder: string 133 | name of folder in Databricks UI containing production libraries 134 | token: string 135 | Databricks API key 136 | host: string 137 | Databricks account url 138 | (e.g. https://fake-organization.cloud.databricks.com) 139 | 140 | Returns 141 | ------- 142 | dictionary mapping a library uri to a library name 143 | dictionary mapping library UI path to base name, major version, 144 | minor version, and id number 145 | """ 146 | res = requests.get( 147 | host + f'/api/2.0/workspace/list?path={prod_folder}', 148 | auth=('token', token), 149 | ) 150 | if res.status_code == 200: 151 | file_list = res.json()['objects'] 152 | library_map = {} 153 | id_nums = {} 154 | for file in file_list: 155 | if file['object_type'] == 'LIBRARY': 156 | library_id = file['object_id'] 157 | status_res = ( 158 | requests 159 | .get( 160 | host + '/api/1.2/libraries/status?libraryId={}' 161 | .format(library_id), 162 | auth=('token', token), 163 | ) 164 | ) 165 | if status_res.status_code == 200: 166 | library_info = status_res.json() 167 | if library_info['libType'] == 'python-egg': 168 | full_name = library_info['name'] + '.egg' 169 | elif library_info['libType'] == 'java-jar': 170 | full_name = library_info['name'] + '.jar' 171 | else: 172 | logger.debug( 173 | 'excluded library type: {} is of libType {}, ' 174 | 'not jar or egg' 175 | .format( 176 | library_info['name'], 177 | library_info['libType'], 178 | ) 179 | ) 180 | continue 181 | try: 182 | name_match = FileNameMatch(full_name) 183 | # map uri to name match object 184 | library_map[library_info['files'][0]] = name_match 185 | # map name to name match object and id number 186 | # we'll need the id number to clean up old libraries 187 | id_nums[library_info['name']] = { 188 | 'name_match': name_match, 189 | 'id_num': library_id, 190 | } 191 | except FileNameError: 192 | logger.debug( 193 | 'FileNameError: {} file name is not parsable' 194 | .format(full_name) 195 | ) 196 | pass 197 | else: 198 | raise APIError(status_res) 199 | return library_map, id_nums 200 | else: 201 | raise APIError(res) 202 | 203 | 204 | def update_job_libraries( 205 | logger, 206 | job_list, 207 | match, 208 | new_library_path, 209 | token, 210 | host, 211 | ): 212 | """ 213 | update libraries on jobs using same major version 214 | 215 | Parameters 216 | ---------- 217 | logger: logging object 218 | configured in cli_commands.py 219 | job_list: list of strings 220 | output of get_job_list 221 | match: FilenameMatch object 222 | match object with suffix 223 | new_library_path: string 224 | path to library in dbfs (including uri) 225 | token: string 226 | Databricks API key with admin permissions 227 | host: string 228 | Databricks account url 229 | (e.g. https://fake-organization.cloud.databricks.com) 230 | 231 | Side Effects 232 | ------------ 233 | jobs now require updated version of library 234 | """ 235 | 236 | for job in job_list: 237 | get_res = requests.get( 238 | host + '/api/2.0/jobs/get?job_id={}'.format(job['job_id']), 239 | auth=('token', token), 240 | ) 241 | if get_res.status_code == 200: 242 | job_specs = get_res.json() # copy current job specs 243 | settings = job_specs['settings'] 244 | job_specs.pop('settings') 245 | new_libraries = [] 246 | for lib in settings['libraries']: 247 | if ( 248 | match.suffix in lib.keys() 249 | and lib[match.suffix] == job['library_path'] 250 | ): 251 | # replace entry for old library path with new one 252 | new_libraries.append({match.suffix: new_library_path}) 253 | else: 254 | new_libraries.append(lib) 255 | settings['libraries'] = new_libraries 256 | job_specs['new_settings'] = settings 257 | post_res = requests.post( 258 | host + '/api/2.0/jobs/reset', 259 | auth=('token', token), 260 | data=json.dumps(job_specs) 261 | ) 262 | if post_res.status_code != 200: 263 | raise APIError(post_res) 264 | else: 265 | raise APIError(get_res) 266 | 267 | 268 | def delete_old_versions( 269 | logger, 270 | new_library_match, 271 | id_nums, 272 | token, 273 | prod_folder, 274 | host 275 | ): 276 | """ 277 | delete any other versions of the same library where: 278 | it has the same major version 279 | it has a smaller minor version 280 | it lives in prod_folder 281 | 282 | Parameters 283 | ---------- 284 | logger: logging object 285 | configured in cli_commands.py 286 | new_library_match: FilenameMatch object 287 | match object with library_name_, major_version, minor_version 288 | id_nums: dict 289 | second output of get_library_mapping 290 | token: string 291 | Databricks API key with admin permissions 292 | prod_folder: string 293 | name of folder in Databricks UI containing production libraries 294 | host: string 295 | Databricks account url 296 | (e.g. https://fake-organization.cloud.databricks.com) 297 | 298 | Side Effects 299 | ------------ 300 | delete any other versions of the same library with the same major version 301 | and smaller minor versions 302 | """ 303 | 304 | old_versions = [] 305 | for name, lib in id_nums.items(): 306 | if new_library_match.replace_version(lib['name_match'], logger): 307 | old_versions.append(lib['name_match'].filename) 308 | res = requests.post( 309 | host + '/api/1.2/libraries/delete', 310 | auth=('token', token), 311 | data={'libraryId': lib['id_num']}, 312 | ) 313 | if res.status_code != 200: 314 | raise APIError(res) 315 | return old_versions 316 | 317 | 318 | def update_databricks(logger, path, token, folder, update_jobs, cleanup): 319 | """ 320 | upload library, update jobs using the same major version, 321 | and delete libraries with the same major and lower minor versions 322 | (depending on update_jobs and cleanup flags) 323 | 324 | Parameters 325 | ---------- 326 | logger: logging object 327 | configured in cli_commands.py 328 | path: string 329 | path with name of egg as output from setuptools 330 | (e.g. dist/new_library-1.0.0-py3.6.egg) 331 | token: string 332 | Databricks API key 333 | folder: string 334 | Databricks folder to upload to 335 | (e.g. '/Users/my_email@fake_organization.com/') 336 | update_jobs: bool 337 | if true, jobs using this library will be updated to point to the 338 | new version 339 | if false, will not touch jobs or other library versions at all 340 | cleanup: bool 341 | if true, outdated libraries will be deleted 342 | if false, nothing will be deleted 343 | 344 | Side Effects 345 | ------------ 346 | new library in Databricks 347 | if update_jobs is true, then updated jobs 348 | if update_jobs and cleanup are true, removed outdated libraries 349 | """ 350 | 351 | config = _load_config(CFG_FILE) 352 | try: 353 | host = config.get(PROFILE, 'host') 354 | except NoOptionError: 355 | raise ValueError('no host provided: please run `stork configure`' 356 | ' to get set up') 357 | try: 358 | prod_folder = config.get(PROFILE, 'prod_folder') 359 | except NoOptionError: 360 | raise ValueError('no prod_folder provided: please run ' 361 | '`stork configure` to get set up') 362 | 363 | match = FileNameMatch(basename(path)) 364 | 365 | try: 366 | load_library(path, match, folder, token, host) 367 | logger.info( 368 | 'new library {}-{} loaded to Databricks' 369 | .format(match.library_name, match.version) 370 | ) 371 | except APIError as err: 372 | if err.code == 'http 500' and 'already exists' in err.message: 373 | logger.info( 374 | 'This version ({}) already exists: '.format(match.version) + 375 | 'if a change has been made please update your version number. ' 376 | 'Note this error can also occur if you are uploading a jar ' 377 | 'and an egg already exists with the same name and version, ' 378 | 'or vice versa. In this case you will need to choose a ' 379 | 'different library name or a different folder for either the ' 380 | 'egg or the jar.' 381 | ) 382 | return 383 | else: 384 | raise err 385 | 386 | if update_jobs and folder == prod_folder: 387 | library_map, id_nums = get_library_mapping( 388 | logger, 389 | prod_folder, 390 | token, 391 | host, 392 | ) 393 | library_uri = [ 394 | uri for uri, tmp_match in library_map.items() 395 | if ( 396 | match.library_name == tmp_match.library_name 397 | and match.version == tmp_match.version 398 | ) 399 | ][0] 400 | library_path = 'dbfs:/FileStore/jars/' + library_uri 401 | job_list = get_job_list(logger, match, library_map, token, host) 402 | logger.info( 403 | 'current major version of library used by jobs: {}' 404 | .format(', '.join([i['job_name'] for i in job_list])) 405 | ) 406 | 407 | if len(job_list) != 0: 408 | update_job_libraries( 409 | logger, 410 | job_list, 411 | match, 412 | library_path, 413 | token, 414 | host, 415 | ) 416 | logger.info( 417 | 'updated jobs: {}' 418 | .format(', '.join([i['job_name'] for i in job_list])) 419 | ) 420 | 421 | if cleanup: 422 | old_versions = delete_old_versions( 423 | logger, 424 | match, 425 | id_nums=id_nums, 426 | token=token, 427 | prod_folder=prod_folder, 428 | host=host, 429 | ) 430 | logger.info( 431 | 'removed old versions: {}'.format(', '.join(old_versions)) 432 | ) 433 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShopRunner1/stork/df49a593f0272fa1bb882585335ceb1fa363d15b/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from .fixtures import * 2 | 3 | from stork.configure import _load_config, CFG_FILE, PROFILE 4 | 5 | 6 | def pytest_addoption(parser): 7 | # for passing Databricks token as a command line parameter 8 | parser.addoption( 9 | '--cfg', 10 | action='store', 11 | default=CFG_FILE, 12 | help='path for config file', 13 | ) 14 | parser.addoption( 15 | '--token', 16 | action='store', 17 | default=None, 18 | help='test token', 19 | ) 20 | parser.addoption( 21 | '--host', 22 | action='store', 23 | default=None, 24 | help='test host', 25 | ) 26 | parser.addoption( 27 | '--prod_folder', 28 | action='store', 29 | default=None, 30 | help='test production folder', 31 | ) 32 | 33 | 34 | def _resolve_test_config(metafunc, config, key): 35 | # makes value availible as a fixture with name key 36 | value = getattr(metafunc.config.option, key) 37 | if value is None and config.has_option(PROFILE, key): 38 | value = config.get(PROFILE, key) 39 | if key in metafunc.fixturenames and value is not None: 40 | metafunc.parametrize(key, [value]) 41 | 42 | 43 | def pytest_generate_tests(metafunc): 44 | # This is called for every test. Only get/set command line arguments 45 | # if the argument is specified in the list of test "fixturenames". 46 | cfg_path = getattr(metafunc.config.option, 'cfg') 47 | config = _load_config(cfg_path) 48 | _resolve_test_config(metafunc, config, 'cfg') 49 | _resolve_test_config(metafunc, config, 'token') 50 | _resolve_test_config(metafunc, config, 'host') 51 | _resolve_test_config(metafunc, config, 'prod_folder') 52 | -------------------------------------------------------------------------------- /tests/fixtures.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa E501 2 | 3 | import pytest 4 | from configparser import ConfigParser 5 | from stork.update_databricks_library import FileNameMatch 6 | 7 | 8 | @pytest.fixture 9 | def delete_library_response_list(): 10 | return [{'libraryId': '6'}, {'libraryId': '7'}] 11 | 12 | 13 | @pytest.fixture 14 | def id_nums(): 15 | id_nums = { 16 | 'awesome_library_b-4.2.3': { 17 | 'name_match': FileNameMatch('awesome_library_b-4.2.3.egg'), 18 | 'id_num': 1, 19 | }, 20 | 'awesome_library_a-0.10.1': { 21 | 'name_match': FileNameMatch('awesome_library_a-0.10.1.egg'), 22 | 'id_num': 2, 23 | }, 24 | 'test-library-0.0.0': { 25 | 'name_match': FileNameMatch('test-library-0.0.0.egg'), 26 | 'id_num': 3, 27 | }, 28 | 'test-library-plus-stuff-0.0.0': { 29 | 'name_match': FileNameMatch('test-library-plus-stuff-0.0.0.egg'), 30 | 'id_num': 4, 31 | }, 32 | 'test-library-1.0.1': { 33 | 'name_match': FileNameMatch('test-library-1.0.1.egg'), 34 | 'id_num': 5, 35 | }, 36 | 'test-library-1.0.2': { 37 | 'name_match': FileNameMatch('test-library-1.0.2.egg'), 38 | 'id_num': 6, 39 | }, 40 | 'test-library-1.0.3': { 41 | 'name_match': FileNameMatch('test-library-1.0.3.egg'), 42 | 'id_num': 7, 43 | } 44 | } 45 | return id_nums 46 | 47 | 48 | @pytest.fixture 49 | def job_list(): 50 | job_list = [ 51 | { 52 | 'job_id': 3, 53 | 'job_name': 'job_3', 54 | 'library_path': 'dbfs:/FileStore/jars/47fb08a7-test-library_1_0_1_py3_6-e5f8c.egg', 55 | }, 56 | ] 57 | return job_list 58 | 59 | 60 | @pytest.fixture 61 | def job_list_response(): 62 | job_list_response = { 63 | 'jobs': [ 64 | { 65 | 'job_id': 1, 66 | 'settings': { 67 | 'name': 'job_1', 68 | 'new_cluster': {'cluster_attributes': 'attrs'}, 69 | 'libraries': [ 70 | {'pypi': {'package': 'boto3'}}, 71 | {'maven': {'coordinates': 'maven_library'}}, 72 | {'egg': 'dbfs:/FileStore/jars/996c949b-awesome_library_a_0_10_1_py3_6-266f.egg'}, 73 | {'egg': 'dbfs:/FileStore/jars/47fb08a7-awesome_library_b_4_2_3_py3_6-e5f8c.egg'}, 74 | {'egg': 'dbfs:/FileStore/jars/47fb08a7-test-library_0_0_0_py3_6-e5f8c.egg'}, 75 | ], 76 | }, 77 | 'creator_user_name': 'tests@shoprunner' 78 | }, 79 | { 80 | 'job_id': 2, 81 | 'settings': { 82 | 'name': 'job_2', 83 | 'new_cluster': {'cluster_attributes': 'attrs'}, 84 | 'libraries': [ 85 | {'egg': 'dbfs:/FileStore/jars/47fb08a7-test-library_1_0_0_py3_6-e5f8c.egg'}, 86 | {'egg': 'dbfs:/FileStore/jars/01832402-test-library-plus-stuff_0_0_0_py3_6-e5f8c.egg'}, 87 | ], 88 | }, 89 | 'creator_user_name': 'tests@shoprunner' 90 | }, 91 | { 92 | 'job_id': 3, 93 | 'settings': { 94 | 'name': 'job_3', 95 | 'new_cluster': {'cluster_attributes': 'attrs'}, 96 | 'libraries': [ 97 | {'egg': 'dbfs:/FileStore/jars/47fb08a7-test-library_1_0_1_py3_6-e5f8c.egg'}, 98 | {'egg': 'dbfs:/FileStore/jars/01832402-test-library-plus-stuff_0_0_0_py3_6-e5f8c.egg'}, 99 | ], 100 | }, 101 | 'creator_user_name': 'tests@shoprunner' 102 | }, 103 | { 104 | 'job_id': 4, 105 | 'settings': { 106 | 'name': 'job_4', 107 | 'new_cluster': {'cluster_attributes': 'attrs'}, 108 | 'libraries': [ 109 | {'egg': 'dbfs:/FileStore/jars/01832402-test-library-plus-stuff_0_0_0_py3_6-e5f8c.egg'} 110 | ], 111 | }, 112 | 'creator_user_name': 'tests@shoprunner' 113 | }, 114 | ] 115 | } 116 | return job_list_response 117 | 118 | 119 | @pytest.fixture 120 | def job_update_response_list_new(): 121 | job_update_response_list = [ 122 | { 123 | 'job_id': 3, 124 | 'new_settings': { 125 | 'name': 'job_3', 126 | 'new_cluster': { 127 | 'cluster_attributes': 'attrs' 128 | }, 129 | 'libraries': [ 130 | {'egg': 'dbfs:/FileStore/jars/47fb08a7-test-library_1_0_3_py3_6-e5f8c.egg'}, 131 | {'egg': 'dbfs:/FileStore/jars/01832402-test-library-plus-stuff_0_0_0_py3_6-e5f8c.egg'} 132 | ] 133 | }, 134 | 'creator_user_name': 'tests@shoprunner' 135 | }, 136 | ] 137 | return job_update_response_list 138 | 139 | 140 | @pytest.fixture 141 | def job_update_response_list_old(): 142 | job_update_response_list = [ 143 | { 144 | 'job_id': 3, 145 | 'settings': { 146 | 'name': 'job_3', 147 | 'new_cluster': { 148 | 'cluster_attributes': 'attrs' 149 | }, 150 | 'libraries': [ 151 | {'egg': 'dbfs:/FileStore/jars/47fb08a7-test-library_1_0_3_py3_6-e5f8c.egg'}, 152 | {'egg': 'dbfs:/FileStore/jars/01832402-test-library-plus-stuff_0_0_0_py3_6-e5f8c.egg'} 153 | ] 154 | }, 155 | 'creator_user_name': 'tests@shoprunner' 156 | }, 157 | ] 158 | return job_update_response_list 159 | 160 | 161 | @pytest.fixture 162 | def library_1(prod_folder): 163 | return { 164 | 'id': '1', 165 | 'name': 'awesome_library_b-4.2.3', 166 | 'folder': prod_folder, 167 | 'libType': 'python-egg', 168 | 'files': ['47fb08a7-awesome_library_b_4_2_3_py3_6-e5f8c.egg'], 169 | 'attachAllClusters': False, 170 | 'statuses': [], 171 | } 172 | 173 | 174 | @pytest.fixture 175 | def library_2(prod_folder): 176 | return { 177 | 'id': '2', 178 | 'name': 'awesome_library_a-0.10.1', 179 | 'folder': prod_folder, 180 | 'libType': 'python-egg', 181 | 'files': ['996c949b-awesome_library_a_0_10_1_py3_6-266f.egg'], 182 | 'attachAllClusters': False, 183 | 'statuses': [], 184 | } 185 | 186 | 187 | @pytest.fixture 188 | def library_3(prod_folder): 189 | return { 190 | 'id': '3', 191 | 'name': 'test-library-0.0.0', 192 | 'folder': prod_folder, 193 | 'libType': 'python-egg', 194 | 'files': ['47fb08a7-test-library_0_0_0_py3_6-e5f8c.egg'], 195 | 'attachAllClusters': False, 196 | 'statuses': [], 197 | } 198 | 199 | 200 | @pytest.fixture 201 | def library_4(prod_folder): 202 | return { 203 | 'id': '4', 204 | 'name': 'test-library-plus-stuff-0.0.0', 205 | 'folder': prod_folder, 206 | 'libType': 'python-egg', 207 | 'files': ['01832402-test-library-plus-stuff_0_0_0_py3_6-e5f8c.egg'], 208 | 'attachAllClusters': False, 209 | 'statuses': [], 210 | } 211 | 212 | 213 | @pytest.fixture 214 | def library_5(prod_folder): 215 | return { 216 | 'id': '5', 217 | 'name': 'test-library-1.0.1', 218 | 'folder': prod_folder, 219 | 'libType': 'python-egg', 220 | 'files': ['47fb08a7-test-library_1_0_1_py3_6-e5f8c.egg'], 221 | 'attachAllClusters': False, 222 | 'statuses': [], 223 | } 224 | 225 | 226 | @pytest.fixture 227 | def library_6(prod_folder): 228 | return { 229 | 'id': '6', 230 | 'name': 'test-library-1.0.2', 231 | 'folder': prod_folder, 232 | 'libType': 'python-egg', 233 | 'files': ['47fb08a7-test-library_1_0_2_py3_6-e5f8c.egg'], 234 | 'attachAllClusters': False, 235 | 'statuses': [], 236 | } 237 | 238 | 239 | @pytest.fixture 240 | def library_7(prod_folder): 241 | return { 242 | 'id': '7', 243 | 'name': 'test-library-1.0.3', 244 | 'folder': prod_folder, 245 | 'libType': 'python-egg', 246 | 'files': ['47fb08a7-test-library_1_0_3_py3_6-e5f8c.egg'], 247 | 'attachAllClusters': False, 248 | 'statuses': [], 249 | } 250 | 251 | 252 | @pytest.fixture 253 | def workspace_list_response(prod_folder): 254 | workspace_list_response = { 255 | 'objects': [ 256 | { 257 | 'object_type': 'LIBRARY', 258 | 'path': f'/{prod_folder}/awesome_library_b-4.2.3', 259 | 'object_id': 1 260 | }, 261 | { 262 | 'object_type': 'LIBRARY', 263 | 'path': f'/{prod_folder}/awesome_library_a-0.10.1', 264 | 'object_id': 2 265 | }, 266 | { 267 | 'object_type': 'LIBRARY', 268 | 'path': f'/{prod_folder}/test-library-0.0.0', 269 | 'object_id': 3 270 | }, 271 | { 272 | 'object_type': 'LIBRARY', 273 | 'path': f'/{prod_folder}/test-library-plus-stuff-0.0.0', 274 | 'object_id': 4 275 | }, 276 | { 277 | 'object_type': 'LIBRARY', 278 | 'path': f'/{prod_folder}/test-library-1.0.1', 279 | 'object_id': 5 280 | }, 281 | { 282 | 'object_type': 'LIBRARY', 283 | 'path': f'/{prod_folder}/test-library-1.0.2', 284 | 'object_id': 6 285 | }, 286 | { 287 | 'object_type': 'LIBRARY', 288 | 'path': f'/{prod_folder}/test-library-1.0.3', 289 | 'object_id': 7 290 | }, 291 | ] 292 | } 293 | 294 | return workspace_list_response 295 | 296 | 297 | @pytest.fixture 298 | def library_mapping(): 299 | library_mapping = { 300 | '47fb08a7-test-library_1_0_2_py3_6-e5f8c.egg': FileNameMatch('test-library-1.0.2.egg'), 301 | '47fb08a7-test-library_1_0_1_py3_6-e5f8c.egg': FileNameMatch('test-library-1.0.1.egg'), 302 | '47fb08a7-test-library_0_0_0_py3_6-e5f8c.egg': FileNameMatch('test-library-0.0.0.egg'), 303 | '01832402-test-library-plus-stuff_0_0_0_py3_6-e5f8c.egg': FileNameMatch('test-library-plus-stuff-0.0.0.egg'), 304 | '996c949b-awesome_library_a_0_10_1_py3_6-266f.egg': FileNameMatch('awesome_library_a-0.10.1.egg'), 305 | '47fb08a7-awesome_library_b_4_2_3_py3_6-e5f8c.egg': FileNameMatch('awesome_library_b-4.2.3.egg'), 306 | '47fb08a7-test-library_1_0_3_py3_6-e5f8c.egg': FileNameMatch('test-library-1.0.3.egg'), 307 | } 308 | return library_mapping 309 | 310 | 311 | @pytest.fixture 312 | def existing_config(): 313 | existing_config = ConfigParser() 314 | existing_config['DEFAULT'] = { 315 | 'host': 'test_host', 316 | 'token': 'test_token', 317 | 'prod_folder': '/test_folder', 318 | } 319 | return existing_config 320 | 321 | @pytest.fixture 322 | def empty_config(): 323 | empty_config = ConfigParser() 324 | empty_config['DEFAULT'] = {} 325 | return empty_config 326 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os.path import expanduser, join 3 | from unittest import mock 4 | 5 | from click.testing import CliRunner 6 | from configparser import ConfigParser 7 | 8 | from stork.configure import configure 9 | from stork.cli_commands import upload, upload_and_update 10 | 11 | 12 | logging.basicConfig(level=logging.INFO) 13 | logger = logging.getLogger('stork.cli_commands') 14 | 15 | 16 | def test_configure_no_existing_config(): 17 | expected_stdout = ( 18 | 'Databricks host (e.g. https://my-organization.cloud.databricks.com): ' 19 | 'https://test_host\n' 20 | 'Databricks API token: \n' 21 | 'Repeat for confirmation: \n' 22 | 'Databricks folder for production libraries: /test_folder\n' 23 | ) 24 | 25 | filename = join(expanduser('~'), '.storkcfg') 26 | expected_call_list = [ 27 | mock.call(filename, encoding=None), 28 | mock.call(filename, 'w+'), 29 | mock.call().write('[DEFAULT]\n'), 30 | mock.call().write('host = https://test_host\n'), 31 | mock.call().write('token = test_token\n'), 32 | mock.call().write('prod_folder = /test_folder\n'), 33 | mock.call().write('\n'), 34 | ] 35 | 36 | with mock.patch('builtins.open', mock.mock_open(read_data='')) as m_open: 37 | runner = CliRunner() 38 | result = runner.invoke( 39 | configure, 40 | input=( 41 | 'https://test_host\n' 42 | 'test_token\n' 43 | 'test_token\n' 44 | '/test_folder\n' 45 | ), 46 | ) 47 | m_open.assert_has_calls(expected_call_list, any_order=True) 48 | 49 | assert not result.exception 50 | assert result.output == expected_stdout 51 | 52 | 53 | def test_configure_extra_slash_in_host(): 54 | expected_stdout = ( 55 | 'Databricks host (e.g. https://my-organization.cloud.databricks.com): ' 56 | 'https://test_host/\n' 57 | 'Databricks API token: \n' 58 | 'Repeat for confirmation: \n' 59 | 'Databricks folder for production libraries: /test_folder\n' 60 | ) 61 | 62 | filename = join(expanduser('~'), '.storkcfg') 63 | expected_call_list = [ 64 | mock.call(filename, encoding=None), 65 | mock.call(filename, 'w+'), 66 | mock.call().write('[DEFAULT]\n'), 67 | mock.call().write('host = https://test_host\n'), 68 | mock.call().write('token = test_token\n'), 69 | mock.call().write('prod_folder = /test_folder\n'), 70 | mock.call().write('\n'), 71 | ] 72 | 73 | with mock.patch('builtins.open', mock.mock_open(read_data='')) as m_open: 74 | runner = CliRunner() 75 | result = runner.invoke( 76 | configure, 77 | input=( 78 | 'https://test_host/\n' 79 | 'test_token\n' 80 | 'test_token\n' 81 | '/test_folder\n' 82 | ), 83 | ) 84 | m_open.assert_has_calls(expected_call_list, any_order=True) 85 | 86 | assert not result.exception 87 | assert result.output == expected_stdout 88 | 89 | 90 | def test_configure_extra_slash_in_folder(): 91 | expected_stdout = ( 92 | 'Databricks host (e.g. https://my-organization.cloud.databricks.com): ' 93 | 'https://test_host\n' 94 | 'Databricks API token: \n' 95 | 'Repeat for confirmation: \n' 96 | 'Databricks folder for production libraries: /test_folder/\n' 97 | ) 98 | 99 | filename = join(expanduser('~'), '.storkcfg') 100 | expected_call_list = [ 101 | mock.call(filename, encoding=None), 102 | mock.call(filename, 'w+'), 103 | mock.call().write('[DEFAULT]\n'), 104 | mock.call().write('host = https://test_host\n'), 105 | mock.call().write('token = test_token\n'), 106 | mock.call().write('prod_folder = /test_folder\n'), 107 | mock.call().write('\n'), 108 | ] 109 | 110 | with mock.patch('builtins.open', mock.mock_open(read_data='')) as m_open: 111 | runner = CliRunner() 112 | result = runner.invoke( 113 | configure, 114 | input=( 115 | 'https://test_host\n' 116 | 'test_token\n' 117 | 'test_token\n' 118 | '/test_folder/\n' 119 | ), 120 | ) 121 | m_open.assert_has_calls(expected_call_list, any_order=True) 122 | 123 | assert not result.exception 124 | assert result.output == expected_stdout 125 | 126 | 127 | def test_configure_no_http_in_host(): 128 | expected_stdout = ( 129 | 'Databricks host (e.g. https://my-organization.cloud.databricks.com): ' 130 | 'test_host\n' 131 | "looks like there's an issue - make sure the host name starts " 132 | 'with http: https://test_host\n' 133 | 'Databricks API token: \n' 134 | 'Repeat for confirmation: \n' 135 | 'Databricks folder for production libraries: /test_folder\n' 136 | ) 137 | 138 | filename = join(expanduser('~'), '.storkcfg') 139 | expected_call_list = [ 140 | mock.call(filename, encoding=None), 141 | mock.call(filename, 'w+'), 142 | mock.call().write('[DEFAULT]\n'), 143 | mock.call().write('host = https://test_host\n'), 144 | mock.call().write('token = test_token\n'), 145 | mock.call().write('prod_folder = /test_folder\n'), 146 | mock.call().write('\n'), 147 | ] 148 | 149 | with mock.patch('builtins.open', mock.mock_open(read_data='')) as m_open: 150 | runner = CliRunner() 151 | result = runner.invoke( 152 | configure, 153 | input=( 154 | 'test_host\n' 155 | 'https://test_host\n' 156 | 'test_token\n' 157 | 'test_token\n' 158 | '/test_folder\n' 159 | ), 160 | ) 161 | m_open.assert_has_calls(expected_call_list, any_order=True) 162 | 163 | assert not result.exception 164 | assert result.output == expected_stdout 165 | 166 | 167 | @mock.patch('stork.cli_commands._load_config') 168 | @mock.patch('stork.cli_commands.update_databricks') 169 | def test_upload(update_databricks_mock, config_mock, existing_config): 170 | 171 | config_mock.return_value = existing_config 172 | 173 | runner = CliRunner() 174 | result = runner.invoke( 175 | upload, 176 | ['--path', '/path/to/egg'] 177 | ) 178 | 179 | config_mock.assert_called_once() 180 | update_databricks_mock.assert_called_with( 181 | logger, 182 | '/path/to/egg', 183 | 'test_token', 184 | '/test_folder', 185 | cleanup=False, 186 | update_jobs=False, 187 | ) 188 | assert not result.exception 189 | 190 | 191 | @mock.patch('stork.cli_commands._load_config') 192 | @mock.patch('stork.cli_commands.update_databricks') 193 | def test_upload_all_options( 194 | update_databricks_mock, 195 | config_mock, 196 | existing_config 197 | ): 198 | 199 | config_mock.return_value = existing_config 200 | 201 | runner = CliRunner() 202 | result = runner.invoke( 203 | upload, 204 | [ 205 | '--path', 206 | '/path/to/egg', 207 | '--token', 208 | 'new_token', 209 | '--folder', 210 | 'new_folder' 211 | ] 212 | ) 213 | 214 | config_mock.assert_called_once() 215 | update_databricks_mock.assert_called_with( 216 | logger, 217 | '/path/to/egg', 218 | 'new_token', 219 | 'new_folder', 220 | cleanup=False, 221 | update_jobs=False, 222 | ) 223 | assert not result.exception 224 | 225 | 226 | @mock.patch('stork.cli_commands._load_config') 227 | def test_upload_missing_token(config_mock, empty_config): 228 | 229 | config_mock.return_value = empty_config 230 | 231 | runner = CliRunner() 232 | result = runner.invoke( 233 | upload, 234 | ['--path', '/path/to/egg', '--folder', 'test_folder'] 235 | ) 236 | 237 | assert str(result.exception) == ( 238 | 'no token found - either provide a command line argument or set up' 239 | ' a default by running `stork configure`' 240 | ) 241 | 242 | 243 | @mock.patch('stork.cli_commands._load_config') 244 | def test_upload_missing_folder(config_mock, empty_config): 245 | 246 | config_mock.return_value = empty_config 247 | 248 | runner = CliRunner() 249 | result = runner.invoke( 250 | upload, 251 | ['--path', '/path/to/egg', '--token', 'test_token'] 252 | ) 253 | 254 | assert str(result.exception) == ( 255 | 'no folder found - either provide a command line argument or set up' 256 | ' a default by running `stork configure`' 257 | ) 258 | 259 | 260 | @mock.patch('stork.cli_commands._load_config') 261 | @mock.patch('stork.cli_commands.update_databricks') 262 | def test_upload_and_update_cleanup( 263 | update_databricks_mock, 264 | config_mock, 265 | existing_config 266 | ): 267 | 268 | config_mock.return_value = existing_config 269 | 270 | runner = CliRunner() 271 | result = runner.invoke( 272 | upload_and_update, 273 | ['--path', '/path/to/egg'] 274 | ) 275 | 276 | config_mock.assert_called_once() 277 | update_databricks_mock.assert_called_with( 278 | logger, 279 | '/path/to/egg', 280 | 'test_token', 281 | '/test_folder', 282 | cleanup=True, 283 | update_jobs=True, 284 | ) 285 | assert not result.exception 286 | 287 | 288 | @mock.patch('stork.cli_commands._load_config') 289 | @mock.patch('stork.cli_commands.update_databricks') 290 | def test_upload_and_update_no_cleanup( 291 | update_databricks_mock, 292 | config_mock, 293 | existing_config 294 | ): 295 | 296 | config_mock.return_value = existing_config 297 | 298 | runner = CliRunner() 299 | result = runner.invoke( 300 | upload_and_update, 301 | ['--path', '/path/to/egg', '--no-cleanup'] 302 | ) 303 | 304 | config_mock.assert_called_once() 305 | update_databricks_mock.assert_called_with( 306 | logger, 307 | '/path/to/egg', 308 | 'test_token', 309 | '/test_folder', 310 | cleanup=False, 311 | update_jobs=True, 312 | ) 313 | assert not result.exception 314 | 315 | 316 | @mock.patch('stork.cli_commands._load_config') 317 | def test_upload_and_update_missing_token(config_mock): 318 | 319 | existing_config = ConfigParser() 320 | existing_config['DEFAULT'] = {'prod_folder': 'test_folder'} 321 | config_mock.return_value = existing_config 322 | 323 | runner = CliRunner() 324 | result = runner.invoke( 325 | upload_and_update, 326 | ['--path', '/path/to/egg'] 327 | ) 328 | 329 | config_mock.assert_called_once() 330 | assert str(result.exception) == ( 331 | 'no token found - either provide a command line argument or set up' 332 | ' a default by running `stork configure`' 333 | ) 334 | 335 | 336 | @mock.patch('stork.cli_commands._load_config') 337 | def test_upload_and_update_missing_folder(config_mock, empty_config): 338 | 339 | config_mock.return_value = empty_config 340 | 341 | runner = CliRunner() 342 | result = runner.invoke( 343 | upload_and_update, 344 | ['-p', '/path/to/egg', '--token', 'test_token'] 345 | ) 346 | 347 | config_mock.assert_called_once() 348 | assert str(result.exception) == ( 349 | 'no folder found - either provide a command line argument or set up' 350 | ' a default by running `stork configure`' 351 | ) 352 | -------------------------------------------------------------------------------- /tests/test_filename_match.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import pytest 4 | 5 | from stork.update_databricks_library import ( 6 | FileNameError, 7 | FileNameMatch, 8 | ) 9 | 10 | 11 | def test_filename_match_egg_with_py(): 12 | match = FileNameMatch('new_library-1.0.0-py3.6.egg') 13 | assert match.library_name == 'new_library' 14 | assert match.version == '1.0.0' 15 | assert match.major_version == '1' 16 | assert match.minor_version == '0.0' 17 | assert match.suffix == 'egg' 18 | assert match.lib_type == 'python-egg' 19 | 20 | 21 | def test_filename_match_egg_snapshot_with_py(): 22 | match = FileNameMatch('new_library-1.0.0-SNAPSHOT-py3.6.egg') 23 | assert match.library_name == 'new_library' 24 | assert match.version == '1.0.0-SNAPSHOT' 25 | assert match.major_version == '1' 26 | assert match.minor_version == '0.0' 27 | assert match.suffix == 'egg' 28 | assert match.lib_type == 'python-egg' 29 | 30 | 31 | def test_filename_match_egg_snapshot_branch_with_py(): 32 | match = FileNameMatch('new_library-1.0.0-SNAPSHOT-my-branch-py3.6.egg') 33 | assert match.library_name == 'new_library' 34 | assert match.version == '1.0.0-SNAPSHOT-my-branch' 35 | assert match.major_version == '1' 36 | assert match.minor_version == '0.0' 37 | assert match.suffix == 'egg' 38 | assert match.lib_type == 'python-egg' 39 | 40 | 41 | def test_filename_match_egg(): 42 | match = FileNameMatch('new_library-1.0.0.egg') 43 | assert match.library_name == 'new_library' 44 | assert match.version == '1.0.0' 45 | assert match.major_version == '1' 46 | assert match.minor_version == '0.0' 47 | assert match.suffix == 'egg' 48 | assert match.lib_type == 'python-egg' 49 | 50 | 51 | def test_filename_match_egg_snapshot(): 52 | match = FileNameMatch('new_library-1.0.0-SNAPSHOT.egg') 53 | assert match.library_name == 'new_library' 54 | assert match.version == '1.0.0-SNAPSHOT' 55 | assert match.major_version == '1' 56 | assert match.minor_version == '0.0' 57 | assert match.suffix == 'egg' 58 | assert match.lib_type == 'python-egg' 59 | 60 | 61 | def test_filename_match_egg_snapshot_branch(): 62 | match = FileNameMatch('new_library-1.0.0-SNAPSHOT-my-branch.egg') 63 | assert match.library_name == 'new_library' 64 | assert match.version == '1.0.0-SNAPSHOT-my-branch' 65 | assert match.major_version == '1' 66 | assert match.minor_version == '0.0' 67 | assert match.suffix == 'egg' 68 | assert match.lib_type == 'python-egg' 69 | 70 | 71 | def test_filename_match_jar(): 72 | match = FileNameMatch('new_library-1.0.0.jar') 73 | assert match.library_name == 'new_library' 74 | assert match.version == '1.0.0' 75 | assert match.major_version == '1' 76 | assert match.minor_version == '0.0' 77 | assert match.suffix == 'jar' 78 | assert match.lib_type == 'java-jar' 79 | 80 | 81 | def test_filename_match_jar_snapshot(): 82 | match = FileNameMatch('new_library-1.0.0-SNAPSHOT.jar') 83 | assert match.library_name == 'new_library' 84 | assert match.version == '1.0.0-SNAPSHOT' 85 | assert match.major_version == '1' 86 | assert match.minor_version == '0.0' 87 | assert match.suffix == 'jar' 88 | assert match.lib_type == 'java-jar' 89 | 90 | 91 | def test_filename_match_jar_snapshot_branch(): 92 | match = FileNameMatch('new_library-1.0.0-SNAPSHOT-my-branch.jar') 93 | assert match.library_name == 'new_library' 94 | assert match.version == '1.0.0-SNAPSHOT-my-branch' 95 | assert match.major_version == '1' 96 | assert match.minor_version == '0.0' 97 | assert match.suffix == 'jar' 98 | assert match.lib_type == 'java-jar' 99 | 100 | 101 | def test_filename_match_wrong_file_type(): 102 | with pytest.raises(FileNameError) as err: 103 | FileNameMatch('test-library-1.0.3.zip') 104 | assert err.filename == 'test-library-1.0.3.zip' 105 | 106 | 107 | def test_filename_match_garbage_version(): 108 | with pytest.raises(FileNameError) as err: 109 | FileNameMatch('test-library-1.0.3-askjdhfa.egg') 110 | assert err.filename == 'test-library-1.0.3-askjdhfa.egg' 111 | 112 | 113 | def test_filename_match_equal(): 114 | match_1 = FileNameMatch('test-library-1.0.3.egg') 115 | match_2 = FileNameMatch('test-library-1.0.3-py3.5.egg') 116 | assert match_1 == match_2 117 | 118 | 119 | def test_filename_match_not_equal(): 120 | match_1 = FileNameMatch('test-library-1.0.3.egg') 121 | match_2 = FileNameMatch('test-library-1.0.3-SNAPSHOT.egg') 122 | assert match_1 != match_2 123 | 124 | 125 | def test_filename_match_should_replace(): 126 | match_1 = FileNameMatch('test-library-1.1.3.egg') 127 | match_2 = FileNameMatch('test-library-1.0.3.egg') 128 | assert match_1.replace_version(match_2, mock.MagicMock()) 129 | 130 | 131 | def test_filename_match_should_replace_snapshot(): 132 | match_1 = FileNameMatch('test-library-1.1.3.egg') 133 | match_2 = FileNameMatch('test-library-1.0.3-SNAPSHOT.egg') 134 | assert match_1.replace_version(match_2, mock.MagicMock()) 135 | 136 | 137 | def test_filename_match_should_not_replace_snapshot(): 138 | match_1 = FileNameMatch('test-library-1.1.3.egg') 139 | match_2 = FileNameMatch('test-library-0.0.3-SNAPSHOT.egg') 140 | assert not match_1.replace_version(match_2, mock.MagicMock()) 141 | -------------------------------------------------------------------------------- /tests/test_token_permissions.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | def test_permission_basic(token, host): 5 | res1 = requests.get( 6 | host + '/api/2.0/libraries/all-cluster-statuses', 7 | auth=('token', token), 8 | ) 9 | assert res1.status_code == 200 10 | 11 | 12 | def test_permission_admin(token, host): 13 | res1 = requests.get( 14 | host + '/api/1.2/libraries/list', 15 | auth=('token', token), 16 | ) 17 | assert res1.status_code == 200 18 | 19 | library_id = res1.json()[0]['id'] 20 | res2 = requests.get( 21 | host + '/api/1.2/libraries/status?libraryId={}' 22 | .format(library_id), 23 | auth=('token', token), 24 | ) 25 | assert res2.status_code == 200 26 | -------------------------------------------------------------------------------- /tests/test_update_databricks_library.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from unittest import mock 3 | 4 | import json 5 | import pytest 6 | import responses 7 | import requests 8 | 9 | from .unittest_helpers import strip_whitespace 10 | from stork.update_databricks_library import ( 11 | APIError, 12 | FileNameError, 13 | FileNameMatch, 14 | load_library, 15 | get_job_list, 16 | get_library_mapping, 17 | update_job_libraries, 18 | delete_old_versions, 19 | update_databricks, 20 | ) 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | # helper for responses library callbacks 26 | def request_callback(request): 27 | return(200, {}, request.body) 28 | 29 | 30 | @responses.activate 31 | def test_load_library_egg(host, prod_folder): 32 | filename = 'test-library-1.0.3-py3.6.egg' 33 | 34 | responses.add( 35 | responses.POST, 36 | host + '/api/1.2/libraries/upload', 37 | status=200, 38 | ) 39 | 40 | with mock.patch( 41 | 'builtins.open', 42 | mock.mock_open(read_data='egg file contents') 43 | ): 44 | load_library( 45 | filename=filename, 46 | match=FileNameMatch(filename), 47 | folder=prod_folder, 48 | token='', 49 | host=host, 50 | ) 51 | 52 | 53 | @responses.activate 54 | def test_load_library_jar(host, prod_folder): 55 | filename = 'test-library-1.0.3.jar' 56 | 57 | responses.add( 58 | responses.POST, 59 | host + '/api/1.2/libraries/upload', 60 | status=200, 61 | ) 62 | 63 | with mock.patch( 64 | 'builtins.open', 65 | mock.mock_open(read_data='jar file contents') 66 | ): 67 | load_library( 68 | filename=filename, 69 | match=FileNameMatch(filename), 70 | folder=prod_folder, 71 | token='', 72 | host=host, 73 | ) 74 | 75 | 76 | @responses.activate 77 | def test_load_library_APIError(host, prod_folder): 78 | filename = 'test-library-1.0.3-py3.6.egg' 79 | 80 | responses.add( 81 | responses.POST, 82 | host + '/api/1.2/libraries/upload', 83 | status=401, 84 | ) 85 | 86 | with pytest.raises(APIError) as err: 87 | with mock.patch( 88 | 'builtins.open', 89 | mock.mock_open(read_data='egg file contents') 90 | ): 91 | load_library( 92 | filename=filename, 93 | match=FileNameMatch(filename), 94 | folder=prod_folder, 95 | token='', 96 | host=host, 97 | ) 98 | assert err.code == 'http 401' 99 | 100 | 101 | @responses.activate 102 | def test_get_job_list(library_mapping, job_list, job_list_response, host): 103 | 104 | responses.add( 105 | responses.GET, 106 | host + '/api/2.0/jobs/list', 107 | status=200, 108 | json=job_list_response, 109 | ) 110 | match = FileNameMatch('test-library-1.1.2.egg') 111 | job_list_actual = get_job_list( 112 | logger, 113 | match=match, 114 | library_mapping=library_mapping, 115 | token='', 116 | host=host, 117 | ) 118 | 119 | assert len(responses.calls) == 1 120 | assert job_list_actual == job_list 121 | 122 | 123 | @responses.activate 124 | def test_get_library_mapping( 125 | workspace_list_response, 126 | library_1, 127 | library_2, 128 | library_3, 129 | library_4, 130 | library_5, 131 | library_6, 132 | library_7, 133 | id_nums, 134 | library_mapping, 135 | host, 136 | prod_folder, 137 | ): 138 | responses.add( 139 | responses.GET, 140 | host + '/api/2.0/workspace/list', 141 | status=200, 142 | json=workspace_list_response, 143 | ) 144 | for i, lib in enumerate([ 145 | library_1, 146 | library_2, 147 | library_3, 148 | library_4, 149 | library_5, 150 | library_6, 151 | library_7 152 | ]): 153 | responses.add( 154 | responses.GET, 155 | host + '/api/1.2/libraries/status?libraryId={}'.format(i+1), 156 | status=200, 157 | json=lib, 158 | ) 159 | 160 | library_map_actual, id_nums_actual = get_library_mapping( 161 | logger, 162 | token='', 163 | host=host, 164 | prod_folder=prod_folder, 165 | ) 166 | 167 | assert len(responses.calls) == 8 168 | assert id_nums == id_nums_actual 169 | assert library_mapping == library_map_actual 170 | 171 | 172 | @responses.activate 173 | def test_update_job_libraries( 174 | job_list, 175 | job_update_response_list_old, 176 | job_update_response_list_new, 177 | host, 178 | ): 179 | for job in job_update_response_list_old: 180 | responses.add( 181 | responses.GET, 182 | host + '/api/2.0/jobs/get?job_id={}'.format(job['job_id']), 183 | status=200, 184 | json=job, 185 | ) 186 | responses.add_callback( 187 | responses.POST, 188 | host + '/api/2.0/jobs/reset', 189 | callback=request_callback, 190 | ) 191 | 192 | update_job_libraries( 193 | logger, 194 | job_list, 195 | FileNameMatch('test_library-1.2.3.egg'), 196 | 'dbfs:/FileStore/jars/some_library_uri', 197 | '', 198 | host, 199 | ) 200 | 201 | assert len(responses.calls) == 2 202 | assert ( 203 | json.loads(responses.calls[1].response.text) == 204 | job_update_response_list_new[0] 205 | ) 206 | 207 | 208 | @pytest.mark.usefixtures('id_nums') 209 | @responses.activate 210 | def test_delete_old_versions(id_nums, host, prod_folder): 211 | for i in range(2): 212 | responses.add_callback( 213 | responses.POST, 214 | host + '/api/1.2/libraries/delete', 215 | callback=request_callback, 216 | ) 217 | 218 | actual_deleted_libraries = delete_old_versions( 219 | logger, 220 | FileNameMatch('test-library-1.0.3-SNAPSHOT.egg'), 221 | id_nums, 222 | token='', 223 | prod_folder=prod_folder, 224 | host=host, 225 | ) 226 | 227 | assert len(responses.calls) == 2 228 | actual_responses = [res.response.text for res in responses.calls] 229 | assert set(actual_responses) == {'libraryId=5', 'libraryId=6'} 230 | assert ( 231 | set(actual_deleted_libraries) == 232 | {'test-library-1.0.1.egg', 'test-library-1.0.2.egg'} 233 | ) 234 | 235 | 236 | @mock.patch('stork.update_databricks_library.load_library') 237 | @responses.activate 238 | def test_update_databricks_already_exists( 239 | load_mock, 240 | caplog, 241 | prod_folder, 242 | host, 243 | cfg, 244 | ): 245 | responses.add( 246 | responses.GET, 247 | 'https://test-api', 248 | status=500, 249 | content_type='text/plain', 250 | json={ 251 | 'error_code': 'http 500', 252 | 'message': ( 253 | 'NameConflictException: ' 254 | 'Node named "test-library" already exists' 255 | ) 256 | }, 257 | ) 258 | res = requests.get('https://test-api') 259 | load_mock.side_effect = APIError(res) 260 | with mock.patch('stork.update_databricks_library.CFG_FILE', cfg): 261 | update_databricks( 262 | logger, 263 | path='some/path/to/test-library-1.0.1-py3.6.egg', 264 | token='', 265 | folder='/other/folder', 266 | update_jobs=False, 267 | cleanup=False, 268 | ) 269 | out = caplog.record_tuples[0][2] 270 | expected_out = ( 271 | 'This version (1.0.1) already exists: ' 272 | 'if a change has been made please update your version number. ' 273 | 'Note this error can also occur if you are uploading a jar ' 274 | 'and an egg already exists with the same name and version, ' 275 | 'or vice versa. In this case you will need to choose a ' 276 | 'different library name or a different folder for either the ' 277 | 'egg or the jar.' 278 | ) 279 | 280 | assert strip_whitespace(out) == strip_whitespace(expected_out) 281 | load_mock.assert_called_with( 282 | 'some/path/to/test-library-1.0.1-py3.6.egg', 283 | FileNameMatch('test-library-1.0.1-py3.6.egg'), 284 | '/other/folder', 285 | '', 286 | host, 287 | ) 288 | 289 | 290 | @mock.patch('stork.update_databricks_library.load_library') 291 | @mock.patch('stork.update_databricks_library.get_job_list') 292 | @mock.patch('stork.update_databricks_library.get_library_mapping') 293 | @mock.patch('stork.update_databricks_library.update_job_libraries') 294 | @mock.patch('stork.update_databricks_library.delete_old_versions') 295 | def test_update_databricks_update_jobs( 296 | delete_mock, 297 | update_mock, 298 | lib_mock, 299 | job_mock, 300 | load_mock, 301 | library_mapping, 302 | id_nums, 303 | job_list, 304 | caplog, 305 | prod_folder, 306 | host, 307 | cfg, 308 | ): 309 | path = 'some/path/to/test-library-1.0.3-py3.6.egg' 310 | delete_mock.return_value = ['test-library-1.0.1', 'test-library-1.0.2'] 311 | job_mock.return_value = job_list 312 | lib_mock.return_value = (library_mapping, id_nums) 313 | 314 | with mock.patch('stork.update_databricks_library.CFG_FILE', cfg): 315 | update_databricks( 316 | logger, 317 | path=path, 318 | token='', 319 | folder=prod_folder, 320 | update_jobs=True, 321 | cleanup=True, 322 | ) 323 | 324 | out = [r[2] for r in caplog.record_tuples] 325 | expected_out = [ 326 | 'new library test-library-1.0.3 loaded to Databricks', 327 | 'current major version of library used by jobs: job_3', 328 | 'updated jobs: job_3', 329 | 'removed old versions: test-library-1.0.1, test-library-1.0.2', 330 | ] 331 | match = FileNameMatch('test-library-1.0.3-py3.6.egg') 332 | assert out == expected_out 333 | load_mock.assert_called_with(path, match, prod_folder, '', host) 334 | job_mock.assert_called_with(logger, match, library_mapping, '', host) 335 | lib_mock.assert_called_with(logger, prod_folder, '', host) 336 | update_mock.assert_called_with( 337 | logger, 338 | job_list, 339 | match, 340 | 'dbfs:/FileStore/jars/47fb08a7-test-library_1_0_3_py3_6-e5f8c.egg', 341 | '', 342 | host, 343 | ) 344 | delete_mock.assert_called_with( 345 | logger, 346 | match, 347 | id_nums=id_nums, 348 | token='', 349 | prod_folder=prod_folder, 350 | host=host, 351 | ) 352 | 353 | 354 | @mock.patch('stork.update_databricks_library.load_library') 355 | @mock.patch('stork.update_databricks_library.get_job_list') 356 | @mock.patch('stork.update_databricks_library.get_library_mapping') 357 | @mock.patch('stork.update_databricks_library.update_job_libraries') 358 | def test_update_databricks_update_jobs_no_cleanup( 359 | update_mock, 360 | lib_mock, 361 | job_mock, 362 | load_mock, 363 | library_mapping, 364 | id_nums, 365 | job_list, 366 | caplog, 367 | prod_folder, 368 | host, 369 | cfg, 370 | ): 371 | path = 'some/path/to/test-library-1.0.3-py3.6.egg' 372 | job_mock.return_value = job_list 373 | lib_mock.return_value = (library_mapping, id_nums) 374 | with mock.patch('stork.update_databricks_library.CFG_FILE', cfg): 375 | update_databricks( 376 | logger, 377 | path=path, 378 | token='', 379 | folder=prod_folder, 380 | update_jobs=True, 381 | cleanup=False, 382 | ) 383 | out = [r[2] for r in caplog.record_tuples] 384 | expected_out = [ 385 | 'new library test-library-1.0.3 loaded to Databricks', 386 | 'current major version of library used by jobs: job_3', 387 | 'updated jobs: job_3', 388 | ] 389 | assert out == expected_out 390 | 391 | match = FileNameMatch('test-library-1.0.3-py3.6.egg') 392 | load_mock.assert_called_with( 393 | path, match, prod_folder, '', host, 394 | ) 395 | job_mock.assert_called_with(logger, match, library_mapping, '', host) 396 | lib_mock.assert_called_with(logger, prod_folder, '', host) 397 | update_mock.assert_called_with( 398 | logger, 399 | job_list, 400 | match, 401 | 'dbfs:/FileStore/jars/47fb08a7-test-library_1_0_3_py3_6-e5f8c.egg', 402 | '', 403 | host, 404 | ) 405 | 406 | 407 | @mock.patch('stork.update_databricks_library.load_library') 408 | def test_update_databricks_only_upload( 409 | load_mock, 410 | caplog, 411 | prod_folder, 412 | host, 413 | cfg, 414 | ): 415 | with mock.patch('stork.update_databricks_library.CFG_FILE', cfg): 416 | update_databricks( 417 | logger, 418 | path='some/path/to/test-library-1.0.3-py3.6.egg', 419 | token='', 420 | folder=prod_folder, 421 | update_jobs=False, 422 | cleanup=False, 423 | ) 424 | out = caplog.record_tuples[0][2] 425 | expected_out = 'new library test-library-1.0.3 loaded to Databricks' 426 | assert strip_whitespace(out) == strip_whitespace(expected_out) 427 | load_mock.assert_called_with( 428 | 'some/path/to/test-library-1.0.3-py3.6.egg', 429 | FileNameMatch('test-library-1.0.3-py3.6.egg'), 430 | prod_folder, 431 | '', 432 | host, 433 | ) 434 | 435 | 436 | @mock.patch('stork.update_databricks_library.load_library') 437 | def test_update_databricks_wrong_folder(load_mock, caplog, host, cfg): 438 | with mock.patch('stork.update_databricks_library.CFG_FILE', cfg): 439 | update_databricks( 440 | logger, 441 | path='some/path/to/test-library-1.0.3-py3.6.egg', 442 | token='', 443 | folder='/other/folder', 444 | update_jobs=True, 445 | cleanup=True, 446 | ) 447 | out = caplog.record_tuples[0][2] 448 | expected_out = 'new library test-library-1.0.3 loaded to Databricks' 449 | assert strip_whitespace(out) == strip_whitespace(expected_out) 450 | load_mock.assert_called_with( 451 | 'some/path/to/test-library-1.0.3-py3.6.egg', 452 | FileNameMatch('test-library-1.0.3-py3.6.egg'), 453 | '/other/folder', 454 | '', 455 | host, 456 | ) 457 | 458 | 459 | @mock.patch('stork.update_databricks_library.load_library') 460 | def test_update_databricks_with_jar_only_upload( 461 | load_mock, 462 | caplog, 463 | prod_folder, 464 | host, 465 | cfg, 466 | ): 467 | with mock.patch('stork.update_databricks_library.CFG_FILE', cfg): 468 | update_databricks( 469 | logger, 470 | path='some/path/to/test-library-1.0.3.jar', 471 | token='', 472 | folder=prod_folder, 473 | update_jobs=False, 474 | cleanup=False, 475 | ) 476 | out = caplog.record_tuples[0][2] 477 | expected_out = 'new library test-library-1.0.3 loaded to Databricks' 478 | assert strip_whitespace(out) == strip_whitespace(expected_out) 479 | load_mock.assert_called_with( 480 | 'some/path/to/test-library-1.0.3.jar', 481 | FileNameMatch('test-library-1.0.3.jar'), 482 | prod_folder, 483 | '', 484 | host, 485 | ) 486 | 487 | 488 | @mock.patch('stork.update_databricks_library.load_library') 489 | def test_update_databricks_filename_not_match( 490 | load_mock, 491 | prod_folder, 492 | host, 493 | cfg, 494 | ): 495 | with mock.patch('stork.update_databricks_library.CFG_FILE', cfg): 496 | with pytest.raises(FileNameError) as err: 497 | update_databricks( 498 | logger, 499 | path='some/path/to/test-library-1.0.3.zip', 500 | token='', 501 | folder=prod_folder, 502 | update_jobs=False, 503 | cleanup=False, 504 | ) 505 | assert err.filename == 'test-library-1.0.3.zip' 506 | -------------------------------------------------------------------------------- /tests/unittest_helpers.py: -------------------------------------------------------------------------------- 1 | def strip_whitespace(string_value): 2 | """ 3 | Return the input string without space, tab, 4 | or newline characters (for comparing strings) 5 | """ 6 | return ''.join( 7 | [c for c in string_value if c != ' ' and c != '\n' and c != '\t'] 8 | ) 9 | --------------------------------------------------------------------------------