├── .github ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── bin ├── get-package.sh ├── get-version.sh └── push-tag.sh ├── docs ├── assets │ └── images │ │ ├── prodigy_train_curve.jpg │ │ ├── project_document.jpg │ │ ├── projects.png │ │ ├── projects.svg │ │ └── spacy-streamlit.png ├── cli.md └── tutorial │ ├── custom-scripts.md │ ├── directory-and-assets.md │ ├── integrations.md │ ├── remote-storage.md │ └── workflow.md ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py └── weasel ├── __init__.py ├── __main__.py ├── about.py ├── cli ├── __init__.py ├── assets.py ├── clone.py ├── document.py ├── dvc.py ├── main.py ├── pull.py ├── push.py ├── remote_storage.py └── run.py ├── compat.py ├── errors.py ├── schemas.py ├── tests ├── __init__.py ├── cli │ ├── __init__.py │ ├── test_cli.py │ ├── test_cli_app.py │ ├── test_document.py │ └── test_remote.py ├── demo_project │ ├── project.yml │ └── scripts │ │ └── check.py ├── test_schemas.py ├── test_validation.py └── util.py └── util ├── __init__.py ├── commands.py ├── config.py ├── environment.py ├── filesystem.py ├── frozen.py ├── git.py ├── hashing.py ├── logging.py ├── modules.py ├── remote.py ├── validation.py └── versions.py /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 5 | 10 | 11 | ### Types of change 12 | 13 | 15 | 16 | ## Checklist 17 | 18 | 20 | 21 | - [ ] I confirm that I have the right to submit this contribution under the project's MIT license. 22 | - [ ] I ran the test suite, and all new and existing tests passed. 23 | - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 24 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - "*.md" 7 | pull_request: 8 | types: [opened, synchronize, reopened, edited] 9 | paths-ignore: 10 | - "*.md" 11 | 12 | env: 13 | MODULE_NAME: "weasel" 14 | RUN_MYPY: "true" 15 | 16 | jobs: 17 | validate: 18 | name: Validate 19 | if: github.repository_owner == 'explosion' 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - name: Check out repo 24 | uses: actions/checkout@v4 25 | 26 | - name: Configure Python version 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: "3.11" 30 | 31 | - name: Set PY variable 32 | run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV 33 | 34 | - uses: actions/cache@v3 35 | with: 36 | path: ~/.cache/pre-commit 37 | key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }} 38 | 39 | - name: Install pre-commit 40 | run: | 41 | pip install 'pre-commit>=3.2.0,<4.0.0' 42 | pre-commit install 43 | 44 | - name: Run pre-commit 45 | run: SKIP=no-commit-to-branch pre-commit run --all-files 46 | 47 | tests: 48 | name: Test 49 | needs: Validate 50 | if: github.repository_owner == 'explosion' 51 | strategy: 52 | fail-fast: true 53 | matrix: 54 | os: [ubuntu-latest, windows-latest, macos-latest] 55 | python_version: ["3.12"] 56 | include: 57 | - os: ubuntu-latest 58 | python_version: "3.7" 59 | - os: windows-latest 60 | python_version: "3.8" 61 | - os: macos-latest 62 | python_version: "3.9" 63 | - os: macos-latest 64 | python_version: "3.10" 65 | - os: windows-latest 66 | python_version: "3.11" 67 | runs-on: ${{ matrix.os }} 68 | 69 | steps: 70 | - name: Check out repo 71 | uses: actions/checkout@v4 72 | 73 | - name: Configure Python version 74 | uses: actions/setup-python@v5 75 | with: 76 | python-version: ${{ matrix.python_version }} 77 | 78 | - name: Build sdist 79 | run: | 80 | python -m pip install -U build pip setuptools 81 | python -m pip install -U -r requirements.txt 82 | python -m build --sdist 83 | 84 | - name: Delete source directory 85 | shell: bash 86 | run: | 87 | rm -rf $MODULE_NAME 88 | 89 | - name: Uninstall all packages 90 | run: | 91 | python -m pip freeze > installed.txt 92 | python -m pip uninstall -y -r installed.txt 93 | 94 | - name: Install from sdist 95 | shell: bash 96 | run: | 97 | SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) 98 | python -m pip install dist/$SDIST 99 | 100 | - name: Test import 101 | shell: bash 102 | run: | 103 | python -c "import $MODULE_NAME" -Werror 104 | 105 | - name: Test CLI 106 | run: | 107 | python -m weasel --help 108 | 109 | - name: Install test requirements 110 | run: | 111 | python -m pip install -U -r requirements.txt 112 | 113 | - name: Run tests 114 | shell: bash 115 | run: | 116 | python -m pytest --pyargs $MODULE_NAME -Werror 117 | 118 | - name: Test 'spacy project' CLI help/info messages 119 | shell: bash 120 | run: | 121 | python -m pip install spacy 122 | python -m spacy project clone pipelines/ner_demo | grep -q "spacy project assets" 123 | cd ner_demo 124 | python -m spacy project run --help | grep -q "spacy project run" 125 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Vim / VSCode / editors 2 | *.swp 3 | *.sw* 4 | Profile.prof 5 | .vscode 6 | .sass-cache 7 | 8 | # Python 9 | .Python 10 | .python-version 11 | __pycache__/ 12 | .pytest_cache 13 | *.py[cod] 14 | .env/ 15 | .env* 16 | .~env/ 17 | .venv 18 | env3.6/ 19 | venv/ 20 | env3.*/ 21 | .dev 22 | .denv 23 | .pypyenv 24 | .pytest_cache/ 25 | .mypy_cache/ 26 | .hypothesis/ 27 | 28 | # Distribution / packaging 29 | env/ 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheelhouse/ 40 | *.egg-info/ 41 | pip-wheel-metadata/ 42 | Pipfile.lock 43 | .installed.cfg 44 | *.egg 45 | .eggs 46 | MANIFEST 47 | 48 | # Temporary files 49 | *.~* 50 | tmp/ 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .cache 61 | nosetests.xml 62 | coverage.xml 63 | 64 | # Translations 65 | *.mo 66 | 67 | # Mr Developer 68 | .mr.developer.cfg 69 | .project 70 | .pydevproject 71 | 72 | # Rope 73 | .ropeproject 74 | 75 | # Django stuff: 76 | *.log 77 | *.pot 78 | 79 | # Windows 80 | *.bat 81 | Thumbs.db 82 | Desktop.ini 83 | 84 | # Mac OS X 85 | *.DS_Store 86 | 87 | # Komodo project files 88 | *.komodoproject 89 | 90 | # Other 91 | *.tgz 92 | 93 | # Pycharm project files 94 | *.idea 95 | 96 | # IPython 97 | .ipynb_checkpoints/ 98 | *.ipynb 99 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: no-commit-to-branch 9 | args: [--branch, main] 10 | - id: end-of-file-fixer 11 | - id: check-yaml 12 | args: [--unsafe] 13 | - id: check-toml 14 | - id: check-json 15 | - id: check-symlinks 16 | - id: check-docstring-first 17 | - id: check-added-large-files 18 | - id: detect-private-key 19 | # - id: requirements-txt-fixer 20 | 21 | - repo: https://github.com/charliermarsh/ruff-pre-commit 22 | rev: v0.0.254 23 | hooks: 24 | - id: ruff 25 | args: [--fix, --exit-non-zero-on-fix] 26 | 27 | - repo: https://github.com/pre-commit/mirrors-mypy 28 | rev: v1.0.1 29 | hooks: 30 | - id: mypy 31 | additional_dependencies: 32 | - "types-requests" 33 | - "types-setuptools" 34 | - "pydantic" 35 | 36 | - repo: https://github.com/pycqa/isort 37 | rev: 5.12.0 38 | hooks: 39 | - id: isort 40 | name: isort (python) 41 | - id: isort 42 | name: isort (cython) 43 | types: [cython] 44 | - id: isort 45 | name: isort (pyi) 46 | types: [pyi] 47 | 48 | - repo: https://github.com/psf/black 49 | rev: 22.3.0 50 | hooks: 51 | - id: black 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (C) 2022 ExplosionAI GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Weasel: A small and easy workflow system 4 | 5 | Weasel lets you manage and share **end-to-end workflows** for 6 | different **use cases and domains**, and orchestrate training, packaging and 7 | serving your custom pipelines. You can start off by cloning a pre-defined 8 | project template, adjust it to fit your needs, load in your data, train a 9 | pipeline, export it as a Python package, upload your outputs to a remote storage 10 | and share your results with your team. Weasel can be used via the 11 | [`weasel`](https://github.com/explosion/weasel/blob/main/docs/cli.md) command and we provide templates in our 12 | [`projects`](https://github.com/explosion/projects) repo. 13 | 14 | ![Illustration of project workflow and commands](https://raw.githubusercontent.com/explosion/weasel/main/docs/assets/images/projects.svg) 15 | 16 | ## 💡 Example: Get started with a project template 17 | 18 | The easiest way to get started is to clone a project template and run it – for 19 | example, this [end-to-end template](https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud) 20 | that lets you train a spaCy **part-of-speech 21 | tagger** and **dependency parser** on a Universal Dependencies treebank. 22 | 23 | ```shell 24 | python -m weasel clone pipelines/tagger_parser_ud 25 | ``` 26 | 27 | > **Note** 28 | > 29 | > Our [`projects`](https://github.com/explosion/projects) repo includes various 30 | > project templates for different NLP tasks, models, workflows and integrations 31 | > that you can clone and run. The easiest way to get started is to pick a 32 | > template, clone it and start modifying it! 33 | 34 | ## 📕 Documentation 35 | 36 | Get started with the documentation: 37 | 38 | - [Learn how to create a Weasel workflow](https://github.com/explosion/weasel/blob/main/docs/tutorial/workflow.md) 39 | - [Working with directory and assets](https://github.com/explosion/weasel/blob/main/docs/tutorial/directory-and-assets.md) 40 | - [Running custom scripts](https://github.com/explosion/weasel/blob/main/docs/tutorial/custom-scripts.md) 41 | - [Using remote storage](https://github.com/explosion/weasel/blob/main/docs/tutorial/remote-storage.md) 42 | - [Weasel integrations](https://github.com/explosion/weasel/blob/main/docs/tutorial/integrations.md) 43 | - [Command line interface description](https://github.com/explosion/weasel/blob/main/docs/cli.md) 44 | 45 | ## Migrating from spaCy Projects 46 | 47 | Weasel is a standalone replacement for spaCy Projects. 48 | There are a few backward incompatibilities that you should be aware of: 49 | 50 | - The `SPACY_CONFIG_OVERRIDES` environment variable is no longer checked. 51 | You can set configuration overrides using `WEASEL_CONFIG_OVERRIDES`. 52 | - Support for the `spacy_version` configuration key has been dropped. 53 | - Support for the `check_requirements` configuration key has been dropped. 54 | - Support for `SPACY_PROJECT_USE_GIT_VERSION` environment variable has been dropped. 55 | - Error codes are now Weasel-specific, and do not follow spaCy error codes. 56 | 57 | Weasel checks for the first three incompatibilities and will issue a 58 | warning if you're using it with spaCy-specific configuration options. 59 | -------------------------------------------------------------------------------- /bin/get-package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | version=$(grep "name = " setup.cfg) 6 | version=${version/__title__ = } 7 | version=${version/\'/} 8 | version=${version/\'/} 9 | version=${version/\"/} 10 | version=${version/\"/} 11 | 12 | echo $version 13 | -------------------------------------------------------------------------------- /bin/get-version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | version=$(grep "version = " setup.cfg) 6 | version=${version/version = } 7 | version=${version/\'/} 8 | version=${version/\'/} 9 | version=${version/\"/} 10 | version=${version/\"/} 11 | 12 | echo $version 13 | -------------------------------------------------------------------------------- /bin/push-tag.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | # Insist repository is clean 6 | git diff-index --quiet HEAD 7 | 8 | git checkout $1 9 | git pull origin $1 10 | git push origin $1 11 | 12 | version=$(grep "version = " setup.cfg) 13 | version=${version/version = } 14 | version=${version/\'/} 15 | version=${version/\'/} 16 | version=${version/\"/} 17 | version=${version/\"/} 18 | git tag "v$version" 19 | git push origin "v$version" 20 | -------------------------------------------------------------------------------- /docs/assets/images/prodigy_train_curve.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/prodigy_train_curve.jpg -------------------------------------------------------------------------------- /docs/assets/images/project_document.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/project_document.jpg -------------------------------------------------------------------------------- /docs/assets/images/projects.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/projects.png -------------------------------------------------------------------------------- /docs/assets/images/spacy-streamlit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/spacy-streamlit.png -------------------------------------------------------------------------------- /docs/cli.md: -------------------------------------------------------------------------------- 1 | # Command Line Interface 2 | 3 | The `weasel` CLI includes subcommands for working with Weasel projects, 4 | end-to-end workflows for building and deploying custom pipelines. 5 | 6 | ## :clipboard: clone 7 | 8 | Clone a project template from a Git repository. Calls into `git` under the hood 9 | and can use the sparse checkout feature if available, so you're only downloading 10 | what you need. By default, Weasel's 11 | [project templates repo](https://github.com/explosion/projects) is used, but you 12 | can provide any other repo (public or private) that you have access to using the 13 | `--repo` option. 14 | 15 | ```bash 16 | python -m weasel clone [name] [dest] [--repo] [--branch] [--sparse] 17 | ``` 18 | 19 | > :bulb: **Example usage** 20 | > 21 | > ```bash 22 | > $ python -m weasel clone pipelines/ner_wikiner 23 | > ``` 24 | 25 | | Name | Description | 26 | | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | 27 | | `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ | 28 | | `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ | 29 | | `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ | 30 | | `--branch`, `-b` | The branch to clone from. Defaults to `master`. ~~str (option)~~ | 31 | | `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | 32 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | 33 | | **CREATES** | The cloned [project directory](tutorial/directory-and-assets.md). | 34 | 35 | ## :open_file_folder: assets 36 | 37 | Fetch project assets like datasets and pretrained weights. Assets are defined in 38 | the `assets` section of the 39 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). If a `checksum` 40 | is provided, the file is only downloaded if no local file with the same checksum 41 | exists and Weasel will show an error if the checksum of the downloaded file 42 | doesn't match. If assets don't specify a `url` they're considered "private" and 43 | you have to take care of putting them into the destination directory yourself. 44 | If a local path is provided, the asset is copied into the current project. 45 | 46 | ```bash 47 | python -m weasel assets [project_dir] 48 | ``` 49 | 50 | | Name | Description | 51 | | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | 52 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | 53 | | `--extra`, `-e` 3.3.1 | Download assets marked as "extra". Default false. ~~bool (flag)~~ | 54 | | `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | 55 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | 56 | | **CREATES** | Downloaded or copied assets defined in the `project.yml`. | 57 | 58 | ## :rocket: run 59 | 60 | Run a named command or workflow defined in the 61 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). If a workflow 62 | name is specified, all commands in the workflow are run, in order. If commands 63 | define 64 | [dependencies or outputs](tutorial/directory-and-assets.md#dependencies-and-outputs), 65 | they will only be re-run if state has changed. For example, if the input dataset 66 | changes, a preprocessing command that depends on those files will be re-run. 67 | 68 | ```bash 69 | python -m weasel run [subcommand] [project_dir] [--force] [--dry] 70 | ``` 71 | 72 | | Name | Description | 73 | | --------------- | --------------------------------------------------------------------------------------- | 74 | | `subcommand` | Name of the command or workflow to run. ~~str (positional)~~ | 75 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | 76 | | `--force`, `-F` | Force re-running steps, even if nothing changed. ~~bool (flag)~~ | 77 | | `--dry`, `-D` | Perform a dry run and don't execute scripts. ~~bool (flag)~~ | 78 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | 79 | | **EXECUTES** | The command defined in the `project.yml`. | 80 | 81 | ## :arrow_up: push 82 | 83 | Upload all available files or directories listed as in the `outputs` section of 84 | commands to a remote storage. Outputs are archived and compressed prior to 85 | upload, and addressed in the remote storage using the output's relative path 86 | (URL encoded), a hash of its command string and dependencies, and a hash of its 87 | file contents. This means `push` should **never overwrite** a file in your 88 | remote. If all the hashes match, the contents are the same and nothing happens. 89 | If the contents are different, the new version of the file is uploaded. Deleting 90 | obsolete files is left up to you. 91 | 92 | Remotes can be defined in the `remotes` section of the 93 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). Under the hood, 94 | Weasel uses [`cloudpathlib`](https://cloudpathlib.drivendata.org) to communicate 95 | with the remote storages, so you can use any protocol that `CloudPath` supports, 96 | including [S3](https://aws.amazon.com/s3/), 97 | [Google Cloud Storage](https://cloud.google.com/storage), and the local 98 | filesystem, although you may need to install extra dependencies to use certain 99 | protocols. 100 | 101 | ```bash 102 | python -m weasel push [remote] [project_dir] 103 | ``` 104 | 105 | > :bulb: **Example** 106 | > 107 | > ```bash 108 | > $ python -m weasel push my_bucket 109 | > ``` 110 | > 111 | > ```yaml title="project.yml" 112 | > remotes: 113 | > my_bucket: 's3://my-weasel-bucket' 114 | > ``` 115 | 116 | | Name | Description | 117 | | -------------- | --------------------------------------------------------------------------------------- | 118 | | `remote` | The name of the remote to upload to. Defaults to `"default"`. ~~str (positional)~~ | 119 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | 120 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | 121 | | **UPLOADS** | All project outputs that exist and are not already stored in the remote. | 122 | 123 | ## :arrow_down: pull 124 | 125 | Download all files or directories listed as `outputs` for commands, unless they 126 | are already present locally. When searching for files in the remote, `pull` 127 | won't just look at the output path, but will also consider the **command 128 | string** and the **hashes of the dependencies**. For instance, let's say you've 129 | previously pushed a checkpoint to the remote, but now you've changed some 130 | hyper-parameters. Because you've changed the inputs to the command, if you run 131 | `pull`, you won't retrieve the stale result. If you train your pipeline and push 132 | the outputs to the remote, the outputs will be saved alongside the prior 133 | outputs, so if you change the config back, you'll be able to fetch back the 134 | result. 135 | 136 | Remotes can be defined in the `remotes` section of the 137 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). Under the hood, 138 | Weasel uses [`cloudpathlib`](https://cloudpathlib.drivendata.org/) to 139 | communicate with the remote storages, so you can use any protocol that 140 | `CloudPath` supports, including [S3](https://aws.amazon.com/s3/), 141 | [Google Cloud Storage](https://cloud.google.com/storage), and the local 142 | filesystem, although you may need to install extra dependencies to use certain 143 | protocols. 144 | 145 | ```bash 146 | python -m weasel pull [remote] [project_dir] 147 | ``` 148 | 149 | > :bulb: **Example** 150 | > 151 | > ```bash 152 | > $ python -m weasel pull my_bucket 153 | > ``` 154 | > 155 | > ```yaml title="project.yml" 156 | > remotes: 157 | > my_bucket: 's3://my-weasel-bucket' 158 | > ``` 159 | 160 | | Name | Description | 161 | | -------------- | --------------------------------------------------------------------------------------- | 162 | | `remote` | The name of the remote to download from. Defaults to `"default"`. ~~str (positional)~~ | 163 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | 164 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | 165 | | **DOWNLOADS** | All project outputs that do not exist locally and can be found in the remote. | 166 | 167 | ## :closed_book: document 168 | 169 | Auto-generate a pretty Markdown-formatted `README` for your project, based on 170 | its [`project.yml`](tutorial/directory-and-assets.md#project-yml). Will create 171 | sections that document the available commands, workflows and assets. The 172 | auto-generated content will be placed between two hidden markers, so you can add 173 | your own custom content before or after the auto-generated documentation. When 174 | you re-run the `project document` command, only the auto-generated part is 175 | replaced. 176 | 177 | ```bash 178 | python -m weasel document [project_dir] [--output] [--no-emoji] 179 | ``` 180 | 181 | > :bulb: **Example usage** 182 | > 183 | > ```bash 184 | > $ python -m weasel document --output README.md 185 | > ``` 186 | > 187 | > For more examples, see the templates in our 188 | > [`projects`](https://github.com/explosion/projects) repo. 189 | 190 | | Name | Description | 191 | | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 192 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | 193 | | `--output`, `-o` | Path to output file or `-` for stdout (default). If a file is specified and it already exists and contains auto-generated docs, only the auto-generated docs section is replaced. ~~Path (positional)~~ | 194 | | `--no-emoji`, `-NE` | Don't use emoji in the titles. ~~bool (flag)~~ | 195 | | **CREATES** | The Markdown-formatted project documentation. | 196 | 197 | ## :repeat: dvc 198 | 199 | Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls 200 | [`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under 201 | the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline, 202 | so you need to specify one workflow defined in the 203 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). If no workflow is 204 | specified, the first defined workflow is used. The DVC config will only be 205 | updated if the `project.yml` changed. For details, see the 206 | [DVC integration](tutorial/integrations.md#data-version-control-dvc) docs. 207 | 208 | > **Warning** 209 | > 210 | > This command requires DVC to be installed and initialized in the project 211 | > directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init). 212 | > You'll also need to add the assets you want to track with 213 | > [`dvc add`](https://dvc.org/doc/command-reference/add). 214 | 215 | ```bash 216 | python -m weasel dvc [project_dir] [workflow] [--force] [--verbose] [--quiet] 217 | ``` 218 | 219 | > :bulb: **Example** 220 | > 221 | > ```bash 222 | > $ git init 223 | > $ dvc init 224 | > $ python -m weasel dvc all 225 | > ``` 226 | 227 | | Name | Description | 228 | | ----------------- | ------------------------------------------------------------------------------------------------------------- | 229 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | 230 | | `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ | 231 | | `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | 232 | | `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ | 233 | | `--quiet`, `-q` | Print no output generated by DVC. ~~bool (flag)~~ | 234 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | 235 | | **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | 236 | -------------------------------------------------------------------------------- /docs/tutorial/custom-scripts.md: -------------------------------------------------------------------------------- 1 | # Custom scripts and projects 2 | 3 | The `project.yml` lets you define any custom commands and run them as part of 4 | your training, evaluation or deployment workflows. The `script` section defines 5 | a list of commands that are called in a subprocess, in order. This lets you 6 | execute other Python scripts or command-line tools. 7 | 8 | Let's say you're training a spaCy pipeline, and you've written a 9 | few integration tests that load the best model produced by the training command 10 | and check that it works correctly. You can now define a `test` command that 11 | calls into [`pytest`](https://docs.pytest.org/en/latest/), runs your tests and 12 | uses [`pytest-html`](https://github.com/pytest-dev/pytest-html) to export a test 13 | report: 14 | 15 | > :bulb: **Example configuration** 16 | > 17 | > ```yaml title="project.yml" 18 | > commands: 19 | > - name: test 20 | > help: 'Test the trained pipeline' 21 | > script: 22 | > - 'pip install pytest pytest-html' 23 | > - 'python -m pytest ./scripts/tests --html=metrics/test-report.html' 24 | > deps: 25 | > - 'training/model-best' 26 | > outputs: 27 | > - 'metrics/test-report.html' 28 | > no_skip: true 29 | > ``` 30 | 31 | Adding `training/model-best` to the command's `deps` lets you ensure that the 32 | file is available. If not, Weasel will show an error and the command won't run. 33 | Setting `no_skip: true` means that the command will always run, even if the 34 | dependencies (the trained pipeline) haven't changed. This makes sense here, 35 | because you typically don't want to skip your tests. 36 | 37 | ## Writing custom scripts 38 | 39 | Your project commands can include any custom scripts – essentially, anything you 40 | can run from the command line. Here's an example of a custom script that uses 41 | [`typer`](https://typer.tiangolo.com/) for quick and easy command-line arguments 42 | that you can define via your `project.yml`: 43 | 44 | ```python title="scripts/custom_evaluation.py" 45 | import typer 46 | 47 | def custom_evaluation(batch_size: int = 128, model_path: str, data_path: str): 48 | # The arguments are now available as positional CLI arguments 49 | print(batch_size, model_path, data_path) 50 | 51 | if __name__ == "__main__": 52 | typer.run(custom_evaluation) 53 | ``` 54 | 55 | > :information_source: **About Typer** 56 | > 57 | > [`typer`](https://typer.tiangolo.com/) is a modern library for building Python 58 | > CLIs using type hints. It's a dependency of Weasel, so it will already be 59 | > pre-installed in your environment. Function arguments automatically become 60 | > positional CLI arguments and using Python type hints, you can define the value 61 | > types. For instance, `batch_size: int` means that the value provided via the 62 | > command line is converted to an integer. 63 | 64 | In your `project.yml`, you can then run the script by calling 65 | `python scripts/custom_evaluation.py` with the function arguments. You can also 66 | use the `vars` section to define reusable variables that will be substituted in 67 | commands, paths and URLs. In the following example, the batch size is defined as a 68 | variable will be added in place of `${vars.batch_size}` in the script. 69 | 70 | > :bulb: **Example usage of `vars`** 71 | > 72 | > ```yaml title="project.yml" 73 | > vars: 74 | > batch_size: 128 75 | > 76 | > commands: 77 | > - name: evaluate 78 | > script: 79 | > - 'python scripts/custom_evaluation.py ${vars.batch_size} ./training/model-best ./corpus/eval.json' 80 | > deps: 81 | > - 'training/model-best' 82 | > - 'corpus/eval.json' 83 | > ``` 84 | 85 | > :information_source: **Calling into Python** 86 | > 87 | > If any of your command scripts call into `python`, Weasel will take care of 88 | > replacing that with your `sys.executable`, to make sure you're executing 89 | > everything with the same Python (not some other Python installed on your 90 | > system). It also normalizes references to `python3`, `pip3` and `pip`. 91 | 92 | You can also use the `env` section to reference **environment variables** and 93 | make their values available to the commands. This can be useful for overriding 94 | settings on the command line and passing through system-level settings. 95 | 96 | > :bulb: **Example usage of EnvVars** 97 | > 98 | > ```bash 99 | > export GPU_ID=1 100 | > BATCH_SIZE=128 python -m weasel run evaluate 101 | > ``` 102 | > 103 | > ```yaml title="project.yml" 104 | > env: 105 | > batch_size: BATCH_SIZE 106 | > gpu_id: GPU_ID 107 | > 108 | > commands: 109 | > - name: evaluate 110 | > script: 111 | > - 'python scripts/custom_evaluation.py ${env.batch_size}' 112 | > ``` 113 | 114 | ## Documenting your project 115 | 116 | > :bulb: **Examples** 117 | > 118 | > For more examples, see the [`projects`](https://github.com/explosion/projects) 119 | > repo. 120 | > 121 | > ![Screenshot of auto-generated Markdown Readme](../assets/images/project_document.jpg) 122 | 123 | When your custom project is ready and you want to share it with others, you can 124 | use the [`weasel document`](../cli.md#closed_book-document) command to 125 | **auto-generate** a pretty, Markdown-formatted `README` file based on your 126 | project's `project.yml`. It will list all commands, workflows and assets defined 127 | in the project and include details on how to run the project, as well as links 128 | to the relevant Weasel documentation to make it easy for others to get started 129 | using your project. 130 | 131 | ```bash 132 | python -m weasel document --output README.md 133 | ``` 134 | 135 | Under the hood, hidden markers are added to identify where the auto-generated 136 | content starts and ends. This means that you can add your own custom content 137 | before or after it and re-running the `document` command will **only 138 | update the auto-generated part**. This makes it easy to keep your documentation 139 | up to date. 140 | 141 | > **Warning** 142 | > 143 | > Note that the contents of an existing file will be **replaced** if no existing 144 | > auto-generated docs are found. If you want Weasel to ignore a file and not update 145 | > it, you can add the comment marker `{/* WEASEL: IGNORE */}` anywhere in 146 | > your markup. 147 | 148 | ## Cloning from your own repo 149 | 150 | The [`weasel clone`](../cli.md#clipboard-clone) command lets you customize 151 | the repo to clone from using the `--repo` option. It calls into `git`, so you'll 152 | be able to clone from any repo that you have access to, including private repos. 153 | 154 | ```bash 155 | python -m weasel clone your_project --repo https://github.com/you/repo 156 | ``` 157 | 158 | At a minimum, a valid project template needs to contain a 159 | [`project.yml`](./directory-and-assets.md#projectyml). It can also include 160 | [other files](./directory-and-assets.md), like custom scripts, a 161 | `requirements.txt` listing additional dependencies, 162 | a machine learning model and meta templates, or Jupyter 163 | notebooks with usage examples. 164 | 165 | > :warning: **Important note about assets** 166 | > 167 | > It's typically not a good idea to check large data assets, trained pipelines or 168 | > other artifacts into a Git repo and you should exclude them from your project 169 | > template by adding a `.gitignore`. If you want to version your data and models, 170 | > check out [Data Version Control](./integrations.md#data-version-control-dvc) (DVC), 171 | > which integrates with Weasek. 172 | -------------------------------------------------------------------------------- /docs/tutorial/directory-and-assets.md: -------------------------------------------------------------------------------- 1 | # Project directory and assets 2 | 3 | ## `project.yml` 4 | 5 | The `project.yml` defines the assets a project depends on, like datasets and 6 | pretrained weights, as well as a series of commands that can be run separately 7 | or as a workflow – for instance, to preprocess the data, convert it to Weasel's 8 | format, train a pipeline, evaluate it and export metrics, package it and spin up 9 | a quick web demo. It looks pretty similar to a config file used to define CI 10 | pipelines. 11 | 12 | > :boom: **Tip: Multi-line YAML** 13 | > 14 | > YAML has [multi-line syntax](https://yaml-multiline.info/) that can be helpful 15 | > for readability with longer values such as project descriptions or commands 16 | > that take several arguments. 17 | 18 | > :boom: **Tip: Variable override** 19 | > 20 | > If you want to override one or more variables on the CLI and are not already 21 | > specifying a project directory, you need to add `.` as a placeholder: 22 | > 23 | > ``` 24 | > python -m weasel run test . --vars.foo bar 25 | > ``` 26 | 27 | > :boom: **Tip: Environment variables** 28 | > 29 | > Commands in a project file are not executed in a shell, so they don't have 30 | > direct access to environment variables. But you can insert environment 31 | > variables using the `env` dictionary to make values available for 32 | > interpolation, just like values in `vars`. Here's an example `env` dict that 33 | > makes `$PATH` available as `ENV_PATH`: 34 | > 35 | > ```yaml 36 | > env: 37 | > ENV_PATH: PATH 38 | > ``` 39 | > 40 | > This can be used in a project command like so: 41 | > 42 | > ```yaml 43 | > - name: 'echo-path' 44 | > script: 45 | > - 'echo ${env.ENV_PATH}' 46 | > ``` 47 | 48 | `project.yml` adheres to the following schema: 49 | 50 | | Section | Description | 51 | | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 52 | | `title` | An optional project title used in `--help` message and [auto-generated docs](../cli.md#closed_book-document). | 53 | | `description` | An optional project description used in [auto-generated docs](../cli.md#closed_book-document). | 54 | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](https://spacy.io/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | 55 | | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | 56 | | `directories` | An optional list of [directories](#data-assets) that should be created in the project for assets, training outputs, metrics etc. Weasel will make sure that these directories always exist. | 57 | | `assets` | A list of assets that can be fetched with the [`assets`](../cli.md#open_file_folder-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | 58 | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`run`](../cli.md#rocket-run) command. | 59 | | `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets Weasel determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`run`](../cli.md#rocket-run) command. | 60 | 61 | ## Data assets 62 | 63 | Assets are any files that your project might need, like training and development 64 | corpora or pretrained weights for initializing your model. Assets are defined in 65 | the `assets` block of your `project.yml` and can be downloaded using the 66 | [`assets`](../cli.md#open_file_folder-assets) command. Defining checksums lets you 67 | verify that someone else running your project will use the same files you used. 68 | Asset URLs can be a number of different **protocols**: HTTP, HTTPS, FTP, SSH, 69 | and even **cloud storage** such as GCS and S3. You can also download assets from 70 | a **Git repo** instead. 71 | 72 | ### Downloading from a URL or cloud storage 73 | 74 | Under the hood, Weasel uses the 75 | [`smart_open`](https://github.com/RaRe-Technologies/smart_open) library so you 76 | can use any protocol it supports. Note that you may need to install extra 77 | dependencies to use certain protocols. 78 | 79 | > :bulb: **Example configuration** 80 | > 81 | > ```yaml title="project.yml" 82 | > assets: 83 | > # Download from public HTTPS URL 84 | > - dest: 'assets/training.spacy' 85 | > url: 'https://example.com/data.spacy' 86 | > checksum: '63373dd656daa1fd3043ce166a59474c' 87 | > # Optional download from Google Cloud Storage bucket 88 | > - dest: 'assets/development.spacy' 89 | > extra: True 90 | > url: 'gs://your-bucket/corpora' 91 | > checksum: '5113dc04e03f079525edd8df3f4f39e3' 92 | > ``` 93 | 94 | | Name | Description | 95 | | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 96 | | `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | 97 | | `extra` | Optional flag determining whether this asset is downloaded only if `weasel assets` is run with `--extra`. `False` by default. | 98 | | `url` | The URL to download from, using the respective protocol. | 99 | | `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | 100 | | `description` | Optional asset description, used in [auto-generated docs](../cli.md#closed_book-document). | 101 | 102 | ### Downloading from a Git repo 103 | 104 | If a `git` block is provided, the asset is downloaded from the given Git 105 | repository. You can download from any repo that you have access to. Under the 106 | hood, this uses Git's "sparse checkout" feature, so you're only downloading the 107 | files you need and not the whole repo. 108 | 109 | > :bulb: **Example configuration** 110 | > 111 | > ```yaml title="project.yml" 112 | > assets: 113 | > - dest: 'assets/training.spacy' 114 | > git: 115 | > repo: 'https://github.com/example/repo' 116 | > branch: 'master' 117 | > path: 'path/training.spacy' 118 | > checksum: '63373dd656daa1fd3043ce166a59474c' 119 | > description: 'The training data (5000 examples)' 120 | > ``` 121 | 122 | | Name | Description | 123 | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 124 | | `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | 125 | | `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.
`branch`: The branch to download from. Defaults to `"master"`. | 126 | | `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | 127 | | `description` | Optional asset description, used in [auto-generated docs](../cli.md#closed_book-document). | 128 | 129 | ### Working with private assets 130 | 131 | > :bulb: **Example configuration** 132 | > 133 | > ```yaml title="project.yml" 134 | > assets: 135 | > - dest: 'assets/private_training_data.json' 136 | > checksum: '63373dd656daa1fd3043ce166a59474c' 137 | > - dest: 'assets/private_vectors.bin' 138 | > checksum: '5113dc04e03f079525edd8df3f4f39e3' 139 | > ``` 140 | 141 | For many projects, the datasets and weights you're working with might be 142 | company-internal and not available over the internet. In that case, you can 143 | specify the destination paths and a checksum, and leave out the URL. When your 144 | teammates clone and run your project, they can place the files in the respective 145 | directory themselves. The [`assets`](../cli.md#open_file_folder-assets) command 146 | will alert you about missing files and mismatched checksums, so you can ensure 147 | that others are running your project with the same data. 148 | 149 | ## Dependencies and outputs 150 | 151 | Each command defined in the `project.yml` can optionally define a list of 152 | dependencies and outputs. These are the files the command requires and creates. 153 | For example, a command for training a spaCy pipeline may depend on a 154 | [`config.cfg`](https://spacy.io/usage/training#config) and the training and evaluation data, and 155 | it will export a directory `model-best`, which you can then re-use in other 156 | commands. 157 | 158 | > :bulb: **Example configuration** 159 | > 160 | > ```yaml title="project.yml" 161 | > commands: 162 | > - name: train 163 | > help: 'Train a spaCy pipeline using the specified corpus and config' 164 | > script: 165 | > - 'python -m spacy train ./configs/config.cfg -o training/ --paths.train ./corpus/training.spacy --paths.dev ./corpus/evaluation.spacy' 166 | > deps: 167 | > - 'configs/config.cfg' 168 | > - 'corpus/training.spacy' 169 | > - 'corpus/evaluation.spacy' 170 | > outputs: 171 | > - 'training/model-best' 172 | > ``` 173 | 174 | > :boom: **Tip: Re-running vs. skipping** 175 | > 176 | > Under the hood, Weasel uses a `project.lock` lockfile that stores the details 177 | > for each command, as well as its dependencies and outputs and their checksums. 178 | > It's updated on each run. If any of this information changes, the command will 179 | > be re-run. Otherwise, it will be skipped. 180 | 181 | If you're running a command and it depends on files that are missing, Weasel will 182 | show you an error. If a command defines dependencies and outputs that haven't 183 | changed since the last run, the command will be skipped. This means that you're 184 | only re-running commands if they need to be re-run. Commands can also set 185 | `no_skip: true` if they should never be skipped – for example commands that run 186 | tests. Commands without outputs are also never skipped. To force re-running a 187 | command or workflow, even if nothing changed, you can set the `--force` flag. 188 | 189 | Note that [`weasel`](../cli.md) doesn't compile any dependency 190 | graphs based on the dependencies and outputs, and won't re-run previous steps 191 | automatically. For instance, if you only run the command `train` that depends on 192 | data created by `preprocess` and those files are missing, Weasel will show an 193 | error – it won't just re-run `preprocess`. If you're looking for more advanced 194 | data management, check out the [Data Version Control (DVC) integration](./integrations.md#data-version-control-dvc). 195 | If you're planning on integrating your Weasel project with DVC, you can also use 196 | `outputs_no_cache` instead of `outputs` to define outputs that won't be cached 197 | or tracked. 198 | 199 | ## Files and directory structure 200 | 201 | The `project.yml` can define a list of `directories` that should be created 202 | within a project – for instance, `assets`, `training`, `corpus` and so on. Weasel 203 | will make sure that these directories are always available, so your commands can 204 | write to and read from them. Project directories will also include all files and 205 | directories copied from the project template with 206 | [`weasel clone`](../cli.md#clipboard-clone). Here's an example of a project 207 | directory: 208 | 209 | > :bulb: **Example configuration** 210 | > 211 | > ```yaml title="project.yml" 212 | > directories: 213 | > - 'assets' 214 | > - 'configs' 215 | > - 'corpus' 216 | > - 'metas' 217 | > - 'metrics' 218 | > - 'notebooks' 219 | > - 'packages' 220 | > - 'scripts' 221 | > - 'training' 222 | > ``` 223 | > 224 | >``` title="Example directory structure" 225 | >├── project.yml # the project settings 226 | >├── project.lock # lockfile that tracks inputs/outputs 227 | >├── assets/ # downloaded data assets 228 | >├── configs/ # pipeline config.cfg files used for training 229 | >├── corpus/ # output directory for training corpus 230 | >├── metas/ # pipeline meta.json templates used for packaging 231 | >├── metrics/ # output directory for evaluation metrics 232 | >├── notebooks/ # directory for Jupyter notebooks 233 | >├── packages/ # output directory for pipeline Python packages 234 | >├── scripts/ # directory for scripts, e.g. referenced in commands 235 | >├── training/ # output directory for trained pipelines 236 | >└── ... # any other files, like a requirements.txt etc. 237 | >``` 238 | 239 | If you don't want a project to create a directory, you can delete it and remove 240 | its entry from the `project.yml` – just make sure it's not required by any of 241 | the commands. [Custom templates](./custom-scripts.md) can use any directories they need – 242 | the only file that's required for a project is the `project.yml`. 243 | -------------------------------------------------------------------------------- /docs/tutorial/integrations.md: -------------------------------------------------------------------------------- 1 | # Integrations 2 | 3 | ## Data Version Control (DVC) 4 | 5 | Data assets like training corpora or pretrained weights are at the core of any 6 | NLP project, but they're often difficult to manage: you can't just check them 7 | into your Git repo to version and keep track of them. And if you have multiple 8 | steps that depend on each other, like a preprocessing step that generates your 9 | training data, you need to make sure the data is always up-to-date, and re-run 10 | all steps of your process every time, just to be safe. 11 | 12 | [Data Version Control](https://dvc.org) (DVC) is a standalone open-source tool 13 | that integrates into your workflow like Git, builds a dependency graph for your 14 | data pipelines and tracks and caches your data files. If you're downloading data 15 | from an external source, like a storage bucket, DVC can tell whether the 16 | resource has changed. It can also determine whether to re-run a step, depending 17 | on whether its input have changed or not. All metadata can be checked into a Git 18 | repo, so you'll always be able to reproduce your experiments. 19 | 20 | To set up DVC, install the package and initialize your Weasek project as a Git 21 | and DVC repo. You can also 22 | [customize your DVC installation](https://dvc.org/doc/install/macos#install-with-pip) 23 | to include support for remote storage like Google Cloud Storage, S3, Azure, SSH 24 | and more. 25 | 26 | ```bash 27 | pip install dvc # Install DVC 28 | git init # Initialize a Git repo 29 | dvc init # Initialize a DVC project 30 | ``` 31 | 32 | > :warning: **Important note on privacy** 33 | > 34 | > DVC enables usage analytics by default, so if you're working in a 35 | > privacy-sensitive environment, make sure to 36 | > [**opt-out manually**](https://dvc.org/doc/user-guide/analytics#opting-out). 37 | 38 | The [`weasel dvc`](../cli.md#repeat-dvc) command creates a `dvc.yaml` 39 | config file based on a workflow defined in your `project.yml`. Whenever you 40 | update your project, you can re-run the command to update your DVC config. You 41 | can then manage your Weasel project like any other DVC project, run 42 | [`dvc add`](https://dvc.org/doc/command-reference/add) to add and track assets 43 | and [`dvc repro`](https://dvc.org/doc/command-reference/repro) to reproduce the 44 | workflow or individual commands. 45 | 46 | ```bash 47 | python -m weasel dvc [project_dir] [workflow_name] 48 | ``` 49 | 50 | > :warning: **Important note for multiple workflows** 51 | > 52 | > DVC currently expects a single workflow per project, so when creating the config 53 | > with [`weasel dvc`](../cli.md#repeat-dvc), you need to specify the name 54 | > of a workflow defined in your `project.yml`. You can still use multiple 55 | > workflows, but only one can be tracked by DVC. 56 | -------------------------------------------------------------------------------- /docs/tutorial/remote-storage.md: -------------------------------------------------------------------------------- 1 | # Remote Storage 2 | 3 | You can persist your project outputs to a remote storage using the 4 | [`push`](../cli.md#arrow_up-push) command. This can help you **export** your 5 | pipeline packages, **share** work with your team, or **cache results** to avoid 6 | repeating work. The [`pull`](../cli.md#arrow_down-pull) command will download 7 | any outputs that are in the remote storage and aren't available locally. 8 | 9 | You can list one or more remotes in the `remotes` section of your 10 | [`project.yml`](./directory-and-assets.md#projectyml) by mapping a string name 11 | to the URL of the storage. Under the hood, Weasel uses 12 | [`cloudpathlib`](https://cloudpathlib.drivendata.org/) to communicate with the 13 | remote storages, so you can use any protocol that `CloudPath` supports, 14 | including [S3](https://aws.amazon.com/s3/), 15 | [Google Cloud Storage](https://cloud.google.com/storage), and the local 16 | filesystem, although you may need to install extra dependencies to use certain 17 | protocols. 18 | 19 | > :bulb: **Example using remote storage** 20 | > 21 | > ```bash 22 | > $ python -m weasel pull local 23 | > ``` 24 | > 25 | > ```yaml title="project.yml" 26 | > remotes: 27 | > default: 's3://my-weasel-bucket' 28 | > local: '/mnt/scratch/cache' 29 | > ``` 30 | 31 | > :information_source: **How it works** 32 | > 33 | > Inside the remote storage, Weasel uses a clever **directory structure** to 34 | > avoid overwriting files. The top level of the directory structure is a 35 | > URL-encoded version of the output's path. Within this directory are 36 | > subdirectories named according to a hash of the command string and the 37 | > command's dependencies. Finally, within those directories are files, named 38 | > according to an MD5 hash of their contents. 39 | > 40 | > ``` 41 | > └── urlencoded_file_path # Path of original file 42 | > ├── some_command_hash # Hash of command you ran 43 | > │ ├── some_content_hash # Hash of file content 44 | > │ └── another_content_hash 45 | > └── another_command_hash 46 | > └── third_content_hash 47 | > ``` 48 | 49 | For instance, let's say you had the following spaCy command in your 50 | `project.yml`: 51 | 52 | ```yaml title="project.yml" 53 | - name: train 54 | help: 'Train a spaCy pipeline using the specified corpus and config' 55 | script: 56 | - 'spacy train ./config.cfg --output training/' 57 | deps: 58 | - 'corpus/train' 59 | - 'corpus/dev' 60 | - 'config.cfg' 61 | outputs: 62 | - 'training/model-best' 63 | ``` 64 | 65 | After you finish training, you run [`push`](../cli.md#arrow_up-push) to make 66 | sure the `training/model-best` output is saved to remote storage. Weasel will 67 | then construct a hash from your command script and the listed dependencies, 68 | `corpus/train`, `corpus/dev` and `config.cfg`, in order to identify the 69 | execution context of your output. It would then compute an MD5 hash of the 70 | `training/model-best` directory, and use those three pieces of information to 71 | construct the storage URL. 72 | 73 | ```bash 74 | python -m weasel run train 75 | python -m weasel push 76 | ``` 77 | 78 | ```title="Overview of the S3 bucket" 79 | └── s3://my-weasel-bucket/training%2Fmodel-best 80 | └── 1d8cb33a06cc345ad3761c6050934a1b 81 | └── d8e20c3537a084c5c10d95899fe0b1ff 82 | ``` 83 | 84 | If you change the command or one of its dependencies (for instance, by editing 85 | the [`config.cfg`](https://spacy.io/usage/training#config) file to tune the 86 | hyperparameters), a different creation hash will be calculated, so when you use 87 | [`push`](../cli.md#arrow_up-push) you won't be overwriting your previous file. 88 | The system even supports multiple outputs for the same file and the same 89 | context, which can happen if your training process is not deterministic, or if 90 | you have dependencies that aren't represented in the command. 91 | 92 | In summary, the `weasel` remote storages are designed to make a particular set 93 | of trade-offs. Priority is placed on **convenience**, **correctness** and 94 | **avoiding data loss**. You can use [`push`](../cli.md#arrow_up-push) freely, as 95 | you'll never overwrite remote state, and you don't have to come up with names or 96 | version numbers. However, it's up to you to manage the size of your remote 97 | storage, and to remove files that are no longer relevant to you. 98 | -------------------------------------------------------------------------------- /docs/tutorial/workflow.md: -------------------------------------------------------------------------------- 1 | # Workflow 2 | 3 | ## 1. Clone a project template 4 | 5 | > :information_source: **Cloning under the hood** 6 | > 7 | > To clone a project, Weasel calls into `git` and uses the "sparse checkout" 8 | > feature to only clone the relevant directory or directories. 9 | 10 | The [`weasel clone`](../cli.md#clipboard-clone) command clones an existing 11 | project template and copies the files to a local directory. You can then run the 12 | project, e.g. to train a pipeline and edit the commands and scripts to build 13 | fully custom workflows. 14 | 15 | ```bash 16 | python -m weasel clone pipelines/tagger_parser_ud 17 | ``` 18 | 19 | By default, the project will be cloned into the current working directory. You 20 | can specify an optional second argument to define the output directory. The 21 | `--repo` option lets you define a custom repo to clone from if you don't want to 22 | use the default [`projects`](https://github.com/explosion/projects) repo. You can 23 | also use any private repo you have access to with Git. 24 | 25 | ## 2. Fetch the project assets 26 | 27 | Assets are data files your project needs – for example, the training and 28 | evaluation data or pretrained vectors and embeddings to initialize your model 29 | with. Each project template comes with a `project.yml` that defines the assets 30 | to download and where to put them. The [`weasel assets`](../cli.md#open_file_folder-assets) 31 | will fetch the project assets for you. 32 | 33 | > :bulb: **Example usage** 34 | > 35 | > ```yaml title="project.yml" 36 | > assets: 37 | > - dest: 'assets/training.spacy' 38 | > url: 'https://example.com/data.spacy' 39 | > checksum: '63373dd656daa1fd3043ce166a59474c' 40 | > - dest: 'assets/development.spacy' 41 | > git: 42 | > repo: 'https://github.com/example/repo' 43 | > branch: 'master' 44 | > path: 'path/development.spacy' 45 | > checksum: '5113dc04e03f079525edd8df3f4f39e3' 46 | > ``` 47 | > 48 | > Let Weasel fetch the assets: 49 | > 50 | > ```bash 51 | > python -m weasel assets 52 | > ``` 53 | 54 | Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and 55 | even cloud storage such as GCS and S3. You can also fetch assets using git, by 56 | replacing the `url` string with a `git` block. Weasel will use Git's "sparse 57 | checkout" feature to avoid downloading the whole repository. 58 | 59 | Sometimes your project configuration may include large assets that you don't 60 | necessarily want to download when you run `weasel assets`. That's why 61 | assets can be marked as [`extra`](./directory-and-assets.md#data-assets) - by default, these assets 62 | are not downloaded. If they should be, run `weasel assets --extra`. 63 | 64 | ## 3. Run a command 65 | 66 | Commands consist of one or more steps and can be run with 67 | [`weasel run`](../cli.md#rocket-run). The following will run the command 68 | `preprocess` defined in the `project.yml`: 69 | 70 | > :bulb: **Example usage** 71 | > 72 | > ```yaml title="project.yml" 73 | > commands: 74 | > - name: preprocess 75 | > help: "Convert the input data to spaCy's format" 76 | > script: 77 | > - 'python -m spacy convert assets/train.conllu corpus/' 78 | > - 'python -m spacy convert assets/eval.conllu corpus/' 79 | > deps: 80 | > - 'assets/train.conllu' 81 | > - 'assets/eval.conllu' 82 | > outputs: 83 | > - 'corpus/train.spacy' 84 | > - 'corpus/eval.spacy' 85 | > ``` 86 | > 87 | > Run the command: 88 | > 89 | > ```bash 90 | > python -m weasel run preprocess 91 | > ``` 92 | 93 | Commands can define their expected [dependencies and outputs](./directory-and-assets.md#dependencies-and-outputs) 94 | using the `deps` (files the commands require) and `outputs` (files the commands 95 | create) keys. This allows your project to track changes and determine whether a 96 | command needs to be re-run. For instance, if your input data changes, you want 97 | to re-run the `preprocess` command. But if nothing changed, this step can be 98 | skipped. You can also set `--force` to force re-running a command, or `--dry` to 99 | perform a "dry run" and see what would happen (without actually running the 100 | script). 101 | 102 | ## 4. Run a workflow 103 | 104 | Workflows are series of commands that are run in order and often depend on each 105 | other. For instance, to generate a spaCy pipeline package, you might start by 106 | converting your data, then run [`spacy train`](https://spacy.io/api/cli#train) to train your 107 | pipeline on the converted data and if that's successful, run 108 | [`spacy package`](https://spacy.io/api/cli#package) to turn the best trained artifact into an 109 | installable Python package. The following command runs the workflow named `all` 110 | defined in the `project.yml`, and executes the commands it specifies, in order: 111 | 112 | > :bulb: **Example usage** 113 | > 114 | > ```yaml title="project.yml" 115 | > workflows: 116 | > all: 117 | > - preprocess 118 | > - train 119 | > - package 120 | > ``` 121 | > 122 | > ```bash 123 | > python -m weasel run all 124 | > ``` 125 | 126 | Using the expected [dependencies and outputs](./directory-and-assets.md#dependencies-and-outputs) 127 | defined in the commands, Weasel can determine whether to re-run a command (if its inputs or 128 | outputs have changed) or whether to skip it. If you're looking to implement more 129 | advanced data pipelines and track your changes in Git, check out the 130 | [Data Version Control (DVC) integration](./integrations.md#data-version-control-dvc). The 131 | [`weasel dvc`](../cli.md#repeat-dvc) command generates a DVC config file 132 | from a workflow defined in your `project.yml` so you can manage your Weasel 133 | project as a DVC repo. 134 | 135 | ## 5. Optional: Push to remote storage 136 | 137 | After training a pipeline, you can optionally use the 138 | [`weasel push`](../cli.md#arrow_up-push) command to upload your outputs to 139 | a remote storage, using protocols like [S3](https://aws.amazon.com/s3/), 140 | [Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help 141 | you **export** your pipeline packages, **share** work with your team, or **cache 142 | results** to avoid repeating work. 143 | 144 | > :bulb: **Example usage** 145 | > 146 | > ```yaml title="project.yml" 147 | > remotes: 148 | > default: 's3://my-weasel-bucket' 149 | > local: '/mnt/scratch/cache' 150 | > ``` 151 | > 152 | > Push to remote: 153 | > 154 | > ```bash 155 | > python -m weasel push 156 | > ``` 157 | 158 | The `remotes` section in your `project.yml` lets you assign names to the 159 | different storages. To download state from a remote storage, you can use the 160 | [`weasel pull`](../cli.md#arrow_down-pull) command. For more details, see the 161 | docs on [remote storage](./remote-storage.md). 162 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.ruff] 6 | ignore = [ 7 | "E501", 8 | ] 9 | select = [ 10 | "E", # pycodestyle errors 11 | "W", # pycodestyle warnings 12 | "F", # Pyflakes 13 | "Q", # flake8-quotes 14 | ] 15 | 16 | [tool.ruff.per-file-ignores] 17 | # Ignore unused imports in __init__ files 18 | "__init__.py" = ["F401"] 19 | 20 | 21 | [tool.isort] 22 | multi_line_output = 9 23 | profile = "black" 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Our libraries 2 | confection>=0.0.4,<0.2.0 3 | wasabi>=0.9.1,<1.2.0 4 | srsly>=2.4.3,<3.0.0 5 | typer>=0.3.0,<1.0.0 6 | cloudpathlib>=0.7.0,<1.0.0 7 | smart-open>=5.2.1,<8.0.0 8 | # Third party dependencies 9 | requests>=2.13.0,<3.0.0 10 | pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 11 | # Official Python utilities 12 | packaging>=20.0 13 | # Development dependencies 14 | black==22.3.0 15 | pytest>=5.2.0,!=7.1.0 16 | mypy>=1.5.0,<1.7.0; python_version >= "3.8" 17 | types-requests 18 | types-setuptools>=57.0.0 19 | ruff>=0.0.259 20 | isort>=5.12.0,<6.0; python_version > "3.7" 21 | pre-commit>=3.2.0,<4.0.0; python_version > "3.7" 22 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = weasel 3 | version = 0.4.1 4 | description = Weasel: A small and easy workflow system 5 | url = https://github.com/explosion/weasel/ 6 | author = Explosion 7 | author_email = contact@explosion.ai 8 | license = MIT 9 | long_description = file: README.md 10 | long_description_content_type = text/markdown 11 | classifiers = 12 | Environment :: Console 13 | Intended Audience :: Developers 14 | Intended Audience :: Science/Research 15 | License :: OSI Approved :: MIT License 16 | Operating System :: POSIX :: Linux 17 | Operating System :: MacOS :: MacOS X 18 | Operating System :: Microsoft :: Windows 19 | Programming Language :: Python :: 3 20 | Programming Language :: Python :: 3.7 21 | Programming Language :: Python :: 3.8 22 | Programming Language :: Python :: 3.9 23 | Programming Language :: Python :: 3.10 24 | Programming Language :: Python :: 3.11 25 | Programming Language :: Python :: 3.12 26 | Topic :: Scientific/Engineering 27 | project_urls = 28 | Release notes = https://github.com/explosion/weasel/releases 29 | Source = https://github.com/explosion/weasel/ 30 | 31 | [options] 32 | python_requires = >=3.7 33 | install_requires = 34 | confection>=0.0.4,<0.2.0 35 | packaging>=20.0 36 | wasabi>=0.9.1,<1.2.0 37 | srsly>=2.4.3,<3.0.0 38 | typer>=0.3.0,<1.0.0 39 | cloudpathlib>=0.7.0,<1.0.0 40 | smart-open>=5.2.1,<8.0.0 41 | requests>=2.13.0,<3.0.0 42 | pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 43 | 44 | 45 | [options.entry_points] 46 | console_scripts = 47 | weasel = weasel.cli:app 48 | 49 | [tool:pytest] 50 | markers = 51 | issue: references specific issue 52 | 53 | [mypy] 54 | ignore_missing_imports = True 55 | no_implicit_optional = True 56 | plugins = pydantic.mypy 57 | allow_redefinition = True 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | if __name__ == "__main__": 5 | from setuptools import find_packages, setup 6 | 7 | setup(packages=find_packages()) 8 | -------------------------------------------------------------------------------- /weasel/__init__.py: -------------------------------------------------------------------------------- 1 | from .cli import app 2 | -------------------------------------------------------------------------------- /weasel/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli.main import COMMAND, app 2 | 3 | app(prog_name=COMMAND) 4 | -------------------------------------------------------------------------------- /weasel/about.py: -------------------------------------------------------------------------------- 1 | __projects__ = "https://github.com/explosion/projects" 2 | __projects_branch__ = "v3" 3 | -------------------------------------------------------------------------------- /weasel/cli/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import app # isort: skip 2 | 3 | from .assets import project_assets 4 | from .clone import project_clone 5 | from .document import project_document 6 | from .dvc import project_update_dvc 7 | from .pull import project_pull 8 | from .push import project_push 9 | from .run import project_run 10 | -------------------------------------------------------------------------------- /weasel/cli/assets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shutil 4 | from pathlib import Path 5 | from typing import Any, Dict, Optional 6 | 7 | import requests 8 | import typer 9 | from wasabi import msg 10 | 11 | from ..util import SimpleFrozenDict, download_file, ensure_path, get_checksum 12 | from ..util import get_git_version, git_checkout, load_project_config 13 | from ..util import parse_config_overrides, working_dir 14 | from .main import PROJECT_FILE, Arg, Opt, app 15 | 16 | # Whether assets are extra if `extra` is not set. 17 | EXTRA_DEFAULT = False 18 | 19 | 20 | @app.command( 21 | "assets", 22 | context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, 23 | ) 24 | def project_assets_cli( 25 | # fmt: off 26 | ctx: typer.Context, # This is only used to read additional arguments 27 | project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), 28 | sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."), 29 | extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.") 30 | # fmt: on 31 | ): 32 | """Fetch project assets like datasets and pretrained weights. Assets are 33 | defined in the "assets" section of the project.yml. If a checksum is 34 | provided in the project.yml, the file is only downloaded if no local file 35 | with the same checksum exists. 36 | 37 | DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-and-assets.md 38 | """ 39 | overrides = parse_config_overrides(ctx.args) 40 | project_assets( 41 | project_dir, 42 | overrides=overrides, 43 | sparse_checkout=sparse_checkout, 44 | extra=extra, 45 | ) 46 | 47 | 48 | def project_assets( 49 | project_dir: Path, 50 | *, 51 | overrides: Dict[str, Any] = SimpleFrozenDict(), 52 | sparse_checkout: bool = False, 53 | extra: bool = False, 54 | ) -> None: 55 | """Fetch assets for a project using DVC if possible. 56 | 57 | project_dir (Path): Path to project directory. 58 | sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files 59 | needed. 60 | extra (bool): Whether to download all assets, including those marked as 'extra'. 61 | """ 62 | project_path = ensure_path(project_dir) 63 | config = load_project_config(project_path, overrides=overrides) 64 | assets = [ 65 | asset 66 | for asset in config.get("assets", []) 67 | if extra or not asset.get("extra", EXTRA_DEFAULT) 68 | ] 69 | if not assets: 70 | msg.warn( 71 | f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)", 72 | exits=0, 73 | ) 74 | msg.info(f"Fetching {len(assets)} asset(s)") 75 | 76 | for asset in assets: 77 | dest = (project_dir / asset["dest"]).resolve() 78 | checksum = asset.get("checksum") 79 | if "git" in asset: 80 | git_err = ( 81 | "Cloning Weasel project templates requires Git and the 'git' command. " 82 | "Make sure it's installed and that the executable is available." 83 | ) 84 | get_git_version(error=git_err) 85 | if dest.exists(): 86 | # If there's already a file, check for checksum 87 | if checksum and checksum == get_checksum(dest): 88 | msg.good( 89 | f"Skipping download with matching checksum: {asset['dest']}" 90 | ) 91 | continue 92 | else: 93 | if dest.is_dir(): 94 | shutil.rmtree(dest) 95 | else: 96 | dest.unlink() 97 | if "repo" not in asset["git"] or asset["git"]["repo"] is None: 98 | msg.fail( 99 | "A git asset must include 'repo', the repository address.", exits=1 100 | ) 101 | if "path" not in asset["git"] or asset["git"]["path"] is None: 102 | msg.fail( 103 | "A git asset must include 'path' - use \"\" to get the entire repository.", 104 | exits=1, 105 | ) 106 | git_checkout( 107 | asset["git"]["repo"], 108 | asset["git"]["path"], 109 | dest, 110 | branch=asset["git"].get("branch"), 111 | sparse=sparse_checkout, 112 | ) 113 | msg.good(f"Downloaded asset {dest}") 114 | else: 115 | url = asset.get("url") 116 | if not url: 117 | # project.yml defines asset without URL that the user has to place 118 | check_private_asset(dest, checksum) 119 | continue 120 | fetch_asset(project_path, url, dest, checksum) 121 | 122 | 123 | def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: 124 | """Check and validate assets without a URL (private assets that the user 125 | has to provide themselves) and give feedback about the checksum. 126 | 127 | dest (Path): Destination path of the asset. 128 | checksum (Optional[str]): Optional checksum of the expected file. 129 | """ 130 | if not Path(dest).exists(): 131 | err = f"No URL provided for asset. You need to add this file yourself: {dest}" 132 | msg.warn(err) 133 | else: 134 | if not checksum: 135 | msg.good(f"Asset already exists: {dest}") 136 | elif checksum == get_checksum(dest): 137 | msg.good(f"Asset exists with matching checksum: {dest}") 138 | else: 139 | msg.fail(f"Asset available but with incorrect checksum: {dest}") 140 | 141 | 142 | def fetch_asset( 143 | project_path: Path, url: str, dest: Path, checksum: Optional[str] = None 144 | ) -> None: 145 | """Fetch an asset from a given URL or path. If a checksum is provided and a 146 | local file exists, it's only re-downloaded if the checksum doesn't match. 147 | 148 | project_path (Path): Path to project directory. 149 | url (str): URL or path to asset. 150 | checksum (Optional[str]): Optional expected checksum of local file. 151 | RETURNS (Optional[Path]): The path to the fetched asset or None if fetching 152 | the asset failed. 153 | """ 154 | dest_path = (project_path / dest).resolve() 155 | if dest_path.exists(): 156 | # If there's already a file, check for checksum 157 | if checksum: 158 | if checksum == get_checksum(dest_path): 159 | msg.good(f"Skipping download with matching checksum: {dest}") 160 | return 161 | else: 162 | # If there's not a checksum, make sure the file is a possibly valid size 163 | if os.path.getsize(dest_path) == 0: 164 | msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}") 165 | os.remove(dest_path) 166 | # We might as well support the user here and create parent directories in 167 | # case the asset dir isn't listed as a dir to create in the project.yml 168 | if not dest_path.parent.exists(): 169 | dest_path.parent.mkdir(parents=True) 170 | with working_dir(project_path): 171 | url = convert_asset_url(url) 172 | try: 173 | download_file(url, dest_path) 174 | msg.good(f"Downloaded asset {dest}") 175 | except requests.exceptions.RequestException as e: 176 | if Path(url).exists() and Path(url).is_file(): 177 | # If it's a local file, copy to destination 178 | shutil.copy(url, str(dest_path)) 179 | msg.good(f"Copied local asset {dest}") 180 | else: 181 | msg.fail(f"Download failed: {dest}", e) 182 | if checksum and checksum != get_checksum(dest_path): 183 | msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") 184 | 185 | 186 | def convert_asset_url(url: str) -> str: 187 | """Check and convert the asset URL if needed. 188 | 189 | url (str): The asset URL. 190 | RETURNS (str): The converted URL. 191 | """ 192 | # If the asset URL is a regular GitHub URL it's likely a mistake 193 | if ( 194 | re.match(r"(http(s?)):\/\/github.com", url) 195 | and "releases/download" not in url 196 | and "/raw/" not in url 197 | ): 198 | converted = url.replace("github.com", "raw.githubusercontent.com") 199 | converted = re.sub(r"/(tree|blob)/", "/", converted) 200 | msg.warn( 201 | "Downloading from a regular GitHub URL. This will only download " 202 | "the source of the page, not the actual file. Converting the URL " 203 | "to a raw URL.", 204 | converted, 205 | ) 206 | return converted 207 | return url 208 | -------------------------------------------------------------------------------- /weasel/cli/clone.py: -------------------------------------------------------------------------------- 1 | import re 2 | import subprocess 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | import typer 7 | from wasabi import msg 8 | 9 | from .. import about 10 | from ..util import ensure_path, get_git_version, git_checkout, git_repo_branch_exists 11 | from .main import COMMAND, PROJECT_FILE, Arg, Opt, _get_parent_command, app 12 | 13 | DEFAULT_REPO = about.__projects__ 14 | DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ 15 | DEFAULT_BRANCHES = ["main", "master"] 16 | 17 | 18 | @app.command("clone") 19 | def project_clone_cli( 20 | # fmt: off 21 | ctx: typer.Context, # This is only used to read the parent command 22 | name: str = Arg(..., help="The name of the template to clone"), 23 | dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), 24 | repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"), 25 | branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"), 26 | sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+."), 27 | # fmt: on 28 | ): 29 | """Clone a project template from a repository. Calls into "git" and will 30 | only download the files from the given subdirectory. The GitHub repo 31 | defaults to the official Weasel template repo, but can be customized 32 | (including using a private repo). 33 | 34 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone 35 | """ 36 | if dest is None: 37 | dest = Path.cwd() / Path(name).parts[-1] 38 | if repo == DEFAULT_REPO and branch is None: 39 | branch = DEFAULT_PROJECTS_BRANCH 40 | 41 | if branch is None: 42 | for default_branch in DEFAULT_BRANCHES: 43 | if git_repo_branch_exists(repo, default_branch): 44 | branch = default_branch 45 | break 46 | if branch is None: 47 | default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES) 48 | msg.fail( 49 | "No branch provided and attempted default " 50 | f"branches {default_branches_msg} do not exist.", 51 | exits=1, 52 | ) 53 | else: 54 | if not git_repo_branch_exists(repo, branch): 55 | msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1) 56 | assert isinstance(branch, str) 57 | parent_command = _get_parent_command(ctx) 58 | project_clone( 59 | name, 60 | dest, 61 | repo=repo, 62 | branch=branch, 63 | sparse_checkout=sparse_checkout, 64 | parent_command=parent_command, 65 | ) 66 | 67 | 68 | def project_clone( 69 | name: str, 70 | dest: Path, 71 | *, 72 | repo: str = about.__projects__, 73 | branch: str = about.__projects_branch__, 74 | sparse_checkout: bool = False, 75 | parent_command: str = COMMAND, 76 | ) -> None: 77 | """Clone a project template from a repository. 78 | 79 | name (str): Name of subdirectory to clone. 80 | dest (Path): Destination path of cloned project. 81 | repo (str): URL of Git repo containing project templates. 82 | branch (str): The branch to clone from 83 | """ 84 | dest = ensure_path(dest) 85 | check_clone(name, dest, repo) 86 | project_dir = dest.resolve() 87 | repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) 88 | try: 89 | git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout) 90 | except subprocess.CalledProcessError: 91 | err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')" 92 | msg.fail(err, exits=1) 93 | msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir) 94 | if not (project_dir / PROJECT_FILE).exists(): 95 | msg.warn(f"No {PROJECT_FILE} found in directory") 96 | else: 97 | msg.good("Your project is now ready!") 98 | print(f"To fetch the assets, run:\n{parent_command} assets {dest}") 99 | 100 | 101 | def check_clone(name: str, dest: Path, repo: str) -> None: 102 | """Check and validate that the destination path can be used to clone. Will 103 | check that Git is available and that the destination path is suitable. 104 | 105 | name (str): Name of the directory to clone from the repo. 106 | dest (Path): Local destination of cloned directory. 107 | repo (str): URL of the repo to clone from. 108 | """ 109 | git_err = ( 110 | f"Cloning Weasel project templates requires Git and the 'git' command. " 111 | f"To clone a project without Git, copy the files from the '{name}' " 112 | f"directory in the {repo} to {dest} manually." 113 | ) 114 | get_git_version(error=git_err) 115 | if not dest: 116 | msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) 117 | if dest.exists(): 118 | # Directory already exists (not allowed, clone needs to create it) 119 | msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) 120 | if not dest.parent.exists(): 121 | # We're not creating parents, parent dir should exist 122 | msg.fail( 123 | f"Can't clone project, parent directory doesn't exist: {dest.parent}. " 124 | f"Create the necessary folder(s) first before continuing.", 125 | exits=1, 126 | ) 127 | -------------------------------------------------------------------------------- /weasel/cli/document.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from wasabi import MarkdownRenderer, msg 4 | 5 | from ..util import load_project_config, working_dir 6 | from .main import PROJECT_FILE, Arg, Opt, app 7 | 8 | DOCS_URL = "https://github.com/explosion/weasel" 9 | INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the 10 | project, as well as the available commands and workflows. For details, see the 11 | [Weasel documentation]({DOCS_URL}).""" 12 | INTRO_COMMANDS = f"""The following commands are defined by the project. They 13 | can be executed using [`weasel run [name]`]({DOCS_URL}/tree/main/docs/cli.md#rocket-run). 14 | Commands are only re-run if their inputs have changed.""" 15 | INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They 16 | can be executed using [`weasel run [name]`]({DOCS_URL}/tree/main/docs/cli.md#rocket-run) 17 | and will run the specified commands in order. Commands are only re-run if their 18 | inputs have changed.""" 19 | INTRO_ASSETS = f"""The following assets are defined by the project. They can 20 | be fetched by running [`weasel assets`]({DOCS_URL}/tree/main/docs/cli.md#open_file_folder-assets) 21 | in the project directory.""" 22 | # These markers are added to the Markdown and can be used to update the file in 23 | # place if it already exists. Only the auto-generated part will be replaced. 24 | MARKER_TAGS = ("WEASEL", "SPACY PROJECT") 25 | MARKER_START = "" 26 | MARKER_END = "" 27 | # If this marker is used in an existing README, it's ignored and not replaced 28 | MARKER_IGNORE = "" 29 | 30 | 31 | @app.command("document") 32 | def project_document_cli( 33 | # fmt: off 34 | project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), 35 | output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"), 36 | no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji") 37 | # fmt: on 38 | ): 39 | """ 40 | Auto-generate a README.md for a project. If the content is saved to a file, 41 | hidden markers are added so you can add custom content before or after the 42 | auto-generated section and only the auto-generated docs will be replaced 43 | when you re-run the command. 44 | 45 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-document 46 | """ 47 | project_document(project_dir, output_file, no_emoji=no_emoji) 48 | 49 | 50 | def project_document( 51 | project_dir: Path, output_file: Path, *, no_emoji: bool = False 52 | ) -> None: 53 | is_stdout = str(output_file) == "-" 54 | config = load_project_config(project_dir) 55 | md = MarkdownRenderer(no_emoji=no_emoji) 56 | md.add(MARKER_START.format(tag="WEASEL")) 57 | title = config.get("title") 58 | description = config.get("description") 59 | md.add(md.title(1, f"Weasel Project{f': {title}' if title else ''}", "🪐")) 60 | if description: 61 | md.add(description) 62 | md.add(md.title(2, PROJECT_FILE, "📋")) 63 | md.add(INTRO_PROJECT) 64 | # Commands 65 | cmds = config.get("commands", []) 66 | data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds] 67 | if data: 68 | md.add(md.title(3, "Commands", "⏯")) 69 | md.add(INTRO_COMMANDS) 70 | md.add(md.table(data, ["Command", "Description"])) 71 | # Workflows 72 | wfs = config.get("workflows", {}).items() 73 | data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs] 74 | if data: 75 | md.add(md.title(3, "Workflows", "⏭")) 76 | md.add(INTRO_WORKFLOWS) 77 | md.add(md.table(data, ["Workflow", "Steps"])) 78 | # Assets 79 | assets = config.get("assets", []) 80 | data = [] 81 | for a in assets: 82 | source = "Git" if a.get("git") else "URL" if a.get("url") else "Local" 83 | dest_path = a["dest"] 84 | dest = md.code(dest_path) 85 | if source == "Local": 86 | # Only link assets if they're in the repo 87 | with working_dir(project_dir) as p: 88 | if (p / dest_path).exists(): 89 | dest = md.link(dest, dest_path) 90 | data.append((dest, source, a.get("description", ""))) 91 | if data: 92 | md.add(md.title(3, "Assets", "🗂")) 93 | md.add(INTRO_ASSETS) 94 | md.add(md.table(data, ["File", "Source", "Description"])) 95 | md.add(MARKER_END.format(tag="WEASEL")) 96 | # Output result 97 | if is_stdout: 98 | print(md.text) 99 | else: 100 | content = md.text 101 | if output_file.exists(): 102 | with output_file.open("r", encoding="utf8") as f: 103 | existing = f.read() 104 | 105 | for marker_tag in MARKER_TAGS: 106 | if MARKER_IGNORE.format(tag=marker_tag) in existing: 107 | msg.warn( 108 | "Found ignore marker in existing file: skipping", output_file 109 | ) 110 | return 111 | 112 | marker_tag_found = False 113 | for marker_tag in MARKER_TAGS: 114 | markers = { 115 | "start": MARKER_START.format(tag=marker_tag), 116 | "end": MARKER_END.format(tag=marker_tag), 117 | } 118 | if markers["start"] in existing and markers["end"] in existing: 119 | marker_tag_found = True 120 | msg.info("Found existing file: only replacing auto-generated docs") 121 | before = existing.split(markers["start"])[0] 122 | after = existing.split(markers["end"])[1] 123 | content = f"{before}{content}{after}" 124 | break 125 | if not marker_tag_found: 126 | msg.warn("Replacing existing file") 127 | 128 | with output_file.open("w", encoding="utf8") as f: 129 | f.write(content) 130 | msg.good("Saved project documentation", output_file) 131 | -------------------------------------------------------------------------------- /weasel/cli/dvc.py: -------------------------------------------------------------------------------- 1 | """This module contains helpers and subcommands for integrating Weasel 2 | with Data Version Control (DVC). https://dvc.org""" 3 | import subprocess 4 | from pathlib import Path 5 | from typing import Any, Dict, List, Optional 6 | 7 | from wasabi import msg 8 | 9 | from ..util import get_hash, join_command, load_project_config, run_command, working_dir 10 | from .main import COMMAND, NAME, PROJECT_FILE, Arg, Opt, app 11 | 12 | DVC_CONFIG = "dvc.yaml" 13 | DVC_DIR = ".dvc" 14 | UPDATE_COMMAND = "dvc" 15 | DVC_CONFIG_COMMENT = f"""# This file is auto-generated by Weasel based on your {PROJECT_FILE}. If you've 16 | # edited your {PROJECT_FILE}, you can regenerate this file by running: 17 | # {COMMAND} {UPDATE_COMMAND}""" 18 | 19 | 20 | @app.command(UPDATE_COMMAND) 21 | def project_update_dvc_cli( 22 | # fmt: off 23 | project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), 24 | workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."), 25 | verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), 26 | quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"), 27 | force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), 28 | # fmt: on 29 | ): 30 | """Auto-generate Data Version Control (DVC) config. A DVC 31 | project can only define one pipeline, so you need to specify one workflow 32 | defined in the project.yml. If no workflow is specified, the first defined 33 | workflow is used. The DVC config will only be updated if the project.yml 34 | changed. 35 | 36 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc 37 | """ 38 | project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force) 39 | 40 | 41 | def project_update_dvc( 42 | project_dir: Path, 43 | workflow: Optional[str] = None, 44 | *, 45 | verbose: bool = False, 46 | quiet: bool = False, 47 | force: bool = False, 48 | ) -> None: 49 | """Update the auto-generated Data Version Control (DVC) config file. A DVC 50 | project can only define one pipeline, so you need to specify one workflow 51 | defined in the project.yml. Will only update the file if the checksum changed. 52 | 53 | project_dir (Path): The project directory. 54 | workflow (Optional[str]): Optional name of workflow defined in project.yml. 55 | If not set, the first workflow will be used. 56 | verbose (bool): Print more info. 57 | quiet (bool): Print less info. 58 | force (bool): Force update DVC config. 59 | """ 60 | config = load_project_config(project_dir) 61 | updated = update_dvc_config( 62 | project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force 63 | ) 64 | help_msg = "To execute the workflow with DVC, run: dvc repro" 65 | if updated: 66 | msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg) 67 | else: 68 | msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg) 69 | 70 | 71 | def update_dvc_config( 72 | path: Path, 73 | config: Dict[str, Any], 74 | workflow: Optional[str] = None, 75 | verbose: bool = False, 76 | quiet: bool = False, 77 | force: bool = False, 78 | ) -> bool: 79 | """Re-run the DVC commands in dry mode and update dvc.yaml file in the 80 | project directory. The file is auto-generated based on the config. The 81 | first line of the auto-generated file specifies the hash of the config 82 | dict, so if any of the config values change, the DVC config is regenerated. 83 | 84 | path (Path): The path to the project directory. 85 | config (Dict[str, Any]): The loaded project.yml. 86 | verbose (bool): Whether to print additional info (via DVC). 87 | quiet (bool): Don't output anything (via DVC). 88 | force (bool): Force update, even if hashes match. 89 | RETURNS (bool): Whether the DVC config file was updated. 90 | """ 91 | ensure_dvc(path) 92 | workflows = config.get("workflows", {}) 93 | workflow_names = list(workflows.keys()) 94 | check_workflows(workflow_names, workflow) 95 | if not workflow: 96 | workflow = workflow_names[0] 97 | config_hash = get_hash(config) 98 | path = path.resolve() 99 | dvc_config_path = path / DVC_CONFIG 100 | if dvc_config_path.exists(): 101 | # Check if the file was generated using the current config, if not, redo 102 | with dvc_config_path.open("r", encoding="utf8") as f: 103 | ref_hash = f.readline().strip().replace("# ", "") 104 | if ref_hash == config_hash and not force: 105 | return False # Nothing has changed in project.yml, don't need to update 106 | dvc_config_path.unlink() 107 | dvc_commands = [] 108 | config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} 109 | 110 | # some flags that apply to every command 111 | flags = [] 112 | if verbose: 113 | flags.append("--verbose") 114 | if quiet: 115 | flags.append("--quiet") 116 | 117 | for name in workflows[workflow]: 118 | command = config_commands[name] 119 | deps = command.get("deps", []) 120 | outputs = command.get("outputs", []) 121 | outputs_no_cache = command.get("outputs_no_cache", []) 122 | if not deps and not outputs and not outputs_no_cache: 123 | continue 124 | # Default to the working dir as the project path since dvc.yaml is auto-generated 125 | # and we don't want arbitrary paths in there 126 | project_cmd = ["python", "-m", NAME, "project", "run", name] 127 | deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] 128 | outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] 129 | outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] 130 | 131 | dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"] 132 | if command.get("no_skip"): 133 | dvc_cmd.append("--always-changed") 134 | full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] 135 | dvc_commands.append(join_command(full_cmd)) 136 | 137 | if not dvc_commands: 138 | # If we don't check for this, then there will be an error when reading the 139 | # config, since DVC wouldn't create it. 140 | msg.fail( 141 | "No usable commands for DVC found. This can happen if none of your " 142 | "commands have dependencies or outputs.", 143 | exits=1, 144 | ) 145 | 146 | with working_dir(path): 147 | for c in dvc_commands: 148 | dvc_command = "dvc " + c 149 | run_command(dvc_command) 150 | with dvc_config_path.open("r+", encoding="utf8") as f: 151 | content = f.read() 152 | f.seek(0, 0) 153 | f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") 154 | return True 155 | 156 | 157 | def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: 158 | """Validate workflows provided in project.yml and check that a given 159 | workflow can be used to generate a DVC config. 160 | 161 | workflows (List[str]): Names of the available workflows. 162 | workflow (Optional[str]): The name of the workflow to convert. 163 | """ 164 | if not workflows: 165 | msg.fail( 166 | f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, " 167 | f"define at least one list of commands.", 168 | exits=1, 169 | ) 170 | if workflow is not None and workflow not in workflows: 171 | msg.fail( 172 | f"Workflow '{workflow}' not defined in {PROJECT_FILE}. " 173 | f"Available workflows: {', '.join(workflows)}", 174 | exits=1, 175 | ) 176 | if not workflow: 177 | msg.warn( 178 | f"No workflow specified for DVC pipeline. Using the first workflow " 179 | f"defined in {PROJECT_FILE}: '{workflows[0]}'" 180 | ) 181 | 182 | 183 | def ensure_dvc(project_dir: Path) -> None: 184 | """Ensure that the "dvc" command is available and that the current project 185 | directory is an initialized DVC project. 186 | """ 187 | try: 188 | subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) 189 | except Exception: 190 | msg.fail( 191 | "To use Weasel with DVC (Data Version Control), DVC needs " 192 | "to be installed and the 'dvc' command needs to be available", 193 | "You can install the Python package from pip (pip install dvc) or " 194 | "conda (conda install -c conda-forge dvc). For more details, see the " 195 | "documentation: https://dvc.org/doc/install", 196 | exits=1, 197 | ) 198 | if not (project_dir / ".dvc").exists(): 199 | msg.fail( 200 | "Project not initialized as a DVC project", 201 | "To initialize a DVC project, you can run 'dvc init' in the project " 202 | "directory. For more details, see the documentation: " 203 | "https://dvc.org/doc/command-reference/init", 204 | exits=1, 205 | ) 206 | -------------------------------------------------------------------------------- /weasel/cli/main.py: -------------------------------------------------------------------------------- 1 | import typer 2 | 3 | COMMAND = "python -m weasel" 4 | NAME = "weasel" 5 | HELP = """weasel Command-line Interface 6 | 7 | DOCS: https://github.com/explosion/weasel 8 | """ 9 | 10 | PROJECT_FILE = "project.yml" 11 | PROJECT_LOCK = "project.lock" 12 | 13 | # Wrappers for Typer's annotations. Initially created to set defaults and to 14 | # keep the names short, but not needed at the moment. 15 | Arg = typer.Argument 16 | Opt = typer.Option 17 | 18 | app = typer.Typer(name=NAME, help=HELP, no_args_is_help=True) 19 | 20 | 21 | def _get_parent_command(ctx: typer.Context) -> str: 22 | parent_command = "" 23 | ctx_parent = ctx.parent 24 | while ctx_parent: 25 | if ctx_parent.info_name: 26 | parent_command = ctx_parent.info_name + " " + parent_command 27 | ctx_parent = ctx_parent.parent 28 | else: 29 | return COMMAND 30 | return parent_command.strip() 31 | -------------------------------------------------------------------------------- /weasel/cli/pull.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from wasabi import msg 4 | 5 | from ..util import load_project_config, logger 6 | from .main import Arg, app 7 | from .remote_storage import RemoteStorage, get_command_hash 8 | from .run import update_lockfile 9 | 10 | 11 | @app.command("pull") 12 | def project_pull_cli( 13 | # fmt: off 14 | remote: str = Arg("default", help="Name or path of remote storage"), 15 | project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), 16 | # fmt: on 17 | ): 18 | """Retrieve available precomputed outputs from a remote storage. 19 | You can alias remotes in your project.yml by mapping them to storage paths. 20 | A storage can be anything that the smart_open library can upload to, e.g. 21 | AWS, Google Cloud Storage, SSH, local directories etc. 22 | 23 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-push 24 | """ 25 | for url, output_path in project_pull(project_dir, remote): 26 | if url is not None: 27 | msg.good(f"Pulled {output_path} from {url}") 28 | 29 | 30 | def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): 31 | # TODO: We don't have tests for this :(. It would take a bit of mockery to 32 | # set up. I guess see if it breaks first? 33 | config = load_project_config(project_dir) 34 | if remote in config.get("remotes", {}): 35 | remote = config["remotes"][remote] 36 | storage = RemoteStorage(project_dir, remote) 37 | commands = list(config.get("commands", [])) 38 | # We use a while loop here because we don't know how the commands 39 | # will be ordered. A command might need dependencies from one that's later 40 | # in the list. 41 | while commands: 42 | for i, cmd in enumerate(list(commands)): 43 | logger.debug("CMD: %s.", cmd["name"]) 44 | deps = [project_dir / dep for dep in cmd.get("deps", [])] 45 | if all(dep.exists() for dep in deps): 46 | cmd_hash = get_command_hash("", "", deps, cmd["script"]) 47 | for output_path in cmd.get("outputs", []): 48 | url = storage.pull(output_path, command_hash=cmd_hash) 49 | logger.debug( 50 | "URL: %s for %s with command hash %s", 51 | url, 52 | output_path, 53 | cmd_hash, 54 | ) 55 | yield url, output_path 56 | 57 | out_locs = [project_dir / out for out in cmd.get("outputs", [])] 58 | if all(loc.exists() for loc in out_locs): 59 | update_lockfile(project_dir, cmd) 60 | # We remove the command from the list here, and break, so that 61 | # we iterate over the loop again. 62 | commands.pop(i) 63 | break 64 | else: 65 | logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) 66 | else: 67 | # If we didn't break the for loop, break the while loop. 68 | break 69 | -------------------------------------------------------------------------------- /weasel/cli/push.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from wasabi import msg 4 | 5 | from ..util import load_project_config, logger 6 | from .main import Arg, app 7 | from .remote_storage import RemoteStorage, get_command_hash, get_content_hash 8 | 9 | 10 | @app.command("push") 11 | def project_push_cli( 12 | # fmt: off 13 | remote: str = Arg("default", help="Name or path of remote storage"), 14 | project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), 15 | # fmt: on 16 | ): 17 | """Persist outputs to a remote storage. You can alias remotes in your 18 | project.yml by mapping them to storage paths. A storage can be anything that 19 | the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, 20 | local directories etc. 21 | 22 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push 23 | """ 24 | for output_path, url in project_push(project_dir, remote): 25 | if url is None: 26 | msg.info(f"Skipping {output_path}") 27 | else: 28 | msg.good(f"Pushed {output_path} to {url}") 29 | 30 | 31 | def project_push(project_dir: Path, remote: str): 32 | """Persist outputs to a remote storage. You can alias remotes in your project.yml 33 | by mapping them to storage paths. A storage can be anything that the smart_open 34 | library can upload to, e.g. gcs, aws, ssh, local directories etc 35 | """ 36 | config = load_project_config(project_dir) 37 | if remote in config.get("remotes", {}): 38 | remote = config["remotes"][remote] 39 | storage = RemoteStorage(project_dir, remote) 40 | for cmd in config.get("commands", []): 41 | logger.debug("CMD: %s", cmd["name"]) 42 | deps = [project_dir / dep for dep in cmd.get("deps", [])] 43 | if any(not dep.exists() for dep in deps): 44 | logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) 45 | continue 46 | cmd_hash = get_command_hash( 47 | "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] 48 | ) 49 | logger.debug("CMD_HASH: %s", cmd_hash) 50 | for output_path in cmd.get("outputs", []): 51 | output_loc = project_dir / output_path 52 | if output_loc.exists() and _is_not_empty_dir(output_loc): 53 | url = storage.push( 54 | output_path, 55 | command_hash=cmd_hash, 56 | content_hash=get_content_hash(output_loc), 57 | ) 58 | logger.debug( 59 | "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash 60 | ) 61 | yield output_path, url 62 | 63 | 64 | def _is_not_empty_dir(loc: Path): 65 | if not loc.is_dir(): 66 | return True 67 | elif any(_is_not_empty_dir(child) for child in loc.iterdir()): 68 | return True 69 | else: 70 | return False 71 | -------------------------------------------------------------------------------- /weasel/cli/remote_storage.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import site 4 | import sys 5 | import tarfile 6 | import urllib.parse 7 | from pathlib import Path 8 | from typing import TYPE_CHECKING, Dict, List, Optional 9 | 10 | from wasabi import msg 11 | 12 | from ..errors import Errors 13 | from ..util import check_spacy_env_vars, download_file, ensure_pathy, get_checksum 14 | from ..util import get_hash, make_tempdir, upload_file 15 | 16 | if TYPE_CHECKING: 17 | from cloudpathlib import CloudPath 18 | 19 | 20 | class RemoteStorage: 21 | """Push and pull outputs to and from a remote file storage. 22 | 23 | Remotes can be anything that `smart_open` can support: AWS, GCS, file system, 24 | ssh, etc. 25 | """ 26 | 27 | def __init__(self, project_root: Path, url: str, *, compression="gz"): 28 | self.root = project_root 29 | self.url = ensure_pathy(url) 30 | self.compression = compression 31 | 32 | def push(self, path: Path, command_hash: str, content_hash: str) -> "CloudPath": 33 | """Compress a file or directory within a project and upload it to a remote 34 | storage. If an object exists at the full URL, nothing is done. 35 | 36 | Within the remote storage, files are addressed by their project path 37 | (url encoded) and two user-supplied hashes, representing their creation 38 | context and their file contents. If the URL already exists, the data is 39 | not uploaded. Paths are archived and compressed prior to upload. 40 | """ 41 | loc = self.root / path 42 | if not loc.exists(): 43 | raise IOError(f"Cannot push {loc}: does not exist.") 44 | url = self.make_url(path, command_hash, content_hash) 45 | if url.exists(): 46 | return url 47 | tmp: Path 48 | with make_tempdir() as tmp: 49 | tar_loc = tmp / self.encode_name(str(path)) 50 | mode_string = f"w:{self.compression}" if self.compression else "w" 51 | with tarfile.open(tar_loc, mode=mode_string) as tar_file: 52 | tar_file.add(str(loc), arcname=str(path)) 53 | upload_file(tar_loc, url) 54 | return url 55 | 56 | def pull( 57 | self, 58 | path: Path, 59 | *, 60 | command_hash: Optional[str] = None, 61 | content_hash: Optional[str] = None, 62 | ) -> Optional["CloudPath"]: 63 | """Retrieve a file from the remote cache. If the file already exists, 64 | nothing is done. 65 | 66 | If the command_hash and/or content_hash are specified, only matching 67 | results are returned. If no results are available, an error is raised. 68 | """ 69 | dest = self.root / path 70 | if dest.exists(): 71 | return None 72 | url = self.find(path, command_hash=command_hash, content_hash=content_hash) 73 | if url is None: 74 | return url 75 | else: 76 | # Make sure the destination exists 77 | if not dest.parent.exists(): 78 | dest.parent.mkdir(parents=True) 79 | tmp: Path 80 | with make_tempdir() as tmp: 81 | tar_loc = tmp / url.parts[-1] 82 | download_file(url, tar_loc) 83 | mode_string = f"r:{self.compression}" if self.compression else "r" 84 | with tarfile.open(tar_loc, mode=mode_string) as tar_file: 85 | # This requires that the path is added correctly, relative 86 | # to root. This is how we set things up in push() 87 | 88 | # Disallow paths outside the current directory for the tar 89 | # file (CVE-2007-4559, directory traversal vulnerability) 90 | def is_within_directory(directory, target): 91 | abs_directory = os.path.abspath(directory) 92 | abs_target = os.path.abspath(target) 93 | prefix = os.path.commonprefix([abs_directory, abs_target]) 94 | return prefix == abs_directory 95 | 96 | def safe_extract(tar, path): 97 | for member in tar.getmembers(): 98 | member_path = os.path.join(path, member.name) 99 | if not is_within_directory(path, member_path): 100 | raise ValueError(Errors.E201) 101 | if sys.version_info >= (3, 12): 102 | tar.extractall(path, filter="data") 103 | else: 104 | tar.extractall(path) 105 | 106 | safe_extract(tar_file, self.root) 107 | return url 108 | 109 | def find( 110 | self, 111 | path: Path, 112 | *, 113 | command_hash: Optional[str] = None, 114 | content_hash: Optional[str] = None, 115 | ) -> Optional["CloudPath"]: 116 | """Find the best matching version of a file within the storage, 117 | or `None` if no match can be found. If both the creation and content hash 118 | are specified, only exact matches will be returned. Otherwise, the most 119 | recent matching file is preferred. 120 | """ 121 | name = self.encode_name(str(path)) 122 | urls = [] 123 | if command_hash is not None and content_hash is not None: 124 | url = self.url / name / command_hash / content_hash 125 | urls = [url] if url.exists() else [] 126 | elif command_hash is not None: 127 | if (self.url / name / command_hash).exists(): 128 | urls = list((self.url / name / command_hash).iterdir()) 129 | else: 130 | if (self.url / name).exists(): 131 | for sub_dir in (self.url / name).iterdir(): 132 | urls.extend(sub_dir.iterdir()) 133 | if content_hash is not None: 134 | urls = [url for url in urls if url.parts[-1] == content_hash] 135 | if len(urls) >= 2: 136 | try: 137 | urls.sort(key=lambda x: x.stat().st_mtime) 138 | except Exception: 139 | msg.warn( 140 | "Unable to sort remote files by last modified. The file(s) " 141 | "pulled from the cache may not be the most recent." 142 | ) 143 | return urls[-1] if urls else None 144 | 145 | def make_url(self, path: Path, command_hash: str, content_hash: str) -> "CloudPath": 146 | """Construct a URL from a subpath, a creation hash and a content hash.""" 147 | return self.url / self.encode_name(str(path)) / command_hash / content_hash 148 | 149 | def encode_name(self, name: str) -> str: 150 | """Encode a subpath into a URL-safe name.""" 151 | return urllib.parse.quote_plus(name) 152 | 153 | 154 | def get_content_hash(loc: Path) -> str: 155 | return get_checksum(loc) 156 | 157 | 158 | def get_command_hash( 159 | site_hash: str, env_hash: str, deps: List[Path], cmd: List[str] 160 | ) -> str: 161 | """Create a hash representing the execution of a command. This includes the 162 | currently installed packages, whatever environment variables have been marked 163 | as relevant, and the command. 164 | """ 165 | check_spacy_env_vars() 166 | dep_checksums = [get_checksum(dep) for dep in sorted(deps)] 167 | hashes = [site_hash, env_hash] + dep_checksums 168 | hashes.extend(cmd) 169 | creation_bytes = "".join(hashes).encode("utf8") 170 | return hashlib.md5(creation_bytes).hexdigest() 171 | 172 | 173 | def get_site_hash(): 174 | """Hash the current Python environment's site-packages contents, including 175 | the name and version of the libraries. The list we're hashing is what 176 | `pip freeze` would output. 177 | """ 178 | site_dirs = site.getsitepackages() 179 | if site.ENABLE_USER_SITE: 180 | site_dirs.extend(site.getusersitepackages()) 181 | packages = set() 182 | for site_dir in site_dirs: 183 | site_dir = Path(site_dir) 184 | for subpath in site_dir.iterdir(): 185 | if subpath.parts[-1].endswith("dist-info"): 186 | packages.add(subpath.parts[-1].replace(".dist-info", "")) 187 | package_bytes = "".join(sorted(packages)).encode("utf8") 188 | return hashlib.md5sum(package_bytes).hexdigest() 189 | 190 | 191 | def get_env_hash(env: Dict[str, str]) -> str: 192 | """Construct a hash of the environment variables that will be passed into 193 | the commands. 194 | 195 | Values in the env dict may be references to the current os.environ, using 196 | the syntax $ENV_VAR to mean os.environ[ENV_VAR] 197 | """ 198 | env_vars = {} 199 | for key, value in env.items(): 200 | if value.startswith("$"): 201 | env_vars[key] = os.environ.get(value[1:], "") 202 | else: 203 | env_vars[key] = value 204 | return get_hash(env_vars) 205 | -------------------------------------------------------------------------------- /weasel/cli/run.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | from typing import Any, Dict, Iterable, List, Optional, Sequence 4 | 5 | import srsly 6 | import typer 7 | from wasabi import msg 8 | from wasabi.util import locale_escape 9 | 10 | from ..util import SimpleFrozenDict, SimpleFrozenList, check_spacy_env_vars 11 | from ..util import get_checksum, get_hash, is_cwd, join_command, load_project_config 12 | from ..util import parse_config_overrides, run_command, split_command, working_dir 13 | from .main import COMMAND, PROJECT_FILE, PROJECT_LOCK, Arg, Opt, _get_parent_command 14 | from .main import app 15 | 16 | 17 | @app.command( 18 | "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} 19 | ) 20 | def project_run_cli( 21 | # fmt: off 22 | ctx: typer.Context, # This is only used to read additional arguments 23 | subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), 24 | project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), 25 | force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), 26 | dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"), 27 | show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") 28 | # fmt: on 29 | ): 30 | """Run a named command or workflow defined in the project.yml. If a workflow 31 | name is specified, all commands in the workflow are run, in order. If 32 | commands define dependencies and/or outputs, they will only be re-run if 33 | state has changed. 34 | 35 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#rocket-run 36 | """ 37 | parent_command = _get_parent_command(ctx) 38 | if show_help or not subcommand: 39 | print_run_help(project_dir, subcommand, parent_command) 40 | else: 41 | overrides = parse_config_overrides(ctx.args) 42 | project_run( 43 | project_dir, 44 | subcommand, 45 | overrides=overrides, 46 | force=force, 47 | dry=dry, 48 | parent_command=parent_command, 49 | ) 50 | 51 | 52 | def project_run( 53 | project_dir: Path, 54 | subcommand: str, 55 | *, 56 | overrides: Dict[str, Any] = SimpleFrozenDict(), 57 | force: bool = False, 58 | dry: bool = False, 59 | capture: bool = False, 60 | skip_requirements_check: bool = False, 61 | parent_command: str = COMMAND, 62 | ) -> None: 63 | """Run a named script defined in the project.yml. If the script is part 64 | of the default pipeline (defined in the "run" section), DVC is used to 65 | execute the command, so it can determine whether to rerun it. It then 66 | calls into "exec" to execute it. 67 | 68 | project_dir (Path): Path to project directory. 69 | subcommand (str): Name of command to run. 70 | overrides (Dict[str, Any]): Optional config overrides. 71 | force (bool): Force re-running, even if nothing changed. 72 | dry (bool): Perform a dry run and don't execute commands. 73 | capture (bool): Whether to capture the output and errors of individual commands. 74 | If False, the stdout and stderr will not be redirected, and if there's an error, 75 | sys.exit will be called with the return code. You should use capture=False 76 | when you want to turn over execution to the command, and capture=True 77 | when you want to run the command more like a function. 78 | skip_requirements_check (bool): No longer used, deprecated. 79 | """ 80 | config = load_project_config(project_dir, overrides=overrides) 81 | commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} 82 | workflows = config.get("workflows", {}) 83 | validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) 84 | 85 | if subcommand in workflows: 86 | msg.info(f"Running workflow '{subcommand}'") 87 | for cmd in workflows[subcommand]: 88 | project_run( 89 | project_dir, 90 | cmd, 91 | overrides=overrides, 92 | force=force, 93 | dry=dry, 94 | capture=capture, 95 | ) 96 | else: 97 | cmd = commands[subcommand] 98 | for dep in cmd.get("deps", []): 99 | if not (project_dir / dep).exists(): 100 | err = f"Missing dependency specified by command '{subcommand}': {dep}" 101 | err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" 102 | err_exits = 1 if not dry else None 103 | msg.fail(err, err_help, exits=err_exits) 104 | check_spacy_env_vars() 105 | with working_dir(project_dir) as current_dir: 106 | msg.divider(subcommand) 107 | rerun = check_rerun(current_dir, cmd) 108 | if not rerun and not force: 109 | msg.info(f"Skipping '{cmd['name']}': nothing changed") 110 | else: 111 | run_commands(cmd["script"], dry=dry, capture=capture) 112 | if not dry: 113 | update_lockfile(current_dir, cmd) 114 | 115 | 116 | def print_run_help( 117 | project_dir: Path, subcommand: Optional[str] = None, parent_command: str = COMMAND 118 | ) -> None: 119 | """Simulate a CLI help prompt using the info available in the project.yml. 120 | 121 | project_dir (Path): The project directory. 122 | subcommand (Optional[str]): The subcommand or None. If a subcommand is 123 | provided, the subcommand help is shown. Otherwise, the top-level help 124 | and a list of available commands is printed. 125 | """ 126 | config = load_project_config(project_dir) 127 | config_commands = config.get("commands", []) 128 | commands = {cmd["name"]: cmd for cmd in config_commands} 129 | workflows = config.get("workflows", {}) 130 | project_loc = "" if is_cwd(project_dir) else project_dir 131 | if subcommand: 132 | validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) 133 | print(f"Usage: {parent_command} run {subcommand} {project_loc}") 134 | if subcommand in commands: 135 | help_text = commands[subcommand].get("help") 136 | if help_text: 137 | print(f"\n{help_text}\n") 138 | elif subcommand in workflows: 139 | steps = workflows[subcommand] 140 | print(f"\nWorkflow consisting of {len(steps)} commands:") 141 | steps_data = [ 142 | (f"{i + 1}. {step}", commands[step].get("help", "")) 143 | for i, step in enumerate(steps) 144 | ] 145 | msg.table(steps_data) 146 | help_cmd = f"{parent_command} run [COMMAND] {project_loc} --help" 147 | print(f"For command details, run: {help_cmd}") 148 | else: 149 | print("") 150 | title = config.get("title") 151 | if title: 152 | print(f"{locale_escape(title)}\n") 153 | if config_commands: 154 | print(f"Available commands in {PROJECT_FILE}") 155 | print(f"Usage: {parent_command} run [COMMAND] {project_loc}") 156 | msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) 157 | if workflows: 158 | print(f"Available workflows in {PROJECT_FILE}") 159 | print(f"Usage: {parent_command} run [WORKFLOW] {project_loc}") 160 | msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()]) 161 | 162 | 163 | def run_commands( 164 | commands: Iterable[str] = SimpleFrozenList(), 165 | silent: bool = False, 166 | dry: bool = False, 167 | capture: bool = False, 168 | ) -> None: 169 | """Run a sequence of commands in a subprocess, in order. 170 | 171 | commands (List[str]): The string commands. 172 | silent (bool): Don't print the commands. 173 | dry (bool): Perform a dry run and don't execut anything. 174 | capture (bool): Whether to capture the output and errors of individual commands. 175 | If False, the stdout and stderr will not be redirected, and if there's an error, 176 | sys.exit will be called with the return code. You should use capture=False 177 | when you want to turn over execution to the command, and capture=True 178 | when you want to run the command more like a function. 179 | """ 180 | for c in commands: 181 | command = split_command(c) 182 | # Not sure if this is needed or a good idea. Motivation: users may often 183 | # use commands in their config that reference "python" and we want to 184 | # make sure that it's always executing the same Python that Weasel is 185 | # executed with and the pip in the same env, not some other Python/pip. 186 | # Also ensures cross-compatibility if user 1 writes "python3" (because 187 | # that's how it's set up on their system), and user 2 without the 188 | # shortcut tries to re-run the command. 189 | if len(command) and command[0] in ("python", "python3"): 190 | command[0] = sys.executable 191 | elif len(command) and command[0] in ("pip", "pip3"): 192 | command = [sys.executable, "-m", "pip", *command[1:]] 193 | if not silent: 194 | print(f"Running command: {join_command(command)}") 195 | if not dry: 196 | run_command(command, capture=capture) 197 | 198 | 199 | def validate_subcommand( 200 | commands: Sequence[str], workflows: Sequence[str], subcommand: str 201 | ) -> None: 202 | """Check that a subcommand is valid and defined. Raises an error otherwise. 203 | 204 | commands (Sequence[str]): The available commands. 205 | subcommand (str): The subcommand. 206 | """ 207 | if not commands and not workflows: 208 | msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1) 209 | if subcommand not in commands and subcommand not in workflows: 210 | help_msg = [] 211 | if subcommand in ["assets", "asset"]: 212 | help_msg.append("Did you mean to run: python -m weasel assets?") 213 | if commands: 214 | help_msg.append(f"Available commands: {', '.join(commands)}") 215 | if workflows: 216 | help_msg.append(f"Available workflows: {', '.join(workflows)}") 217 | msg.fail( 218 | f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}", 219 | ". ".join(help_msg), 220 | exits=1, 221 | ) 222 | 223 | 224 | def check_rerun( 225 | project_dir: Path, 226 | command: Dict[str, Any], 227 | ) -> bool: 228 | """Check if a command should be rerun because its settings or inputs/outputs 229 | changed. 230 | 231 | project_dir (Path): The current project directory. 232 | command (Dict[str, Any]): The command, as defined in the project.yml. 233 | strict_version (bool): 234 | RETURNS (bool): Whether to re-run the command. 235 | """ 236 | # Always rerun if no-skip is set 237 | if command.get("no_skip", False): 238 | return True 239 | lock_path = project_dir / PROJECT_LOCK 240 | if not lock_path.exists(): # We don't have a lockfile, run command 241 | return True 242 | data = srsly.read_yaml(lock_path) 243 | if command["name"] not in data: # We don't have info about this command 244 | return True 245 | entry = data[command["name"]] 246 | # Always run commands with no outputs (otherwise they'd always be skipped) 247 | if not entry.get("outs", []): 248 | return True 249 | # If the entry in the lockfile matches the lockfile entry that would be 250 | # generated from the current command, we don't rerun because it means that 251 | # all inputs/outputs, hashes and scripts are the same and nothing changed 252 | lock_entry = get_lock_entry(project_dir, command) 253 | return get_hash(lock_entry) != get_hash(entry) 254 | 255 | 256 | def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None: 257 | """Update the lockfile after running a command. Will create a lockfile if 258 | it doesn't yet exist and will add an entry for the current command, its 259 | script and dependencies/outputs. 260 | 261 | project_dir (Path): The current project directory. 262 | command (Dict[str, Any]): The command, as defined in the project.yml. 263 | """ 264 | lock_path = project_dir / PROJECT_LOCK 265 | if not lock_path.exists(): 266 | srsly.write_yaml(lock_path, {}) 267 | data = {} 268 | else: 269 | data = srsly.read_yaml(lock_path) 270 | data[command["name"]] = get_lock_entry(project_dir, command) 271 | srsly.write_yaml(lock_path, data) 272 | 273 | 274 | def get_lock_entry( 275 | project_dir: Path, command: Dict[str, Any], *, parent_command: str = COMMAND 276 | ) -> Dict[str, Any]: 277 | """Get a lockfile entry for a given command. An entry includes the command, 278 | the script (command steps) and a list of dependencies and outputs with 279 | their paths and file hashes, if available. The format is based on the 280 | dvc.lock files, to keep things consistent. 281 | 282 | project_dir (Path): The current project directory. 283 | command (Dict[str, Any]): The command, as defined in the project.yml. 284 | RETURNS (Dict[str, Any]): The lockfile entry. 285 | """ 286 | deps = get_fileinfo(project_dir, command.get("deps", [])) 287 | outs = get_fileinfo(project_dir, command.get("outputs", [])) 288 | outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", [])) 289 | return { 290 | "cmd": f"{parent_command} run {command['name']}", 291 | "script": command["script"], 292 | "deps": deps, 293 | "outs": [*outs, *outs_nc], 294 | } 295 | 296 | 297 | def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]: 298 | """Generate the file information for a list of paths (dependencies, outputs). 299 | Includes the file path and the file's checksum. 300 | 301 | project_dir (Path): The current project directory. 302 | paths (List[str]): The file paths. 303 | RETURNS (List[Dict[str, str]]): The lockfile entry for a file. 304 | """ 305 | data = [] 306 | for path in paths: 307 | file_path = project_dir / path 308 | md5 = get_checksum(file_path) if file_path.exists() else None 309 | data.append({"path": path, "md5": md5}) 310 | return data 311 | -------------------------------------------------------------------------------- /weasel/compat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | is_windows = sys.platform.startswith("win") 4 | is_linux = sys.platform.startswith("linux") 5 | is_osx = sys.platform == "darwin" 6 | -------------------------------------------------------------------------------- /weasel/errors.py: -------------------------------------------------------------------------------- 1 | class ErrorsWithCodes(type): 2 | def __getattribute__(self, code): 3 | msg = super().__getattribute__(code) 4 | if code.startswith("__"): # python system attributes like __class__ 5 | return msg 6 | else: 7 | return "[{code}] {msg}".format(code=code, msg=msg) 8 | 9 | 10 | class Warnings(metaclass=ErrorsWithCodes): 11 | # File system 12 | W801 = "Could not clean/remove the temp directory at {dir}: {msg}." 13 | W802 = ( 14 | "Remote storage is not yet supported for Python 3.12 with " 15 | "cloudpathlib. Please use Python 3.11 or earlier for remote storage." 16 | ) 17 | 18 | 19 | class Errors(metaclass=ErrorsWithCodes): 20 | # API - Datastructure 21 | E001 = ( 22 | "Can't write to frozen dictionary. This is likely an internal " 23 | "error. Are you writing to a default function argument?" 24 | ) 25 | E002 = ( 26 | "Can't write to frozen list. Maybe you're trying to modify a computed " 27 | "property or default function argument?" 28 | ) 29 | 30 | # Workflow 31 | E501 = "Can not execute command '{str_command}'. Do you have '{tool}' installed?" 32 | 33 | # File system 34 | E801 = "The tar file pulled from the remote attempted an unsafe path " "traversal." 35 | -------------------------------------------------------------------------------- /weasel/schemas.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Any, Dict, List, Optional, Type, Union 3 | 4 | try: 5 | from pydantic.v1 import BaseModel, Field, StrictStr, ValidationError, root_validator 6 | except ImportError: 7 | from pydantic import BaseModel, Field, StrictStr, ValidationError, root_validator # type: ignore 8 | 9 | from wasabi import msg 10 | 11 | 12 | def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: 13 | """Validate data against a given pydantic schema. 14 | 15 | obj (Dict[str, Any]): JSON-serializable data to validate. 16 | schema (pydantic.BaseModel): The schema to validate against. 17 | RETURNS (List[str]): A list of error messages, if available. 18 | """ 19 | try: 20 | schema(**obj) 21 | return [] 22 | except ValidationError as e: 23 | errors = e.errors() 24 | data = defaultdict(list) 25 | for error in errors: 26 | err_loc = " -> ".join([str(p) for p in error.get("loc", [])]) 27 | data[err_loc].append(error.get("msg")) 28 | return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] # type: ignore[arg-type] 29 | 30 | 31 | # Project config Schema 32 | 33 | 34 | class ProjectConfigAssetGitItem(BaseModel): 35 | # fmt: off 36 | repo: StrictStr = Field(..., title="URL of Git repo to download from") 37 | path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)") 38 | branch: StrictStr = Field("master", title="Branch to clone from") 39 | # fmt: on 40 | 41 | 42 | class ProjectConfigAssetURL(BaseModel): 43 | # fmt: off 44 | dest: StrictStr = Field(..., title="Destination of downloaded asset") 45 | url: Optional[StrictStr] = Field(None, title="URL of asset") 46 | checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") 47 | description: StrictStr = Field("", title="Description of asset") 48 | # fmt: on 49 | 50 | 51 | class ProjectConfigAssetGit(BaseModel): 52 | # fmt: off 53 | git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") 54 | checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") 55 | description: Optional[StrictStr] = Field(None, title="Description of asset") 56 | # fmt: on 57 | 58 | 59 | class ProjectConfigCommand(BaseModel): 60 | # fmt: off 61 | name: StrictStr = Field(..., title="Name of command") 62 | help: Optional[StrictStr] = Field(None, title="Command description") 63 | script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") 64 | deps: List[StrictStr] = Field([], title="File dependencies required by this command") 65 | outputs: List[StrictStr] = Field([], title="Outputs produced by this command") 66 | outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)") 67 | no_skip: bool = Field(False, title="Never skip this command, even if nothing changed") 68 | # fmt: on 69 | 70 | class Config: 71 | title = "A single named command specified in a project config" 72 | extra = "forbid" 73 | 74 | 75 | class ProjectConfigSchema(BaseModel): 76 | # fmt: off 77 | vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") 78 | env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names") 79 | assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") 80 | workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") 81 | commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") 82 | title: Optional[str] = Field(None, title="Project title") 83 | # fmt: on 84 | 85 | class Config: 86 | title = "Schema for project configuration file" 87 | 88 | @root_validator(pre=True) 89 | def check_legacy_keys(cls, obj: Dict[str, Any]) -> Dict[str, Any]: 90 | if "spacy_version" in obj: 91 | msg.warn( 92 | "Your project configuration file includes a `spacy_version` key, " 93 | "which is now deprecated. Weasel will not validate your version of spaCy.", 94 | ) 95 | if "check_requirements" in obj: 96 | msg.warn( 97 | "Your project configuration file includes a `check_requirements` key, " 98 | "which is now deprecated. Weasel will not validate your requirements.", 99 | ) 100 | return obj 101 | -------------------------------------------------------------------------------- /weasel/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/weasel/tests/__init__.py -------------------------------------------------------------------------------- /weasel/tests/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/weasel/tests/cli/__init__.py -------------------------------------------------------------------------------- /weasel/tests/cli/test_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import pytest 5 | import srsly 6 | 7 | from weasel.cli.remote_storage import RemoteStorage 8 | from weasel.schemas import ProjectConfigSchema, validate 9 | from weasel.util import git_checkout, is_subpath_of, load_project_config, make_tempdir 10 | from weasel.util import validate_project_commands 11 | 12 | 13 | def test_issue11235(): 14 | """ 15 | Test that the cli handles interpolation in the directory names correctly when loading project config. 16 | """ 17 | lang_var = "en" 18 | variables = {"lang": lang_var} 19 | commands = [{"name": "x", "script": ["hello ${vars.lang}"]}] 20 | directories = ["cfg", "${vars.lang}_model"] 21 | project = {"commands": commands, "vars": variables, "directories": directories} 22 | with make_tempdir() as d: 23 | srsly.write_yaml(d / "project.yml", project) 24 | cfg = load_project_config(d) 25 | # Check that the directories are interpolated and created correctly 26 | assert os.path.exists(d / "cfg") 27 | assert os.path.exists(d / f"{lang_var}_model") 28 | assert cfg["commands"][0]["script"][0] == f"hello {lang_var}" 29 | 30 | 31 | def test_project_config_validation_full(): 32 | config = { 33 | "vars": {"some_var": 20}, 34 | "directories": ["assets", "configs", "corpus", "scripts", "training"], 35 | "assets": [ 36 | { 37 | "dest": "x", 38 | "extra": True, 39 | "url": "https://example.com", 40 | "checksum": "63373dd656daa1fd3043ce166a59474c", 41 | }, 42 | { 43 | "dest": "y", 44 | "git": { 45 | "repo": "https://github.com/example/repo", 46 | "branch": "develop", 47 | "path": "y", 48 | }, 49 | }, 50 | { 51 | "dest": "z", 52 | "extra": False, 53 | "url": "https://example.com", 54 | "checksum": "63373dd656daa1fd3043ce166a59474c", 55 | }, 56 | ], 57 | "commands": [ 58 | { 59 | "name": "train", 60 | "help": "Train a model", 61 | "script": ["python -m spacy train config.cfg -o training"], 62 | "deps": ["config.cfg", "corpus/training.spcy"], 63 | "outputs": ["training/model-best"], 64 | }, 65 | {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True}, 66 | ], 67 | "workflows": {"all": ["train", "test"], "train": ["train"]}, 68 | } 69 | errors = validate(ProjectConfigSchema, config) 70 | assert not errors 71 | 72 | 73 | @pytest.mark.parametrize( 74 | "config", 75 | [ 76 | {"commands": [{"name": "a"}, {"name": "a"}]}, 77 | {"commands": [{"name": "a"}], "workflows": {"a": []}}, 78 | {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}}, 79 | ], 80 | ) 81 | def test_project_config_validation1(config): 82 | with pytest.raises(SystemExit): 83 | validate_project_commands(config) 84 | 85 | 86 | @pytest.mark.parametrize( 87 | "config,n_errors", 88 | [ 89 | ({"commands": {"a": []}}, 1), 90 | ({"commands": [{"help": "..."}]}, 1), 91 | ({"commands": [{"name": "a", "extra": "b"}]}, 1), 92 | ({"commands": [{"extra": "b"}]}, 2), 93 | ({"commands": [{"name": "a", "deps": [123]}]}, 1), 94 | ], 95 | ) 96 | def test_project_config_validation2(config, n_errors): 97 | errors = validate(ProjectConfigSchema, config) 98 | assert len(errors) == n_errors 99 | 100 | 101 | @pytest.mark.parametrize( 102 | "parent,child,expected", 103 | [ 104 | ("/tmp", "/tmp", True), 105 | ("/tmp", "/", False), 106 | ("/tmp", "/tmp/subdir", True), 107 | ("/tmp", "/tmpdir", False), 108 | ("/tmp", "/tmp/subdir/..", True), 109 | ("/tmp", "/tmp/..", False), 110 | ], 111 | ) 112 | def test_is_subpath_of(parent, child, expected): 113 | assert is_subpath_of(parent, child) == expected 114 | 115 | 116 | def test_local_remote_storage(): 117 | with make_tempdir() as d: 118 | filename = "a.txt" 119 | 120 | content_hashes = ("aaaa", "cccc", "bbbb") 121 | for i, content_hash in enumerate(content_hashes): 122 | # make sure that each subsequent file has a later timestamp 123 | if i > 0: 124 | time.sleep(1) 125 | content = f"{content_hash} content" 126 | loc_file = d / "root" / filename 127 | if not loc_file.parent.exists(): 128 | loc_file.parent.mkdir(parents=True) 129 | with loc_file.open(mode="w") as file_: 130 | file_.write(content) 131 | 132 | # push first version to remote storage 133 | remote = RemoteStorage(d / "root", str(d / "remote")) 134 | remote.push(filename, "aaaa", content_hash) 135 | 136 | # retrieve with full hashes 137 | loc_file.unlink() 138 | remote.pull(filename, command_hash="aaaa", content_hash=content_hash) 139 | with loc_file.open(mode="r") as file_: 140 | assert file_.read() == content 141 | 142 | # retrieve with command hash 143 | loc_file.unlink() 144 | remote.pull(filename, command_hash="aaaa") 145 | with loc_file.open(mode="r") as file_: 146 | assert file_.read() == content 147 | 148 | # retrieve with content hash 149 | loc_file.unlink() 150 | remote.pull(filename, content_hash=content_hash) 151 | with loc_file.open(mode="r") as file_: 152 | assert file_.read() == content 153 | 154 | # retrieve with no hashes 155 | loc_file.unlink() 156 | remote.pull(filename) 157 | with loc_file.open(mode="r") as file_: 158 | assert file_.read() == content 159 | 160 | 161 | def test_local_remote_storage_pull_missing(): 162 | # pulling from a non-existent remote pulls nothing gracefully 163 | with make_tempdir() as d: 164 | filename = "a.txt" 165 | remote = RemoteStorage(d / "root", str(d / "remote")) 166 | assert remote.pull(filename, command_hash="aaaa") is None 167 | assert remote.pull(filename) is None 168 | 169 | 170 | def test_project_git_dir_asset(): 171 | with make_tempdir() as d: 172 | # Use a very small repo. 173 | git_checkout( 174 | "https://github.com/explosion/os-signpost.git", 175 | "os_signpost", 176 | d / "signpost", 177 | branch="v0.0.3", 178 | ) 179 | assert os.path.isdir(d / "signpost") 180 | 181 | 182 | @pytest.mark.issue(66) 183 | def test_project_git_file_asset(): 184 | with make_tempdir() as d: 185 | # Use a very small repo. 186 | git_checkout( 187 | "https://github.com/explosion/os-signpost.git", 188 | "README.md", 189 | d / "readme.md", 190 | branch="v0.0.3", 191 | ) 192 | assert os.path.isfile(d / "readme.md") 193 | -------------------------------------------------------------------------------- /weasel/tests/cli/test_cli_app.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Dict 3 | 4 | import pytest 5 | import srsly 6 | from typer.testing import CliRunner 7 | 8 | from weasel import app 9 | from weasel.cli.main import HELP 10 | from weasel.util import get_git_version 11 | 12 | runner = CliRunner() 13 | 14 | 15 | @pytest.mark.parametrize("cmd", [None, "--help"]) 16 | def test_show_help(cmd): 17 | """Basic test to confirm help text appears""" 18 | result = runner.invoke(app, [cmd] if cmd else None) 19 | for line in HELP.splitlines(): 20 | assert line in result.stdout 21 | 22 | 23 | def has_git(): 24 | try: 25 | get_git_version() 26 | return True 27 | except RuntimeError: 28 | return False 29 | 30 | 31 | SAMPLE_PROJECT: Dict[str, Any] = { 32 | "title": "Sample project", 33 | "description": "This is a project for testing", 34 | "assets": [ 35 | { 36 | "dest": "assets/weasel-readme.md", 37 | "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/README.md", 38 | "checksum": "65f4c426a9b153b7683738c92d0d20f9", 39 | }, 40 | { 41 | "dest": "assets/pyproject.toml", 42 | "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/pyproject.toml", 43 | "checksum": "1e2da3a3030d6611520952d5322cd94e", 44 | "extra": True, 45 | }, 46 | ], 47 | "commands": [ 48 | { 49 | "name": "ok", 50 | "help": "print ok", 51 | "script": ["python -c \"print('okokok')\""], 52 | }, 53 | { 54 | "name": "create", 55 | "help": "make a file", 56 | "script": ["touch abc.txt"], 57 | "outputs": ["abc.txt"], 58 | }, 59 | { 60 | "name": "clean", 61 | "help": "remove test file", 62 | "script": ["rm abc.txt"], 63 | }, 64 | ], 65 | } 66 | 67 | SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT) 68 | 69 | 70 | @pytest.fixture 71 | def project_dir(tmp_path: Path): 72 | path = tmp_path / "project" 73 | path.mkdir() 74 | (path / "project.yml").write_text(SAMPLE_PROJECT_TEXT) 75 | yield path 76 | 77 | 78 | def test_project_document(project_dir: Path): 79 | readme_path = project_dir / "README.md" 80 | assert not readme_path.exists(), "README already exists" 81 | result = CliRunner().invoke( 82 | app, ["document", str(project_dir), "-o", str(readme_path)] 83 | ) 84 | assert result.exit_code == 0 85 | assert readme_path.is_file() 86 | text = readme_path.read_text("utf-8") 87 | assert SAMPLE_PROJECT["description"] in text 88 | 89 | 90 | def test_project_assets(project_dir: Path): 91 | asset_dir = project_dir / "assets" 92 | assert not asset_dir.exists(), "Assets dir is already present" 93 | result = CliRunner().invoke(app, ["assets", str(project_dir)]) 94 | assert result.exit_code == 0 95 | assert (asset_dir / "weasel-readme.md").is_file(), "Assets not downloaded" 96 | # check that extras work 97 | result = CliRunner().invoke(app, ["assets", "--extra", str(project_dir)]) 98 | assert result.exit_code == 0 99 | assert (asset_dir / "pyproject.toml").is_file(), "Extras not downloaded" 100 | 101 | 102 | def test_project_run(project_dir: Path): 103 | # make sure dry run works 104 | test_file = project_dir / "abc.txt" 105 | result = CliRunner().invoke(app, ["run", "--dry", "create", str(project_dir)]) 106 | assert result.exit_code == 0 107 | assert not test_file.is_file() 108 | result = CliRunner().invoke(app, ["run", "create", str(project_dir)]) 109 | assert result.exit_code == 0 110 | assert test_file.is_file() 111 | result = CliRunner().invoke(app, ["run", "ok", str(project_dir)]) 112 | assert result.exit_code == 0 113 | assert "okokok" in result.stdout 114 | 115 | 116 | def test_check_spacy_env_vars(project_dir: Path, monkeypatch: pytest.MonkeyPatch): 117 | # make sure dry run works 118 | project_dir / "abc.txt" 119 | 120 | result = CliRunner().invoke(app, ["run", "--dry", "create", str(project_dir)]) 121 | assert result.exit_code == 0 122 | assert ( 123 | "You've set a `SPACY_CONFIG_OVERRIDES` environment variable" 124 | not in result.output 125 | ) 126 | assert ( 127 | "You've set a `SPACY_PROJECT_USE_GIT_VERSION` environment variable" 128 | not in result.output 129 | ) 130 | 131 | monkeypatch.setenv("SPACY_CONFIG_OVERRIDES", "test") 132 | monkeypatch.setenv("SPACY_PROJECT_USE_GIT_VERSION", "false") 133 | 134 | result = CliRunner().invoke(app, ["run", "--dry", "create", str(project_dir)]) 135 | assert result.exit_code == 0 136 | 137 | assert "You've set a `SPACY_CONFIG_OVERRIDES` environment variable" in result.output 138 | assert ( 139 | "You've set a `SPACY_PROJECT_USE_GIT_VERSION` environment variable" 140 | in result.output 141 | ) 142 | 143 | 144 | @pytest.mark.skipif(not has_git(), reason="git not installed") 145 | @pytest.mark.parametrize( 146 | "options_string", 147 | [ 148 | "", 149 | # "--sparse", 150 | "--branch v3", 151 | "--repo https://github.com/explosion/projects --branch v3", 152 | ], 153 | ) 154 | def test_project_clone(tmp_path: Path, options_string: str): 155 | out = tmp_path / "project_clone" 156 | target = "benchmarks/ner_conll03" 157 | if not options_string: 158 | options = [] 159 | else: 160 | options = options_string.split() 161 | result = CliRunner().invoke(app, ["clone", target, *options, str(out)]) 162 | assert result.exit_code == 0 163 | assert "weasel assets" in result.output 164 | assert (out / "README.md").is_file() 165 | 166 | 167 | def test_project_push_pull(tmp_path: Path, project_dir: Path): 168 | proj = dict(SAMPLE_PROJECT) 169 | remote = "xyz" 170 | 171 | remote_dir = tmp_path / "remote" 172 | remote_dir.mkdir() 173 | 174 | proj["remotes"] = {remote: str(remote_dir)} 175 | proj_text = srsly.yaml_dumps(proj) 176 | (project_dir / "project.yml").write_text(proj_text) 177 | 178 | test_file = project_dir / "abc.txt" 179 | result = CliRunner().invoke(app, ["run", "create", str(project_dir)]) 180 | assert result.exit_code == 0 181 | assert test_file.is_file() 182 | result = CliRunner().invoke(app, ["push", remote, str(project_dir)]) 183 | assert result.exit_code == 0 184 | result = CliRunner().invoke(app, ["run", "clean", str(project_dir)]) 185 | assert result.exit_code == 0 186 | assert not test_file.exists() 187 | result = CliRunner().invoke(app, ["pull", remote, str(project_dir)]) 188 | assert result.exit_code == 0 189 | assert test_file.is_file() 190 | -------------------------------------------------------------------------------- /weasel/tests/cli/test_document.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Dict 3 | 4 | import pytest 5 | import srsly 6 | from typer.testing import CliRunner 7 | 8 | from weasel import app 9 | from weasel.cli.document import MARKER_END, MARKER_IGNORE, MARKER_START, MARKER_TAGS 10 | 11 | runner = CliRunner() 12 | 13 | SAMPLE_PROJECT: Dict[str, Any] = { 14 | "title": "Sample project", 15 | "description": "This is a project for testing", 16 | "assets": [ 17 | { 18 | "dest": "assets/weasel-readme.md", 19 | "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/README.md", 20 | "checksum": "65f4c426a9b153b7683738c92d0d20f9", 21 | }, 22 | { 23 | "dest": "assets/pyproject.toml", 24 | "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/pyproject.toml", 25 | "checksum": "1e2da3a3030d6611520952d5322cd94e", 26 | "extra": True, 27 | }, 28 | ], 29 | "commands": [ 30 | { 31 | "name": "ok", 32 | "help": "print ok", 33 | "script": ["python -c \"print('okokok')\""], 34 | }, 35 | { 36 | "name": "create", 37 | "help": "make a file", 38 | "script": ["touch abc.txt"], 39 | "outputs": ["abc.txt"], 40 | }, 41 | { 42 | "name": "clean", 43 | "help": "remove test file", 44 | "script": ["rm abc.txt"], 45 | }, 46 | ], 47 | } 48 | 49 | 50 | @pytest.fixture(scope="function") 51 | def project_yaml_file( 52 | tmp_path_factory: pytest.TempPathFactory, 53 | ): 54 | test_dir = tmp_path_factory.mktemp("project") 55 | path = test_dir / "project.yml" 56 | path.write_text(srsly.yaml_dumps(SAMPLE_PROJECT)) 57 | return path 58 | 59 | 60 | def test_create_docs(project_yaml_file: Path): 61 | result = runner.invoke(app, ["document", str(project_yaml_file.parent)]) 62 | conf_data = srsly.read_yaml(project_yaml_file) 63 | assert result.exit_code == 0 64 | assert conf_data["title"] in result.stdout 65 | 66 | 67 | def test_raise_error_no_config(): 68 | result = runner.invoke(app, ["document"]) 69 | assert result.exit_code == 1 70 | 71 | 72 | @pytest.mark.parametrize("marker", MARKER_TAGS) 73 | def test_markers(tmp_path_factory: pytest.TempPathFactory, marker: str): 74 | """Weasel should be able to handle both 'SPACY PROJECT' and 'WEASEL' markers.""" 75 | project: Dict[str, Any] = { 76 | "title": "Awesome project", 77 | "description": "Project using spacy projects and gets migrated to weasel.", 78 | } 79 | additional_text = ( 80 | "\n\n## Some additional information\n\nHere is some additional information about this project " 81 | "that is not autogenerated from the [`project.yml`](project.yml)." 82 | ) 83 | 84 | # Create project file. 85 | test_dir = tmp_path_factory.mktemp("project") 86 | path = test_dir / "project.yml" 87 | path.write_text(srsly.yaml_dumps(project)) 88 | 89 | # Store readme with additional information. 90 | # runner.invoke(app, ["document", str(path.parent), "--output", test_dir / "readme.md"]) 91 | with open(test_dir / "readme.md", "w+", encoding="utf-8") as file: 92 | readme = runner.invoke(app, ["document", str(path.parent)]).output 93 | for to_replace in (MARKER_START, MARKER_END, MARKER_IGNORE): 94 | readme = readme.replace( 95 | to_replace.format(tag="WEASEL"), to_replace.format(tag=marker) 96 | ) 97 | file.writelines(readme) 98 | file.writelines(additional_text) 99 | 100 | # Run `document` again on existing readme file. Ensure additional information is still there. 101 | runner.invoke( 102 | app, ["document", str(path.parent), "--output", str(test_dir / "readme.md")] 103 | ) 104 | with open(test_dir / "readme.md", "r", encoding="utf-8") as file: 105 | assert additional_text in "".join(file.readlines()) 106 | -------------------------------------------------------------------------------- /weasel/tests/cli/test_remote.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | from typer.testing import CliRunner 5 | 6 | from weasel import app 7 | 8 | from .test_cli_app import has_git 9 | 10 | runner = CliRunner() 11 | 12 | 13 | @pytest.fixture 14 | def project_dir(tmp_path_factory: pytest.TempPathFactory): 15 | # a working directory for the session 16 | base = tmp_path_factory.mktemp("project") 17 | return base / "project" 18 | 19 | 20 | @pytest.fixture 21 | def remote_url(tmp_path_factory: pytest.TempPathFactory): 22 | # a "remote" for testing 23 | base = tmp_path_factory.mktemp("remote") 24 | return base / "remote" 25 | 26 | 27 | @pytest.fixture 28 | def clone(project_dir: Path): 29 | """Cloning shouldn't fail""" 30 | repo = "https://github.com/explosion/weasel" 31 | branch = "main" 32 | result = runner.invoke( 33 | app, 34 | [ 35 | "clone", 36 | "--repo", 37 | repo, 38 | "--branch", 39 | branch, 40 | "weasel/tests/demo_project", 41 | str(project_dir), 42 | ], 43 | ) 44 | 45 | assert result.exit_code == 0 46 | assert (project_dir / "project.yml").exists() 47 | 48 | 49 | @pytest.fixture(autouse=True) 50 | def assets(clone, project_dir: Path): 51 | result = runner.invoke(app, ["assets", str(project_dir)]) 52 | 53 | print(result.stdout) 54 | assert result.exit_code == 0 55 | assert (project_dir / "assets/README.md").exists() 56 | 57 | 58 | @pytest.mark.skipif(not has_git(), reason="git not installed") 59 | def test_remote(project_dir: Path, remote_url: Path): 60 | result = runner.invoke(app, ["assets", str(project_dir)]) 61 | assert result.exit_code == 0 62 | assert (project_dir / "assets/README.md").exists() 63 | 64 | result = runner.invoke(app, ["run", "prep", str(project_dir)]) 65 | assert result.exit_code == 0 66 | 67 | # append remote to the file 68 | with open(project_dir / "project.yml", "a") as project_file: 69 | project_file.write(f"\nremotes:\n default: {remote_url}\n") 70 | 71 | result = runner.invoke(app, ["push", "default", str(project_dir)]) 72 | assert result.exit_code == 0 73 | 74 | # delete a file, and make sure pull restores it 75 | (project_dir / "corpus/stuff.txt").unlink() 76 | 77 | result = runner.invoke(app, ["pull", "default", str(project_dir)]) 78 | assert result.exit_code == 0 79 | assert (project_dir / "corpus/stuff.txt").exists() 80 | -------------------------------------------------------------------------------- /weasel/tests/demo_project/project.yml: -------------------------------------------------------------------------------- 1 | title: Weasel demo project (for tests) 2 | description: | 3 | This project is a minimal demo for the Weasel tests. 4 | 5 | directories: [assets, corpus, scripts] 6 | 7 | assets: 8 | - dest: assets/README.md 9 | url: https://raw.githubusercontent.com/explosion/weasel/main/README.md 10 | 11 | commands: 12 | - name: prep 13 | help: Make a file to test with push/pull 14 | script: 15 | - python scripts/check.py 16 | outputs: 17 | - corpus/stuff.txt 18 | -------------------------------------------------------------------------------- /weasel/tests/demo_project/scripts/check.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | workdir = pathlib.Path(__file__).parent.resolve().parent 4 | 5 | text = """ 6 | _ 7 | __ _____ __ _ ___ ___| | 8 | \\ \\ /\\ / / _ \\/ _` / __|/ _ \\ | 9 | \\ V V / __/ (_| \\__ \\ __/ | 10 | \\_/\\_/ \\___|\\__,_|___/\\___|_| 11 | 12 | """ 13 | 14 | with open(workdir / "corpus/stuff.txt", "w") as outfile: 15 | outfile.write(text) 16 | -------------------------------------------------------------------------------- /weasel/tests/test_schemas.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | import srsly 5 | from typer.testing import CliRunner 6 | 7 | from weasel import app 8 | 9 | EXAMPLES = [ 10 | (dict(title="Test"), False), 11 | (dict(title="Test", spacy_version=""), True), 12 | (dict(title="Test", spacy_version="3.4.1"), True), 13 | ] 14 | 15 | 16 | @pytest.fixture 17 | def project_dir(tmp_path: Path): 18 | path = tmp_path / "project" 19 | path.mkdir() 20 | yield path 21 | 22 | 23 | @pytest.mark.parametrize("conf,should_warn", EXAMPLES) 24 | def test_project_document(project_dir: Path, conf, should_warn): 25 | config = srsly.yaml_dumps(conf) 26 | 27 | (project_dir / "project.yml").write_text(config) 28 | 29 | result = CliRunner().invoke(app, ["document", str(project_dir)]) 30 | assert result.exit_code == 0 31 | assert ( 32 | "Your project configuration file includes a `spacy_version` key, " 33 | in result.output 34 | ) is should_warn 35 | -------------------------------------------------------------------------------- /weasel/tests/test_validation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import srsly 3 | from confection import ConfigValidationError 4 | 5 | from weasel.schemas import ProjectConfigSchema, validate 6 | from weasel.util import is_subpath_of, load_project_config, make_tempdir 7 | from weasel.util import substitute_project_variables, validate_project_commands 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "parent,child,expected", 12 | [ 13 | ("/tmp", "/tmp", True), 14 | ("/tmp", "/", False), 15 | ("/tmp", "/tmp/subdir", True), 16 | ("/tmp", "/tmpdir", False), 17 | ("/tmp", "/tmp/subdir/..", True), 18 | ("/tmp", "/tmp/..", False), 19 | ], 20 | ) 21 | def test_is_subpath_of(parent, child, expected): 22 | assert is_subpath_of(parent, child) == expected 23 | 24 | 25 | def test_project_config_validation_full(): 26 | config = { 27 | "vars": {"some_var": 20}, 28 | "directories": ["assets", "configs", "corpus", "scripts", "training"], 29 | "assets": [ 30 | { 31 | "dest": "x", 32 | "extra": True, 33 | "url": "https://example.com", 34 | "checksum": "63373dd656daa1fd3043ce166a59474c", 35 | }, 36 | { 37 | "dest": "y", 38 | "git": { 39 | "repo": "https://github.com/example/repo", 40 | "branch": "develop", 41 | "path": "y", 42 | }, 43 | }, 44 | { 45 | "dest": "z", 46 | "extra": False, 47 | "url": "https://example.com", 48 | "checksum": "63373dd656daa1fd3043ce166a59474c", 49 | }, 50 | ], 51 | "commands": [ 52 | { 53 | "name": "train", 54 | "help": "Train a model", 55 | "script": ["python -m spacy train config.cfg -o training"], 56 | "deps": ["config.cfg", "corpus/training.spcy"], 57 | "outputs": ["training/model-best"], 58 | }, 59 | {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True}, 60 | ], 61 | "workflows": {"all": ["train", "test"], "train": ["train"]}, 62 | } 63 | errors = validate(ProjectConfigSchema, config) 64 | assert not errors 65 | 66 | 67 | @pytest.mark.parametrize( 68 | "config", 69 | [ 70 | {"commands": [{"name": "a"}, {"name": "a"}]}, 71 | {"commands": [{"name": "a"}], "workflows": {"a": []}}, 72 | {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}}, 73 | ], 74 | ) 75 | def test_project_config_validation1(config): 76 | with pytest.raises(SystemExit): 77 | validate_project_commands(config) 78 | 79 | 80 | @pytest.mark.parametrize( 81 | "config,n_errors", 82 | [ 83 | ({"commands": {"a": []}}, 1), 84 | ({"commands": [{"help": "..."}]}, 1), 85 | ({"commands": [{"name": "a", "extra": "b"}]}, 1), 86 | ({"commands": [{"extra": "b"}]}, 2), 87 | ({"commands": [{"name": "a", "deps": [123]}]}, 1), 88 | ], 89 | ) 90 | def test_project_config_validation2(config, n_errors): 91 | errors = validate(ProjectConfigSchema, config) 92 | assert len(errors) == n_errors 93 | 94 | 95 | @pytest.mark.parametrize( 96 | "int_value", 97 | [10, pytest.param("10", marks=pytest.mark.xfail)], 98 | ) 99 | def test_project_config_interpolation(int_value): 100 | variables = {"a": int_value, "b": {"c": "foo", "d": True}} 101 | commands = [ 102 | {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]}, 103 | {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]}, 104 | ] 105 | project = {"commands": commands, "vars": variables} 106 | with make_tempdir() as d: 107 | srsly.write_yaml(d / "project.yml", project) 108 | cfg = load_project_config(d) 109 | assert type(cfg) == dict 110 | assert type(cfg["commands"]) == list 111 | assert cfg["commands"][0]["script"][0] == "hello 10 foo" 112 | assert cfg["commands"][1]["script"][0] == "foo true" 113 | commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}] 114 | project = {"commands": commands, "vars": variables} 115 | with pytest.raises(ConfigValidationError): 116 | substitute_project_variables(project) 117 | 118 | 119 | @pytest.mark.parametrize( 120 | "greeting", 121 | [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)], 122 | ) 123 | def test_project_config_interpolation_override(greeting): 124 | variables = {"a": "world"} 125 | commands = [ 126 | {"name": "x", "script": ["hello ${vars.a}"]}, 127 | ] 128 | overrides = {"vars.a": greeting} 129 | project = {"commands": commands, "vars": variables} 130 | with make_tempdir() as d: 131 | srsly.write_yaml(d / "project.yml", project) 132 | cfg = load_project_config(d, overrides=overrides) 133 | assert type(cfg) == dict 134 | assert type(cfg["commands"]) == list 135 | assert cfg["commands"][0]["script"][0] == f"hello {greeting}" 136 | 137 | 138 | def test_project_config_interpolation_env(monkeypatch: pytest.MonkeyPatch): 139 | variables = {"a": 10} 140 | env_var = "SPACY_TEST_FOO" 141 | 142 | env_vars = {"foo": env_var} 143 | commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}] 144 | project = {"commands": commands, "vars": variables, "env": env_vars} 145 | 146 | with make_tempdir() as d: 147 | srsly.write_yaml(d / "project.yml", project) 148 | cfg = load_project_config(d) 149 | assert cfg["commands"][0]["script"][0] == "hello 10 " 150 | 151 | monkeypatch.setenv(env_var, "123") 152 | 153 | with make_tempdir() as d: 154 | srsly.write_yaml(d / "project.yml", project) 155 | cfg = load_project_config(d) 156 | assert cfg["commands"][0]["script"][0] == "hello 10 123" 157 | -------------------------------------------------------------------------------- /weasel/tests/util.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import re 3 | import tempfile 4 | 5 | import srsly 6 | 7 | 8 | @contextlib.contextmanager 9 | def make_tempfile(mode="r"): 10 | f = tempfile.TemporaryFile(mode=mode) 11 | yield f 12 | f.close() 13 | 14 | 15 | def assert_packed_msg_equal(b1, b2): 16 | """Assert that two packed msgpack messages are equal.""" 17 | msg1 = srsly.msgpack_loads(b1) 18 | msg2 = srsly.msgpack_loads(b2) 19 | assert sorted(msg1.keys()) == sorted(msg2.keys()) 20 | for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): 21 | assert k1 == k2 22 | assert v1 == v2 23 | 24 | 25 | def normalize_whitespace(s): 26 | return re.sub(r"\s+", " ", s) 27 | -------------------------------------------------------------------------------- /weasel/util/__init__.py: -------------------------------------------------------------------------------- 1 | from .commands import join_command, run_command, split_command 2 | from .config import load_project_config, parse_config_overrides 3 | from .config import substitute_project_variables 4 | from .environment import ENV_VARS, check_bool_env_var, check_spacy_env_vars 5 | from .filesystem import ensure_path, ensure_pathy, is_cwd, is_subpath_of, make_tempdir 6 | from .filesystem import working_dir 7 | from .frozen import SimpleFrozenDict, SimpleFrozenList 8 | from .git import _http_to_git, get_git_version, git_checkout, git_repo_branch_exists 9 | from .git import git_sparse_checkout 10 | from .hashing import get_checksum, get_hash 11 | from .logging import logger 12 | from .modules import import_file 13 | from .remote import download_file, upload_file 14 | from .validation import validate_project_commands 15 | from .versions import get_minor_version, is_compatible_version, is_minor_version_match 16 | -------------------------------------------------------------------------------- /weasel/util/commands.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shlex 3 | import subprocess 4 | import sys 5 | from typing import Any, List, Optional, Union 6 | 7 | from ..compat import is_windows 8 | from ..errors import Errors 9 | 10 | 11 | def split_command(command: str) -> List[str]: 12 | """Split a string command using shlex. Handles platform compatibility. 13 | 14 | command (str) : The command to split 15 | RETURNS (List[str]): The split command. 16 | """ 17 | return shlex.split(command, posix=not is_windows) 18 | 19 | 20 | def join_command(command: List[str]) -> str: 21 | """Join a command using shlex. shlex.join is only available for Python 3.8+, 22 | so we're using a workaround here. 23 | 24 | command (List[str]): The command to join. 25 | RETURNS (str): The joined command 26 | """ 27 | return " ".join(shlex.quote(cmd) for cmd in command) 28 | 29 | 30 | def run_command( 31 | command: Union[str, List[str]], 32 | *, 33 | stdin: Optional[Any] = None, 34 | capture: bool = False, 35 | ) -> subprocess.CompletedProcess: 36 | """Run a command on the command line as a subprocess. If the subprocess 37 | returns a non-zero exit code, a system exit is performed. 38 | 39 | command (str / List[str]): The command. If provided as a string, the 40 | string will be split using shlex.split. 41 | stdin (Optional[Any]): stdin to read from or None. 42 | capture (bool): Whether to capture the output and errors. If False, 43 | the stdout and stderr will not be redirected, and if there's an error, 44 | sys.exit will be called with the return code. You should use capture=False 45 | when you want to turn over execution to the command, and capture=True 46 | when you want to run the command more like a function. 47 | RETURNS (Optional[CompletedProcess]): The process object. 48 | """ 49 | if isinstance(command, str): 50 | cmd_list = split_command(command) 51 | cmd_str = command 52 | else: 53 | cmd_list = command 54 | cmd_str = " ".join(command) 55 | try: 56 | ret = subprocess.run( 57 | cmd_list, 58 | env=os.environ.copy(), 59 | input=stdin, 60 | encoding="utf8", 61 | check=False, 62 | stdout=subprocess.PIPE if capture else None, 63 | stderr=subprocess.STDOUT if capture else None, 64 | ) 65 | except FileNotFoundError: 66 | # Indicates the *command* wasn't found, it's an error before the command 67 | # is run. 68 | raise FileNotFoundError( 69 | Errors.E501.format(str_command=cmd_str, tool=cmd_list[0]) 70 | ) from None 71 | if ret.returncode != 0 and capture: 72 | message = f"Error running command:\n\n{cmd_str}\n\n" 73 | message += f"Subprocess exited with status {ret.returncode}" 74 | if ret.stdout is not None: 75 | message += "\n\nProcess log (stdout and stderr):\n\n" 76 | message += ret.stdout 77 | error = subprocess.SubprocessError(message) 78 | error.ret = ret # type: ignore[attr-defined] 79 | error.command = cmd_str # type: ignore[attr-defined] 80 | raise error 81 | elif ret.returncode != 0: 82 | sys.exit(ret.returncode) 83 | return ret 84 | -------------------------------------------------------------------------------- /weasel/util/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | from typing import Any, Dict, List, Optional 5 | 6 | import srsly 7 | from click import NoSuchOption 8 | from click.parser import split_arg_string 9 | from confection import Config 10 | from wasabi import msg 11 | 12 | from ..cli.main import PROJECT_FILE 13 | from ..schemas import ProjectConfigSchema, validate 14 | from .environment import ENV_VARS 15 | from .frozen import SimpleFrozenDict 16 | from .logging import logger 17 | from .validation import show_validation_error, validate_project_commands 18 | 19 | 20 | def parse_config_overrides( 21 | args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES 22 | ) -> Dict[str, Any]: 23 | """Generate a dictionary of config overrides based on the extra arguments 24 | provided on the CLI, e.g. --training.batch_size to override 25 | "training.batch_size". Arguments without a "." are considered invalid, 26 | since the config only allows top-level sections to exist. 27 | 28 | env_vars (Optional[str]): Optional environment variable to read from. 29 | RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting. 30 | """ 31 | env_string = os.environ.get(env_var, "") if env_var else "" 32 | env_overrides = _parse_overrides(split_arg_string(env_string)) 33 | cli_overrides = _parse_overrides(args, is_cli=True) 34 | if cli_overrides: 35 | keys = [k for k in cli_overrides if k not in env_overrides] 36 | logger.debug("Config overrides from CLI: %s", keys) 37 | if env_overrides: 38 | logger.debug("Config overrides from env variables: %s", list(env_overrides)) 39 | return {**cli_overrides, **env_overrides} 40 | 41 | 42 | def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]: 43 | result = {} 44 | while args: 45 | opt = args.pop(0) 46 | err = f"Invalid config override '{opt}'" 47 | if opt.startswith("--"): # new argument 48 | orig_opt = opt 49 | opt = opt.replace("--", "") 50 | if "." not in opt: 51 | if is_cli: 52 | raise NoSuchOption(orig_opt) 53 | else: 54 | msg.fail(f"{err}: can't override top-level sections", exits=1) 55 | if "=" in opt: # we have --opt=value 56 | opt, value = opt.split("=", 1) 57 | opt = opt.replace("-", "_") 58 | else: 59 | if not args or args[0].startswith("--"): # flag with no value 60 | value = "true" 61 | else: 62 | value = args.pop(0) 63 | result[opt] = _parse_override(value) 64 | else: 65 | msg.fail(f"{err}: name should start with --", exits=1) 66 | return result 67 | 68 | 69 | def _parse_override(value: Any) -> Any: 70 | # Just like we do in the config, we're calling json.loads on the 71 | # values. But since they come from the CLI, it'd be unintuitive to 72 | # explicitly mark strings with escaped quotes. So we're working 73 | # around that here by falling back to a string if parsing fails. 74 | # TODO: improve logic to handle simple types like list of strings? 75 | try: 76 | return srsly.json_loads(value) 77 | except ValueError: 78 | return str(value) 79 | 80 | 81 | def load_project_config( 82 | path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() 83 | ) -> Dict[str, Any]: 84 | """Load the project.yml file from a directory and validate it. Also make 85 | sure that all directories defined in the config exist. 86 | 87 | path (Path): The path to the project directory. 88 | interpolate (bool): Whether to substitute project variables. 89 | overrides (Dict[str, Any]): Optional config overrides. 90 | RETURNS (Dict[str, Any]): The loaded project.yml. 91 | """ 92 | config_path = path / PROJECT_FILE 93 | if not config_path.exists(): 94 | msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1) 95 | invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct." 96 | try: 97 | config = srsly.read_yaml(config_path) 98 | except ValueError as e: 99 | msg.fail(invalid_err, e, exits=1) 100 | errors = validate(ProjectConfigSchema, config) 101 | if errors: 102 | msg.fail(invalid_err) 103 | print("\n".join(errors)) 104 | sys.exit(1) 105 | validate_project_commands(config) 106 | if interpolate: 107 | err = f"{PROJECT_FILE} validation error" 108 | with show_validation_error(title=err, hint_fill=False): 109 | config = substitute_project_variables(config, overrides) 110 | # Make sure directories defined in config exist 111 | for subdir in config.get("directories", []): 112 | dir_path = path / subdir 113 | if not dir_path.exists(): 114 | dir_path.mkdir(parents=True) 115 | return config 116 | 117 | 118 | def substitute_project_variables( 119 | config: Dict[str, Any], 120 | overrides: Dict[str, Any] = SimpleFrozenDict(), 121 | key: str = "vars", 122 | env_key: str = "env", 123 | ) -> Dict[str, Any]: 124 | """Interpolate variables in the project file using the config system. 125 | 126 | config (Dict[str, Any]): The project config. 127 | overrides (Dict[str, Any]): Optional config overrides. 128 | key (str): Key containing variables in project config. 129 | env_key (str): Key containing environment variable mapping in project config. 130 | RETURNS (Dict[str, Any]): The interpolated project config. 131 | """ 132 | config.setdefault(key, {}) 133 | config.setdefault(env_key, {}) 134 | # Substitute references to env vars with their values 135 | for config_var, env_var in config[env_key].items(): 136 | config[env_key][config_var] = _parse_override(os.environ.get(env_var, "")) 137 | # Need to put variables in the top scope again so we can have a top-level 138 | # section "project" (otherwise, a list of commands in the top scope wouldn't) 139 | # be allowed by Thinc's config system 140 | cfg = Config({"project": config, key: config[key], env_key: config[env_key]}) 141 | cfg = Config().from_str(cfg.to_str(), overrides=overrides) 142 | interpolated = cfg.interpolate() 143 | return dict(interpolated["project"]) 144 | -------------------------------------------------------------------------------- /weasel/util/environment.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from wasabi import msg 4 | 5 | 6 | class ENV_VARS: 7 | CONFIG_OVERRIDES = "WEASEL_CONFIG_OVERRIDES" 8 | 9 | 10 | def check_spacy_env_vars(): 11 | if "SPACY_CONFIG_OVERRIDES" in os.environ: 12 | msg.warn( 13 | "You've set a `SPACY_CONFIG_OVERRIDES` environment variable, " 14 | "which is now deprecated. Weasel will not use it. " 15 | "You can use `WEASEL_CONFIG_OVERRIDES` instead." 16 | ) 17 | if "SPACY_PROJECT_USE_GIT_VERSION" in os.environ: 18 | msg.warn( 19 | "You've set a `SPACY_PROJECT_USE_GIT_VERSION` environment variable, " 20 | "which is now deprecated. Weasel will not use it." 21 | ) 22 | 23 | 24 | def check_bool_env_var(env_var: str) -> bool: 25 | """Convert the value of an environment variable to a boolean. Add special 26 | check for "0" (falsy) and consider everything else truthy, except unset. 27 | 28 | env_var (str): The name of the environment variable to check. 29 | RETURNS (bool): Its boolean value. 30 | """ 31 | value = os.environ.get(env_var, False) 32 | if value == "0": 33 | return False 34 | return bool(value) 35 | -------------------------------------------------------------------------------- /weasel/util/filesystem.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import stat 4 | import sys 5 | import tempfile 6 | import warnings 7 | from contextlib import contextmanager 8 | from pathlib import Path 9 | from typing import Any, Generator, Iterator, Union 10 | 11 | from ..errors import Warnings 12 | 13 | 14 | @contextmanager 15 | def working_dir(path: Union[str, Path]) -> Iterator[Path]: 16 | """Change current working directory and returns to previous on exit. 17 | 18 | path (str / Path): The directory to navigate to. 19 | YIELDS (Path): The absolute path to the current working directory. This 20 | should be used if the block needs to perform actions within the working 21 | directory, to prevent mismatches with relative paths. 22 | """ 23 | prev_cwd = Path.cwd() 24 | current = Path(path).resolve() 25 | os.chdir(str(current)) 26 | try: 27 | yield current 28 | finally: 29 | os.chdir(str(prev_cwd)) 30 | 31 | 32 | @contextmanager 33 | def make_tempdir() -> Generator[Path, None, None]: 34 | """Execute a block in a temporary directory and remove the directory and 35 | its contents at the end of the with block. 36 | 37 | YIELDS (Path): The path of the temp directory. 38 | """ 39 | d = Path(tempfile.mkdtemp()) 40 | yield d 41 | 42 | # On Windows, git clones use read-only files, which cause permission errors 43 | # when being deleted. This forcibly fixes permissions. 44 | def force_remove(rmfunc, path, ex): 45 | os.chmod(path, stat.S_IWRITE) 46 | rmfunc(path) 47 | 48 | try: 49 | if sys.version_info >= (3, 12): 50 | shutil.rmtree(str(d), onexc=force_remove) 51 | else: 52 | shutil.rmtree(str(d), onerror=force_remove) 53 | except PermissionError as e: 54 | warnings.warn(Warnings.W801.format(dir=d, msg=e)) 55 | 56 | 57 | def is_cwd(path: Union[Path, str]) -> bool: 58 | """Check whether a path is the current working directory. 59 | 60 | path (Union[Path, str]): The directory path. 61 | RETURNS (bool): Whether the path is the current working directory. 62 | """ 63 | return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower() 64 | 65 | 66 | def ensure_path(path: Any) -> Any: 67 | """Ensure string is converted to a Path. 68 | 69 | path (Any): Anything. If string, it's converted to Path. 70 | RETURNS: Path or original argument. 71 | """ 72 | if isinstance(path, str): 73 | return Path(path) 74 | else: 75 | return path 76 | 77 | 78 | def ensure_pathy(path): 79 | """Temporary helper to prevent importing cloudpathlib globally (which was 80 | originally added due to a slow and annoying Google Cloud warning with 81 | Pathy)""" 82 | from cloudpathlib import AnyPath # noqa: F811 83 | 84 | return AnyPath(path) 85 | 86 | 87 | def is_subpath_of(parent, child): 88 | """ 89 | Check whether `child` is a path contained within `parent`. 90 | """ 91 | # Based on https://stackoverflow.com/a/37095733 . 92 | 93 | # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so 94 | # we can stop using crusty old os.path functions. 95 | parent_realpath = os.path.realpath(parent) 96 | child_realpath = os.path.realpath(child) 97 | return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath 98 | -------------------------------------------------------------------------------- /weasel/util/frozen.py: -------------------------------------------------------------------------------- 1 | from ..errors import Errors 2 | 3 | 4 | class SimpleFrozenDict(dict): 5 | """Simplified implementation of a frozen dict, mainly used as default 6 | function or method argument (for arguments that should default to empty 7 | dictionary). Will raise an error if user or Weasel attempts to add to dict. 8 | """ 9 | 10 | def __init__(self, *args, error: str = Errors.E001, **kwargs) -> None: 11 | """Initialize the frozen dict. Can be initialized with pre-defined 12 | values. 13 | 14 | error (str): The error message when user tries to assign to dict. 15 | """ 16 | super().__init__(*args, **kwargs) 17 | self.error = error 18 | 19 | def __setitem__(self, key, value): 20 | raise NotImplementedError(self.error) 21 | 22 | def pop(self, key, default=None): 23 | raise NotImplementedError(self.error) 24 | 25 | def update(self, other): 26 | raise NotImplementedError(self.error) 27 | 28 | 29 | class SimpleFrozenList(list): 30 | """Wrapper class around a list that lets us raise custom errors if certain 31 | attributes/methods are accessed. Mostly used for properties like 32 | Language.pipeline that return an immutable list (and that we don't want to 33 | convert to a tuple to not break too much backwards compatibility). If a user 34 | accidentally calls nlp.pipeline.append(), we can raise a more helpful error. 35 | """ 36 | 37 | def __init__(self, *args, error: str = Errors.E002) -> None: 38 | """Initialize the frozen list. 39 | 40 | error (str): The error message when user tries to mutate the list. 41 | """ 42 | self.error = error 43 | super().__init__(*args) 44 | 45 | def append(self, *args, **kwargs): 46 | raise NotImplementedError(self.error) 47 | 48 | def clear(self, *args, **kwargs): 49 | raise NotImplementedError(self.error) 50 | 51 | def extend(self, *args, **kwargs): 52 | raise NotImplementedError(self.error) 53 | 54 | def insert(self, *args, **kwargs): 55 | raise NotImplementedError(self.error) 56 | 57 | def pop(self, *args, **kwargs): 58 | raise NotImplementedError(self.error) 59 | 60 | def remove(self, *args, **kwargs): 61 | raise NotImplementedError(self.error) 62 | 63 | def reverse(self, *args, **kwargs): 64 | raise NotImplementedError(self.error) 65 | 66 | def sort(self, *args, **kwargs): 67 | raise NotImplementedError(self.error) 68 | -------------------------------------------------------------------------------- /weasel/util/git.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from pathlib import Path 4 | from typing import Tuple 5 | 6 | from wasabi import msg 7 | 8 | from .commands import run_command 9 | from .filesystem import is_subpath_of, make_tempdir 10 | 11 | 12 | def git_checkout( 13 | repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False 14 | ): 15 | git_version = get_git_version() 16 | if dest.exists(): 17 | msg.fail("Destination of checkout must not exist", exits=1) 18 | if not dest.parent.exists(): 19 | msg.fail("Parent of destination of checkout must exist", exits=1) 20 | if sparse and git_version >= (2, 22): 21 | return git_sparse_checkout(repo, subpath, dest, branch) 22 | elif sparse: 23 | # Only show warnings if the user explicitly wants sparse checkout but 24 | # the Git version doesn't support it 25 | err_old = ( 26 | f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " 27 | f"that doesn't fully support sparse checkout yet." 28 | ) 29 | err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." 30 | msg.warn( 31 | f"{err_unk if git_version == (0, 0) else err_old} " 32 | f"This means that more files than necessary may be downloaded " 33 | f"temporarily. To only download the files needed, make sure " 34 | f"you're using Git v2.22 or above." 35 | ) 36 | with make_tempdir() as tmp_dir: 37 | cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" 38 | run_command(cmd, capture=True) 39 | # We need Path(name) to make sure we also support subdirectories 40 | try: 41 | source_path = tmp_dir / Path(subpath) 42 | if not is_subpath_of(tmp_dir, source_path): 43 | err = f"'{subpath}' is a path outside of the cloned repository." 44 | msg.fail(err, repo, exits=1) 45 | if os.path.isdir(source_path): 46 | shutil.copytree(source_path, dest) 47 | else: 48 | shutil.copyfile(source_path, dest) 49 | except FileNotFoundError: 50 | err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')" 51 | msg.fail(err, repo, exits=1) 52 | 53 | 54 | def git_sparse_checkout(repo, subpath, dest, branch): 55 | # We're using Git, partial clone and sparse checkout to 56 | # only clone the files we need 57 | # This ends up being RIDICULOUS. omg. 58 | # So, every tutorial and SO post talks about 'sparse checkout'...But they 59 | # go and *clone* the whole repo. Worthless. And cloning part of a repo 60 | # turns out to be completely broken. The only way to specify a "path" is.. 61 | # a path *on the server*? The contents of which, specifies the paths. Wat. 62 | # Obviously this is hopelessly broken and insecure, because you can query 63 | # arbitrary paths on the server! So nobody enables this. 64 | # What we have to do is disable *all* files. We could then just checkout 65 | # the path, and it'd "work", but be hopelessly slow...Because it goes and 66 | # transfers every missing object one-by-one. So the final piece is that we 67 | # need to use some weird git internals to fetch the missings in bulk, and 68 | # *that* we can do by path. 69 | # We're using Git and sparse checkout to only clone the files we need 70 | with make_tempdir() as tmp_dir: 71 | # This is the "clone, but don't download anything" part. 72 | cmd = ( 73 | f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " 74 | f"-b {branch} --filter=blob:none" 75 | ) 76 | run_command(cmd) 77 | # Now we need to find the missing filenames for the subpath we want. 78 | # Looking for this 'rev-list' command in the git --help? Hah. 79 | cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" 80 | ret = run_command(cmd, capture=True) 81 | git_repo = _http_to_git(repo) 82 | # Now pass those missings into another bit of git internals 83 | missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) 84 | if not missings: 85 | err = ( 86 | f"Could not find any relevant files for '{subpath}'. " 87 | f"Did you specify a correct and complete path within repo '{repo}' " 88 | f"and branch {branch}?" 89 | ) 90 | msg.fail(err, exits=1) 91 | cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" 92 | run_command(cmd, capture=True) 93 | # And finally, we can checkout our subpath 94 | cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" 95 | run_command(cmd, capture=True) 96 | 97 | # Get a subdirectory of the cloned path, if appropriate 98 | source_path = tmp_dir / Path(subpath) 99 | if not is_subpath_of(tmp_dir, source_path): 100 | err = f"'{subpath}' is a path outside of the cloned repository." 101 | msg.fail(err, repo, exits=1) 102 | 103 | shutil.move(str(source_path), str(dest)) 104 | 105 | 106 | def git_repo_branch_exists(repo: str, branch: str) -> bool: 107 | """Uses 'git ls-remote' to check if a repository and branch exists 108 | 109 | repo (str): URL to get repo. 110 | branch (str): Branch on repo to check. 111 | RETURNS (bool): True if repo:branch exists. 112 | """ 113 | get_git_version() 114 | cmd = f"git ls-remote {repo} {branch}" 115 | # We might be tempted to use `--exit-code` with `git ls-remote`, but 116 | # `run_command` handles the `returncode` for us, so we'll rely on 117 | # the fact that stdout returns '' if the requested branch doesn't exist 118 | ret = run_command(cmd, capture=True) 119 | exists = ret.stdout != "" 120 | return exists 121 | 122 | 123 | def get_git_version( 124 | error: str = "Could not run 'git'. Make sure it's installed and the executable is available.", 125 | ) -> Tuple[int, int]: 126 | """Get the version of git and raise an error if calling 'git --version' fails. 127 | 128 | error (str): The error message to show. 129 | RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns 130 | (0, 0) if the version couldn't be determined. 131 | """ 132 | try: 133 | ret = run_command("git --version", capture=True) 134 | except Exception: 135 | raise RuntimeError(error) 136 | stdout = ret.stdout.strip() 137 | if not stdout or not stdout.startswith("git version"): 138 | return 0, 0 139 | version = stdout[11:].strip().split(".") 140 | return int(version[0]), int(version[1]) 141 | 142 | 143 | def _http_to_git(repo: str) -> str: 144 | if repo.startswith("http://"): 145 | repo = repo.replace(r"http://", r"https://") 146 | if repo.startswith(r"https://"): 147 | repo = repo.replace("https://", "git@").replace("/", ":", 1) 148 | if repo.endswith("/"): 149 | repo = repo[:-1] 150 | repo = f"{repo}.git" 151 | return repo 152 | -------------------------------------------------------------------------------- /weasel/util/hashing.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from pathlib import Path 3 | from typing import Iterable, Union 4 | 5 | import srsly 6 | from wasabi import msg 7 | 8 | 9 | def get_hash(data, exclude: Iterable[str] = tuple()) -> str: 10 | """Get the hash for a JSON-serializable object. 11 | 12 | data: The data to hash. 13 | exclude (Iterable[str]): Top-level keys to exclude if data is a dict. 14 | RETURNS (str): The hash. 15 | """ 16 | if isinstance(data, dict): 17 | data = {k: v for k, v in data.items() if k not in exclude} 18 | data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") 19 | return hashlib.md5(data_str).hexdigest() 20 | 21 | 22 | def get_checksum(path: Union[Path, str]) -> str: 23 | """Get the checksum for a file or directory given its file path. If a 24 | directory path is provided, this uses all files in that directory. 25 | 26 | path (Union[Path, str]): The file or directory path. 27 | RETURNS (str): The checksum. 28 | """ 29 | path = Path(path) 30 | if not (path.is_file() or path.is_dir()): 31 | msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1) 32 | if path.is_file(): 33 | return hashlib.md5(Path(path).read_bytes()).hexdigest() 34 | else: 35 | # TODO: this is currently pretty slow 36 | dir_checksum = hashlib.md5() 37 | for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): 38 | dir_checksum.update(sub_file.read_bytes()) 39 | return dir_checksum.hexdigest() 40 | -------------------------------------------------------------------------------- /weasel/util/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger("weasel") 4 | logger_stream_handler = logging.StreamHandler() 5 | logger_stream_handler.setFormatter( 6 | logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s") 7 | ) 8 | logger.addHandler(logger_stream_handler) 9 | -------------------------------------------------------------------------------- /weasel/util/modules.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from pathlib import Path 3 | from types import ModuleType 4 | from typing import Union 5 | 6 | 7 | def import_file(name: str, loc: Union[str, Path]) -> ModuleType: 8 | """Import module from a file. Used to load models from a directory. 9 | 10 | name (str): Name of module to load. 11 | loc (str / Path): Path to the file. 12 | RETURNS: The loaded module. 13 | """ 14 | spec = importlib.util.spec_from_file_location(name, str(loc)) # type: ignore 15 | module = importlib.util.module_from_spec(spec) # type: ignore 16 | spec.loader.exec_module(module) # type: ignore[union-attr] 17 | return module 18 | -------------------------------------------------------------------------------- /weasel/util/remote.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from pathlib import Path 3 | from typing import TYPE_CHECKING, Union 4 | 5 | if TYPE_CHECKING: 6 | from cloudpathlib import CloudPath 7 | 8 | 9 | def upload_file(src: Path, dest: Union[str, "CloudPath"]) -> None: 10 | """Upload a file. 11 | 12 | src (Path): The source path. 13 | url (str): The destination URL to upload to. 14 | """ 15 | import smart_open 16 | 17 | # Create parent directories for local paths 18 | if isinstance(dest, Path): 19 | if not dest.parent.exists(): 20 | dest.parent.mkdir(parents=True) 21 | 22 | dest = str(dest) 23 | with smart_open.open(dest, mode="wb") as output_file: 24 | with src.open(mode="rb") as input_file: 25 | output_file.write(input_file.read()) 26 | 27 | 28 | def download_file( 29 | src: Union[str, "CloudPath"], dest: Path, *, force: bool = False 30 | ) -> None: 31 | """Download a file using smart_open. 32 | 33 | url (str): The URL of the file. 34 | dest (Path): The destination path. 35 | force (bool): Whether to force download even if file exists. 36 | If False, the download will be skipped. 37 | """ 38 | import smart_open 39 | 40 | if dest.exists() and not force: 41 | return None 42 | src = str(src) 43 | with smart_open.open(src, mode="rb", compression="disable") as input_file: 44 | with dest.open(mode="wb") as output_file: 45 | shutil.copyfileobj(input_file, output_file) 46 | -------------------------------------------------------------------------------- /weasel/util/validation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from configparser import InterpolationError 3 | from contextlib import contextmanager 4 | from pathlib import Path 5 | from typing import TYPE_CHECKING, Any, Dict, Optional, Union 6 | 7 | from confection import ConfigValidationError 8 | from wasabi import msg 9 | 10 | from ..cli.main import PROJECT_FILE 11 | 12 | if TYPE_CHECKING: 13 | pass 14 | 15 | 16 | @contextmanager 17 | def show_validation_error( 18 | file_path: Optional[Union[str, Path]] = None, 19 | *, 20 | title: Optional[str] = None, 21 | desc: str = "", 22 | show_config: Optional[bool] = None, 23 | hint_fill: bool = True, 24 | ): 25 | """Helper to show custom config validation errors on the CLI. 26 | 27 | file_path (str / Path): Optional file path of config file, used in hints. 28 | title (str): Override title of custom formatted error. 29 | desc (str): Override description of custom formatted error. 30 | show_config (bool): Whether to output the config the error refers to. 31 | hint_fill (bool): Show hint about filling config. 32 | """ 33 | try: 34 | yield 35 | except ConfigValidationError as e: 36 | title = title if title is not None else e.title 37 | if e.desc: 38 | desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}" 39 | # Re-generate a new error object with overrides 40 | err = e.from_error(e, title="", desc=desc, show_config=show_config) 41 | msg.fail(title) 42 | print(err.text.strip()) 43 | if hint_fill and "value_error.missing" in err.error_types: 44 | config_path = ( 45 | file_path 46 | if file_path is not None and str(file_path) != "-" 47 | else "config.cfg" 48 | ) 49 | msg.text( 50 | "If your config contains missing values, you can run the 'init " 51 | "fill-config' command to fill in all the defaults, if possible:", 52 | spaced=True, 53 | ) 54 | print(f"python -m spacy init fill-config {config_path} {config_path} \n") 55 | sys.exit(1) 56 | except InterpolationError as e: 57 | msg.fail("Config validation error", e, exits=1) 58 | 59 | 60 | def validate_project_commands(config: Dict[str, Any]) -> None: 61 | """Check that project commands and workflows are valid, don't contain 62 | duplicates, don't clash and only refer to commands that exist. 63 | 64 | config (Dict[str, Any]): The loaded config. 65 | """ 66 | command_names = [cmd["name"] for cmd in config.get("commands", [])] 67 | workflows = config.get("workflows", {}) 68 | duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1]) 69 | if duplicates: 70 | err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}" 71 | msg.fail(err, exits=1) 72 | for workflow_name, workflow_steps in workflows.items(): 73 | if workflow_name in command_names: 74 | err = f"Can't use workflow name '{workflow_name}': name already exists as a command" 75 | msg.fail(err, exits=1) 76 | for step in workflow_steps: 77 | if step not in command_names: 78 | msg.fail( 79 | f"Unknown command specified in workflow '{workflow_name}': {step}", 80 | f"Workflows can only refer to commands defined in the 'commands' " 81 | f"section of the {PROJECT_FILE}.", 82 | exits=1, 83 | ) 84 | -------------------------------------------------------------------------------- /weasel/util/versions.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from packaging.specifiers import InvalidSpecifier, SpecifierSet 4 | from packaging.version import InvalidVersion, Version 5 | 6 | 7 | def is_compatible_version( 8 | version: str, constraint: str, prereleases: bool = True 9 | ) -> Optional[bool]: 10 | """Check if a version (e.g. "2.0.0") is compatible given a version 11 | constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version, 12 | it's interpreted as =={version}. 13 | 14 | version (str): The version to check. 15 | constraint (str): The constraint string. 16 | prereleases (bool): Whether to allow prereleases. If set to False, 17 | prerelease versions will be considered incompatible. 18 | RETURNS (bool / None): Whether the version is compatible, or None if the 19 | version or constraint are invalid. 20 | """ 21 | # Handle cases where exact version is provided as constraint 22 | if constraint[0].isdigit(): 23 | constraint = f"=={constraint}" 24 | try: 25 | spec = SpecifierSet(constraint) 26 | version = Version(version) # type: ignore[assignment] 27 | except (InvalidSpecifier, InvalidVersion): 28 | return None 29 | spec.prereleases = prereleases 30 | return version in spec 31 | 32 | 33 | def get_minor_version(version: str) -> Optional[str]: 34 | """Get the major + minor version (without patch or prerelease identifiers). 35 | 36 | version (str): The version. 37 | RETURNS (str): The major + minor version or None if version is invalid. 38 | """ 39 | try: 40 | v = Version(version) 41 | except (TypeError, InvalidVersion): 42 | return None 43 | return f"{v.major}.{v.minor}" 44 | 45 | 46 | def is_minor_version_match(version_a: str, version_b: str) -> bool: 47 | """Compare two versions and check if they match in major and minor, without 48 | patch or prerelease identifiers. Used internally for compatibility checks 49 | that should be insensitive to patch releases. 50 | 51 | version_a (str): The first version 52 | version_b (str): The second version. 53 | RETURNS (bool): Whether the versions match. 54 | """ 55 | a = get_minor_version(version_a) 56 | b = get_minor_version(version_b) 57 | return a is not None and b is not None and a == b 58 | --------------------------------------------------------------------------------