├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── bin
    ├── get-package.sh
    ├── get-version.sh
    └── push-tag.sh
├── docs
    ├── assets
    │   └── images
    │   │   ├── prodigy_train_curve.jpg
    │   │   ├── project_document.jpg
    │   │   ├── projects.png
    │   │   ├── projects.svg
    │   │   └── spacy-streamlit.png
    ├── cli.md
    └── tutorial
    │   ├── custom-scripts.md
    │   ├── directory-and-assets.md
    │   ├── integrations.md
    │   ├── remote-storage.md
    │   └── workflow.md
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
└── weasel
    ├── __init__.py
    ├── __main__.py
    ├── about.py
    ├── cli
        ├── __init__.py
        ├── assets.py
        ├── clone.py
        ├── document.py
        ├── dvc.py
        ├── main.py
        ├── pull.py
        ├── push.py
        ├── remote_storage.py
        └── run.py
    ├── compat.py
    ├── errors.py
    ├── schemas.py
    ├── tests
        ├── __init__.py
        ├── cli
        │   ├── __init__.py
        │   ├── test_cli.py
        │   ├── test_cli_app.py
        │   ├── test_document.py
        │   └── test_remote.py
        ├── demo_project
        │   ├── project.yml
        │   └── scripts
        │   │   └── check.py
        ├── test_schemas.py
        ├── test_validation.py
        └── util.py
    └── util
        ├── __init__.py
        ├── commands.py
        ├── config.py
        ├── environment.py
        ├── filesystem.py
        ├── frozen.py
        ├── git.py
        ├── hashing.py
        ├── logging.py
        ├── modules.py
        ├── remote.py
        ├── validation.py
        └── versions.py


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--- Provide a general summary of your changes in the title. -->
 2 | 
 3 | ## Description
 4 | 
 5 | <!--- Use this section to describe your changes. If your changes required
 6 | testing, include information about the testing environment and the tests you
 7 | ran. If your test fixes a bug reported in an issue, don't forget to include the
 8 | issue number. If your PR is still a work in progress, that's totally fine – just
 9 | include a note to let us know. -->
10 | 
11 | ### Types of change
12 | 
13 | <!-- What type of change does your PR cover? Is it a bug fix, an enhancement
14 | or new feature, or a change to the documentation? -->
15 | 
16 | ## Checklist
17 | 
18 | <!--- Before you submit the PR, go over this checklist and make sure you can
19 | tick off all the boxes. [] -> [x] -->
20 | 
21 | - [ ] I confirm that I have the right to submit this contribution under the project's MIT license.
22 | - [ ] I ran the test suite, and all new and existing tests passed.
23 | - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
24 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
  1 | name: tests
  2 | 
  3 | on:
  4 |   push:
  5 |     paths-ignore:
  6 |       - "*.md"
  7 |   pull_request:
  8 |     types: [opened, synchronize, reopened, edited]
  9 |     paths-ignore:
 10 |       - "*.md"
 11 | 
 12 | env:
 13 |   MODULE_NAME: "weasel"
 14 |   RUN_MYPY: "true"
 15 | 
 16 | jobs:
 17 |   validate:
 18 |     name: Validate
 19 |     if: github.repository_owner == 'explosion'
 20 |     runs-on: ubuntu-latest
 21 | 
 22 |     steps:
 23 |       - name: Check out repo
 24 |         uses: actions/checkout@v4
 25 | 
 26 |       - name: Configure Python version
 27 |         uses: actions/setup-python@v5
 28 |         with:
 29 |           python-version: "3.11"
 30 | 
 31 |       - name: Set PY variable
 32 |         run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
 33 | 
 34 |       - uses: actions/cache@v3
 35 |         with:
 36 |           path: ~/.cache/pre-commit
 37 |           key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
 38 | 
 39 |       - name: Install pre-commit
 40 |         run: |
 41 |           pip install 'pre-commit>=3.2.0,<4.0.0'
 42 |           pre-commit install
 43 | 
 44 |       - name: Run pre-commit
 45 |         run: SKIP=no-commit-to-branch pre-commit run --all-files
 46 | 
 47 |   tests:
 48 |     name: Test
 49 |     needs: Validate
 50 |     if: github.repository_owner == 'explosion'
 51 |     strategy:
 52 |       fail-fast: true
 53 |       matrix:
 54 |         os: [ubuntu-latest, windows-latest, macos-latest]
 55 |         python_version: ["3.12"]
 56 |         include:
 57 |           - os: ubuntu-latest
 58 |             python_version: "3.7"
 59 |           - os: windows-latest
 60 |             python_version: "3.8"
 61 |           - os: macos-latest
 62 |             python_version: "3.9"
 63 |           - os: macos-latest
 64 |             python_version: "3.10"
 65 |           - os: windows-latest
 66 |             python_version: "3.11"
 67 |     runs-on: ${{ matrix.os }}
 68 | 
 69 |     steps:
 70 |       - name: Check out repo
 71 |         uses: actions/checkout@v4
 72 | 
 73 |       - name: Configure Python version
 74 |         uses: actions/setup-python@v5
 75 |         with:
 76 |           python-version: ${{ matrix.python_version }}
 77 | 
 78 |       - name: Build sdist
 79 |         run: |
 80 |           python -m pip install -U build pip setuptools
 81 |           python -m pip install -U -r requirements.txt
 82 |           python -m build --sdist
 83 | 
 84 |       - name: Delete source directory
 85 |         shell: bash
 86 |         run: |
 87 |           rm -rf $MODULE_NAME
 88 | 
 89 |       - name: Uninstall all packages
 90 |         run: |
 91 |           python -m pip freeze > installed.txt
 92 |           python -m pip uninstall -y -r installed.txt
 93 | 
 94 |       - name: Install from sdist
 95 |         shell: bash
 96 |         run: |
 97 |           SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
 98 |           python -m pip install dist/$SDIST
 99 | 
100 |       - name: Test import
101 |         shell: bash
102 |         run: |
103 |           python -c "import $MODULE_NAME" -Werror
104 | 
105 |       - name: Test CLI
106 |         run: |
107 |           python -m weasel --help
108 | 
109 |       - name: Install test requirements
110 |         run: |
111 |           python -m pip install -U -r requirements.txt
112 | 
113 |       - name: Run tests
114 |         shell: bash
115 |         run: |
116 |           python -m pytest --pyargs $MODULE_NAME -Werror
117 | 
118 |       - name: Test 'spacy project' CLI help/info messages
119 |         shell: bash
120 |         run: |
121 |           python -m pip install spacy
122 |           python -m spacy project clone pipelines/ner_demo | grep -q "spacy project assets"
123 |           cd ner_demo
124 |           python -m spacy project run --help | grep -q "spacy project run"
125 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Vim / VSCode / editors
 2 | *.swp
 3 | *.sw*
 4 | Profile.prof
 5 | .vscode
 6 | .sass-cache
 7 | 
 8 | # Python
 9 | .Python
10 | .python-version
11 | __pycache__/
12 | .pytest_cache
13 | *.py[cod]
14 | .env/
15 | .env*
16 | .~env/
17 | .venv
18 | env3.6/
19 | venv/
20 | env3.*/
21 | .dev
22 | .denv
23 | .pypyenv
24 | .pytest_cache/
25 | .mypy_cache/
26 | .hypothesis/
27 | 
28 | # Distribution / packaging
29 | env/
30 | build/
31 | develop-eggs/
32 | dist/
33 | eggs/
34 | lib/
35 | lib64/
36 | parts/
37 | sdist/
38 | var/
39 | wheelhouse/
40 | *.egg-info/
41 | pip-wheel-metadata/
42 | Pipfile.lock
43 | .installed.cfg
44 | *.egg
45 | .eggs
46 | MANIFEST
47 | 
48 | # Temporary files
49 | *.~*
50 | tmp/
51 | 
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 | 
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .coverage
60 | .cache
61 | nosetests.xml
62 | coverage.xml
63 | 
64 | # Translations
65 | *.mo
66 | 
67 | # Mr Developer
68 | .mr.developer.cfg
69 | .project
70 | .pydevproject
71 | 
72 | # Rope
73 | .ropeproject
74 | 
75 | # Django stuff:
76 | *.log
77 | *.pot
78 | 
79 | # Windows
80 | *.bat
81 | Thumbs.db
82 | Desktop.ini
83 | 
84 | # Mac OS X
85 | *.DS_Store
86 | 
87 | # Komodo project files
88 | *.komodoproject
89 | 
90 | # Other
91 | *.tgz
92 | 
93 | # Pycharm project files
94 | *.idea
95 | 
96 | # IPython
97 | .ipynb_checkpoints/
98 | *.ipynb
99 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v3.2.0
 6 |     hooks:
 7 |       - id: trailing-whitespace
 8 |       - id: no-commit-to-branch
 9 |         args: [--branch, main]
10 |       - id: end-of-file-fixer
11 |       - id: check-yaml
12 |         args: [--unsafe]
13 |       - id: check-toml
14 |       - id: check-json
15 |       - id: check-symlinks
16 |       - id: check-docstring-first
17 |       - id: check-added-large-files
18 |       - id: detect-private-key
19 |       # - id: requirements-txt-fixer
20 | 
21 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
22 |     rev: v0.0.254
23 |     hooks:
24 |       - id: ruff
25 |         args: [--fix, --exit-non-zero-on-fix]
26 | 
27 |   - repo: https://github.com/pre-commit/mirrors-mypy
28 |     rev: v1.0.1
29 |     hooks:
30 |       - id: mypy
31 |         additional_dependencies:
32 |           - "types-requests"
33 |           - "types-setuptools"
34 |           - "pydantic"
35 | 
36 |   - repo: https://github.com/pycqa/isort
37 |     rev: 5.12.0
38 |     hooks:
39 |       - id: isort
40 |         name: isort (python)
41 |       - id: isort
42 |         name: isort (cython)
43 |         types: [cython]
44 |       - id: isort
45 |         name: isort (pyi)
46 |         types: [pyi]
47 | 
48 |   - repo: https://github.com/psf/black
49 |     rev: 22.3.0
50 |     hooks:
51 |       - id: black
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (C) 2022 ExplosionAI GmbH
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
 2 | 
 3 | # Weasel: A small and easy workflow system
 4 | 
 5 | Weasel lets you manage and share **end-to-end workflows** for
 6 | different **use cases and domains**, and orchestrate training, packaging and
 7 | serving your custom pipelines. You can start off by cloning a pre-defined
 8 | project template, adjust it to fit your needs, load in your data, train a
 9 | pipeline, export it as a Python package, upload your outputs to a remote storage
10 | and share your results with your team. Weasel can be used via the
11 | [`weasel`](https://github.com/explosion/weasel/blob/main/docs/cli.md) command and we provide templates in our
12 | [`projects`](https://github.com/explosion/projects) repo.
13 | 
14 | ![Illustration of project workflow and commands](https://raw.githubusercontent.com/explosion/weasel/main/docs/assets/images/projects.svg)
15 | 
16 | ## 💡 Example: Get started with a project template
17 | 
18 | The easiest way to get started is to clone a project template and run it – for
19 | example, this [end-to-end template](https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud)
20 | that lets you train a spaCy **part-of-speech
21 | tagger** and **dependency parser** on a Universal Dependencies treebank.
22 | 
23 | ```shell
24 | python -m weasel clone pipelines/tagger_parser_ud
25 | ```
26 | 
27 | > **Note**
28 | >
29 | > Our [`projects`](https://github.com/explosion/projects) repo includes various
30 | > project templates for different NLP tasks, models, workflows and integrations
31 | > that you can clone and run. The easiest way to get started is to pick a
32 | > template, clone it and start modifying it!
33 | 
34 | ## 📕 Documentation
35 | 
36 | Get started with the documentation:
37 | 
38 | - [Learn how to create a Weasel workflow](https://github.com/explosion/weasel/blob/main/docs/tutorial/workflow.md)
39 | - [Working with directory and assets](https://github.com/explosion/weasel/blob/main/docs/tutorial/directory-and-assets.md)
40 | - [Running custom scripts](https://github.com/explosion/weasel/blob/main/docs/tutorial/custom-scripts.md)
41 | - [Using remote storage](https://github.com/explosion/weasel/blob/main/docs/tutorial/remote-storage.md)
42 | - [Weasel integrations](https://github.com/explosion/weasel/blob/main/docs/tutorial/integrations.md)
43 | - [Command line interface description](https://github.com/explosion/weasel/blob/main/docs/cli.md)
44 | 
45 | ## Migrating from spaCy Projects
46 | 
47 | Weasel is a standalone replacement for spaCy Projects.
48 | There are a few backward incompatibilities that you should be aware of:
49 | 
50 | - The `SPACY_CONFIG_OVERRIDES` environment variable is no longer checked.
51 |   You can set configuration overrides using `WEASEL_CONFIG_OVERRIDES`.
52 | - Support for the `spacy_version` configuration key has been dropped.
53 | - Support for the `check_requirements` configuration key has been dropped.
54 | - Support for `SPACY_PROJECT_USE_GIT_VERSION` environment variable has been dropped.
55 | - Error codes are now Weasel-specific, and do not follow spaCy error codes.
56 | 
57 | Weasel checks for the first three incompatibilities and will issue a
58 | warning if you're using it with spaCy-specific configuration options.
59 | 


--------------------------------------------------------------------------------
/bin/get-package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | version=$(grep "name = " setup.cfg)
 6 | version=${version/__title__ = }
 7 | version=${version/\'/}
 8 | version=${version/\'/}
 9 | version=${version/\"/}
10 | version=${version/\"/}
11 | 
12 | echo $version
13 | 


--------------------------------------------------------------------------------
/bin/get-version.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | version=$(grep "version = " setup.cfg)
 6 | version=${version/version = }
 7 | version=${version/\'/}
 8 | version=${version/\'/}
 9 | version=${version/\"/}
10 | version=${version/\"/}
11 | 
12 | echo $version
13 | 


--------------------------------------------------------------------------------
/bin/push-tag.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | # Insist repository is clean
 6 | git diff-index --quiet HEAD
 7 | 
 8 | git checkout $1
 9 | git pull origin $1
10 | git push origin $1
11 | 
12 | version=$(grep "version = " setup.cfg)
13 | version=${version/version = }
14 | version=${version/\'/}
15 | version=${version/\'/}
16 | version=${version/\"/}
17 | version=${version/\"/}
18 | git tag "v$version"
19 | git push origin "v$version"
20 | 


--------------------------------------------------------------------------------
/docs/assets/images/prodigy_train_curve.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/prodigy_train_curve.jpg


--------------------------------------------------------------------------------
/docs/assets/images/project_document.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/project_document.jpg


--------------------------------------------------------------------------------
/docs/assets/images/projects.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/projects.png


--------------------------------------------------------------------------------
/docs/assets/images/spacy-streamlit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/spacy-streamlit.png


--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
  1 | # Command Line Interface
  2 | 
  3 | The `weasel` CLI includes subcommands for working with Weasel projects,
  4 | end-to-end workflows for building and deploying custom pipelines.
  5 | 
  6 | ## :clipboard: clone
  7 | 
  8 | Clone a project template from a Git repository. Calls into `git` under the hood
  9 | and can use the sparse checkout feature if available, so you're only downloading
 10 | what you need. By default, Weasel's
 11 | [project templates repo](https://github.com/explosion/projects) is used, but you
 12 | can provide any other repo (public or private) that you have access to using the
 13 | `--repo` option.
 14 | 
 15 | ```bash
 16 | python -m weasel clone [name] [dest] [--repo] [--branch] [--sparse]
 17 | ```
 18 | 
 19 | > :bulb: **Example usage**
 20 | >
 21 | > ```bash
 22 | > $ python -m weasel clone pipelines/ner_wikiner
 23 | > ```
 24 | 
 25 | | Name             | Description                                                                                                                                               |
 26 | | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
 27 | | `name`           | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~         |
 28 | | `dest`           | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~                                                                  |
 29 | | `--repo`, `-r`   | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~                                                  |
 30 | | `--branch`, `-b` | The branch to clone from. Defaults to `master`. ~~str (option)~~                                                                                          |
 31 | | `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
 32 | | `--help`, `-h`   | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
 33 | | **CREATES**      | The cloned [project directory](tutorial/directory-and-assets.md).                                                                                         |
 34 | 
 35 | ## :open_file_folder: assets
 36 | 
 37 | Fetch project assets like datasets and pretrained weights. Assets are defined in
 38 | the `assets` section of the
 39 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). If a `checksum`
 40 | is provided, the file is only downloaded if no local file with the same checksum
 41 | exists and Weasel will show an error if the checksum of the downloaded file
 42 | doesn't match. If assets don't specify a `url` they're considered "private" and
 43 | you have to take care of putting them into the destination directory yourself.
 44 | If a local path is provided, the asset is copied into the current project.
 45 | 
 46 | ```bash
 47 | python -m weasel assets [project_dir]
 48 | ```
 49 | 
 50 | | Name                                           | Description                                                                                                                                               |
 51 | | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
 52 | | `project_dir`                                  | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                                                                   |
 53 | | `--extra`, `-e` <Tag variant="new">3.3.1</Tag> | Download assets marked as "extra". Default false. ~~bool (flag)~~                                                                                         |
 54 | | `--sparse`, `-S`                               | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
 55 | | `--help`, `-h`                                 | Show help message and available arguments. ~~bool (flag)~~                                                                                                |
 56 | | **CREATES**                                    | Downloaded or copied assets defined in the `project.yml`.                                                                                                 |
 57 | 
 58 | ## :rocket: run
 59 | 
 60 | Run a named command or workflow defined in the
 61 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). If a workflow
 62 | name is specified, all commands in the workflow are run, in order. If commands
 63 | define
 64 | [dependencies or outputs](tutorial/directory-and-assets.md#dependencies-and-outputs),
 65 | they will only be re-run if state has changed. For example, if the input dataset
 66 | changes, a preprocessing command that depends on those files will be re-run.
 67 | 
 68 | ```bash
 69 | python -m weasel run [subcommand] [project_dir] [--force] [--dry]
 70 | ```
 71 | 
 72 | | Name            | Description                                                                             |
 73 | | --------------- | --------------------------------------------------------------------------------------- |
 74 | | `subcommand`    | Name of the command or workflow to run. ~~str (positional)~~                            |
 75 | | `project_dir`   | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
 76 | | `--force`, `-F` | Force re-running steps, even if nothing changed. ~~bool (flag)~~                        |
 77 | | `--dry`, `-D`   | Perform a dry run and don't execute scripts. ~~bool (flag)~~                            |
 78 | | `--help`, `-h`  | Show help message and available arguments. ~~bool (flag)~~                              |
 79 | | **EXECUTES**    | The command defined in the `project.yml`.                                               |
 80 | 
 81 | ## :arrow_up: push
 82 | 
 83 | Upload all available files or directories listed as in the `outputs` section of
 84 | commands to a remote storage. Outputs are archived and compressed prior to
 85 | upload, and addressed in the remote storage using the output's relative path
 86 | (URL encoded), a hash of its command string and dependencies, and a hash of its
 87 | file contents. This means `push` should **never overwrite** a file in your
 88 | remote. If all the hashes match, the contents are the same and nothing happens.
 89 | If the contents are different, the new version of the file is uploaded. Deleting
 90 | obsolete files is left up to you.
 91 | 
 92 | Remotes can be defined in the `remotes` section of the
 93 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). Under the hood,
 94 | Weasel uses [`cloudpathlib`](https://cloudpathlib.drivendata.org) to communicate
 95 | with the remote storages, so you can use any protocol that `CloudPath` supports,
 96 | including [S3](https://aws.amazon.com/s3/),
 97 | [Google Cloud Storage](https://cloud.google.com/storage), and the local
 98 | filesystem, although you may need to install extra dependencies to use certain
 99 | protocols.
100 | 
101 | ```bash
102 | python -m weasel push [remote] [project_dir]
103 | ```
104 | 
105 | > :bulb: **Example**
106 | >
107 | > ```bash
108 | > $ python -m weasel push my_bucket
109 | > ```
110 | >
111 | > ```yaml title="project.yml"
112 | > remotes:
113 | >   my_bucket: 's3://my-weasel-bucket'
114 | > ```
115 | 
116 | | Name           | Description                                                                             |
117 | | -------------- | --------------------------------------------------------------------------------------- |
118 | | `remote`       | The name of the remote to upload to. Defaults to `"default"`. ~~str (positional)~~      |
119 | | `project_dir`  | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
120 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~                              |
121 | | **UPLOADS**    | All project outputs that exist and are not already stored in the remote.                |
122 | 
123 | ## :arrow_down: pull
124 | 
125 | Download all files or directories listed as `outputs` for commands, unless they
126 | are already present locally. When searching for files in the remote, `pull`
127 | won't just look at the output path, but will also consider the **command
128 | string** and the **hashes of the dependencies**. For instance, let's say you've
129 | previously pushed a checkpoint to the remote, but now you've changed some
130 | hyper-parameters. Because you've changed the inputs to the command, if you run
131 | `pull`, you won't retrieve the stale result. If you train your pipeline and push
132 | the outputs to the remote, the outputs will be saved alongside the prior
133 | outputs, so if you change the config back, you'll be able to fetch back the
134 | result.
135 | 
136 | Remotes can be defined in the `remotes` section of the
137 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). Under the hood,
138 | Weasel uses [`cloudpathlib`](https://cloudpathlib.drivendata.org/) to
139 | communicate with the remote storages, so you can use any protocol that
140 | `CloudPath` supports, including [S3](https://aws.amazon.com/s3/),
141 | [Google Cloud Storage](https://cloud.google.com/storage), and the local
142 | filesystem, although you may need to install extra dependencies to use certain
143 | protocols.
144 | 
145 | ```bash
146 | python -m weasel pull [remote] [project_dir]
147 | ```
148 | 
149 | > :bulb: **Example**
150 | >
151 | > ```bash
152 | > $ python -m weasel pull my_bucket
153 | > ```
154 | >
155 | > ```yaml title="project.yml"
156 | > remotes:
157 | >   my_bucket: 's3://my-weasel-bucket'
158 | > ```
159 | 
160 | | Name           | Description                                                                             |
161 | | -------------- | --------------------------------------------------------------------------------------- |
162 | | `remote`       | The name of the remote to download from. Defaults to `"default"`. ~~str (positional)~~  |
163 | | `project_dir`  | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
164 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~                              |
165 | | **DOWNLOADS**  | All project outputs that do not exist locally and can be found in the remote.           |
166 | 
167 | ## :closed_book: document
168 | 
169 | Auto-generate a pretty Markdown-formatted `README` for your project, based on
170 | its [`project.yml`](tutorial/directory-and-assets.md#project-yml). Will create
171 | sections that document the available commands, workflows and assets. The
172 | auto-generated content will be placed between two hidden markers, so you can add
173 | your own custom content before or after the auto-generated documentation. When
174 | you re-run the `project document` command, only the auto-generated part is
175 | replaced.
176 | 
177 | ```bash
178 | python -m weasel document [project_dir] [--output] [--no-emoji]
179 | ```
180 | 
181 | > :bulb: **Example usage**
182 | >
183 | > ```bash
184 | > $ python -m weasel document --output README.md
185 | > ```
186 | >
187 | > For more examples, see the templates in our
188 | > [`projects`](https://github.com/explosion/projects) repo.
189 | 
190 | | Name                | Description                                                                                                                                                                                             |
191 | | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
192 | | `project_dir`       | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                                                                                                                 |
193 | | `--output`, `-o`    | Path to output file or `-` for stdout (default). If a file is specified and it already exists and contains auto-generated docs, only the auto-generated docs section is replaced. ~~Path (positional)~~ |
194 | | `--no-emoji`, `-NE` | Don't use emoji in the titles. ~~bool (flag)~~                                                                                                                                                          |
195 | | **CREATES**         | The Markdown-formatted project documentation.                                                                                                                                                           |
196 | 
197 | ## :repeat: dvc
198 | 
199 | Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
200 | [`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
201 | the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline,
202 | so you need to specify one workflow defined in the
203 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). If no workflow is
204 | specified, the first defined workflow is used. The DVC config will only be
205 | updated if the `project.yml` changed. For details, see the
206 | [DVC integration](tutorial/integrations.md#data-version-control-dvc) docs.
207 | 
208 | > **Warning**
209 | >
210 | > This command requires DVC to be installed and initialized in the project
211 | > directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init).
212 | > You'll also need to add the assets you want to track with
213 | > [`dvc add`](https://dvc.org/doc/command-reference/add).
214 | 
215 | ```bash
216 | python -m weasel dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
217 | ```
218 | 
219 | > :bulb: **Example**
220 | >
221 | > ```bash
222 | > $ git init
223 | > $ dvc init
224 | > $ python -m weasel dvc all
225 | > ```
226 | 
227 | | Name              | Description                                                                                                   |
228 | | ----------------- | ------------------------------------------------------------------------------------------------------------- |
229 | | `project_dir`     | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                       |
230 | | `workflow`        | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
231 | | `--force`, `-F`   | Force-updating config file. ~~bool (flag)~~                                                                   |
232 | | `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~                                                           |
233 | | `--quiet`, `-q`   | Print no output generated by DVC. ~~bool (flag)~~                                                             |
234 | | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                    |
235 | | **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                 |
236 | 


--------------------------------------------------------------------------------
/docs/tutorial/custom-scripts.md:
--------------------------------------------------------------------------------
  1 | # Custom scripts and projects
  2 | 
  3 | The `project.yml` lets you define any custom commands and run them as part of
  4 | your training, evaluation or deployment workflows. The `script` section defines
  5 | a list of commands that are called in a subprocess, in order. This lets you
  6 | execute other Python scripts or command-line tools.
  7 | 
  8 | Let's say you're training a spaCy pipeline, and you've written a
  9 | few integration tests that load the best model produced by the training command
 10 | and check that it works correctly. You can now define a `test` command that
 11 | calls into [`pytest`](https://docs.pytest.org/en/latest/), runs your tests and
 12 | uses [`pytest-html`](https://github.com/pytest-dev/pytest-html) to export a test
 13 | report:
 14 | 
 15 | > :bulb: **Example configuration**
 16 | >
 17 | > ```yaml title="project.yml"
 18 | > commands:
 19 | >   - name: test
 20 | >     help: 'Test the trained pipeline'
 21 | >     script:
 22 | >       - 'pip install pytest pytest-html'
 23 | >       - 'python -m pytest ./scripts/tests --html=metrics/test-report.html'
 24 | >     deps:
 25 | >       - 'training/model-best'
 26 | >     outputs:
 27 | >       - 'metrics/test-report.html'
 28 | >     no_skip: true
 29 | > ```
 30 | 
 31 | Adding `training/model-best` to the command's `deps` lets you ensure that the
 32 | file is available. If not, Weasel will show an error and the command won't run.
 33 | Setting `no_skip: true` means that the command will always run, even if the
 34 | dependencies (the trained pipeline) haven't changed. This makes sense here,
 35 | because you typically don't want to skip your tests.
 36 | 
 37 | ## Writing custom scripts
 38 | 
 39 | Your project commands can include any custom scripts – essentially, anything you
 40 | can run from the command line. Here's an example of a custom script that uses
 41 | [`typer`](https://typer.tiangolo.com/) for quick and easy command-line arguments
 42 | that you can define via your `project.yml`:
 43 | 
 44 | ```python title="scripts/custom_evaluation.py"
 45 | import typer
 46 | 
 47 | def custom_evaluation(batch_size: int = 128, model_path: str, data_path: str):
 48 |     # The arguments are now available as positional CLI arguments
 49 |     print(batch_size, model_path, data_path)
 50 | 
 51 | if __name__ == "__main__":
 52 |     typer.run(custom_evaluation)
 53 | ```
 54 | 
 55 | > :information_source: **About Typer**
 56 | >
 57 | > [`typer`](https://typer.tiangolo.com/) is a modern library for building Python
 58 | > CLIs using type hints. It's a dependency of Weasel, so it will already be
 59 | > pre-installed in your environment. Function arguments automatically become
 60 | > positional CLI arguments and using Python type hints, you can define the value
 61 | > types. For instance, `batch_size: int` means that the value provided via the
 62 | > command line is converted to an integer.
 63 | 
 64 | In your `project.yml`, you can then run the script by calling
 65 | `python scripts/custom_evaluation.py` with the function arguments. You can also
 66 | use the `vars` section to define reusable variables that will be substituted in
 67 | commands, paths and URLs. In the following example, the batch size is defined as a
 68 | variable will be added in place of `${vars.batch_size}` in the script.
 69 | 
 70 | > :bulb: **Example usage of `vars`**
 71 | >
 72 | > ```yaml title="project.yml"
 73 | > vars:
 74 | >  batch_size: 128
 75 | >
 76 | > commands:
 77 | >   - name: evaluate
 78 | >     script:
 79 | >       - 'python scripts/custom_evaluation.py ${vars.batch_size} ./training/model-best ./corpus/eval.json'
 80 | >     deps:
 81 | >       - 'training/model-best'
 82 | >       - 'corpus/eval.json'
 83 | > ```
 84 | 
 85 | > :information_source: **Calling into Python**
 86 | >
 87 | > If any of your command scripts call into `python`, Weasel will take care of
 88 | > replacing that with your `sys.executable`, to make sure you're executing
 89 | > everything with the same Python (not some other Python installed on your
 90 | > system). It also normalizes references to `python3`, `pip3` and `pip`.
 91 | 
 92 | You can also use the `env` section to reference **environment variables** and
 93 | make their values available to the commands. This can be useful for overriding
 94 | settings on the command line and passing through system-level settings.
 95 | 
 96 | > :bulb: **Example usage of EnvVars**
 97 | >
 98 | > ```bash
 99 | > export GPU_ID=1
100 | > BATCH_SIZE=128 python -m weasel run evaluate
101 | > ```
102 | >
103 | > ```yaml title="project.yml"
104 | > env:
105 | >   batch_size: BATCH_SIZE
106 | >   gpu_id: GPU_ID
107 | >
108 | > commands:
109 | >   - name: evaluate
110 | >     script:
111 | >       - 'python scripts/custom_evaluation.py ${env.batch_size}'
112 | > ```
113 | 
114 | ## Documenting your project
115 | 
116 | > :bulb: **Examples**
117 | >
118 | > For more examples, see the [`projects`](https://github.com/explosion/projects)
119 | > repo.
120 | >
121 | > ![Screenshot of auto-generated Markdown Readme](../assets/images/project_document.jpg)
122 | 
123 | When your custom project is ready and you want to share it with others, you can
124 | use the [`weasel document`](../cli.md#closed_book-document) command to
125 | **auto-generate** a pretty, Markdown-formatted `README` file based on your
126 | project's `project.yml`. It will list all commands, workflows and assets defined
127 | in the project and include details on how to run the project, as well as links
128 | to the relevant Weasel documentation to make it easy for others to get started
129 | using your project.
130 | 
131 | ```bash
132 | python -m weasel document --output README.md
133 | ```
134 | 
135 | Under the hood, hidden markers are added to identify where the auto-generated
136 | content starts and ends. This means that you can add your own custom content
137 | before or after it and re-running the `document` command will **only
138 | update the auto-generated part**. This makes it easy to keep your documentation
139 | up to date.
140 | 
141 | > **Warning**
142 | >
143 | > Note that the contents of an existing file will be **replaced** if no existing
144 | > auto-generated docs are found. If you want Weasel to ignore a file and not update
145 | > it, you can add the comment marker `{/* WEASEL: IGNORE */}` anywhere in
146 | > your markup.
147 | 
148 | ## Cloning from your own repo
149 | 
150 | The [`weasel clone`](../cli.md#clipboard-clone) command lets you customize
151 | the repo to clone from using the `--repo` option. It calls into `git`, so you'll
152 | be able to clone from any repo that you have access to, including private repos.
153 | 
154 | ```bash
155 | python -m weasel clone your_project --repo https://github.com/you/repo
156 | ```
157 | 
158 | At a minimum, a valid project template needs to contain a
159 | [`project.yml`](./directory-and-assets.md#projectyml). It can also include
160 | [other files](./directory-and-assets.md), like custom scripts, a
161 | `requirements.txt` listing additional dependencies,
162 | a machine learning model and meta templates, or Jupyter
163 | notebooks with usage examples.
164 | 
165 | > :warning: **Important note about assets**
166 | >
167 | > It's typically not a good idea to check large data assets, trained pipelines or
168 | > other artifacts into a Git repo and you should exclude them from your project
169 | > template by adding a `.gitignore`. If you want to version your data and models,
170 | > check out [Data Version Control](./integrations.md#data-version-control-dvc) (DVC),
171 | > which integrates with Weasek.
172 | 


--------------------------------------------------------------------------------
/docs/tutorial/directory-and-assets.md:
--------------------------------------------------------------------------------
  1 | # Project directory and assets
  2 | 
  3 | ## `project.yml`
  4 | 
  5 | The `project.yml` defines the assets a project depends on, like datasets and
  6 | pretrained weights, as well as a series of commands that can be run separately
  7 | or as a workflow – for instance, to preprocess the data, convert it to Weasel's
  8 | format, train a pipeline, evaluate it and export metrics, package it and spin up
  9 | a quick web demo. It looks pretty similar to a config file used to define CI
 10 | pipelines.
 11 | 
 12 | > :boom: **Tip: Multi-line YAML**
 13 | >
 14 | > YAML has [multi-line syntax](https://yaml-multiline.info/) that can be helpful
 15 | > for readability with longer values such as project descriptions or commands
 16 | > that take several arguments.
 17 | 
 18 | > :boom: **Tip: Variable override**
 19 | >
 20 | > If you want to override one or more variables on the CLI and are not already
 21 | > specifying a project directory, you need to add `.` as a placeholder:
 22 | >
 23 | > ```
 24 | > python -m weasel run test . --vars.foo bar
 25 | > ```
 26 | 
 27 | > :boom: **Tip: Environment variables**
 28 | >
 29 | > Commands in a project file are not executed in a shell, so they don't have
 30 | > direct access to environment variables. But you can insert environment
 31 | > variables using the `env` dictionary to make values available for
 32 | > interpolation, just like values in `vars`. Here's an example `env` dict that
 33 | > makes `$PATH` available as `ENV_PATH`:
 34 | >
 35 | > ```yaml
 36 | > env:
 37 | >   ENV_PATH: PATH
 38 | > ```
 39 | >
 40 | > This can be used in a project command like so:
 41 | >
 42 | > ```yaml
 43 | > - name: 'echo-path'
 44 | >   script:
 45 | >     - 'echo ${env.ENV_PATH}'
 46 | > ```
 47 | 
 48 | `project.yml` adheres to the following schema:
 49 | 
 50 | | Section              | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
 51 | | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 52 | | `title`              | An optional project title used in `--help` message and [auto-generated docs](../cli.md#closed_book-document).                                                                                                                                                                                                                                                                                                                                                                                         |
 53 | | `description`        | An optional project description used in [auto-generated docs](../cli.md#closed_book-document).                                                                                                                                                                                                                                                                                                                                                                                                        |
 54 | | `vars`               | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](https://spacy.io/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`.                                                                                                |
 55 | | `env`                | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`.                                                                                                                                                                                                                                                                                   |
 56 | | `directories`        | An optional list of [directories](#data-assets) that should be created in the project for assets, training outputs, metrics etc. Weasel will make sure that these directories always exist.                                                                                                                                                                                                                                                                                                           |
 57 | | `assets`             | A list of assets that can be fetched with the [`assets`](../cli.md#open_file_folder-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo.                                                               |
 58 | | `workflows`          | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`run`](../cli.md#rocket-run) command.                                                                                                                                                                                                                                                                                                                                          |
 59 | | `commands`           | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets Weasel determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`run`](../cli.md#rocket-run) command. |
 60 | 
 61 | ## Data assets
 62 | 
 63 | Assets are any files that your project might need, like training and development
 64 | corpora or pretrained weights for initializing your model. Assets are defined in
 65 | the `assets` block of your `project.yml` and can be downloaded using the
 66 | [`assets`](../cli.md#open_file_folder-assets) command. Defining checksums lets you
 67 | verify that someone else running your project will use the same files you used.
 68 | Asset URLs can be a number of different **protocols**: HTTP, HTTPS, FTP, SSH,
 69 | and even **cloud storage** such as GCS and S3. You can also download assets from
 70 | a **Git repo** instead.
 71 | 
 72 | ### Downloading from a URL or cloud storage
 73 | 
 74 | Under the hood, Weasel uses the
 75 | [`smart_open`](https://github.com/RaRe-Technologies/smart_open) library so you
 76 | can use any protocol it supports. Note that you may need to install extra
 77 | dependencies to use certain protocols.
 78 | 
 79 | > :bulb: **Example configuration**
 80 | >
 81 | > ```yaml title="project.yml"
 82 | > assets:
 83 | >   # Download from public HTTPS URL
 84 | >   - dest: 'assets/training.spacy'
 85 | >     url: 'https://example.com/data.spacy'
 86 | >     checksum: '63373dd656daa1fd3043ce166a59474c'
 87 | >   # Optional download from Google Cloud Storage bucket
 88 | >   - dest: 'assets/development.spacy'
 89 | >     extra: True
 90 | >     url: 'gs://your-bucket/corpora'
 91 | >     checksum: '5113dc04e03f079525edd8df3f4f39e3'
 92 | > ```
 93 | 
 94 | | Name          | Description                                                                                                                                                                      |
 95 | | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 96 | | `dest`        | The destination path to save the downloaded asset to (relative to the project directory), including the file name.                                                               |
 97 | | `extra`       | Optional flag determining whether this asset is downloaded only if `weasel assets` is run with `--extra`. `False` by default.                                                    |
 98 | | `url`         | The URL to download from, using the respective protocol.                                                                                                                         |
 99 | | `checksum`    | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
100 | | `description` | Optional asset description, used in [auto-generated docs](../cli.md#closed_book-document).                                                                                       |
101 | 
102 | ### Downloading from a Git repo
103 | 
104 | If a `git` block is provided, the asset is downloaded from the given Git
105 | repository. You can download from any repo that you have access to. Under the
106 | hood, this uses Git's "sparse checkout" feature, so you're only downloading the
107 | files you need and not the whole repo.
108 | 
109 | > :bulb: **Example configuration**
110 | >
111 | > ```yaml title="project.yml"
112 | > assets:
113 | >   - dest: 'assets/training.spacy'
114 | >     git:
115 | >       repo: 'https://github.com/example/repo'
116 | >       branch: 'master'
117 | >       path: 'path/training.spacy'
118 | >     checksum: '63373dd656daa1fd3043ce166a59474c'
119 | >     description: 'The training data (5000 examples)'
120 | > ```
121 | 
122 | | Name          | Description                                                                                                                                                                                                                           |
123 | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
124 | | `dest`        | The destination path to save the downloaded asset to (relative to the project directory), including the file name.                                                                                                                    |
125 | | `git`         | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.<br />`branch`: The branch to download from. Defaults to `"master"`. |
126 | | `checksum`    | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists.                                                      |
127 | | `description` | Optional asset description, used in [auto-generated docs](../cli.md#closed_book-document).                                                                                                                                            |
128 | 
129 | ### Working with private assets
130 | 
131 | > :bulb: **Example configuration**
132 | >
133 | > ```yaml title="project.yml"
134 | > assets:
135 | >   - dest: 'assets/private_training_data.json'
136 | >     checksum: '63373dd656daa1fd3043ce166a59474c'
137 | >   - dest: 'assets/private_vectors.bin'
138 | >     checksum: '5113dc04e03f079525edd8df3f4f39e3'
139 | > ```
140 | 
141 | For many projects, the datasets and weights you're working with might be
142 | company-internal and not available over the internet. In that case, you can
143 | specify the destination paths and a checksum, and leave out the URL. When your
144 | teammates clone and run your project, they can place the files in the respective
145 | directory themselves. The [`assets`](../cli.md#open_file_folder-assets) command
146 | will alert you about missing files and mismatched checksums, so you can ensure
147 | that others are running your project with the same data.
148 | 
149 | ## Dependencies and outputs
150 | 
151 | Each command defined in the `project.yml` can optionally define a list of
152 | dependencies and outputs. These are the files the command requires and creates.
153 | For example, a command for training a spaCy pipeline may depend on a
154 | [`config.cfg`](https://spacy.io/usage/training#config) and the training and evaluation data, and
155 | it will export a directory `model-best`, which you can then re-use in other
156 | commands.
157 | 
158 | > :bulb: **Example configuration**
159 | >
160 | > ```yaml title="project.yml"
161 | > commands:
162 | > - name: train
163 | >     help: 'Train a spaCy pipeline using the specified corpus and config'
164 | >     script:
165 | >     - 'python -m spacy train ./configs/config.cfg -o training/ --paths.train ./corpus/training.spacy --paths.dev ./corpus/evaluation.spacy'
166 | >     deps:
167 | >     - 'configs/config.cfg'
168 | >     - 'corpus/training.spacy'
169 | >     - 'corpus/evaluation.spacy'
170 | >     outputs:
171 | >     - 'training/model-best'
172 | > ```
173 | 
174 | > :boom: **Tip: Re-running vs. skipping**
175 | >
176 | > Under the hood, Weasel uses a `project.lock` lockfile that stores the details
177 | > for each command, as well as its dependencies and outputs and their checksums.
178 | > It's updated on each run. If any of this information changes, the command will
179 | > be re-run. Otherwise, it will be skipped.
180 | 
181 | If you're running a command and it depends on files that are missing, Weasel will
182 | show you an error. If a command defines dependencies and outputs that haven't
183 | changed since the last run, the command will be skipped. This means that you're
184 | only re-running commands if they need to be re-run. Commands can also set
185 | `no_skip: true` if they should never be skipped – for example commands that run
186 | tests. Commands without outputs are also never skipped. To force re-running a
187 | command or workflow, even if nothing changed, you can set the `--force` flag.
188 | 
189 | Note that [`weasel`](../cli.md) doesn't compile any dependency
190 | graphs based on the dependencies and outputs, and won't re-run previous steps
191 | automatically. For instance, if you only run the command `train` that depends on
192 | data created by `preprocess` and those files are missing, Weasel will show an
193 | error – it won't just re-run `preprocess`. If you're looking for more advanced
194 | data management, check out the [Data Version Control (DVC) integration](./integrations.md#data-version-control-dvc).
195 | If you're planning on integrating your Weasel project with DVC, you can also use
196 | `outputs_no_cache` instead of `outputs` to define outputs that won't be cached
197 | or tracked.
198 | 
199 | ## Files and directory structure
200 | 
201 | The `project.yml` can define a list of `directories` that should be created
202 | within a project – for instance, `assets`, `training`, `corpus` and so on. Weasel
203 | will make sure that these directories are always available, so your commands can
204 | write to and read from them. Project directories will also include all files and
205 | directories copied from the project template with
206 | [`weasel clone`](../cli.md#clipboard-clone). Here's an example of a project
207 | directory:
208 | 
209 | > :bulb: **Example configuration**
210 | >
211 | > ```yaml title="project.yml"
212 | > directories:
213 | >   - 'assets'
214 | >   - 'configs'
215 | >   - 'corpus'
216 | >   - 'metas'
217 | >   - 'metrics'
218 | >   - 'notebooks'
219 | >   - 'packages'
220 | >   - 'scripts'
221 | >   - 'training'
222 | > ```
223 | >
224 | >``` title="Example directory structure"
225 | >├── project.yml          # the project settings
226 | >├── project.lock         # lockfile that tracks inputs/outputs
227 | >├── assets/              # downloaded data assets
228 | >├── configs/             # pipeline config.cfg files used for training
229 | >├── corpus/              # output directory for training corpus
230 | >├── metas/               # pipeline meta.json templates used for packaging
231 | >├── metrics/             # output directory for evaluation metrics
232 | >├── notebooks/           # directory for Jupyter notebooks
233 | >├── packages/            # output directory for pipeline Python packages
234 | >├── scripts/             # directory for scripts, e.g. referenced in commands
235 | >├── training/            # output directory for trained pipelines
236 | >└── ...                  # any other files, like a requirements.txt etc.
237 | >```
238 | 
239 | If you don't want a project to create a directory, you can delete it and remove
240 | its entry from the `project.yml` – just make sure it's not required by any of
241 | the commands. [Custom templates](./custom-scripts.md) can use any directories they need –
242 | the only file that's required for a project is the `project.yml`.
243 | 


--------------------------------------------------------------------------------
/docs/tutorial/integrations.md:
--------------------------------------------------------------------------------
 1 | # Integrations
 2 | 
 3 | ## Data Version Control (DVC)
 4 | 
 5 | Data assets like training corpora or pretrained weights are at the core of any
 6 | NLP project, but they're often difficult to manage: you can't just check them
 7 | into your Git repo to version and keep track of them. And if you have multiple
 8 | steps that depend on each other, like a preprocessing step that generates your
 9 | training data, you need to make sure the data is always up-to-date, and re-run
10 | all steps of your process every time, just to be safe.
11 | 
12 | [Data Version Control](https://dvc.org) (DVC) is a standalone open-source tool
13 | that integrates into your workflow like Git, builds a dependency graph for your
14 | data pipelines and tracks and caches your data files. If you're downloading data
15 | from an external source, like a storage bucket, DVC can tell whether the
16 | resource has changed. It can also determine whether to re-run a step, depending
17 | on whether its input have changed or not. All metadata can be checked into a Git
18 | repo, so you'll always be able to reproduce your experiments.
19 | 
20 | To set up DVC, install the package and initialize your Weasek project as a Git
21 | and DVC repo. You can also
22 | [customize your DVC installation](https://dvc.org/doc/install/macos#install-with-pip)
23 | to include support for remote storage like Google Cloud Storage, S3, Azure, SSH
24 | and more.
25 | 
26 | ```bash
27 | pip install dvc   # Install DVC
28 | git init          # Initialize a Git repo
29 | dvc init          # Initialize a DVC project
30 | ```
31 | 
32 | > :warning: **Important note on privacy**
33 | >
34 | > DVC enables usage analytics by default, so if you're working in a
35 | > privacy-sensitive environment, make sure to
36 | > [**opt-out manually**](https://dvc.org/doc/user-guide/analytics#opting-out).
37 | 
38 | The [`weasel dvc`](../cli.md#repeat-dvc) command creates a `dvc.yaml`
39 | config file based on a workflow defined in your `project.yml`. Whenever you
40 | update your project, you can re-run the command to update your DVC config. You
41 | can then manage your Weasel project like any other DVC project, run
42 | [`dvc add`](https://dvc.org/doc/command-reference/add) to add and track assets
43 | and [`dvc repro`](https://dvc.org/doc/command-reference/repro) to reproduce the
44 | workflow or individual commands.
45 | 
46 | ```bash
47 | python -m weasel dvc [project_dir] [workflow_name]
48 | ```
49 | 
50 | > :warning: **Important note for multiple workflows**
51 | >
52 | > DVC currently expects a single workflow per project, so when creating the config
53 | > with [`weasel dvc`](../cli.md#repeat-dvc), you need to specify the name
54 | > of a workflow defined in your `project.yml`. You can still use multiple
55 | > workflows, but only one can be tracked by DVC.
56 | 


--------------------------------------------------------------------------------
/docs/tutorial/remote-storage.md:
--------------------------------------------------------------------------------
 1 | # Remote Storage
 2 | 
 3 | You can persist your project outputs to a remote storage using the
 4 | [`push`](../cli.md#arrow_up-push) command. This can help you **export** your
 5 | pipeline packages, **share** work with your team, or **cache results** to avoid
 6 | repeating work. The [`pull`](../cli.md#arrow_down-pull) command will download
 7 | any outputs that are in the remote storage and aren't available locally.
 8 | 
 9 | You can list one or more remotes in the `remotes` section of your
10 | [`project.yml`](./directory-and-assets.md#projectyml) by mapping a string name
11 | to the URL of the storage. Under the hood, Weasel uses
12 | [`cloudpathlib`](https://cloudpathlib.drivendata.org/) to communicate with the
13 | remote storages, so you can use any protocol that `CloudPath` supports,
14 | including [S3](https://aws.amazon.com/s3/),
15 | [Google Cloud Storage](https://cloud.google.com/storage), and the local
16 | filesystem, although you may need to install extra dependencies to use certain
17 | protocols.
18 | 
19 | > :bulb: **Example using remote storage**
20 | >
21 | > ```bash
22 | > $ python -m weasel pull local
23 | > ```
24 | >
25 | > ```yaml title="project.yml"
26 | > remotes:
27 | > default: 's3://my-weasel-bucket'
28 | > local: '/mnt/scratch/cache'
29 | > ```
30 | 
31 | > :information_source: **How it works**
32 | >
33 | > Inside the remote storage, Weasel uses a clever **directory structure** to
34 | > avoid overwriting files. The top level of the directory structure is a
35 | > URL-encoded version of the output's path. Within this directory are
36 | > subdirectories named according to a hash of the command string and the
37 | > command's dependencies. Finally, within those directories are files, named
38 | > according to an MD5 hash of their contents.
39 | >
40 | > ```
41 | > └── urlencoded_file_path            # Path of original file
42 | >     ├── some_command_hash           # Hash of command you ran
43 | >     │   ├── some_content_hash       # Hash of file content
44 | >     │   └── another_content_hash
45 | >     └── another_command_hash
46 | >         └── third_content_hash
47 | > ```
48 | 
49 | For instance, let's say you had the following spaCy command in your
50 | `project.yml`:
51 | 
52 | ```yaml title="project.yml"
53 | - name: train
54 |   help: 'Train a spaCy pipeline using the specified corpus and config'
55 |   script:
56 |     - 'spacy train ./config.cfg --output training/'
57 |   deps:
58 |     - 'corpus/train'
59 |     - 'corpus/dev'
60 |     - 'config.cfg'
61 |   outputs:
62 |     - 'training/model-best'
63 | ```
64 | 
65 | After you finish training, you run [`push`](../cli.md#arrow_up-push) to make
66 | sure the `training/model-best` output is saved to remote storage. Weasel will
67 | then construct a hash from your command script and the listed dependencies,
68 | `corpus/train`, `corpus/dev` and `config.cfg`, in order to identify the
69 | execution context of your output. It would then compute an MD5 hash of the
70 | `training/model-best` directory, and use those three pieces of information to
71 | construct the storage URL.
72 | 
73 | ```bash
74 | python -m weasel run train
75 | python -m weasel push
76 | ```
77 | 
78 | ```title="Overview of the S3 bucket"
79 | └── s3://my-weasel-bucket/training%2Fmodel-best
80 |     └── 1d8cb33a06cc345ad3761c6050934a1b
81 |         └── d8e20c3537a084c5c10d95899fe0b1ff
82 | ```
83 | 
84 | If you change the command or one of its dependencies (for instance, by editing
85 | the [`config.cfg`](https://spacy.io/usage/training#config) file to tune the
86 | hyperparameters), a different creation hash will be calculated, so when you use
87 | [`push`](../cli.md#arrow_up-push) you won't be overwriting your previous file.
88 | The system even supports multiple outputs for the same file and the same
89 | context, which can happen if your training process is not deterministic, or if
90 | you have dependencies that aren't represented in the command.
91 | 
92 | In summary, the `weasel` remote storages are designed to make a particular set
93 | of trade-offs. Priority is placed on **convenience**, **correctness** and
94 | **avoiding data loss**. You can use [`push`](../cli.md#arrow_up-push) freely, as
95 | you'll never overwrite remote state, and you don't have to come up with names or
96 | version numbers. However, it's up to you to manage the size of your remote
97 | storage, and to remove files that are no longer relevant to you.
98 | 


--------------------------------------------------------------------------------
/docs/tutorial/workflow.md:
--------------------------------------------------------------------------------
  1 | # Workflow
  2 | 
  3 | ## 1. Clone a project template
  4 | 
  5 | > :information_source: **Cloning under the hood**
  6 | >
  7 | > To clone a project, Weasel calls into `git` and uses the "sparse checkout"
  8 | > feature to only clone the relevant directory or directories.
  9 | 
 10 | The [`weasel clone`](../cli.md#clipboard-clone) command clones an existing
 11 | project template and copies the files to a local directory. You can then run the
 12 | project, e.g. to train a pipeline and edit the commands and scripts to build
 13 | fully custom workflows.
 14 | 
 15 | ```bash
 16 | python -m weasel clone pipelines/tagger_parser_ud
 17 | ```
 18 | 
 19 | By default, the project will be cloned into the current working directory. You
 20 | can specify an optional second argument to define the output directory. The
 21 | `--repo` option lets you define a custom repo to clone from if you don't want to
 22 | use the default [`projects`](https://github.com/explosion/projects) repo. You can
 23 | also use any private repo you have access to with Git.
 24 | 
 25 | ## 2. Fetch the project assets
 26 | 
 27 | Assets are data files your project needs – for example, the training and
 28 | evaluation data or pretrained vectors and embeddings to initialize your model
 29 | with. Each project template comes with a `project.yml` that defines the assets
 30 | to download and where to put them. The [`weasel assets`](../cli.md#open_file_folder-assets)
 31 | will fetch the project assets for you.
 32 | 
 33 | > :bulb: **Example usage**
 34 | >
 35 | > ```yaml title="project.yml"
 36 | > assets:
 37 | > - dest: 'assets/training.spacy'
 38 | >     url: 'https://example.com/data.spacy'
 39 | >     checksum: '63373dd656daa1fd3043ce166a59474c'
 40 | > - dest: 'assets/development.spacy'
 41 | >     git:
 42 | >     repo: 'https://github.com/example/repo'
 43 | >     branch: 'master'
 44 | >     path: 'path/development.spacy'
 45 | >     checksum: '5113dc04e03f079525edd8df3f4f39e3'
 46 | > ```
 47 | >
 48 | > Let Weasel fetch the assets:
 49 | >
 50 | > ```bash
 51 | > python -m weasel assets
 52 | > ```
 53 | 
 54 | Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and
 55 | even cloud storage such as GCS and S3. You can also fetch assets using git, by
 56 | replacing the `url` string with a `git` block. Weasel will use Git's "sparse
 57 | checkout" feature to avoid downloading the whole repository.
 58 | 
 59 | Sometimes your project configuration may include large assets that you don't
 60 | necessarily want to download when you run `weasel assets`. That's why
 61 | assets can be marked as [`extra`](./directory-and-assets.md#data-assets) - by default, these assets
 62 | are not downloaded. If they should be, run `weasel assets --extra`.
 63 | 
 64 | ## 3. Run a command
 65 | 
 66 | Commands consist of one or more steps and can be run with
 67 | [`weasel run`](../cli.md#rocket-run). The following will run the command
 68 | `preprocess` defined in the `project.yml`:
 69 | 
 70 | > :bulb: **Example usage**
 71 | >
 72 | > ```yaml title="project.yml"
 73 | > commands:
 74 | > - name: preprocess
 75 | >     help: "Convert the input data to spaCy's format"
 76 | >     script:
 77 | >     - 'python -m spacy convert assets/train.conllu corpus/'
 78 | >     - 'python -m spacy convert assets/eval.conllu corpus/'
 79 | >     deps:
 80 | >     - 'assets/train.conllu'
 81 | >     - 'assets/eval.conllu'
 82 | >     outputs:
 83 | >     - 'corpus/train.spacy'
 84 | >     - 'corpus/eval.spacy'
 85 | > ```
 86 | >
 87 | > Run the command:
 88 | >
 89 | > ```bash
 90 | > python -m weasel run preprocess
 91 | > ```
 92 | 
 93 | Commands can define their expected [dependencies and outputs](./directory-and-assets.md#dependencies-and-outputs)
 94 | using the `deps` (files the commands require) and `outputs` (files the commands
 95 | create) keys. This allows your project to track changes and determine whether a
 96 | command needs to be re-run. For instance, if your input data changes, you want
 97 | to re-run the `preprocess` command. But if nothing changed, this step can be
 98 | skipped. You can also set `--force` to force re-running a command, or `--dry` to
 99 | perform a "dry run" and see what would happen (without actually running the
100 | script).
101 | 
102 | ## 4. Run a workflow
103 | 
104 | Workflows are series of commands that are run in order and often depend on each
105 | other. For instance, to generate a spaCy pipeline package, you might start by
106 | converting your data, then run [`spacy train`](https://spacy.io/api/cli#train) to train your
107 | pipeline on the converted data and if that's successful, run
108 | [`spacy package`](https://spacy.io/api/cli#package) to turn the best trained artifact into an
109 | installable Python package. The following command runs the workflow named `all`
110 | defined in the `project.yml`, and executes the commands it specifies, in order:
111 | 
112 | > :bulb: **Example usage**
113 | >
114 | > ```yaml title="project.yml"
115 | > workflows:
116 | > all:
117 | >     - preprocess
118 | >     - train
119 | >     - package
120 | > ```
121 | >
122 | > ```bash
123 | > python -m weasel run all
124 | > ```
125 | 
126 | Using the expected [dependencies and outputs](./directory-and-assets.md#dependencies-and-outputs)
127 | defined in the commands, Weasel can determine whether to re-run a command (if its inputs or
128 | outputs have changed) or whether to skip it. If you're looking to implement more
129 | advanced data pipelines and track your changes in Git, check out the
130 | [Data Version Control (DVC) integration](./integrations.md#data-version-control-dvc). The
131 | [`weasel dvc`](../cli.md#repeat-dvc) command generates a DVC config file
132 | from a workflow defined in your `project.yml` so you can manage your Weasel
133 | project as a DVC repo.
134 | 
135 | ## 5. Optional: Push to remote storage
136 | 
137 | After training a pipeline, you can optionally use the
138 | [`weasel push`](../cli.md#arrow_up-push) command to upload your outputs to
139 | a remote storage, using protocols like [S3](https://aws.amazon.com/s3/),
140 | [Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help
141 | you **export** your pipeline packages, **share** work with your team, or **cache
142 | results** to avoid repeating work.
143 | 
144 | > :bulb: **Example usage**
145 | >
146 | > ```yaml title="project.yml"
147 | > remotes:
148 | >   default: 's3://my-weasel-bucket'
149 | >   local: '/mnt/scratch/cache'
150 | > ```
151 | >
152 | > Push to remote:
153 | >
154 | > ```bash
155 | > python -m weasel push
156 | > ```
157 | 
158 | The `remotes` section in your `project.yml` lets you assign names to the
159 | different storages. To download state from a remote storage, you can use the
160 | [`weasel pull`](../cli.md#arrow_down-pull) command. For more details, see the
161 | docs on [remote storage](./remote-storage.md).
162 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.ruff]
 6 | ignore = [
 7 |     "E501",
 8 | ]
 9 | select = [
10 |     "E",    # pycodestyle errors
11 |     "W",    # pycodestyle warnings
12 |     "F",    # Pyflakes
13 |     "Q",    # flake8-quotes
14 | ]
15 | 
16 | [tool.ruff.per-file-ignores]
17 | # Ignore unused imports in __init__ files
18 | "__init__.py" = ["F401"]
19 | 
20 | 
21 | [tool.isort]
22 | multi_line_output = 9
23 | profile = "black"
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Our libraries
 2 | confection>=0.0.4,<0.2.0
 3 | wasabi>=0.9.1,<1.2.0
 4 | srsly>=2.4.3,<3.0.0
 5 | typer>=0.3.0,<1.0.0
 6 | cloudpathlib>=0.7.0,<1.0.0
 7 | smart-open>=5.2.1,<8.0.0
 8 | # Third party dependencies
 9 | requests>=2.13.0,<3.0.0
10 | pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
11 | # Official Python utilities
12 | packaging>=20.0
13 | # Development dependencies
14 | black==22.3.0
15 | pytest>=5.2.0,!=7.1.0
16 | mypy>=1.5.0,<1.7.0; python_version >= "3.8"
17 | types-requests
18 | types-setuptools>=57.0.0
19 | ruff>=0.0.259
20 | isort>=5.12.0,<6.0; python_version > "3.7"
21 | pre-commit>=3.2.0,<4.0.0; python_version > "3.7"
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = weasel
 3 | version = 0.4.1
 4 | description = Weasel: A small and easy workflow system
 5 | url = https://github.com/explosion/weasel/
 6 | author = Explosion
 7 | author_email = contact@explosion.ai
 8 | license = MIT
 9 | long_description = file: README.md
10 | long_description_content_type = text/markdown
11 | classifiers =
12 |     Environment :: Console
13 |     Intended Audience :: Developers
14 |     Intended Audience :: Science/Research
15 |     License :: OSI Approved :: MIT License
16 |     Operating System :: POSIX :: Linux
17 |     Operating System :: MacOS :: MacOS X
18 |     Operating System :: Microsoft :: Windows
19 |     Programming Language :: Python :: 3
20 |     Programming Language :: Python :: 3.7
21 |     Programming Language :: Python :: 3.8
22 |     Programming Language :: Python :: 3.9
23 |     Programming Language :: Python :: 3.10
24 |     Programming Language :: Python :: 3.11
25 |     Programming Language :: Python :: 3.12
26 |     Topic :: Scientific/Engineering
27 | project_urls =
28 |     Release notes = https://github.com/explosion/weasel/releases
29 |     Source = https://github.com/explosion/weasel/
30 | 
31 | [options]
32 | python_requires = >=3.7
33 | install_requires =
34 |     confection>=0.0.4,<0.2.0
35 |     packaging>=20.0
36 |     wasabi>=0.9.1,<1.2.0
37 |     srsly>=2.4.3,<3.0.0
38 |     typer>=0.3.0,<1.0.0
39 |     cloudpathlib>=0.7.0,<1.0.0
40 |     smart-open>=5.2.1,<8.0.0
41 |     requests>=2.13.0,<3.0.0
42 |     pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
43 | 
44 | 
45 | [options.entry_points]
46 | console_scripts =
47 |     weasel = weasel.cli:app
48 | 
49 | [tool:pytest]
50 | markers =
51 |     issue: references specific issue
52 | 
53 | [mypy]
54 | ignore_missing_imports = True
55 | no_implicit_optional = True
56 | plugins = pydantic.mypy
57 | allow_redefinition = True
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | 
4 | if __name__ == "__main__":
5 |     from setuptools import find_packages, setup
6 | 
7 |     setup(packages=find_packages())
8 | 


--------------------------------------------------------------------------------
/weasel/__init__.py:
--------------------------------------------------------------------------------
1 | from .cli import app
2 | 


--------------------------------------------------------------------------------
/weasel/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli.main import COMMAND, app
2 | 
3 | app(prog_name=COMMAND)
4 | 


--------------------------------------------------------------------------------
/weasel/about.py:
--------------------------------------------------------------------------------
1 | __projects__ = "https://github.com/explosion/projects"
2 | __projects_branch__ = "v3"
3 | 


--------------------------------------------------------------------------------
/weasel/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | from .main import app  # isort: skip
 2 | 
 3 | from .assets import project_assets
 4 | from .clone import project_clone
 5 | from .document import project_document
 6 | from .dvc import project_update_dvc
 7 | from .pull import project_pull
 8 | from .push import project_push
 9 | from .run import project_run
10 | 


--------------------------------------------------------------------------------
/weasel/cli/assets.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import shutil
  4 | from pathlib import Path
  5 | from typing import Any, Dict, Optional
  6 | 
  7 | import requests
  8 | import typer
  9 | from wasabi import msg
 10 | 
 11 | from ..util import SimpleFrozenDict, download_file, ensure_path, get_checksum
 12 | from ..util import get_git_version, git_checkout, load_project_config
 13 | from ..util import parse_config_overrides, working_dir
 14 | from .main import PROJECT_FILE, Arg, Opt, app
 15 | 
 16 | # Whether assets are extra if `extra` is not set.
 17 | EXTRA_DEFAULT = False
 18 | 
 19 | 
 20 | @app.command(
 21 |     "assets",
 22 |     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 23 | )
 24 | def project_assets_cli(
 25 |     # fmt: off
 26 |     ctx: typer.Context,  # This is only used to read additional arguments
 27 |     project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
 28 |     sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
 29 |     extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
 30 |     # fmt: on
 31 | ):
 32 |     """Fetch project assets like datasets and pretrained weights. Assets are
 33 |     defined in the "assets" section of the project.yml. If a checksum is
 34 |     provided in the project.yml, the file is only downloaded if no local file
 35 |     with the same checksum exists.
 36 | 
 37 |     DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-and-assets.md
 38 |     """
 39 |     overrides = parse_config_overrides(ctx.args)
 40 |     project_assets(
 41 |         project_dir,
 42 |         overrides=overrides,
 43 |         sparse_checkout=sparse_checkout,
 44 |         extra=extra,
 45 |     )
 46 | 
 47 | 
 48 | def project_assets(
 49 |     project_dir: Path,
 50 |     *,
 51 |     overrides: Dict[str, Any] = SimpleFrozenDict(),
 52 |     sparse_checkout: bool = False,
 53 |     extra: bool = False,
 54 | ) -> None:
 55 |     """Fetch assets for a project using DVC if possible.
 56 | 
 57 |     project_dir (Path): Path to project directory.
 58 |     sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
 59 |                             needed.
 60 |     extra (bool): Whether to download all assets, including those marked as 'extra'.
 61 |     """
 62 |     project_path = ensure_path(project_dir)
 63 |     config = load_project_config(project_path, overrides=overrides)
 64 |     assets = [
 65 |         asset
 66 |         for asset in config.get("assets", [])
 67 |         if extra or not asset.get("extra", EXTRA_DEFAULT)
 68 |     ]
 69 |     if not assets:
 70 |         msg.warn(
 71 |             f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
 72 |             exits=0,
 73 |         )
 74 |     msg.info(f"Fetching {len(assets)} asset(s)")
 75 | 
 76 |     for asset in assets:
 77 |         dest = (project_dir / asset["dest"]).resolve()
 78 |         checksum = asset.get("checksum")
 79 |         if "git" in asset:
 80 |             git_err = (
 81 |                 "Cloning Weasel project templates requires Git and the 'git' command. "
 82 |                 "Make sure it's installed and that the executable is available."
 83 |             )
 84 |             get_git_version(error=git_err)
 85 |             if dest.exists():
 86 |                 # If there's already a file, check for checksum
 87 |                 if checksum and checksum == get_checksum(dest):
 88 |                     msg.good(
 89 |                         f"Skipping download with matching checksum: {asset['dest']}"
 90 |                     )
 91 |                     continue
 92 |                 else:
 93 |                     if dest.is_dir():
 94 |                         shutil.rmtree(dest)
 95 |                     else:
 96 |                         dest.unlink()
 97 |             if "repo" not in asset["git"] or asset["git"]["repo"] is None:
 98 |                 msg.fail(
 99 |                     "A git asset must include 'repo', the repository address.", exits=1
100 |                 )
101 |             if "path" not in asset["git"] or asset["git"]["path"] is None:
102 |                 msg.fail(
103 |                     "A git asset must include 'path' - use \"\" to get the entire repository.",
104 |                     exits=1,
105 |                 )
106 |             git_checkout(
107 |                 asset["git"]["repo"],
108 |                 asset["git"]["path"],
109 |                 dest,
110 |                 branch=asset["git"].get("branch"),
111 |                 sparse=sparse_checkout,
112 |             )
113 |             msg.good(f"Downloaded asset {dest}")
114 |         else:
115 |             url = asset.get("url")
116 |             if not url:
117 |                 # project.yml defines asset without URL that the user has to place
118 |                 check_private_asset(dest, checksum)
119 |                 continue
120 |             fetch_asset(project_path, url, dest, checksum)
121 | 
122 | 
123 | def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
124 |     """Check and validate assets without a URL (private assets that the user
125 |     has to provide themselves) and give feedback about the checksum.
126 | 
127 |     dest (Path): Destination path of the asset.
128 |     checksum (Optional[str]): Optional checksum of the expected file.
129 |     """
130 |     if not Path(dest).exists():
131 |         err = f"No URL provided for asset. You need to add this file yourself: {dest}"
132 |         msg.warn(err)
133 |     else:
134 |         if not checksum:
135 |             msg.good(f"Asset already exists: {dest}")
136 |         elif checksum == get_checksum(dest):
137 |             msg.good(f"Asset exists with matching checksum: {dest}")
138 |         else:
139 |             msg.fail(f"Asset available but with incorrect checksum: {dest}")
140 | 
141 | 
142 | def fetch_asset(
143 |     project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
144 | ) -> None:
145 |     """Fetch an asset from a given URL or path. If a checksum is provided and a
146 |     local file exists, it's only re-downloaded if the checksum doesn't match.
147 | 
148 |     project_path (Path): Path to project directory.
149 |     url (str): URL or path to asset.
150 |     checksum (Optional[str]): Optional expected checksum of local file.
151 |     RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
152 |         the asset failed.
153 |     """
154 |     dest_path = (project_path / dest).resolve()
155 |     if dest_path.exists():
156 |         # If there's already a file, check for checksum
157 |         if checksum:
158 |             if checksum == get_checksum(dest_path):
159 |                 msg.good(f"Skipping download with matching checksum: {dest}")
160 |                 return
161 |         else:
162 |             # If there's not a checksum, make sure the file is a possibly valid size
163 |             if os.path.getsize(dest_path) == 0:
164 |                 msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
165 |                 os.remove(dest_path)
166 |     # We might as well support the user here and create parent directories in
167 |     # case the asset dir isn't listed as a dir to create in the project.yml
168 |     if not dest_path.parent.exists():
169 |         dest_path.parent.mkdir(parents=True)
170 |     with working_dir(project_path):
171 |         url = convert_asset_url(url)
172 |         try:
173 |             download_file(url, dest_path)
174 |             msg.good(f"Downloaded asset {dest}")
175 |         except requests.exceptions.RequestException as e:
176 |             if Path(url).exists() and Path(url).is_file():
177 |                 # If it's a local file, copy to destination
178 |                 shutil.copy(url, str(dest_path))
179 |                 msg.good(f"Copied local asset {dest}")
180 |             else:
181 |                 msg.fail(f"Download failed: {dest}", e)
182 |     if checksum and checksum != get_checksum(dest_path):
183 |         msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
184 | 
185 | 
186 | def convert_asset_url(url: str) -> str:
187 |     """Check and convert the asset URL if needed.
188 | 
189 |     url (str): The asset URL.
190 |     RETURNS (str): The converted URL.
191 |     """
192 |     # If the asset URL is a regular GitHub URL it's likely a mistake
193 |     if (
194 |         re.match(r"(http(s?)):\/\/github.com", url)
195 |         and "releases/download" not in url
196 |         and "/raw/" not in url
197 |     ):
198 |         converted = url.replace("github.com", "raw.githubusercontent.com")
199 |         converted = re.sub(r"/(tree|blob)/", "/", converted)
200 |         msg.warn(
201 |             "Downloading from a regular GitHub URL. This will only download "
202 |             "the source of the page, not the actual file. Converting the URL "
203 |             "to a raw URL.",
204 |             converted,
205 |         )
206 |         return converted
207 |     return url
208 | 


--------------------------------------------------------------------------------
/weasel/cli/clone.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import subprocess
  3 | from pathlib import Path
  4 | from typing import Optional
  5 | 
  6 | import typer
  7 | from wasabi import msg
  8 | 
  9 | from .. import about
 10 | from ..util import ensure_path, get_git_version, git_checkout, git_repo_branch_exists
 11 | from .main import COMMAND, PROJECT_FILE, Arg, Opt, _get_parent_command, app
 12 | 
 13 | DEFAULT_REPO = about.__projects__
 14 | DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
 15 | DEFAULT_BRANCHES = ["main", "master"]
 16 | 
 17 | 
 18 | @app.command("clone")
 19 | def project_clone_cli(
 20 |     # fmt: off
 21 |     ctx: typer.Context,  # This is only used to read the parent command
 22 |     name: str = Arg(..., help="The name of the template to clone"),
 23 |     dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
 24 |     repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
 25 |     branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
 26 |     sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+."),
 27 |     # fmt: on
 28 | ):
 29 |     """Clone a project template from a repository. Calls into "git" and will
 30 |     only download the files from the given subdirectory. The GitHub repo
 31 |     defaults to the official Weasel template repo, but can be customized
 32 |     (including using a private repo).
 33 | 
 34 |     DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone
 35 |     """
 36 |     if dest is None:
 37 |         dest = Path.cwd() / Path(name).parts[-1]
 38 |     if repo == DEFAULT_REPO and branch is None:
 39 |         branch = DEFAULT_PROJECTS_BRANCH
 40 | 
 41 |     if branch is None:
 42 |         for default_branch in DEFAULT_BRANCHES:
 43 |             if git_repo_branch_exists(repo, default_branch):
 44 |                 branch = default_branch
 45 |                 break
 46 |         if branch is None:
 47 |             default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
 48 |             msg.fail(
 49 |                 "No branch provided and attempted default "
 50 |                 f"branches {default_branches_msg} do not exist.",
 51 |                 exits=1,
 52 |             )
 53 |     else:
 54 |         if not git_repo_branch_exists(repo, branch):
 55 |             msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
 56 |     assert isinstance(branch, str)
 57 |     parent_command = _get_parent_command(ctx)
 58 |     project_clone(
 59 |         name,
 60 |         dest,
 61 |         repo=repo,
 62 |         branch=branch,
 63 |         sparse_checkout=sparse_checkout,
 64 |         parent_command=parent_command,
 65 |     )
 66 | 
 67 | 
 68 | def project_clone(
 69 |     name: str,
 70 |     dest: Path,
 71 |     *,
 72 |     repo: str = about.__projects__,
 73 |     branch: str = about.__projects_branch__,
 74 |     sparse_checkout: bool = False,
 75 |     parent_command: str = COMMAND,
 76 | ) -> None:
 77 |     """Clone a project template from a repository.
 78 | 
 79 |     name (str): Name of subdirectory to clone.
 80 |     dest (Path): Destination path of cloned project.
 81 |     repo (str): URL of Git repo containing project templates.
 82 |     branch (str): The branch to clone from
 83 |     """
 84 |     dest = ensure_path(dest)
 85 |     check_clone(name, dest, repo)
 86 |     project_dir = dest.resolve()
 87 |     repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
 88 |     try:
 89 |         git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
 90 |     except subprocess.CalledProcessError:
 91 |         err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
 92 |         msg.fail(err, exits=1)
 93 |     msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
 94 |     if not (project_dir / PROJECT_FILE).exists():
 95 |         msg.warn(f"No {PROJECT_FILE} found in directory")
 96 |     else:
 97 |         msg.good("Your project is now ready!")
 98 |         print(f"To fetch the assets, run:\n{parent_command} assets {dest}")
 99 | 
100 | 
101 | def check_clone(name: str, dest: Path, repo: str) -> None:
102 |     """Check and validate that the destination path can be used to clone. Will
103 |     check that Git is available and that the destination path is suitable.
104 | 
105 |     name (str): Name of the directory to clone from the repo.
106 |     dest (Path): Local destination of cloned directory.
107 |     repo (str): URL of the repo to clone from.
108 |     """
109 |     git_err = (
110 |         f"Cloning Weasel project templates requires Git and the 'git' command. "
111 |         f"To clone a project without Git, copy the files from the '{name}' "
112 |         f"directory in the {repo} to {dest} manually."
113 |     )
114 |     get_git_version(error=git_err)
115 |     if not dest:
116 |         msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
117 |     if dest.exists():
118 |         # Directory already exists (not allowed, clone needs to create it)
119 |         msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
120 |     if not dest.parent.exists():
121 |         # We're not creating parents, parent dir should exist
122 |         msg.fail(
123 |             f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
124 |             f"Create the necessary folder(s) first before continuing.",
125 |             exits=1,
126 |         )
127 | 


--------------------------------------------------------------------------------
/weasel/cli/document.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | from wasabi import MarkdownRenderer, msg
  4 | 
  5 | from ..util import load_project_config, working_dir
  6 | from .main import PROJECT_FILE, Arg, Opt, app
  7 | 
  8 | DOCS_URL = "https://github.com/explosion/weasel"
  9 | INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
 10 | project, as well as the available commands and workflows. For details, see the
 11 | [Weasel documentation]({DOCS_URL})."""
 12 | INTRO_COMMANDS = f"""The following commands are defined by the project. They
 13 | can be executed using [`weasel run [name]`]({DOCS_URL}/tree/main/docs/cli.md#rocket-run).
 14 | Commands are only re-run if their inputs have changed."""
 15 | INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
 16 | can be executed using [`weasel run [name]`]({DOCS_URL}/tree/main/docs/cli.md#rocket-run)
 17 | and will run the specified commands in order. Commands are only re-run if their
 18 | inputs have changed."""
 19 | INTRO_ASSETS = f"""The following assets are defined by the project. They can
 20 | be fetched by running [`weasel assets`]({DOCS_URL}/tree/main/docs/cli.md#open_file_folder-assets)
 21 | in the project directory."""
 22 | # These markers are added to the Markdown and can be used to update the file in
 23 | # place if it already exists. Only the auto-generated part will be replaced.
 24 | MARKER_TAGS = ("WEASEL", "SPACY PROJECT")
 25 | MARKER_START = "<!-- {tag}: AUTO-GENERATED DOCS START (do not remove) -->"
 26 | MARKER_END = "<!-- {tag}: AUTO-GENERATED DOCS END (do not remove) -->"
 27 | # If this marker is used in an existing README, it's ignored and not replaced
 28 | MARKER_IGNORE = "<!-- {tag}: IGNORE -->"
 29 | 
 30 | 
 31 | @app.command("document")
 32 | def project_document_cli(
 33 |     # fmt: off
 34 |     project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
 35 |     output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
 36 |     no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
 37 |     # fmt: on
 38 | ):
 39 |     """
 40 |     Auto-generate a README.md for a project. If the content is saved to a file,
 41 |     hidden markers are added so you can add custom content before or after the
 42 |     auto-generated section and only the auto-generated docs will be replaced
 43 |     when you re-run the command.
 44 | 
 45 |     DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-document
 46 |     """
 47 |     project_document(project_dir, output_file, no_emoji=no_emoji)
 48 | 
 49 | 
 50 | def project_document(
 51 |     project_dir: Path, output_file: Path, *, no_emoji: bool = False
 52 | ) -> None:
 53 |     is_stdout = str(output_file) == "-"
 54 |     config = load_project_config(project_dir)
 55 |     md = MarkdownRenderer(no_emoji=no_emoji)
 56 |     md.add(MARKER_START.format(tag="WEASEL"))
 57 |     title = config.get("title")
 58 |     description = config.get("description")
 59 |     md.add(md.title(1, f"Weasel Project{f': {title}' if title else ''}", "🪐"))
 60 |     if description:
 61 |         md.add(description)
 62 |     md.add(md.title(2, PROJECT_FILE, "📋"))
 63 |     md.add(INTRO_PROJECT)
 64 |     # Commands
 65 |     cmds = config.get("commands", [])
 66 |     data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
 67 |     if data:
 68 |         md.add(md.title(3, "Commands", "⏯"))
 69 |         md.add(INTRO_COMMANDS)
 70 |         md.add(md.table(data, ["Command", "Description"]))
 71 |     # Workflows
 72 |     wfs = config.get("workflows", {}).items()
 73 |     data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
 74 |     if data:
 75 |         md.add(md.title(3, "Workflows", "⏭"))
 76 |         md.add(INTRO_WORKFLOWS)
 77 |         md.add(md.table(data, ["Workflow", "Steps"]))
 78 |     # Assets
 79 |     assets = config.get("assets", [])
 80 |     data = []
 81 |     for a in assets:
 82 |         source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
 83 |         dest_path = a["dest"]
 84 |         dest = md.code(dest_path)
 85 |         if source == "Local":
 86 |             # Only link assets if they're in the repo
 87 |             with working_dir(project_dir) as p:
 88 |                 if (p / dest_path).exists():
 89 |                     dest = md.link(dest, dest_path)
 90 |         data.append((dest, source, a.get("description", "")))
 91 |     if data:
 92 |         md.add(md.title(3, "Assets", "🗂"))
 93 |         md.add(INTRO_ASSETS)
 94 |         md.add(md.table(data, ["File", "Source", "Description"]))
 95 |     md.add(MARKER_END.format(tag="WEASEL"))
 96 |     # Output result
 97 |     if is_stdout:
 98 |         print(md.text)
 99 |     else:
100 |         content = md.text
101 |         if output_file.exists():
102 |             with output_file.open("r", encoding="utf8") as f:
103 |                 existing = f.read()
104 | 
105 |             for marker_tag in MARKER_TAGS:
106 |                 if MARKER_IGNORE.format(tag=marker_tag) in existing:
107 |                     msg.warn(
108 |                         "Found ignore marker in existing file: skipping", output_file
109 |                     )
110 |                     return
111 | 
112 |             marker_tag_found = False
113 |             for marker_tag in MARKER_TAGS:
114 |                 markers = {
115 |                     "start": MARKER_START.format(tag=marker_tag),
116 |                     "end": MARKER_END.format(tag=marker_tag),
117 |                 }
118 |                 if markers["start"] in existing and markers["end"] in existing:
119 |                     marker_tag_found = True
120 |                     msg.info("Found existing file: only replacing auto-generated docs")
121 |                     before = existing.split(markers["start"])[0]
122 |                     after = existing.split(markers["end"])[1]
123 |                     content = f"{before}{content}{after}"
124 |                     break
125 |             if not marker_tag_found:
126 |                 msg.warn("Replacing existing file")
127 | 
128 |         with output_file.open("w", encoding="utf8") as f:
129 |             f.write(content)
130 |         msg.good("Saved project documentation", output_file)
131 | 


--------------------------------------------------------------------------------
/weasel/cli/dvc.py:
--------------------------------------------------------------------------------
  1 | """This module contains helpers and subcommands for integrating Weasel
  2 | with Data Version Control (DVC). https://dvc.org"""
  3 | import subprocess
  4 | from pathlib import Path
  5 | from typing import Any, Dict, List, Optional
  6 | 
  7 | from wasabi import msg
  8 | 
  9 | from ..util import get_hash, join_command, load_project_config, run_command, working_dir
 10 | from .main import COMMAND, NAME, PROJECT_FILE, Arg, Opt, app
 11 | 
 12 | DVC_CONFIG = "dvc.yaml"
 13 | DVC_DIR = ".dvc"
 14 | UPDATE_COMMAND = "dvc"
 15 | DVC_CONFIG_COMMENT = f"""# This file is auto-generated by Weasel based on your {PROJECT_FILE}. If you've
 16 | # edited your {PROJECT_FILE}, you can regenerate this file by running:
 17 | # {COMMAND} {UPDATE_COMMAND}"""
 18 | 
 19 | 
 20 | @app.command(UPDATE_COMMAND)
 21 | def project_update_dvc_cli(
 22 |     # fmt: off
 23 |     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
 24 |     workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
 25 |     verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
 26 |     quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
 27 |     force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
 28 |     # fmt: on
 29 | ):
 30 |     """Auto-generate Data Version Control (DVC) config. A DVC
 31 |     project can only define one pipeline, so you need to specify one workflow
 32 |     defined in the project.yml. If no workflow is specified, the first defined
 33 |     workflow is used. The DVC config will only be updated if the project.yml
 34 |     changed.
 35 | 
 36 |     DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc
 37 |     """
 38 |     project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
 39 | 
 40 | 
 41 | def project_update_dvc(
 42 |     project_dir: Path,
 43 |     workflow: Optional[str] = None,
 44 |     *,
 45 |     verbose: bool = False,
 46 |     quiet: bool = False,
 47 |     force: bool = False,
 48 | ) -> None:
 49 |     """Update the auto-generated Data Version Control (DVC) config file. A DVC
 50 |     project can only define one pipeline, so you need to specify one workflow
 51 |     defined in the project.yml. Will only update the file if the checksum changed.
 52 | 
 53 |     project_dir (Path): The project directory.
 54 |     workflow (Optional[str]): Optional name of workflow defined in project.yml.
 55 |         If not set, the first workflow will be used.
 56 |     verbose (bool): Print more info.
 57 |     quiet (bool): Print less info.
 58 |     force (bool): Force update DVC config.
 59 |     """
 60 |     config = load_project_config(project_dir)
 61 |     updated = update_dvc_config(
 62 |         project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
 63 |     )
 64 |     help_msg = "To execute the workflow with DVC, run: dvc repro"
 65 |     if updated:
 66 |         msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
 67 |     else:
 68 |         msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
 69 | 
 70 | 
 71 | def update_dvc_config(
 72 |     path: Path,
 73 |     config: Dict[str, Any],
 74 |     workflow: Optional[str] = None,
 75 |     verbose: bool = False,
 76 |     quiet: bool = False,
 77 |     force: bool = False,
 78 | ) -> bool:
 79 |     """Re-run the DVC commands in dry mode and update dvc.yaml file in the
 80 |     project directory. The file is auto-generated based on the config. The
 81 |     first line of the auto-generated file specifies the hash of the config
 82 |     dict, so if any of the config values change, the DVC config is regenerated.
 83 | 
 84 |     path (Path): The path to the project directory.
 85 |     config (Dict[str, Any]): The loaded project.yml.
 86 |     verbose (bool): Whether to print additional info (via DVC).
 87 |     quiet (bool): Don't output anything (via DVC).
 88 |     force (bool): Force update, even if hashes match.
 89 |     RETURNS (bool): Whether the DVC config file was updated.
 90 |     """
 91 |     ensure_dvc(path)
 92 |     workflows = config.get("workflows", {})
 93 |     workflow_names = list(workflows.keys())
 94 |     check_workflows(workflow_names, workflow)
 95 |     if not workflow:
 96 |         workflow = workflow_names[0]
 97 |     config_hash = get_hash(config)
 98 |     path = path.resolve()
 99 |     dvc_config_path = path / DVC_CONFIG
100 |     if dvc_config_path.exists():
101 |         # Check if the file was generated using the current config, if not, redo
102 |         with dvc_config_path.open("r", encoding="utf8") as f:
103 |             ref_hash = f.readline().strip().replace("# ", "")
104 |         if ref_hash == config_hash and not force:
105 |             return False  # Nothing has changed in project.yml, don't need to update
106 |         dvc_config_path.unlink()
107 |     dvc_commands = []
108 |     config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
109 | 
110 |     # some flags that apply to every command
111 |     flags = []
112 |     if verbose:
113 |         flags.append("--verbose")
114 |     if quiet:
115 |         flags.append("--quiet")
116 | 
117 |     for name in workflows[workflow]:
118 |         command = config_commands[name]
119 |         deps = command.get("deps", [])
120 |         outputs = command.get("outputs", [])
121 |         outputs_no_cache = command.get("outputs_no_cache", [])
122 |         if not deps and not outputs and not outputs_no_cache:
123 |             continue
124 |         # Default to the working dir as the project path since dvc.yaml is auto-generated
125 |         # and we don't want arbitrary paths in there
126 |         project_cmd = ["python", "-m", NAME, "project", "run", name]
127 |         deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
128 |         outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
129 |         outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
130 | 
131 |         dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
132 |         if command.get("no_skip"):
133 |             dvc_cmd.append("--always-changed")
134 |         full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
135 |         dvc_commands.append(join_command(full_cmd))
136 | 
137 |     if not dvc_commands:
138 |         # If we don't check for this, then there will be an error when reading the
139 |         # config, since DVC wouldn't create it.
140 |         msg.fail(
141 |             "No usable commands for DVC found. This can happen if none of your "
142 |             "commands have dependencies or outputs.",
143 |             exits=1,
144 |         )
145 | 
146 |     with working_dir(path):
147 |         for c in dvc_commands:
148 |             dvc_command = "dvc " + c
149 |             run_command(dvc_command)
150 |     with dvc_config_path.open("r+", encoding="utf8") as f:
151 |         content = f.read()
152 |         f.seek(0, 0)
153 |         f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
154 |     return True
155 | 
156 | 
157 | def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
158 |     """Validate workflows provided in project.yml and check that a given
159 |     workflow can be used to generate a DVC config.
160 | 
161 |     workflows (List[str]): Names of the available workflows.
162 |     workflow (Optional[str]): The name of the workflow to convert.
163 |     """
164 |     if not workflows:
165 |         msg.fail(
166 |             f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
167 |             f"define at least one list of commands.",
168 |             exits=1,
169 |         )
170 |     if workflow is not None and workflow not in workflows:
171 |         msg.fail(
172 |             f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
173 |             f"Available workflows: {', '.join(workflows)}",
174 |             exits=1,
175 |         )
176 |     if not workflow:
177 |         msg.warn(
178 |             f"No workflow specified for DVC pipeline. Using the first workflow "
179 |             f"defined in {PROJECT_FILE}: '{workflows[0]}'"
180 |         )
181 | 
182 | 
183 | def ensure_dvc(project_dir: Path) -> None:
184 |     """Ensure that the "dvc" command is available and that the current project
185 |     directory is an initialized DVC project.
186 |     """
187 |     try:
188 |         subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
189 |     except Exception:
190 |         msg.fail(
191 |             "To use Weasel with DVC (Data Version Control), DVC needs "
192 |             "to be installed and the 'dvc' command needs to be available",
193 |             "You can install the Python package from pip (pip install dvc) or "
194 |             "conda (conda install -c conda-forge dvc). For more details, see the "
195 |             "documentation: https://dvc.org/doc/install",
196 |             exits=1,
197 |         )
198 |     if not (project_dir / ".dvc").exists():
199 |         msg.fail(
200 |             "Project not initialized as a DVC project",
201 |             "To initialize a DVC project, you can run 'dvc init' in the project "
202 |             "directory. For more details, see the documentation: "
203 |             "https://dvc.org/doc/command-reference/init",
204 |             exits=1,
205 |         )
206 | 


--------------------------------------------------------------------------------
/weasel/cli/main.py:
--------------------------------------------------------------------------------
 1 | import typer
 2 | 
 3 | COMMAND = "python -m weasel"
 4 | NAME = "weasel"
 5 | HELP = """weasel Command-line Interface
 6 | 
 7 | DOCS: https://github.com/explosion/weasel
 8 | """
 9 | 
10 | PROJECT_FILE = "project.yml"
11 | PROJECT_LOCK = "project.lock"
12 | 
13 | # Wrappers for Typer's annotations. Initially created to set defaults and to
14 | # keep the names short, but not needed at the moment.
15 | Arg = typer.Argument
16 | Opt = typer.Option
17 | 
18 | app = typer.Typer(name=NAME, help=HELP, no_args_is_help=True)
19 | 
20 | 
21 | def _get_parent_command(ctx: typer.Context) -> str:
22 |     parent_command = ""
23 |     ctx_parent = ctx.parent
24 |     while ctx_parent:
25 |         if ctx_parent.info_name:
26 |             parent_command = ctx_parent.info_name + " " + parent_command
27 |             ctx_parent = ctx_parent.parent
28 |         else:
29 |             return COMMAND
30 |     return parent_command.strip()
31 | 


--------------------------------------------------------------------------------
/weasel/cli/pull.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from wasabi import msg
 4 | 
 5 | from ..util import load_project_config, logger
 6 | from .main import Arg, app
 7 | from .remote_storage import RemoteStorage, get_command_hash
 8 | from .run import update_lockfile
 9 | 
10 | 
11 | @app.command("pull")
12 | def project_pull_cli(
13 |     # fmt: off
14 |     remote: str = Arg("default", help="Name or path of remote storage"),
15 |     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
16 |     # fmt: on
17 | ):
18 |     """Retrieve available precomputed outputs from a remote storage.
19 |     You can alias remotes in your project.yml by mapping them to storage paths.
20 |     A storage can be anything that the smart_open library can upload to, e.g.
21 |     AWS, Google Cloud Storage, SSH, local directories etc.
22 | 
23 |     DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-push
24 |     """
25 |     for url, output_path in project_pull(project_dir, remote):
26 |         if url is not None:
27 |             msg.good(f"Pulled {output_path} from {url}")
28 | 
29 | 
30 | def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
31 |     # TODO: We don't have tests for this :(. It would take a bit of mockery to
32 |     # set up. I guess see if it breaks first?
33 |     config = load_project_config(project_dir)
34 |     if remote in config.get("remotes", {}):
35 |         remote = config["remotes"][remote]
36 |     storage = RemoteStorage(project_dir, remote)
37 |     commands = list(config.get("commands", []))
38 |     # We use a while loop here because we don't know how the commands
39 |     # will be ordered. A command might need dependencies from one that's later
40 |     # in the list.
41 |     while commands:
42 |         for i, cmd in enumerate(list(commands)):
43 |             logger.debug("CMD: %s.", cmd["name"])
44 |             deps = [project_dir / dep for dep in cmd.get("deps", [])]
45 |             if all(dep.exists() for dep in deps):
46 |                 cmd_hash = get_command_hash("", "", deps, cmd["script"])
47 |                 for output_path in cmd.get("outputs", []):
48 |                     url = storage.pull(output_path, command_hash=cmd_hash)
49 |                     logger.debug(
50 |                         "URL: %s for %s with command hash %s",
51 |                         url,
52 |                         output_path,
53 |                         cmd_hash,
54 |                     )
55 |                     yield url, output_path
56 | 
57 |                 out_locs = [project_dir / out for out in cmd.get("outputs", [])]
58 |                 if all(loc.exists() for loc in out_locs):
59 |                     update_lockfile(project_dir, cmd)
60 |                 # We remove the command from the list here, and break, so that
61 |                 # we iterate over the loop again.
62 |                 commands.pop(i)
63 |                 break
64 |             else:
65 |                 logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
66 |         else:
67 |             # If we didn't break the for loop, break the while loop.
68 |             break
69 | 


--------------------------------------------------------------------------------
/weasel/cli/push.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from wasabi import msg
 4 | 
 5 | from ..util import load_project_config, logger
 6 | from .main import Arg, app
 7 | from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
 8 | 
 9 | 
10 | @app.command("push")
11 | def project_push_cli(
12 |     # fmt: off
13 |     remote: str = Arg("default", help="Name or path of remote storage"),
14 |     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
15 |     # fmt: on
16 | ):
17 |     """Persist outputs to a remote storage. You can alias remotes in your
18 |     project.yml by mapping them to storage paths. A storage can be anything that
19 |     the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
20 |     local directories etc.
21 | 
22 |     DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push
23 |     """
24 |     for output_path, url in project_push(project_dir, remote):
25 |         if url is None:
26 |             msg.info(f"Skipping {output_path}")
27 |         else:
28 |             msg.good(f"Pushed {output_path} to {url}")
29 | 
30 | 
31 | def project_push(project_dir: Path, remote: str):
32 |     """Persist outputs to a remote storage. You can alias remotes in your project.yml
33 |     by mapping them to storage paths. A storage can be anything that the smart_open
34 |     library can upload to, e.g. gcs, aws, ssh, local directories etc
35 |     """
36 |     config = load_project_config(project_dir)
37 |     if remote in config.get("remotes", {}):
38 |         remote = config["remotes"][remote]
39 |     storage = RemoteStorage(project_dir, remote)
40 |     for cmd in config.get("commands", []):
41 |         logger.debug("CMD: %s", cmd["name"])
42 |         deps = [project_dir / dep for dep in cmd.get("deps", [])]
43 |         if any(not dep.exists() for dep in deps):
44 |             logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
45 |             continue
46 |         cmd_hash = get_command_hash(
47 |             "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
48 |         )
49 |         logger.debug("CMD_HASH: %s", cmd_hash)
50 |         for output_path in cmd.get("outputs", []):
51 |             output_loc = project_dir / output_path
52 |             if output_loc.exists() and _is_not_empty_dir(output_loc):
53 |                 url = storage.push(
54 |                     output_path,
55 |                     command_hash=cmd_hash,
56 |                     content_hash=get_content_hash(output_loc),
57 |                 )
58 |                 logger.debug(
59 |                     "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
60 |                 )
61 |                 yield output_path, url
62 | 
63 | 
64 | def _is_not_empty_dir(loc: Path):
65 |     if not loc.is_dir():
66 |         return True
67 |     elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
68 |         return True
69 |     else:
70 |         return False
71 | 


--------------------------------------------------------------------------------
/weasel/cli/remote_storage.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import site
  4 | import sys
  5 | import tarfile
  6 | import urllib.parse
  7 | from pathlib import Path
  8 | from typing import TYPE_CHECKING, Dict, List, Optional
  9 | 
 10 | from wasabi import msg
 11 | 
 12 | from ..errors import Errors
 13 | from ..util import check_spacy_env_vars, download_file, ensure_pathy, get_checksum
 14 | from ..util import get_hash, make_tempdir, upload_file
 15 | 
 16 | if TYPE_CHECKING:
 17 |     from cloudpathlib import CloudPath
 18 | 
 19 | 
 20 | class RemoteStorage:
 21 |     """Push and pull outputs to and from a remote file storage.
 22 | 
 23 |     Remotes can be anything that `smart_open` can support: AWS, GCS, file system,
 24 |     ssh, etc.
 25 |     """
 26 | 
 27 |     def __init__(self, project_root: Path, url: str, *, compression="gz"):
 28 |         self.root = project_root
 29 |         self.url = ensure_pathy(url)
 30 |         self.compression = compression
 31 | 
 32 |     def push(self, path: Path, command_hash: str, content_hash: str) -> "CloudPath":
 33 |         """Compress a file or directory within a project and upload it to a remote
 34 |         storage. If an object exists at the full URL, nothing is done.
 35 | 
 36 |         Within the remote storage, files are addressed by their project path
 37 |         (url encoded) and two user-supplied hashes, representing their creation
 38 |         context and their file contents. If the URL already exists, the data is
 39 |         not uploaded. Paths are archived and compressed prior to upload.
 40 |         """
 41 |         loc = self.root / path
 42 |         if not loc.exists():
 43 |             raise IOError(f"Cannot push {loc}: does not exist.")
 44 |         url = self.make_url(path, command_hash, content_hash)
 45 |         if url.exists():
 46 |             return url
 47 |         tmp: Path
 48 |         with make_tempdir() as tmp:
 49 |             tar_loc = tmp / self.encode_name(str(path))
 50 |             mode_string = f"w:{self.compression}" if self.compression else "w"
 51 |             with tarfile.open(tar_loc, mode=mode_string) as tar_file:
 52 |                 tar_file.add(str(loc), arcname=str(path))
 53 |             upload_file(tar_loc, url)
 54 |         return url
 55 | 
 56 |     def pull(
 57 |         self,
 58 |         path: Path,
 59 |         *,
 60 |         command_hash: Optional[str] = None,
 61 |         content_hash: Optional[str] = None,
 62 |     ) -> Optional["CloudPath"]:
 63 |         """Retrieve a file from the remote cache. If the file already exists,
 64 |         nothing is done.
 65 | 
 66 |         If the command_hash and/or content_hash are specified, only matching
 67 |         results are returned. If no results are available, an error is raised.
 68 |         """
 69 |         dest = self.root / path
 70 |         if dest.exists():
 71 |             return None
 72 |         url = self.find(path, command_hash=command_hash, content_hash=content_hash)
 73 |         if url is None:
 74 |             return url
 75 |         else:
 76 |             # Make sure the destination exists
 77 |             if not dest.parent.exists():
 78 |                 dest.parent.mkdir(parents=True)
 79 |             tmp: Path
 80 |             with make_tempdir() as tmp:
 81 |                 tar_loc = tmp / url.parts[-1]
 82 |                 download_file(url, tar_loc)
 83 |                 mode_string = f"r:{self.compression}" if self.compression else "r"
 84 |                 with tarfile.open(tar_loc, mode=mode_string) as tar_file:
 85 |                     # This requires that the path is added correctly, relative
 86 |                     # to root. This is how we set things up in push()
 87 | 
 88 |                     # Disallow paths outside the current directory for the tar
 89 |                     # file (CVE-2007-4559, directory traversal vulnerability)
 90 |                     def is_within_directory(directory, target):
 91 |                         abs_directory = os.path.abspath(directory)
 92 |                         abs_target = os.path.abspath(target)
 93 |                         prefix = os.path.commonprefix([abs_directory, abs_target])
 94 |                         return prefix == abs_directory
 95 | 
 96 |                     def safe_extract(tar, path):
 97 |                         for member in tar.getmembers():
 98 |                             member_path = os.path.join(path, member.name)
 99 |                             if not is_within_directory(path, member_path):
100 |                                 raise ValueError(Errors.E201)
101 |                         if sys.version_info >= (3, 12):
102 |                             tar.extractall(path, filter="data")
103 |                         else:
104 |                             tar.extractall(path)
105 | 
106 |                     safe_extract(tar_file, self.root)
107 |         return url
108 | 
109 |     def find(
110 |         self,
111 |         path: Path,
112 |         *,
113 |         command_hash: Optional[str] = None,
114 |         content_hash: Optional[str] = None,
115 |     ) -> Optional["CloudPath"]:
116 |         """Find the best matching version of a file within the storage,
117 |         or `None` if no match can be found. If both the creation and content hash
118 |         are specified, only exact matches will be returned. Otherwise, the most
119 |         recent matching file is preferred.
120 |         """
121 |         name = self.encode_name(str(path))
122 |         urls = []
123 |         if command_hash is not None and content_hash is not None:
124 |             url = self.url / name / command_hash / content_hash
125 |             urls = [url] if url.exists() else []
126 |         elif command_hash is not None:
127 |             if (self.url / name / command_hash).exists():
128 |                 urls = list((self.url / name / command_hash).iterdir())
129 |         else:
130 |             if (self.url / name).exists():
131 |                 for sub_dir in (self.url / name).iterdir():
132 |                     urls.extend(sub_dir.iterdir())
133 |                 if content_hash is not None:
134 |                     urls = [url for url in urls if url.parts[-1] == content_hash]
135 |         if len(urls) >= 2:
136 |             try:
137 |                 urls.sort(key=lambda x: x.stat().st_mtime)
138 |             except Exception:
139 |                 msg.warn(
140 |                     "Unable to sort remote files by last modified. The file(s) "
141 |                     "pulled from the cache may not be the most recent."
142 |                 )
143 |         return urls[-1] if urls else None
144 | 
145 |     def make_url(self, path: Path, command_hash: str, content_hash: str) -> "CloudPath":
146 |         """Construct a URL from a subpath, a creation hash and a content hash."""
147 |         return self.url / self.encode_name(str(path)) / command_hash / content_hash
148 | 
149 |     def encode_name(self, name: str) -> str:
150 |         """Encode a subpath into a URL-safe name."""
151 |         return urllib.parse.quote_plus(name)
152 | 
153 | 
154 | def get_content_hash(loc: Path) -> str:
155 |     return get_checksum(loc)
156 | 
157 | 
158 | def get_command_hash(
159 |     site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
160 | ) -> str:
161 |     """Create a hash representing the execution of a command. This includes the
162 |     currently installed packages, whatever environment variables have been marked
163 |     as relevant, and the command.
164 |     """
165 |     check_spacy_env_vars()
166 |     dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
167 |     hashes = [site_hash, env_hash] + dep_checksums
168 |     hashes.extend(cmd)
169 |     creation_bytes = "".join(hashes).encode("utf8")
170 |     return hashlib.md5(creation_bytes).hexdigest()
171 | 
172 | 
173 | def get_site_hash():
174 |     """Hash the current Python environment's site-packages contents, including
175 |     the name and version of the libraries. The list we're hashing is what
176 |     `pip freeze` would output.
177 |     """
178 |     site_dirs = site.getsitepackages()
179 |     if site.ENABLE_USER_SITE:
180 |         site_dirs.extend(site.getusersitepackages())
181 |     packages = set()
182 |     for site_dir in site_dirs:
183 |         site_dir = Path(site_dir)
184 |         for subpath in site_dir.iterdir():
185 |             if subpath.parts[-1].endswith("dist-info"):
186 |                 packages.add(subpath.parts[-1].replace(".dist-info", ""))
187 |     package_bytes = "".join(sorted(packages)).encode("utf8")
188 |     return hashlib.md5sum(package_bytes).hexdigest()
189 | 
190 | 
191 | def get_env_hash(env: Dict[str, str]) -> str:
192 |     """Construct a hash of the environment variables that will be passed into
193 |     the commands.
194 | 
195 |     Values in the env dict may be references to the current os.environ, using
196 |     the syntax $ENV_VAR to mean os.environ[ENV_VAR]
197 |     """
198 |     env_vars = {}
199 |     for key, value in env.items():
200 |         if value.startswith("$"):
201 |             env_vars[key] = os.environ.get(value[1:], "")
202 |         else:
203 |             env_vars[key] = value
204 |     return get_hash(env_vars)
205 | 


--------------------------------------------------------------------------------
/weasel/cli/run.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from pathlib import Path
  3 | from typing import Any, Dict, Iterable, List, Optional, Sequence
  4 | 
  5 | import srsly
  6 | import typer
  7 | from wasabi import msg
  8 | from wasabi.util import locale_escape
  9 | 
 10 | from ..util import SimpleFrozenDict, SimpleFrozenList, check_spacy_env_vars
 11 | from ..util import get_checksum, get_hash, is_cwd, join_command, load_project_config
 12 | from ..util import parse_config_overrides, run_command, split_command, working_dir
 13 | from .main import COMMAND, PROJECT_FILE, PROJECT_LOCK, Arg, Opt, _get_parent_command
 14 | from .main import app
 15 | 
 16 | 
 17 | @app.command(
 18 |     "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
 19 | )
 20 | def project_run_cli(
 21 |     # fmt: off
 22 |     ctx: typer.Context,  # This is only used to read additional arguments
 23 |     subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
 24 |     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
 25 |     force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
 26 |     dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
 27 |     show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
 28 |     # fmt: on
 29 | ):
 30 |     """Run a named command or workflow defined in the project.yml. If a workflow
 31 |     name is specified, all commands in the workflow are run, in order. If
 32 |     commands define dependencies and/or outputs, they will only be re-run if
 33 |     state has changed.
 34 | 
 35 |     DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#rocket-run
 36 |     """
 37 |     parent_command = _get_parent_command(ctx)
 38 |     if show_help or not subcommand:
 39 |         print_run_help(project_dir, subcommand, parent_command)
 40 |     else:
 41 |         overrides = parse_config_overrides(ctx.args)
 42 |         project_run(
 43 |             project_dir,
 44 |             subcommand,
 45 |             overrides=overrides,
 46 |             force=force,
 47 |             dry=dry,
 48 |             parent_command=parent_command,
 49 |         )
 50 | 
 51 | 
 52 | def project_run(
 53 |     project_dir: Path,
 54 |     subcommand: str,
 55 |     *,
 56 |     overrides: Dict[str, Any] = SimpleFrozenDict(),
 57 |     force: bool = False,
 58 |     dry: bool = False,
 59 |     capture: bool = False,
 60 |     skip_requirements_check: bool = False,
 61 |     parent_command: str = COMMAND,
 62 | ) -> None:
 63 |     """Run a named script defined in the project.yml. If the script is part
 64 |     of the default pipeline (defined in the "run" section), DVC is used to
 65 |     execute the command, so it can determine whether to rerun it. It then
 66 |     calls into "exec" to execute it.
 67 | 
 68 |     project_dir (Path): Path to project directory.
 69 |     subcommand (str): Name of command to run.
 70 |     overrides (Dict[str, Any]): Optional config overrides.
 71 |     force (bool): Force re-running, even if nothing changed.
 72 |     dry (bool): Perform a dry run and don't execute commands.
 73 |     capture (bool): Whether to capture the output and errors of individual commands.
 74 |         If False, the stdout and stderr will not be redirected, and if there's an error,
 75 |         sys.exit will be called with the return code. You should use capture=False
 76 |         when you want to turn over execution to the command, and capture=True
 77 |         when you want to run the command more like a function.
 78 |     skip_requirements_check (bool): No longer used, deprecated.
 79 |     """
 80 |     config = load_project_config(project_dir, overrides=overrides)
 81 |     commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
 82 |     workflows = config.get("workflows", {})
 83 |     validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
 84 | 
 85 |     if subcommand in workflows:
 86 |         msg.info(f"Running workflow '{subcommand}'")
 87 |         for cmd in workflows[subcommand]:
 88 |             project_run(
 89 |                 project_dir,
 90 |                 cmd,
 91 |                 overrides=overrides,
 92 |                 force=force,
 93 |                 dry=dry,
 94 |                 capture=capture,
 95 |             )
 96 |     else:
 97 |         cmd = commands[subcommand]
 98 |         for dep in cmd.get("deps", []):
 99 |             if not (project_dir / dep).exists():
100 |                 err = f"Missing dependency specified by command '{subcommand}': {dep}"
101 |                 err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
102 |                 err_exits = 1 if not dry else None
103 |                 msg.fail(err, err_help, exits=err_exits)
104 |         check_spacy_env_vars()
105 |         with working_dir(project_dir) as current_dir:
106 |             msg.divider(subcommand)
107 |             rerun = check_rerun(current_dir, cmd)
108 |             if not rerun and not force:
109 |                 msg.info(f"Skipping '{cmd['name']}': nothing changed")
110 |             else:
111 |                 run_commands(cmd["script"], dry=dry, capture=capture)
112 |                 if not dry:
113 |                     update_lockfile(current_dir, cmd)
114 | 
115 | 
116 | def print_run_help(
117 |     project_dir: Path, subcommand: Optional[str] = None, parent_command: str = COMMAND
118 | ) -> None:
119 |     """Simulate a CLI help prompt using the info available in the project.yml.
120 | 
121 |     project_dir (Path): The project directory.
122 |     subcommand (Optional[str]): The subcommand or None. If a subcommand is
123 |         provided, the subcommand help is shown. Otherwise, the top-level help
124 |         and a list of available commands is printed.
125 |     """
126 |     config = load_project_config(project_dir)
127 |     config_commands = config.get("commands", [])
128 |     commands = {cmd["name"]: cmd for cmd in config_commands}
129 |     workflows = config.get("workflows", {})
130 |     project_loc = "" if is_cwd(project_dir) else project_dir
131 |     if subcommand:
132 |         validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
133 |         print(f"Usage: {parent_command} run {subcommand} {project_loc}")
134 |         if subcommand in commands:
135 |             help_text = commands[subcommand].get("help")
136 |             if help_text:
137 |                 print(f"\n{help_text}\n")
138 |         elif subcommand in workflows:
139 |             steps = workflows[subcommand]
140 |             print(f"\nWorkflow consisting of {len(steps)} commands:")
141 |             steps_data = [
142 |                 (f"{i + 1}. {step}", commands[step].get("help", ""))
143 |                 for i, step in enumerate(steps)
144 |             ]
145 |             msg.table(steps_data)
146 |             help_cmd = f"{parent_command} run [COMMAND] {project_loc} --help"
147 |             print(f"For command details, run: {help_cmd}")
148 |     else:
149 |         print("")
150 |         title = config.get("title")
151 |         if title:
152 |             print(f"{locale_escape(title)}\n")
153 |         if config_commands:
154 |             print(f"Available commands in {PROJECT_FILE}")
155 |             print(f"Usage: {parent_command} run [COMMAND] {project_loc}")
156 |             msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
157 |         if workflows:
158 |             print(f"Available workflows in {PROJECT_FILE}")
159 |             print(f"Usage: {parent_command} run [WORKFLOW] {project_loc}")
160 |             msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
161 | 
162 | 
163 | def run_commands(
164 |     commands: Iterable[str] = SimpleFrozenList(),
165 |     silent: bool = False,
166 |     dry: bool = False,
167 |     capture: bool = False,
168 | ) -> None:
169 |     """Run a sequence of commands in a subprocess, in order.
170 | 
171 |     commands (List[str]): The string commands.
172 |     silent (bool): Don't print the commands.
173 |     dry (bool): Perform a dry run and don't execut anything.
174 |     capture (bool): Whether to capture the output and errors of individual commands.
175 |         If False, the stdout and stderr will not be redirected, and if there's an error,
176 |         sys.exit will be called with the return code. You should use capture=False
177 |         when you want to turn over execution to the command, and capture=True
178 |         when you want to run the command more like a function.
179 |     """
180 |     for c in commands:
181 |         command = split_command(c)
182 |         # Not sure if this is needed or a good idea. Motivation: users may often
183 |         # use commands in their config that reference "python" and we want to
184 |         # make sure that it's always executing the same Python that Weasel is
185 |         # executed with and the pip in the same env, not some other Python/pip.
186 |         # Also ensures cross-compatibility if user 1 writes "python3" (because
187 |         # that's how it's set up on their system), and user 2 without the
188 |         # shortcut tries to re-run the command.
189 |         if len(command) and command[0] in ("python", "python3"):
190 |             command[0] = sys.executable
191 |         elif len(command) and command[0] in ("pip", "pip3"):
192 |             command = [sys.executable, "-m", "pip", *command[1:]]
193 |         if not silent:
194 |             print(f"Running command: {join_command(command)}")
195 |         if not dry:
196 |             run_command(command, capture=capture)
197 | 
198 | 
199 | def validate_subcommand(
200 |     commands: Sequence[str], workflows: Sequence[str], subcommand: str
201 | ) -> None:
202 |     """Check that a subcommand is valid and defined. Raises an error otherwise.
203 | 
204 |     commands (Sequence[str]): The available commands.
205 |     subcommand (str): The subcommand.
206 |     """
207 |     if not commands and not workflows:
208 |         msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
209 |     if subcommand not in commands and subcommand not in workflows:
210 |         help_msg = []
211 |         if subcommand in ["assets", "asset"]:
212 |             help_msg.append("Did you mean to run: python -m weasel assets?")
213 |         if commands:
214 |             help_msg.append(f"Available commands: {', '.join(commands)}")
215 |         if workflows:
216 |             help_msg.append(f"Available workflows: {', '.join(workflows)}")
217 |         msg.fail(
218 |             f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
219 |             ". ".join(help_msg),
220 |             exits=1,
221 |         )
222 | 
223 | 
224 | def check_rerun(
225 |     project_dir: Path,
226 |     command: Dict[str, Any],
227 | ) -> bool:
228 |     """Check if a command should be rerun because its settings or inputs/outputs
229 |     changed.
230 | 
231 |     project_dir (Path): The current project directory.
232 |     command (Dict[str, Any]): The command, as defined in the project.yml.
233 |     strict_version (bool):
234 |     RETURNS (bool): Whether to re-run the command.
235 |     """
236 |     # Always rerun if no-skip is set
237 |     if command.get("no_skip", False):
238 |         return True
239 |     lock_path = project_dir / PROJECT_LOCK
240 |     if not lock_path.exists():  # We don't have a lockfile, run command
241 |         return True
242 |     data = srsly.read_yaml(lock_path)
243 |     if command["name"] not in data:  # We don't have info about this command
244 |         return True
245 |     entry = data[command["name"]]
246 |     # Always run commands with no outputs (otherwise they'd always be skipped)
247 |     if not entry.get("outs", []):
248 |         return True
249 |     # If the entry in the lockfile matches the lockfile entry that would be
250 |     # generated from the current command, we don't rerun because it means that
251 |     # all inputs/outputs, hashes and scripts are the same and nothing changed
252 |     lock_entry = get_lock_entry(project_dir, command)
253 |     return get_hash(lock_entry) != get_hash(entry)
254 | 
255 | 
256 | def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
257 |     """Update the lockfile after running a command. Will create a lockfile if
258 |     it doesn't yet exist and will add an entry for the current command, its
259 |     script and dependencies/outputs.
260 | 
261 |     project_dir (Path): The current project directory.
262 |     command (Dict[str, Any]): The command, as defined in the project.yml.
263 |     """
264 |     lock_path = project_dir / PROJECT_LOCK
265 |     if not lock_path.exists():
266 |         srsly.write_yaml(lock_path, {})
267 |         data = {}
268 |     else:
269 |         data = srsly.read_yaml(lock_path)
270 |     data[command["name"]] = get_lock_entry(project_dir, command)
271 |     srsly.write_yaml(lock_path, data)
272 | 
273 | 
274 | def get_lock_entry(
275 |     project_dir: Path, command: Dict[str, Any], *, parent_command: str = COMMAND
276 | ) -> Dict[str, Any]:
277 |     """Get a lockfile entry for a given command. An entry includes the command,
278 |     the script (command steps) and a list of dependencies and outputs with
279 |     their paths and file hashes, if available. The format is based on the
280 |     dvc.lock files, to keep things consistent.
281 | 
282 |     project_dir (Path): The current project directory.
283 |     command (Dict[str, Any]): The command, as defined in the project.yml.
284 |     RETURNS (Dict[str, Any]): The lockfile entry.
285 |     """
286 |     deps = get_fileinfo(project_dir, command.get("deps", []))
287 |     outs = get_fileinfo(project_dir, command.get("outputs", []))
288 |     outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
289 |     return {
290 |         "cmd": f"{parent_command} run {command['name']}",
291 |         "script": command["script"],
292 |         "deps": deps,
293 |         "outs": [*outs, *outs_nc],
294 |     }
295 | 
296 | 
297 | def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
298 |     """Generate the file information for a list of paths (dependencies, outputs).
299 |     Includes the file path and the file's checksum.
300 | 
301 |     project_dir (Path): The current project directory.
302 |     paths (List[str]): The file paths.
303 |     RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
304 |     """
305 |     data = []
306 |     for path in paths:
307 |         file_path = project_dir / path
308 |         md5 = get_checksum(file_path) if file_path.exists() else None
309 |         data.append({"path": path, "md5": md5})
310 |     return data
311 | 


--------------------------------------------------------------------------------
/weasel/compat.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | is_windows = sys.platform.startswith("win")
4 | is_linux = sys.platform.startswith("linux")
5 | is_osx = sys.platform == "darwin"
6 | 


--------------------------------------------------------------------------------
/weasel/errors.py:
--------------------------------------------------------------------------------
 1 | class ErrorsWithCodes(type):
 2 |     def __getattribute__(self, code):
 3 |         msg = super().__getattribute__(code)
 4 |         if code.startswith("__"):  # python system attributes like __class__
 5 |             return msg
 6 |         else:
 7 |             return "[{code}] {msg}".format(code=code, msg=msg)
 8 | 
 9 | 
10 | class Warnings(metaclass=ErrorsWithCodes):
11 |     # File system
12 |     W801 = "Could not clean/remove the temp directory at {dir}: {msg}."
13 |     W802 = (
14 |         "Remote storage is not yet supported for Python 3.12 with "
15 |         "cloudpathlib. Please use Python 3.11 or earlier for remote storage."
16 |     )
17 | 
18 | 
19 | class Errors(metaclass=ErrorsWithCodes):
20 |     # API - Datastructure
21 |     E001 = (
22 |         "Can't write to frozen dictionary. This is likely an internal "
23 |         "error. Are you writing to a default function argument?"
24 |     )
25 |     E002 = (
26 |         "Can't write to frozen list. Maybe you're trying to modify a computed "
27 |         "property or default function argument?"
28 |     )
29 | 
30 |     # Workflow
31 |     E501 = "Can not execute command '{str_command}'. Do you have '{tool}' installed?"
32 | 
33 |     # File system
34 |     E801 = "The tar file pulled from the remote attempted an unsafe path " "traversal."
35 | 


--------------------------------------------------------------------------------
/weasel/schemas.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Any, Dict, List, Optional, Type, Union
  3 | 
  4 | try:
  5 |     from pydantic.v1 import BaseModel, Field, StrictStr, ValidationError, root_validator
  6 | except ImportError:
  7 |     from pydantic import BaseModel, Field, StrictStr, ValidationError, root_validator  # type: ignore
  8 | 
  9 | from wasabi import msg
 10 | 
 11 | 
 12 | def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
 13 |     """Validate data against a given pydantic schema.
 14 | 
 15 |     obj (Dict[str, Any]): JSON-serializable data to validate.
 16 |     schema (pydantic.BaseModel): The schema to validate against.
 17 |     RETURNS (List[str]): A list of error messages, if available.
 18 |     """
 19 |     try:
 20 |         schema(**obj)
 21 |         return []
 22 |     except ValidationError as e:
 23 |         errors = e.errors()
 24 |         data = defaultdict(list)
 25 |         for error in errors:
 26 |             err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
 27 |             data[err_loc].append(error.get("msg"))
 28 |         return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()]  # type: ignore[arg-type]
 29 | 
 30 | 
 31 | # Project config Schema
 32 | 
 33 | 
 34 | class ProjectConfigAssetGitItem(BaseModel):
 35 |     # fmt: off
 36 |     repo: StrictStr = Field(..., title="URL of Git repo to download from")
 37 |     path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)")
 38 |     branch: StrictStr = Field("master", title="Branch to clone from")
 39 |     # fmt: on
 40 | 
 41 | 
 42 | class ProjectConfigAssetURL(BaseModel):
 43 |     # fmt: off
 44 |     dest: StrictStr = Field(..., title="Destination of downloaded asset")
 45 |     url: Optional[StrictStr] = Field(None, title="URL of asset")
 46 |     checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
 47 |     description: StrictStr = Field("", title="Description of asset")
 48 |     # fmt: on
 49 | 
 50 | 
 51 | class ProjectConfigAssetGit(BaseModel):
 52 |     # fmt: off
 53 |     git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
 54 |     checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
 55 |     description: Optional[StrictStr] = Field(None, title="Description of asset")
 56 |     # fmt: on
 57 | 
 58 | 
 59 | class ProjectConfigCommand(BaseModel):
 60 |     # fmt: off
 61 |     name: StrictStr = Field(..., title="Name of command")
 62 |     help: Optional[StrictStr] = Field(None, title="Command description")
 63 |     script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
 64 |     deps: List[StrictStr] = Field([], title="File dependencies required by this command")
 65 |     outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
 66 |     outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
 67 |     no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
 68 |     # fmt: on
 69 | 
 70 |     class Config:
 71 |         title = "A single named command specified in a project config"
 72 |         extra = "forbid"
 73 | 
 74 | 
 75 | class ProjectConfigSchema(BaseModel):
 76 |     # fmt: off
 77 |     vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
 78 |     env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
 79 |     assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
 80 |     workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
 81 |     commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
 82 |     title: Optional[str] = Field(None, title="Project title")
 83 |     # fmt: on
 84 | 
 85 |     class Config:
 86 |         title = "Schema for project configuration file"
 87 | 
 88 |     @root_validator(pre=True)
 89 |     def check_legacy_keys(cls, obj: Dict[str, Any]) -> Dict[str, Any]:
 90 |         if "spacy_version" in obj:
 91 |             msg.warn(
 92 |                 "Your project configuration file includes a `spacy_version` key, "
 93 |                 "which is now deprecated. Weasel will not validate your version of spaCy.",
 94 |             )
 95 |         if "check_requirements" in obj:
 96 |             msg.warn(
 97 |                 "Your project configuration file includes a `check_requirements` key, "
 98 |                 "which is now deprecated. Weasel will not validate your requirements.",
 99 |             )
100 |         return obj
101 | 


--------------------------------------------------------------------------------
/weasel/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/weasel/tests/__init__.py


--------------------------------------------------------------------------------
/weasel/tests/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/weasel/tests/cli/__init__.py


--------------------------------------------------------------------------------
/weasel/tests/cli/test_cli.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import pytest
  5 | import srsly
  6 | 
  7 | from weasel.cli.remote_storage import RemoteStorage
  8 | from weasel.schemas import ProjectConfigSchema, validate
  9 | from weasel.util import git_checkout, is_subpath_of, load_project_config, make_tempdir
 10 | from weasel.util import validate_project_commands
 11 | 
 12 | 
 13 | def test_issue11235():
 14 |     """
 15 |     Test that the cli handles interpolation in the directory names correctly when loading project config.
 16 |     """
 17 |     lang_var = "en"
 18 |     variables = {"lang": lang_var}
 19 |     commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
 20 |     directories = ["cfg", "${vars.lang}_model"]
 21 |     project = {"commands": commands, "vars": variables, "directories": directories}
 22 |     with make_tempdir() as d:
 23 |         srsly.write_yaml(d / "project.yml", project)
 24 |         cfg = load_project_config(d)
 25 |         # Check that the directories are interpolated and created correctly
 26 |         assert os.path.exists(d / "cfg")
 27 |         assert os.path.exists(d / f"{lang_var}_model")
 28 |     assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
 29 | 
 30 | 
 31 | def test_project_config_validation_full():
 32 |     config = {
 33 |         "vars": {"some_var": 20},
 34 |         "directories": ["assets", "configs", "corpus", "scripts", "training"],
 35 |         "assets": [
 36 |             {
 37 |                 "dest": "x",
 38 |                 "extra": True,
 39 |                 "url": "https://example.com",
 40 |                 "checksum": "63373dd656daa1fd3043ce166a59474c",
 41 |             },
 42 |             {
 43 |                 "dest": "y",
 44 |                 "git": {
 45 |                     "repo": "https://github.com/example/repo",
 46 |                     "branch": "develop",
 47 |                     "path": "y",
 48 |                 },
 49 |             },
 50 |             {
 51 |                 "dest": "z",
 52 |                 "extra": False,
 53 |                 "url": "https://example.com",
 54 |                 "checksum": "63373dd656daa1fd3043ce166a59474c",
 55 |             },
 56 |         ],
 57 |         "commands": [
 58 |             {
 59 |                 "name": "train",
 60 |                 "help": "Train a model",
 61 |                 "script": ["python -m spacy train config.cfg -o training"],
 62 |                 "deps": ["config.cfg", "corpus/training.spcy"],
 63 |                 "outputs": ["training/model-best"],
 64 |             },
 65 |             {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True},
 66 |         ],
 67 |         "workflows": {"all": ["train", "test"], "train": ["train"]},
 68 |     }
 69 |     errors = validate(ProjectConfigSchema, config)
 70 |     assert not errors
 71 | 
 72 | 
 73 | @pytest.mark.parametrize(
 74 |     "config",
 75 |     [
 76 |         {"commands": [{"name": "a"}, {"name": "a"}]},
 77 |         {"commands": [{"name": "a"}], "workflows": {"a": []}},
 78 |         {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
 79 |     ],
 80 | )
 81 | def test_project_config_validation1(config):
 82 |     with pytest.raises(SystemExit):
 83 |         validate_project_commands(config)
 84 | 
 85 | 
 86 | @pytest.mark.parametrize(
 87 |     "config,n_errors",
 88 |     [
 89 |         ({"commands": {"a": []}}, 1),
 90 |         ({"commands": [{"help": "..."}]}, 1),
 91 |         ({"commands": [{"name": "a", "extra": "b"}]}, 1),
 92 |         ({"commands": [{"extra": "b"}]}, 2),
 93 |         ({"commands": [{"name": "a", "deps": [123]}]}, 1),
 94 |     ],
 95 | )
 96 | def test_project_config_validation2(config, n_errors):
 97 |     errors = validate(ProjectConfigSchema, config)
 98 |     assert len(errors) == n_errors
 99 | 
100 | 
101 | @pytest.mark.parametrize(
102 |     "parent,child,expected",
103 |     [
104 |         ("/tmp", "/tmp", True),
105 |         ("/tmp", "/", False),
106 |         ("/tmp", "/tmp/subdir", True),
107 |         ("/tmp", "/tmpdir", False),
108 |         ("/tmp", "/tmp/subdir/..", True),
109 |         ("/tmp", "/tmp/..", False),
110 |     ],
111 | )
112 | def test_is_subpath_of(parent, child, expected):
113 |     assert is_subpath_of(parent, child) == expected
114 | 
115 | 
116 | def test_local_remote_storage():
117 |     with make_tempdir() as d:
118 |         filename = "a.txt"
119 | 
120 |         content_hashes = ("aaaa", "cccc", "bbbb")
121 |         for i, content_hash in enumerate(content_hashes):
122 |             # make sure that each subsequent file has a later timestamp
123 |             if i > 0:
124 |                 time.sleep(1)
125 |             content = f"{content_hash} content"
126 |             loc_file = d / "root" / filename
127 |             if not loc_file.parent.exists():
128 |                 loc_file.parent.mkdir(parents=True)
129 |             with loc_file.open(mode="w") as file_:
130 |                 file_.write(content)
131 | 
132 |             # push first version to remote storage
133 |             remote = RemoteStorage(d / "root", str(d / "remote"))
134 |             remote.push(filename, "aaaa", content_hash)
135 | 
136 |             # retrieve with full hashes
137 |             loc_file.unlink()
138 |             remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
139 |             with loc_file.open(mode="r") as file_:
140 |                 assert file_.read() == content
141 | 
142 |             # retrieve with command hash
143 |             loc_file.unlink()
144 |             remote.pull(filename, command_hash="aaaa")
145 |             with loc_file.open(mode="r") as file_:
146 |                 assert file_.read() == content
147 | 
148 |             # retrieve with content hash
149 |             loc_file.unlink()
150 |             remote.pull(filename, content_hash=content_hash)
151 |             with loc_file.open(mode="r") as file_:
152 |                 assert file_.read() == content
153 | 
154 |             # retrieve with no hashes
155 |             loc_file.unlink()
156 |             remote.pull(filename)
157 |             with loc_file.open(mode="r") as file_:
158 |                 assert file_.read() == content
159 | 
160 | 
161 | def test_local_remote_storage_pull_missing():
162 |     # pulling from a non-existent remote pulls nothing gracefully
163 |     with make_tempdir() as d:
164 |         filename = "a.txt"
165 |         remote = RemoteStorage(d / "root", str(d / "remote"))
166 |         assert remote.pull(filename, command_hash="aaaa") is None
167 |         assert remote.pull(filename) is None
168 | 
169 | 
170 | def test_project_git_dir_asset():
171 |     with make_tempdir() as d:
172 |         # Use a very small repo.
173 |         git_checkout(
174 |             "https://github.com/explosion/os-signpost.git",
175 |             "os_signpost",
176 |             d / "signpost",
177 |             branch="v0.0.3",
178 |         )
179 |         assert os.path.isdir(d / "signpost")
180 | 
181 | 
182 | @pytest.mark.issue(66)
183 | def test_project_git_file_asset():
184 |     with make_tempdir() as d:
185 |         # Use a very small repo.
186 |         git_checkout(
187 |             "https://github.com/explosion/os-signpost.git",
188 |             "README.md",
189 |             d / "readme.md",
190 |             branch="v0.0.3",
191 |         )
192 |         assert os.path.isfile(d / "readme.md")
193 | 


--------------------------------------------------------------------------------
/weasel/tests/cli/test_cli_app.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Any, Dict
  3 | 
  4 | import pytest
  5 | import srsly
  6 | from typer.testing import CliRunner
  7 | 
  8 | from weasel import app
  9 | from weasel.cli.main import HELP
 10 | from weasel.util import get_git_version
 11 | 
 12 | runner = CliRunner()
 13 | 
 14 | 
 15 | @pytest.mark.parametrize("cmd", [None, "--help"])
 16 | def test_show_help(cmd):
 17 |     """Basic test to confirm help text appears"""
 18 |     result = runner.invoke(app, [cmd] if cmd else None)
 19 |     for line in HELP.splitlines():
 20 |         assert line in result.stdout
 21 | 
 22 | 
 23 | def has_git():
 24 |     try:
 25 |         get_git_version()
 26 |         return True
 27 |     except RuntimeError:
 28 |         return False
 29 | 
 30 | 
 31 | SAMPLE_PROJECT: Dict[str, Any] = {
 32 |     "title": "Sample project",
 33 |     "description": "This is a project for testing",
 34 |     "assets": [
 35 |         {
 36 |             "dest": "assets/weasel-readme.md",
 37 |             "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/README.md",
 38 |             "checksum": "65f4c426a9b153b7683738c92d0d20f9",
 39 |         },
 40 |         {
 41 |             "dest": "assets/pyproject.toml",
 42 |             "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/pyproject.toml",
 43 |             "checksum": "1e2da3a3030d6611520952d5322cd94e",
 44 |             "extra": True,
 45 |         },
 46 |     ],
 47 |     "commands": [
 48 |         {
 49 |             "name": "ok",
 50 |             "help": "print ok",
 51 |             "script": ["python -c \"print('okokok')\""],
 52 |         },
 53 |         {
 54 |             "name": "create",
 55 |             "help": "make a file",
 56 |             "script": ["touch abc.txt"],
 57 |             "outputs": ["abc.txt"],
 58 |         },
 59 |         {
 60 |             "name": "clean",
 61 |             "help": "remove test file",
 62 |             "script": ["rm abc.txt"],
 63 |         },
 64 |     ],
 65 | }
 66 | 
 67 | SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
 68 | 
 69 | 
 70 | @pytest.fixture
 71 | def project_dir(tmp_path: Path):
 72 |     path = tmp_path / "project"
 73 |     path.mkdir()
 74 |     (path / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
 75 |     yield path
 76 | 
 77 | 
 78 | def test_project_document(project_dir: Path):
 79 |     readme_path = project_dir / "README.md"
 80 |     assert not readme_path.exists(), "README already exists"
 81 |     result = CliRunner().invoke(
 82 |         app, ["document", str(project_dir), "-o", str(readme_path)]
 83 |     )
 84 |     assert result.exit_code == 0
 85 |     assert readme_path.is_file()
 86 |     text = readme_path.read_text("utf-8")
 87 |     assert SAMPLE_PROJECT["description"] in text
 88 | 
 89 | 
 90 | def test_project_assets(project_dir: Path):
 91 |     asset_dir = project_dir / "assets"
 92 |     assert not asset_dir.exists(), "Assets dir is already present"
 93 |     result = CliRunner().invoke(app, ["assets", str(project_dir)])
 94 |     assert result.exit_code == 0
 95 |     assert (asset_dir / "weasel-readme.md").is_file(), "Assets not downloaded"
 96 |     # check that extras work
 97 |     result = CliRunner().invoke(app, ["assets", "--extra", str(project_dir)])
 98 |     assert result.exit_code == 0
 99 |     assert (asset_dir / "pyproject.toml").is_file(), "Extras not downloaded"
100 | 
101 | 
102 | def test_project_run(project_dir: Path):
103 |     # make sure dry run works
104 |     test_file = project_dir / "abc.txt"
105 |     result = CliRunner().invoke(app, ["run", "--dry", "create", str(project_dir)])
106 |     assert result.exit_code == 0
107 |     assert not test_file.is_file()
108 |     result = CliRunner().invoke(app, ["run", "create", str(project_dir)])
109 |     assert result.exit_code == 0
110 |     assert test_file.is_file()
111 |     result = CliRunner().invoke(app, ["run", "ok", str(project_dir)])
112 |     assert result.exit_code == 0
113 |     assert "okokok" in result.stdout
114 | 
115 | 
116 | def test_check_spacy_env_vars(project_dir: Path, monkeypatch: pytest.MonkeyPatch):
117 |     # make sure dry run works
118 |     project_dir / "abc.txt"
119 | 
120 |     result = CliRunner().invoke(app, ["run", "--dry", "create", str(project_dir)])
121 |     assert result.exit_code == 0
122 |     assert (
123 |         "You've set a `SPACY_CONFIG_OVERRIDES` environment variable"
124 |         not in result.output
125 |     )
126 |     assert (
127 |         "You've set a `SPACY_PROJECT_USE_GIT_VERSION` environment variable"
128 |         not in result.output
129 |     )
130 | 
131 |     monkeypatch.setenv("SPACY_CONFIG_OVERRIDES", "test")
132 |     monkeypatch.setenv("SPACY_PROJECT_USE_GIT_VERSION", "false")
133 | 
134 |     result = CliRunner().invoke(app, ["run", "--dry", "create", str(project_dir)])
135 |     assert result.exit_code == 0
136 | 
137 |     assert "You've set a `SPACY_CONFIG_OVERRIDES` environment variable" in result.output
138 |     assert (
139 |         "You've set a `SPACY_PROJECT_USE_GIT_VERSION` environment variable"
140 |         in result.output
141 |     )
142 | 
143 | 
144 | @pytest.mark.skipif(not has_git(), reason="git not installed")
145 | @pytest.mark.parametrize(
146 |     "options_string",
147 |     [
148 |         "",
149 |         # "--sparse",
150 |         "--branch v3",
151 |         "--repo https://github.com/explosion/projects --branch v3",
152 |     ],
153 | )
154 | def test_project_clone(tmp_path: Path, options_string: str):
155 |     out = tmp_path / "project_clone"
156 |     target = "benchmarks/ner_conll03"
157 |     if not options_string:
158 |         options = []
159 |     else:
160 |         options = options_string.split()
161 |     result = CliRunner().invoke(app, ["clone", target, *options, str(out)])
162 |     assert result.exit_code == 0
163 |     assert "weasel assets" in result.output
164 |     assert (out / "README.md").is_file()
165 | 
166 | 
167 | def test_project_push_pull(tmp_path: Path, project_dir: Path):
168 |     proj = dict(SAMPLE_PROJECT)
169 |     remote = "xyz"
170 | 
171 |     remote_dir = tmp_path / "remote"
172 |     remote_dir.mkdir()
173 | 
174 |     proj["remotes"] = {remote: str(remote_dir)}
175 |     proj_text = srsly.yaml_dumps(proj)
176 |     (project_dir / "project.yml").write_text(proj_text)
177 | 
178 |     test_file = project_dir / "abc.txt"
179 |     result = CliRunner().invoke(app, ["run", "create", str(project_dir)])
180 |     assert result.exit_code == 0
181 |     assert test_file.is_file()
182 |     result = CliRunner().invoke(app, ["push", remote, str(project_dir)])
183 |     assert result.exit_code == 0
184 |     result = CliRunner().invoke(app, ["run", "clean", str(project_dir)])
185 |     assert result.exit_code == 0
186 |     assert not test_file.exists()
187 |     result = CliRunner().invoke(app, ["pull", remote, str(project_dir)])
188 |     assert result.exit_code == 0
189 |     assert test_file.is_file()
190 | 


--------------------------------------------------------------------------------
/weasel/tests/cli/test_document.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Any, Dict
  3 | 
  4 | import pytest
  5 | import srsly
  6 | from typer.testing import CliRunner
  7 | 
  8 | from weasel import app
  9 | from weasel.cli.document import MARKER_END, MARKER_IGNORE, MARKER_START, MARKER_TAGS
 10 | 
 11 | runner = CliRunner()
 12 | 
 13 | SAMPLE_PROJECT: Dict[str, Any] = {
 14 |     "title": "Sample project",
 15 |     "description": "This is a project for testing",
 16 |     "assets": [
 17 |         {
 18 |             "dest": "assets/weasel-readme.md",
 19 |             "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/README.md",
 20 |             "checksum": "65f4c426a9b153b7683738c92d0d20f9",
 21 |         },
 22 |         {
 23 |             "dest": "assets/pyproject.toml",
 24 |             "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/pyproject.toml",
 25 |             "checksum": "1e2da3a3030d6611520952d5322cd94e",
 26 |             "extra": True,
 27 |         },
 28 |     ],
 29 |     "commands": [
 30 |         {
 31 |             "name": "ok",
 32 |             "help": "print ok",
 33 |             "script": ["python -c \"print('okokok')\""],
 34 |         },
 35 |         {
 36 |             "name": "create",
 37 |             "help": "make a file",
 38 |             "script": ["touch abc.txt"],
 39 |             "outputs": ["abc.txt"],
 40 |         },
 41 |         {
 42 |             "name": "clean",
 43 |             "help": "remove test file",
 44 |             "script": ["rm abc.txt"],
 45 |         },
 46 |     ],
 47 | }
 48 | 
 49 | 
 50 | @pytest.fixture(scope="function")
 51 | def project_yaml_file(
 52 |     tmp_path_factory: pytest.TempPathFactory,
 53 | ):
 54 |     test_dir = tmp_path_factory.mktemp("project")
 55 |     path = test_dir / "project.yml"
 56 |     path.write_text(srsly.yaml_dumps(SAMPLE_PROJECT))
 57 |     return path
 58 | 
 59 | 
 60 | def test_create_docs(project_yaml_file: Path):
 61 |     result = runner.invoke(app, ["document", str(project_yaml_file.parent)])
 62 |     conf_data = srsly.read_yaml(project_yaml_file)
 63 |     assert result.exit_code == 0
 64 |     assert conf_data["title"] in result.stdout
 65 | 
 66 | 
 67 | def test_raise_error_no_config():
 68 |     result = runner.invoke(app, ["document"])
 69 |     assert result.exit_code == 1
 70 | 
 71 | 
 72 | @pytest.mark.parametrize("marker", MARKER_TAGS)
 73 | def test_markers(tmp_path_factory: pytest.TempPathFactory, marker: str):
 74 |     """Weasel should be able to handle both 'SPACY PROJECT' and 'WEASEL' markers."""
 75 |     project: Dict[str, Any] = {
 76 |         "title": "Awesome project",
 77 |         "description": "Project using spacy projects and gets migrated to weasel.",
 78 |     }
 79 |     additional_text = (
 80 |         "\n\n## Some additional information\n\nHere is some additional information about this project "
 81 |         "that is not autogenerated from the [`project.yml`](project.yml)."
 82 |     )
 83 | 
 84 |     # Create project file.
 85 |     test_dir = tmp_path_factory.mktemp("project")
 86 |     path = test_dir / "project.yml"
 87 |     path.write_text(srsly.yaml_dumps(project))
 88 | 
 89 |     # Store readme with additional information.
 90 |     # runner.invoke(app, ["document", str(path.parent), "--output", test_dir / "readme.md"])
 91 |     with open(test_dir / "readme.md", "w+", encoding="utf-8") as file:
 92 |         readme = runner.invoke(app, ["document", str(path.parent)]).output
 93 |         for to_replace in (MARKER_START, MARKER_END, MARKER_IGNORE):
 94 |             readme = readme.replace(
 95 |                 to_replace.format(tag="WEASEL"), to_replace.format(tag=marker)
 96 |             )
 97 |         file.writelines(readme)
 98 |         file.writelines(additional_text)
 99 | 
100 |     # Run `document` again on existing readme file. Ensure additional information is still there.
101 |     runner.invoke(
102 |         app, ["document", str(path.parent), "--output", str(test_dir / "readme.md")]
103 |     )
104 |     with open(test_dir / "readme.md", "r", encoding="utf-8") as file:
105 |         assert additional_text in "".join(file.readlines())
106 | 


--------------------------------------------------------------------------------
/weasel/tests/cli/test_remote.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | from typer.testing import CliRunner
 5 | 
 6 | from weasel import app
 7 | 
 8 | from .test_cli_app import has_git
 9 | 
10 | runner = CliRunner()
11 | 
12 | 
13 | @pytest.fixture
14 | def project_dir(tmp_path_factory: pytest.TempPathFactory):
15 |     # a working directory for the session
16 |     base = tmp_path_factory.mktemp("project")
17 |     return base / "project"
18 | 
19 | 
20 | @pytest.fixture
21 | def remote_url(tmp_path_factory: pytest.TempPathFactory):
22 |     # a "remote" for testing
23 |     base = tmp_path_factory.mktemp("remote")
24 |     return base / "remote"
25 | 
26 | 
27 | @pytest.fixture
28 | def clone(project_dir: Path):
29 |     """Cloning shouldn't fail"""
30 |     repo = "https://github.com/explosion/weasel"
31 |     branch = "main"
32 |     result = runner.invoke(
33 |         app,
34 |         [
35 |             "clone",
36 |             "--repo",
37 |             repo,
38 |             "--branch",
39 |             branch,
40 |             "weasel/tests/demo_project",
41 |             str(project_dir),
42 |         ],
43 |     )
44 | 
45 |     assert result.exit_code == 0
46 |     assert (project_dir / "project.yml").exists()
47 | 
48 | 
49 | @pytest.fixture(autouse=True)
50 | def assets(clone, project_dir: Path):
51 |     result = runner.invoke(app, ["assets", str(project_dir)])
52 | 
53 |     print(result.stdout)
54 |     assert result.exit_code == 0
55 |     assert (project_dir / "assets/README.md").exists()
56 | 
57 | 
58 | @pytest.mark.skipif(not has_git(), reason="git not installed")
59 | def test_remote(project_dir: Path, remote_url: Path):
60 |     result = runner.invoke(app, ["assets", str(project_dir)])
61 |     assert result.exit_code == 0
62 |     assert (project_dir / "assets/README.md").exists()
63 | 
64 |     result = runner.invoke(app, ["run", "prep", str(project_dir)])
65 |     assert result.exit_code == 0
66 | 
67 |     # append remote to the file
68 |     with open(project_dir / "project.yml", "a") as project_file:
69 |         project_file.write(f"\nremotes:\n    default: {remote_url}\n")
70 | 
71 |     result = runner.invoke(app, ["push", "default", str(project_dir)])
72 |     assert result.exit_code == 0
73 | 
74 |     # delete a file, and make sure pull restores it
75 |     (project_dir / "corpus/stuff.txt").unlink()
76 | 
77 |     result = runner.invoke(app, ["pull", "default", str(project_dir)])
78 |     assert result.exit_code == 0
79 |     assert (project_dir / "corpus/stuff.txt").exists()
80 | 


--------------------------------------------------------------------------------
/weasel/tests/demo_project/project.yml:
--------------------------------------------------------------------------------
 1 | title: Weasel demo project (for tests)
 2 | description: |
 3 |   This project is a minimal demo for the Weasel tests.
 4 | 
 5 | directories: [assets, corpus, scripts]
 6 | 
 7 | assets:
 8 |   - dest: assets/README.md
 9 |     url: https://raw.githubusercontent.com/explosion/weasel/main/README.md
10 | 
11 | commands:
12 |   - name: prep
13 |     help: Make a file to test with push/pull
14 |     script:
15 |       - python scripts/check.py
16 |     outputs:
17 |       - corpus/stuff.txt
18 | 


--------------------------------------------------------------------------------
/weasel/tests/demo_project/scripts/check.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | workdir = pathlib.Path(__file__).parent.resolve().parent
 4 | 
 5 | text = """
 6 |                              _
 7 | __      _____  __ _ ___  ___| |
 8 | \\ \\ /\\ / / _ \\/ _` / __|/ _ \\ |
 9 |  \\ V  V /  __/ (_| \\__ \\  __/ |
10 |   \\_/\\_/ \\___|\\__,_|___/\\___|_|
11 | 
12 | """
13 | 
14 | with open(workdir / "corpus/stuff.txt", "w") as outfile:
15 |     outfile.write(text)
16 | 


--------------------------------------------------------------------------------
/weasel/tests/test_schemas.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | import srsly
 5 | from typer.testing import CliRunner
 6 | 
 7 | from weasel import app
 8 | 
 9 | EXAMPLES = [
10 |     (dict(title="Test"), False),
11 |     (dict(title="Test", spacy_version=""), True),
12 |     (dict(title="Test", spacy_version="3.4.1"), True),
13 | ]
14 | 
15 | 
16 | @pytest.fixture
17 | def project_dir(tmp_path: Path):
18 |     path = tmp_path / "project"
19 |     path.mkdir()
20 |     yield path
21 | 
22 | 
23 | @pytest.mark.parametrize("conf,should_warn", EXAMPLES)
24 | def test_project_document(project_dir: Path, conf, should_warn):
25 |     config = srsly.yaml_dumps(conf)
26 | 
27 |     (project_dir / "project.yml").write_text(config)
28 | 
29 |     result = CliRunner().invoke(app, ["document", str(project_dir)])
30 |     assert result.exit_code == 0
31 |     assert (
32 |         "Your project configuration file includes a `spacy_version` key, "
33 |         in result.output
34 |     ) is should_warn
35 | 


--------------------------------------------------------------------------------
/weasel/tests/test_validation.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import srsly
  3 | from confection import ConfigValidationError
  4 | 
  5 | from weasel.schemas import ProjectConfigSchema, validate
  6 | from weasel.util import is_subpath_of, load_project_config, make_tempdir
  7 | from weasel.util import substitute_project_variables, validate_project_commands
  8 | 
  9 | 
 10 | @pytest.mark.parametrize(
 11 |     "parent,child,expected",
 12 |     [
 13 |         ("/tmp", "/tmp", True),
 14 |         ("/tmp", "/", False),
 15 |         ("/tmp", "/tmp/subdir", True),
 16 |         ("/tmp", "/tmpdir", False),
 17 |         ("/tmp", "/tmp/subdir/..", True),
 18 |         ("/tmp", "/tmp/..", False),
 19 |     ],
 20 | )
 21 | def test_is_subpath_of(parent, child, expected):
 22 |     assert is_subpath_of(parent, child) == expected
 23 | 
 24 | 
 25 | def test_project_config_validation_full():
 26 |     config = {
 27 |         "vars": {"some_var": 20},
 28 |         "directories": ["assets", "configs", "corpus", "scripts", "training"],
 29 |         "assets": [
 30 |             {
 31 |                 "dest": "x",
 32 |                 "extra": True,
 33 |                 "url": "https://example.com",
 34 |                 "checksum": "63373dd656daa1fd3043ce166a59474c",
 35 |             },
 36 |             {
 37 |                 "dest": "y",
 38 |                 "git": {
 39 |                     "repo": "https://github.com/example/repo",
 40 |                     "branch": "develop",
 41 |                     "path": "y",
 42 |                 },
 43 |             },
 44 |             {
 45 |                 "dest": "z",
 46 |                 "extra": False,
 47 |                 "url": "https://example.com",
 48 |                 "checksum": "63373dd656daa1fd3043ce166a59474c",
 49 |             },
 50 |         ],
 51 |         "commands": [
 52 |             {
 53 |                 "name": "train",
 54 |                 "help": "Train a model",
 55 |                 "script": ["python -m spacy train config.cfg -o training"],
 56 |                 "deps": ["config.cfg", "corpus/training.spcy"],
 57 |                 "outputs": ["training/model-best"],
 58 |             },
 59 |             {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True},
 60 |         ],
 61 |         "workflows": {"all": ["train", "test"], "train": ["train"]},
 62 |     }
 63 |     errors = validate(ProjectConfigSchema, config)
 64 |     assert not errors
 65 | 
 66 | 
 67 | @pytest.mark.parametrize(
 68 |     "config",
 69 |     [
 70 |         {"commands": [{"name": "a"}, {"name": "a"}]},
 71 |         {"commands": [{"name": "a"}], "workflows": {"a": []}},
 72 |         {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
 73 |     ],
 74 | )
 75 | def test_project_config_validation1(config):
 76 |     with pytest.raises(SystemExit):
 77 |         validate_project_commands(config)
 78 | 
 79 | 
 80 | @pytest.mark.parametrize(
 81 |     "config,n_errors",
 82 |     [
 83 |         ({"commands": {"a": []}}, 1),
 84 |         ({"commands": [{"help": "..."}]}, 1),
 85 |         ({"commands": [{"name": "a", "extra": "b"}]}, 1),
 86 |         ({"commands": [{"extra": "b"}]}, 2),
 87 |         ({"commands": [{"name": "a", "deps": [123]}]}, 1),
 88 |     ],
 89 | )
 90 | def test_project_config_validation2(config, n_errors):
 91 |     errors = validate(ProjectConfigSchema, config)
 92 |     assert len(errors) == n_errors
 93 | 
 94 | 
 95 | @pytest.mark.parametrize(
 96 |     "int_value",
 97 |     [10, pytest.param("10", marks=pytest.mark.xfail)],
 98 | )
 99 | def test_project_config_interpolation(int_value):
100 |     variables = {"a": int_value, "b": {"c": "foo", "d": True}}
101 |     commands = [
102 |         {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
103 |         {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
104 |     ]
105 |     project = {"commands": commands, "vars": variables}
106 |     with make_tempdir() as d:
107 |         srsly.write_yaml(d / "project.yml", project)
108 |         cfg = load_project_config(d)
109 |     assert type(cfg) == dict
110 |     assert type(cfg["commands"]) == list
111 |     assert cfg["commands"][0]["script"][0] == "hello 10 foo"
112 |     assert cfg["commands"][1]["script"][0] == "foo true"
113 |     commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
114 |     project = {"commands": commands, "vars": variables}
115 |     with pytest.raises(ConfigValidationError):
116 |         substitute_project_variables(project)
117 | 
118 | 
119 | @pytest.mark.parametrize(
120 |     "greeting",
121 |     [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)],
122 | )
123 | def test_project_config_interpolation_override(greeting):
124 |     variables = {"a": "world"}
125 |     commands = [
126 |         {"name": "x", "script": ["hello ${vars.a}"]},
127 |     ]
128 |     overrides = {"vars.a": greeting}
129 |     project = {"commands": commands, "vars": variables}
130 |     with make_tempdir() as d:
131 |         srsly.write_yaml(d / "project.yml", project)
132 |         cfg = load_project_config(d, overrides=overrides)
133 |     assert type(cfg) == dict
134 |     assert type(cfg["commands"]) == list
135 |     assert cfg["commands"][0]["script"][0] == f"hello {greeting}"
136 | 
137 | 
138 | def test_project_config_interpolation_env(monkeypatch: pytest.MonkeyPatch):
139 |     variables = {"a": 10}
140 |     env_var = "SPACY_TEST_FOO"
141 | 
142 |     env_vars = {"foo": env_var}
143 |     commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
144 |     project = {"commands": commands, "vars": variables, "env": env_vars}
145 | 
146 |     with make_tempdir() as d:
147 |         srsly.write_yaml(d / "project.yml", project)
148 |         cfg = load_project_config(d)
149 |     assert cfg["commands"][0]["script"][0] == "hello 10 "
150 | 
151 |     monkeypatch.setenv(env_var, "123")
152 | 
153 |     with make_tempdir() as d:
154 |         srsly.write_yaml(d / "project.yml", project)
155 |         cfg = load_project_config(d)
156 |     assert cfg["commands"][0]["script"][0] == "hello 10 123"
157 | 


--------------------------------------------------------------------------------
/weasel/tests/util.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import re
 3 | import tempfile
 4 | 
 5 | import srsly
 6 | 
 7 | 
 8 | @contextlib.contextmanager
 9 | def make_tempfile(mode="r"):
10 |     f = tempfile.TemporaryFile(mode=mode)
11 |     yield f
12 |     f.close()
13 | 
14 | 
15 | def assert_packed_msg_equal(b1, b2):
16 |     """Assert that two packed msgpack messages are equal."""
17 |     msg1 = srsly.msgpack_loads(b1)
18 |     msg2 = srsly.msgpack_loads(b2)
19 |     assert sorted(msg1.keys()) == sorted(msg2.keys())
20 |     for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
21 |         assert k1 == k2
22 |         assert v1 == v2
23 | 
24 | 
25 | def normalize_whitespace(s):
26 |     return re.sub(r"\s+", " ", s)
27 | 


--------------------------------------------------------------------------------
/weasel/util/__init__.py:
--------------------------------------------------------------------------------
 1 | from .commands import join_command, run_command, split_command
 2 | from .config import load_project_config, parse_config_overrides
 3 | from .config import substitute_project_variables
 4 | from .environment import ENV_VARS, check_bool_env_var, check_spacy_env_vars
 5 | from .filesystem import ensure_path, ensure_pathy, is_cwd, is_subpath_of, make_tempdir
 6 | from .filesystem import working_dir
 7 | from .frozen import SimpleFrozenDict, SimpleFrozenList
 8 | from .git import _http_to_git, get_git_version, git_checkout, git_repo_branch_exists
 9 | from .git import git_sparse_checkout
10 | from .hashing import get_checksum, get_hash
11 | from .logging import logger
12 | from .modules import import_file
13 | from .remote import download_file, upload_file
14 | from .validation import validate_project_commands
15 | from .versions import get_minor_version, is_compatible_version, is_minor_version_match
16 | 


--------------------------------------------------------------------------------
/weasel/util/commands.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shlex
 3 | import subprocess
 4 | import sys
 5 | from typing import Any, List, Optional, Union
 6 | 
 7 | from ..compat import is_windows
 8 | from ..errors import Errors
 9 | 
10 | 
11 | def split_command(command: str) -> List[str]:
12 |     """Split a string command using shlex. Handles platform compatibility.
13 | 
14 |     command (str) : The command to split
15 |     RETURNS (List[str]): The split command.
16 |     """
17 |     return shlex.split(command, posix=not is_windows)
18 | 
19 | 
20 | def join_command(command: List[str]) -> str:
21 |     """Join a command using shlex. shlex.join is only available for Python 3.8+,
22 |     so we're using a workaround here.
23 | 
24 |     command (List[str]): The command to join.
25 |     RETURNS (str): The joined command
26 |     """
27 |     return " ".join(shlex.quote(cmd) for cmd in command)
28 | 
29 | 
30 | def run_command(
31 |     command: Union[str, List[str]],
32 |     *,
33 |     stdin: Optional[Any] = None,
34 |     capture: bool = False,
35 | ) -> subprocess.CompletedProcess:
36 |     """Run a command on the command line as a subprocess. If the subprocess
37 |     returns a non-zero exit code, a system exit is performed.
38 | 
39 |     command (str / List[str]): The command. If provided as a string, the
40 |         string will be split using shlex.split.
41 |     stdin (Optional[Any]): stdin to read from or None.
42 |     capture (bool): Whether to capture the output and errors. If False,
43 |         the stdout and stderr will not be redirected, and if there's an error,
44 |         sys.exit will be called with the return code. You should use capture=False
45 |         when you want to turn over execution to the command, and capture=True
46 |         when you want to run the command more like a function.
47 |     RETURNS (Optional[CompletedProcess]): The process object.
48 |     """
49 |     if isinstance(command, str):
50 |         cmd_list = split_command(command)
51 |         cmd_str = command
52 |     else:
53 |         cmd_list = command
54 |         cmd_str = " ".join(command)
55 |     try:
56 |         ret = subprocess.run(
57 |             cmd_list,
58 |             env=os.environ.copy(),
59 |             input=stdin,
60 |             encoding="utf8",
61 |             check=False,
62 |             stdout=subprocess.PIPE if capture else None,
63 |             stderr=subprocess.STDOUT if capture else None,
64 |         )
65 |     except FileNotFoundError:
66 |         # Indicates the *command* wasn't found, it's an error before the command
67 |         # is run.
68 |         raise FileNotFoundError(
69 |             Errors.E501.format(str_command=cmd_str, tool=cmd_list[0])
70 |         ) from None
71 |     if ret.returncode != 0 and capture:
72 |         message = f"Error running command:\n\n{cmd_str}\n\n"
73 |         message += f"Subprocess exited with status {ret.returncode}"
74 |         if ret.stdout is not None:
75 |             message += "\n\nProcess log (stdout and stderr):\n\n"
76 |             message += ret.stdout
77 |         error = subprocess.SubprocessError(message)
78 |         error.ret = ret  # type: ignore[attr-defined]
79 |         error.command = cmd_str  # type: ignore[attr-defined]
80 |         raise error
81 |     elif ret.returncode != 0:
82 |         sys.exit(ret.returncode)
83 |     return ret
84 | 


--------------------------------------------------------------------------------
/weasel/util/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from pathlib import Path
  4 | from typing import Any, Dict, List, Optional
  5 | 
  6 | import srsly
  7 | from click import NoSuchOption
  8 | from click.parser import split_arg_string
  9 | from confection import Config
 10 | from wasabi import msg
 11 | 
 12 | from ..cli.main import PROJECT_FILE
 13 | from ..schemas import ProjectConfigSchema, validate
 14 | from .environment import ENV_VARS
 15 | from .frozen import SimpleFrozenDict
 16 | from .logging import logger
 17 | from .validation import show_validation_error, validate_project_commands
 18 | 
 19 | 
 20 | def parse_config_overrides(
 21 |     args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
 22 | ) -> Dict[str, Any]:
 23 |     """Generate a dictionary of config overrides based on the extra arguments
 24 |     provided on the CLI, e.g. --training.batch_size to override
 25 |     "training.batch_size". Arguments without a "." are considered invalid,
 26 |     since the config only allows top-level sections to exist.
 27 | 
 28 |     env_vars (Optional[str]): Optional environment variable to read from.
 29 |     RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
 30 |     """
 31 |     env_string = os.environ.get(env_var, "") if env_var else ""
 32 |     env_overrides = _parse_overrides(split_arg_string(env_string))
 33 |     cli_overrides = _parse_overrides(args, is_cli=True)
 34 |     if cli_overrides:
 35 |         keys = [k for k in cli_overrides if k not in env_overrides]
 36 |         logger.debug("Config overrides from CLI: %s", keys)
 37 |     if env_overrides:
 38 |         logger.debug("Config overrides from env variables: %s", list(env_overrides))
 39 |     return {**cli_overrides, **env_overrides}
 40 | 
 41 | 
 42 | def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
 43 |     result = {}
 44 |     while args:
 45 |         opt = args.pop(0)
 46 |         err = f"Invalid config override '{opt}'"
 47 |         if opt.startswith("--"):  # new argument
 48 |             orig_opt = opt
 49 |             opt = opt.replace("--", "")
 50 |             if "." not in opt:
 51 |                 if is_cli:
 52 |                     raise NoSuchOption(orig_opt)
 53 |                 else:
 54 |                     msg.fail(f"{err}: can't override top-level sections", exits=1)
 55 |             if "=" in opt:  # we have --opt=value
 56 |                 opt, value = opt.split("=", 1)
 57 |                 opt = opt.replace("-", "_")
 58 |             else:
 59 |                 if not args or args[0].startswith("--"):  # flag with no value
 60 |                     value = "true"
 61 |                 else:
 62 |                     value = args.pop(0)
 63 |             result[opt] = _parse_override(value)
 64 |         else:
 65 |             msg.fail(f"{err}: name should start with --", exits=1)
 66 |     return result
 67 | 
 68 | 
 69 | def _parse_override(value: Any) -> Any:
 70 |     # Just like we do in the config, we're calling json.loads on the
 71 |     # values. But since they come from the CLI, it'd be unintuitive to
 72 |     # explicitly mark strings with escaped quotes. So we're working
 73 |     # around that here by falling back to a string if parsing fails.
 74 |     # TODO: improve logic to handle simple types like list of strings?
 75 |     try:
 76 |         return srsly.json_loads(value)
 77 |     except ValueError:
 78 |         return str(value)
 79 | 
 80 | 
 81 | def load_project_config(
 82 |     path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
 83 | ) -> Dict[str, Any]:
 84 |     """Load the project.yml file from a directory and validate it. Also make
 85 |     sure that all directories defined in the config exist.
 86 | 
 87 |     path (Path): The path to the project directory.
 88 |     interpolate (bool): Whether to substitute project variables.
 89 |     overrides (Dict[str, Any]): Optional config overrides.
 90 |     RETURNS (Dict[str, Any]): The loaded project.yml.
 91 |     """
 92 |     config_path = path / PROJECT_FILE
 93 |     if not config_path.exists():
 94 |         msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
 95 |     invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
 96 |     try:
 97 |         config = srsly.read_yaml(config_path)
 98 |     except ValueError as e:
 99 |         msg.fail(invalid_err, e, exits=1)
100 |     errors = validate(ProjectConfigSchema, config)
101 |     if errors:
102 |         msg.fail(invalid_err)
103 |         print("\n".join(errors))
104 |         sys.exit(1)
105 |     validate_project_commands(config)
106 |     if interpolate:
107 |         err = f"{PROJECT_FILE} validation error"
108 |         with show_validation_error(title=err, hint_fill=False):
109 |             config = substitute_project_variables(config, overrides)
110 |     # Make sure directories defined in config exist
111 |     for subdir in config.get("directories", []):
112 |         dir_path = path / subdir
113 |         if not dir_path.exists():
114 |             dir_path.mkdir(parents=True)
115 |     return config
116 | 
117 | 
118 | def substitute_project_variables(
119 |     config: Dict[str, Any],
120 |     overrides: Dict[str, Any] = SimpleFrozenDict(),
121 |     key: str = "vars",
122 |     env_key: str = "env",
123 | ) -> Dict[str, Any]:
124 |     """Interpolate variables in the project file using the config system.
125 | 
126 |     config (Dict[str, Any]): The project config.
127 |     overrides (Dict[str, Any]): Optional config overrides.
128 |     key (str): Key containing variables in project config.
129 |     env_key (str): Key containing environment variable mapping in project config.
130 |     RETURNS (Dict[str, Any]): The interpolated project config.
131 |     """
132 |     config.setdefault(key, {})
133 |     config.setdefault(env_key, {})
134 |     # Substitute references to env vars with their values
135 |     for config_var, env_var in config[env_key].items():
136 |         config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
137 |     # Need to put variables in the top scope again so we can have a top-level
138 |     # section "project" (otherwise, a list of commands in the top scope wouldn't)
139 |     # be allowed by Thinc's config system
140 |     cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
141 |     cfg = Config().from_str(cfg.to_str(), overrides=overrides)
142 |     interpolated = cfg.interpolate()
143 |     return dict(interpolated["project"])
144 | 


--------------------------------------------------------------------------------
/weasel/util/environment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from wasabi import msg
 4 | 
 5 | 
 6 | class ENV_VARS:
 7 |     CONFIG_OVERRIDES = "WEASEL_CONFIG_OVERRIDES"
 8 | 
 9 | 
10 | def check_spacy_env_vars():
11 |     if "SPACY_CONFIG_OVERRIDES" in os.environ:
12 |         msg.warn(
13 |             "You've set a `SPACY_CONFIG_OVERRIDES` environment variable, "
14 |             "which is now deprecated. Weasel will not use it. "
15 |             "You can use `WEASEL_CONFIG_OVERRIDES` instead."
16 |         )
17 |     if "SPACY_PROJECT_USE_GIT_VERSION" in os.environ:
18 |         msg.warn(
19 |             "You've set a `SPACY_PROJECT_USE_GIT_VERSION` environment variable, "
20 |             "which is now deprecated. Weasel will not use it."
21 |         )
22 | 
23 | 
24 | def check_bool_env_var(env_var: str) -> bool:
25 |     """Convert the value of an environment variable to a boolean. Add special
26 |     check for "0" (falsy) and consider everything else truthy, except unset.
27 | 
28 |     env_var (str): The name of the environment variable to check.
29 |     RETURNS (bool): Its boolean value.
30 |     """
31 |     value = os.environ.get(env_var, False)
32 |     if value == "0":
33 |         return False
34 |     return bool(value)
35 | 


--------------------------------------------------------------------------------
/weasel/util/filesystem.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import stat
 4 | import sys
 5 | import tempfile
 6 | import warnings
 7 | from contextlib import contextmanager
 8 | from pathlib import Path
 9 | from typing import Any, Generator, Iterator, Union
10 | 
11 | from ..errors import Warnings
12 | 
13 | 
14 | @contextmanager
15 | def working_dir(path: Union[str, Path]) -> Iterator[Path]:
16 |     """Change current working directory and returns to previous on exit.
17 | 
18 |     path (str / Path): The directory to navigate to.
19 |     YIELDS (Path): The absolute path to the current working directory. This
20 |         should be used if the block needs to perform actions within the working
21 |         directory, to prevent mismatches with relative paths.
22 |     """
23 |     prev_cwd = Path.cwd()
24 |     current = Path(path).resolve()
25 |     os.chdir(str(current))
26 |     try:
27 |         yield current
28 |     finally:
29 |         os.chdir(str(prev_cwd))
30 | 
31 | 
32 | @contextmanager
33 | def make_tempdir() -> Generator[Path, None, None]:
34 |     """Execute a block in a temporary directory and remove the directory and
35 |     its contents at the end of the with block.
36 | 
37 |     YIELDS (Path): The path of the temp directory.
38 |     """
39 |     d = Path(tempfile.mkdtemp())
40 |     yield d
41 | 
42 |     # On Windows, git clones use read-only files, which cause permission errors
43 |     # when being deleted. This forcibly fixes permissions.
44 |     def force_remove(rmfunc, path, ex):
45 |         os.chmod(path, stat.S_IWRITE)
46 |         rmfunc(path)
47 | 
48 |     try:
49 |         if sys.version_info >= (3, 12):
50 |             shutil.rmtree(str(d), onexc=force_remove)
51 |         else:
52 |             shutil.rmtree(str(d), onerror=force_remove)
53 |     except PermissionError as e:
54 |         warnings.warn(Warnings.W801.format(dir=d, msg=e))
55 | 
56 | 
57 | def is_cwd(path: Union[Path, str]) -> bool:
58 |     """Check whether a path is the current working directory.
59 | 
60 |     path (Union[Path, str]): The directory path.
61 |     RETURNS (bool): Whether the path is the current working directory.
62 |     """
63 |     return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
64 | 
65 | 
66 | def ensure_path(path: Any) -> Any:
67 |     """Ensure string is converted to a Path.
68 | 
69 |     path (Any): Anything. If string, it's converted to Path.
70 |     RETURNS: Path or original argument.
71 |     """
72 |     if isinstance(path, str):
73 |         return Path(path)
74 |     else:
75 |         return path
76 | 
77 | 
78 | def ensure_pathy(path):
79 |     """Temporary helper to prevent importing cloudpathlib globally (which was
80 |     originally added due to a slow and annoying Google Cloud warning with
81 |     Pathy)"""
82 |     from cloudpathlib import AnyPath  # noqa: F811
83 | 
84 |     return AnyPath(path)
85 | 
86 | 
87 | def is_subpath_of(parent, child):
88 |     """
89 |     Check whether `child` is a path contained within `parent`.
90 |     """
91 |     # Based on https://stackoverflow.com/a/37095733 .
92 | 
93 |     # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
94 |     # we can stop using crusty old os.path functions.
95 |     parent_realpath = os.path.realpath(parent)
96 |     child_realpath = os.path.realpath(child)
97 |     return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
98 | 


--------------------------------------------------------------------------------
/weasel/util/frozen.py:
--------------------------------------------------------------------------------
 1 | from ..errors import Errors
 2 | 
 3 | 
 4 | class SimpleFrozenDict(dict):
 5 |     """Simplified implementation of a frozen dict, mainly used as default
 6 |     function or method argument (for arguments that should default to empty
 7 |     dictionary). Will raise an error if user or Weasel attempts to add to dict.
 8 |     """
 9 | 
10 |     def __init__(self, *args, error: str = Errors.E001, **kwargs) -> None:
11 |         """Initialize the frozen dict. Can be initialized with pre-defined
12 |         values.
13 | 
14 |         error (str): The error message when user tries to assign to dict.
15 |         """
16 |         super().__init__(*args, **kwargs)
17 |         self.error = error
18 | 
19 |     def __setitem__(self, key, value):
20 |         raise NotImplementedError(self.error)
21 | 
22 |     def pop(self, key, default=None):
23 |         raise NotImplementedError(self.error)
24 | 
25 |     def update(self, other):
26 |         raise NotImplementedError(self.error)
27 | 
28 | 
29 | class SimpleFrozenList(list):
30 |     """Wrapper class around a list that lets us raise custom errors if certain
31 |     attributes/methods are accessed. Mostly used for properties like
32 |     Language.pipeline that return an immutable list (and that we don't want to
33 |     convert to a tuple to not break too much backwards compatibility). If a user
34 |     accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
35 |     """
36 | 
37 |     def __init__(self, *args, error: str = Errors.E002) -> None:
38 |         """Initialize the frozen list.
39 | 
40 |         error (str): The error message when user tries to mutate the list.
41 |         """
42 |         self.error = error
43 |         super().__init__(*args)
44 | 
45 |     def append(self, *args, **kwargs):
46 |         raise NotImplementedError(self.error)
47 | 
48 |     def clear(self, *args, **kwargs):
49 |         raise NotImplementedError(self.error)
50 | 
51 |     def extend(self, *args, **kwargs):
52 |         raise NotImplementedError(self.error)
53 | 
54 |     def insert(self, *args, **kwargs):
55 |         raise NotImplementedError(self.error)
56 | 
57 |     def pop(self, *args, **kwargs):
58 |         raise NotImplementedError(self.error)
59 | 
60 |     def remove(self, *args, **kwargs):
61 |         raise NotImplementedError(self.error)
62 | 
63 |     def reverse(self, *args, **kwargs):
64 |         raise NotImplementedError(self.error)
65 | 
66 |     def sort(self, *args, **kwargs):
67 |         raise NotImplementedError(self.error)
68 | 


--------------------------------------------------------------------------------
/weasel/util/git.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | from pathlib import Path
  4 | from typing import Tuple
  5 | 
  6 | from wasabi import msg
  7 | 
  8 | from .commands import run_command
  9 | from .filesystem import is_subpath_of, make_tempdir
 10 | 
 11 | 
 12 | def git_checkout(
 13 |     repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
 14 | ):
 15 |     git_version = get_git_version()
 16 |     if dest.exists():
 17 |         msg.fail("Destination of checkout must not exist", exits=1)
 18 |     if not dest.parent.exists():
 19 |         msg.fail("Parent of destination of checkout must exist", exits=1)
 20 |     if sparse and git_version >= (2, 22):
 21 |         return git_sparse_checkout(repo, subpath, dest, branch)
 22 |     elif sparse:
 23 |         # Only show warnings if the user explicitly wants sparse checkout but
 24 |         # the Git version doesn't support it
 25 |         err_old = (
 26 |             f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
 27 |             f"that doesn't fully support sparse checkout yet."
 28 |         )
 29 |         err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
 30 |         msg.warn(
 31 |             f"{err_unk if git_version == (0, 0) else err_old} "
 32 |             f"This means that more files than necessary may be downloaded "
 33 |             f"temporarily. To only download the files needed, make sure "
 34 |             f"you're using Git v2.22 or above."
 35 |         )
 36 |     with make_tempdir() as tmp_dir:
 37 |         cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
 38 |         run_command(cmd, capture=True)
 39 |         # We need Path(name) to make sure we also support subdirectories
 40 |         try:
 41 |             source_path = tmp_dir / Path(subpath)
 42 |             if not is_subpath_of(tmp_dir, source_path):
 43 |                 err = f"'{subpath}' is a path outside of the cloned repository."
 44 |                 msg.fail(err, repo, exits=1)
 45 |             if os.path.isdir(source_path):
 46 |                 shutil.copytree(source_path, dest)
 47 |             else:
 48 |                 shutil.copyfile(source_path, dest)
 49 |         except FileNotFoundError:
 50 |             err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
 51 |             msg.fail(err, repo, exits=1)
 52 | 
 53 | 
 54 | def git_sparse_checkout(repo, subpath, dest, branch):
 55 |     # We're using Git, partial clone and sparse checkout to
 56 |     # only clone the files we need
 57 |     # This ends up being RIDICULOUS. omg.
 58 |     # So, every tutorial and SO post talks about 'sparse checkout'...But they
 59 |     # go and *clone* the whole repo. Worthless. And cloning part of a repo
 60 |     # turns out to be completely broken. The only way to specify a "path" is..
 61 |     # a path *on the server*? The contents of which, specifies the paths. Wat.
 62 |     # Obviously this is hopelessly broken and insecure, because you can query
 63 |     # arbitrary paths on the server! So nobody enables this.
 64 |     # What we have to do is disable *all* files. We could then just checkout
 65 |     # the path, and it'd "work", but be hopelessly slow...Because it goes and
 66 |     # transfers every missing object one-by-one. So the final piece is that we
 67 |     # need to use some weird git internals to fetch the missings in bulk, and
 68 |     # *that* we can do by path.
 69 |     # We're using Git and sparse checkout to only clone the files we need
 70 |     with make_tempdir() as tmp_dir:
 71 |         # This is the "clone, but don't download anything" part.
 72 |         cmd = (
 73 |             f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
 74 |             f"-b {branch} --filter=blob:none"
 75 |         )
 76 |         run_command(cmd)
 77 |         # Now we need to find the missing filenames for the subpath we want.
 78 |         # Looking for this 'rev-list' command in the git --help? Hah.
 79 |         cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
 80 |         ret = run_command(cmd, capture=True)
 81 |         git_repo = _http_to_git(repo)
 82 |         # Now pass those missings into another bit of git internals
 83 |         missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
 84 |         if not missings:
 85 |             err = (
 86 |                 f"Could not find any relevant files for '{subpath}'. "
 87 |                 f"Did you specify a correct and complete path within repo '{repo}' "
 88 |                 f"and branch {branch}?"
 89 |             )
 90 |             msg.fail(err, exits=1)
 91 |         cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
 92 |         run_command(cmd, capture=True)
 93 |         # And finally, we can checkout our subpath
 94 |         cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
 95 |         run_command(cmd, capture=True)
 96 | 
 97 |         # Get a subdirectory of the cloned path, if appropriate
 98 |         source_path = tmp_dir / Path(subpath)
 99 |         if not is_subpath_of(tmp_dir, source_path):
100 |             err = f"'{subpath}' is a path outside of the cloned repository."
101 |             msg.fail(err, repo, exits=1)
102 | 
103 |         shutil.move(str(source_path), str(dest))
104 | 
105 | 
106 | def git_repo_branch_exists(repo: str, branch: str) -> bool:
107 |     """Uses 'git ls-remote' to check if a repository and branch exists
108 | 
109 |     repo (str): URL to get repo.
110 |     branch (str): Branch on repo to check.
111 |     RETURNS (bool): True if repo:branch exists.
112 |     """
113 |     get_git_version()
114 |     cmd = f"git ls-remote {repo} {branch}"
115 |     # We might be tempted to use `--exit-code` with `git ls-remote`, but
116 |     # `run_command` handles the `returncode` for us, so we'll rely on
117 |     # the fact that stdout returns '' if the requested branch doesn't exist
118 |     ret = run_command(cmd, capture=True)
119 |     exists = ret.stdout != ""
120 |     return exists
121 | 
122 | 
123 | def get_git_version(
124 |     error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
125 | ) -> Tuple[int, int]:
126 |     """Get the version of git and raise an error if calling 'git --version' fails.
127 | 
128 |     error (str): The error message to show.
129 |     RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
130 |         (0, 0) if the version couldn't be determined.
131 |     """
132 |     try:
133 |         ret = run_command("git --version", capture=True)
134 |     except Exception:
135 |         raise RuntimeError(error)
136 |     stdout = ret.stdout.strip()
137 |     if not stdout or not stdout.startswith("git version"):
138 |         return 0, 0
139 |     version = stdout[11:].strip().split(".")
140 |     return int(version[0]), int(version[1])
141 | 
142 | 
143 | def _http_to_git(repo: str) -> str:
144 |     if repo.startswith("http://"):
145 |         repo = repo.replace(r"http://", r"https://")
146 |     if repo.startswith(r"https://"):
147 |         repo = repo.replace("https://", "git@").replace("/", ":", 1)
148 |         if repo.endswith("/"):
149 |             repo = repo[:-1]
150 |         repo = f"{repo}.git"
151 |     return repo
152 | 


--------------------------------------------------------------------------------
/weasel/util/hashing.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | from pathlib import Path
 3 | from typing import Iterable, Union
 4 | 
 5 | import srsly
 6 | from wasabi import msg
 7 | 
 8 | 
 9 | def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
10 |     """Get the hash for a JSON-serializable object.
11 | 
12 |     data: The data to hash.
13 |     exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
14 |     RETURNS (str): The hash.
15 |     """
16 |     if isinstance(data, dict):
17 |         data = {k: v for k, v in data.items() if k not in exclude}
18 |     data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
19 |     return hashlib.md5(data_str).hexdigest()
20 | 
21 | 
22 | def get_checksum(path: Union[Path, str]) -> str:
23 |     """Get the checksum for a file or directory given its file path. If a
24 |     directory path is provided, this uses all files in that directory.
25 | 
26 |     path (Union[Path, str]): The file or directory path.
27 |     RETURNS (str): The checksum.
28 |     """
29 |     path = Path(path)
30 |     if not (path.is_file() or path.is_dir()):
31 |         msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
32 |     if path.is_file():
33 |         return hashlib.md5(Path(path).read_bytes()).hexdigest()
34 |     else:
35 |         # TODO: this is currently pretty slow
36 |         dir_checksum = hashlib.md5()
37 |         for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
38 |             dir_checksum.update(sub_file.read_bytes())
39 |         return dir_checksum.hexdigest()
40 | 


--------------------------------------------------------------------------------
/weasel/util/logging.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logger = logging.getLogger("weasel")
4 | logger_stream_handler = logging.StreamHandler()
5 | logger_stream_handler.setFormatter(
6 |     logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
7 | )
8 | logger.addHandler(logger_stream_handler)
9 | 


--------------------------------------------------------------------------------
/weasel/util/modules.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from pathlib import Path
 3 | from types import ModuleType
 4 | from typing import Union
 5 | 
 6 | 
 7 | def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
 8 |     """Import module from a file. Used to load models from a directory.
 9 | 
10 |     name (str): Name of module to load.
11 |     loc (str / Path): Path to the file.
12 |     RETURNS: The loaded module.
13 |     """
14 |     spec = importlib.util.spec_from_file_location(name, str(loc))  # type: ignore
15 |     module = importlib.util.module_from_spec(spec)  # type: ignore
16 |     spec.loader.exec_module(module)  # type: ignore[union-attr]
17 |     return module
18 | 


--------------------------------------------------------------------------------
/weasel/util/remote.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from pathlib import Path
 3 | from typing import TYPE_CHECKING, Union
 4 | 
 5 | if TYPE_CHECKING:
 6 |     from cloudpathlib import CloudPath
 7 | 
 8 | 
 9 | def upload_file(src: Path, dest: Union[str, "CloudPath"]) -> None:
10 |     """Upload a file.
11 | 
12 |     src (Path): The source path.
13 |     url (str): The destination URL to upload to.
14 |     """
15 |     import smart_open
16 | 
17 |     # Create parent directories for local paths
18 |     if isinstance(dest, Path):
19 |         if not dest.parent.exists():
20 |             dest.parent.mkdir(parents=True)
21 | 
22 |     dest = str(dest)
23 |     with smart_open.open(dest, mode="wb") as output_file:
24 |         with src.open(mode="rb") as input_file:
25 |             output_file.write(input_file.read())
26 | 
27 | 
28 | def download_file(
29 |     src: Union[str, "CloudPath"], dest: Path, *, force: bool = False
30 | ) -> None:
31 |     """Download a file using smart_open.
32 | 
33 |     url (str): The URL of the file.
34 |     dest (Path): The destination path.
35 |     force (bool): Whether to force download even if file exists.
36 |         If False, the download will be skipped.
37 |     """
38 |     import smart_open
39 | 
40 |     if dest.exists() and not force:
41 |         return None
42 |     src = str(src)
43 |     with smart_open.open(src, mode="rb", compression="disable") as input_file:
44 |         with dest.open(mode="wb") as output_file:
45 |             shutil.copyfileobj(input_file, output_file)
46 | 


--------------------------------------------------------------------------------
/weasel/util/validation.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from configparser import InterpolationError
 3 | from contextlib import contextmanager
 4 | from pathlib import Path
 5 | from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 6 | 
 7 | from confection import ConfigValidationError
 8 | from wasabi import msg
 9 | 
10 | from ..cli.main import PROJECT_FILE
11 | 
12 | if TYPE_CHECKING:
13 |     pass
14 | 
15 | 
16 | @contextmanager
17 | def show_validation_error(
18 |     file_path: Optional[Union[str, Path]] = None,
19 |     *,
20 |     title: Optional[str] = None,
21 |     desc: str = "",
22 |     show_config: Optional[bool] = None,
23 |     hint_fill: bool = True,
24 | ):
25 |     """Helper to show custom config validation errors on the CLI.
26 | 
27 |     file_path (str / Path): Optional file path of config file, used in hints.
28 |     title (str): Override title of custom formatted error.
29 |     desc (str): Override description of custom formatted error.
30 |     show_config (bool): Whether to output the config the error refers to.
31 |     hint_fill (bool): Show hint about filling config.
32 |     """
33 |     try:
34 |         yield
35 |     except ConfigValidationError as e:
36 |         title = title if title is not None else e.title
37 |         if e.desc:
38 |             desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
39 |         # Re-generate a new error object with overrides
40 |         err = e.from_error(e, title="", desc=desc, show_config=show_config)
41 |         msg.fail(title)
42 |         print(err.text.strip())
43 |         if hint_fill and "value_error.missing" in err.error_types:
44 |             config_path = (
45 |                 file_path
46 |                 if file_path is not None and str(file_path) != "-"
47 |                 else "config.cfg"
48 |             )
49 |             msg.text(
50 |                 "If your config contains missing values, you can run the 'init "
51 |                 "fill-config' command to fill in all the defaults, if possible:",
52 |                 spaced=True,
53 |             )
54 |             print(f"python -m spacy init fill-config {config_path} {config_path} \n")
55 |         sys.exit(1)
56 |     except InterpolationError as e:
57 |         msg.fail("Config validation error", e, exits=1)
58 | 
59 | 
60 | def validate_project_commands(config: Dict[str, Any]) -> None:
61 |     """Check that project commands and workflows are valid, don't contain
62 |     duplicates, don't clash  and only refer to commands that exist.
63 | 
64 |     config (Dict[str, Any]): The loaded config.
65 |     """
66 |     command_names = [cmd["name"] for cmd in config.get("commands", [])]
67 |     workflows = config.get("workflows", {})
68 |     duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
69 |     if duplicates:
70 |         err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
71 |         msg.fail(err, exits=1)
72 |     for workflow_name, workflow_steps in workflows.items():
73 |         if workflow_name in command_names:
74 |             err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
75 |             msg.fail(err, exits=1)
76 |         for step in workflow_steps:
77 |             if step not in command_names:
78 |                 msg.fail(
79 |                     f"Unknown command specified in workflow '{workflow_name}': {step}",
80 |                     f"Workflows can only refer to commands defined in the 'commands' "
81 |                     f"section of the {PROJECT_FILE}.",
82 |                     exits=1,
83 |                 )
84 | 


--------------------------------------------------------------------------------
/weasel/util/versions.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from packaging.specifiers import InvalidSpecifier, SpecifierSet
 4 | from packaging.version import InvalidVersion, Version
 5 | 
 6 | 
 7 | def is_compatible_version(
 8 |     version: str, constraint: str, prereleases: bool = True
 9 | ) -> Optional[bool]:
10 |     """Check if a version (e.g. "2.0.0") is compatible given a version
11 |     constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
12 |     it's interpreted as =={version}.
13 | 
14 |     version (str): The version to check.
15 |     constraint (str): The constraint string.
16 |     prereleases (bool): Whether to allow prereleases. If set to False,
17 |         prerelease versions will be considered incompatible.
18 |     RETURNS (bool / None): Whether the version is compatible, or None if the
19 |         version or constraint are invalid.
20 |     """
21 |     # Handle cases where exact version is provided as constraint
22 |     if constraint[0].isdigit():
23 |         constraint = f"=={constraint}"
24 |     try:
25 |         spec = SpecifierSet(constraint)
26 |         version = Version(version)  # type: ignore[assignment]
27 |     except (InvalidSpecifier, InvalidVersion):
28 |         return None
29 |     spec.prereleases = prereleases
30 |     return version in spec
31 | 
32 | 
33 | def get_minor_version(version: str) -> Optional[str]:
34 |     """Get the major + minor version (without patch or prerelease identifiers).
35 | 
36 |     version (str): The version.
37 |     RETURNS (str): The major + minor version or None if version is invalid.
38 |     """
39 |     try:
40 |         v = Version(version)
41 |     except (TypeError, InvalidVersion):
42 |         return None
43 |     return f"{v.major}.{v.minor}"
44 | 
45 | 
46 | def is_minor_version_match(version_a: str, version_b: str) -> bool:
47 |     """Compare two versions and check if they match in major and minor, without
48 |     patch or prerelease identifiers. Used internally for compatibility checks
49 |     that should be insensitive to patch releases.
50 | 
51 |     version_a (str): The first version
52 |     version_b (str): The second version.
53 |     RETURNS (bool): Whether the versions match.
54 |     """
55 |     a = get_minor_version(version_a)
56 |     b = get_minor_version(version_b)
57 |     return a is not None and b is not None and a == b
58 | 


--------------------------------------------------------------------------------