├── .github
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── bin
├── get-package.sh
├── get-version.sh
└── push-tag.sh
├── docs
├── assets
│ └── images
│ │ ├── prodigy_train_curve.jpg
│ │ ├── project_document.jpg
│ │ ├── projects.png
│ │ ├── projects.svg
│ │ └── spacy-streamlit.png
├── cli.md
└── tutorial
│ ├── custom-scripts.md
│ ├── directory-and-assets.md
│ ├── integrations.md
│ ├── remote-storage.md
│ └── workflow.md
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
└── weasel
├── __init__.py
├── __main__.py
├── about.py
├── cli
├── __init__.py
├── assets.py
├── clone.py
├── document.py
├── dvc.py
├── main.py
├── pull.py
├── push.py
├── remote_storage.py
└── run.py
├── compat.py
├── errors.py
├── schemas.py
├── tests
├── __init__.py
├── cli
│ ├── __init__.py
│ ├── test_cli.py
│ ├── test_cli_app.py
│ ├── test_document.py
│ └── test_remote.py
├── demo_project
│ ├── project.yml
│ └── scripts
│ │ └── check.py
├── test_schemas.py
├── test_validation.py
└── util.py
└── util
├── __init__.py
├── commands.py
├── config.py
├── environment.py
├── filesystem.py
├── frozen.py
├── git.py
├── hashing.py
├── logging.py
├── modules.py
├── remote.py
├── validation.py
└── versions.py
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Description
4 |
5 |
10 |
11 | ### Types of change
12 |
13 |
15 |
16 | ## Checklist
17 |
18 |
20 |
21 | - [ ] I confirm that I have the right to submit this contribution under the project's MIT license.
22 | - [ ] I ran the test suite, and all new and existing tests passed.
23 | - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
24 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | paths-ignore:
6 | - "*.md"
7 | pull_request:
8 | types: [opened, synchronize, reopened, edited]
9 | paths-ignore:
10 | - "*.md"
11 |
12 | env:
13 | MODULE_NAME: "weasel"
14 | RUN_MYPY: "true"
15 |
16 | jobs:
17 | validate:
18 | name: Validate
19 | if: github.repository_owner == 'explosion'
20 | runs-on: ubuntu-latest
21 |
22 | steps:
23 | - name: Check out repo
24 | uses: actions/checkout@v4
25 |
26 | - name: Configure Python version
27 | uses: actions/setup-python@v5
28 | with:
29 | python-version: "3.11"
30 |
31 | - name: Set PY variable
32 | run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
33 |
34 | - uses: actions/cache@v3
35 | with:
36 | path: ~/.cache/pre-commit
37 | key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
38 |
39 | - name: Install pre-commit
40 | run: |
41 | pip install 'pre-commit>=3.2.0,<4.0.0'
42 | pre-commit install
43 |
44 | - name: Run pre-commit
45 | run: SKIP=no-commit-to-branch pre-commit run --all-files
46 |
47 | tests:
48 | name: Test
49 | needs: Validate
50 | if: github.repository_owner == 'explosion'
51 | strategy:
52 | fail-fast: true
53 | matrix:
54 | os: [ubuntu-latest, windows-latest, macos-latest]
55 | python_version: ["3.12"]
56 | include:
57 | - os: ubuntu-latest
58 | python_version: "3.7"
59 | - os: windows-latest
60 | python_version: "3.8"
61 | - os: macos-latest
62 | python_version: "3.9"
63 | - os: macos-latest
64 | python_version: "3.10"
65 | - os: windows-latest
66 | python_version: "3.11"
67 | runs-on: ${{ matrix.os }}
68 |
69 | steps:
70 | - name: Check out repo
71 | uses: actions/checkout@v4
72 |
73 | - name: Configure Python version
74 | uses: actions/setup-python@v5
75 | with:
76 | python-version: ${{ matrix.python_version }}
77 |
78 | - name: Build sdist
79 | run: |
80 | python -m pip install -U build pip setuptools
81 | python -m pip install -U -r requirements.txt
82 | python -m build --sdist
83 |
84 | - name: Delete source directory
85 | shell: bash
86 | run: |
87 | rm -rf $MODULE_NAME
88 |
89 | - name: Uninstall all packages
90 | run: |
91 | python -m pip freeze > installed.txt
92 | python -m pip uninstall -y -r installed.txt
93 |
94 | - name: Install from sdist
95 | shell: bash
96 | run: |
97 | SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
98 | python -m pip install dist/$SDIST
99 |
100 | - name: Test import
101 | shell: bash
102 | run: |
103 | python -c "import $MODULE_NAME" -Werror
104 |
105 | - name: Test CLI
106 | run: |
107 | python -m weasel --help
108 |
109 | - name: Install test requirements
110 | run: |
111 | python -m pip install -U -r requirements.txt
112 |
113 | - name: Run tests
114 | shell: bash
115 | run: |
116 | python -m pytest --pyargs $MODULE_NAME -Werror
117 |
118 | - name: Test 'spacy project' CLI help/info messages
119 | shell: bash
120 | run: |
121 | python -m pip install spacy
122 | python -m spacy project clone pipelines/ner_demo | grep -q "spacy project assets"
123 | cd ner_demo
124 | python -m spacy project run --help | grep -q "spacy project run"
125 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Vim / VSCode / editors
2 | *.swp
3 | *.sw*
4 | Profile.prof
5 | .vscode
6 | .sass-cache
7 |
8 | # Python
9 | .Python
10 | .python-version
11 | __pycache__/
12 | .pytest_cache
13 | *.py[cod]
14 | .env/
15 | .env*
16 | .~env/
17 | .venv
18 | env3.6/
19 | venv/
20 | env3.*/
21 | .dev
22 | .denv
23 | .pypyenv
24 | .pytest_cache/
25 | .mypy_cache/
26 | .hypothesis/
27 |
28 | # Distribution / packaging
29 | env/
30 | build/
31 | develop-eggs/
32 | dist/
33 | eggs/
34 | lib/
35 | lib64/
36 | parts/
37 | sdist/
38 | var/
39 | wheelhouse/
40 | *.egg-info/
41 | pip-wheel-metadata/
42 | Pipfile.lock
43 | .installed.cfg
44 | *.egg
45 | .eggs
46 | MANIFEST
47 |
48 | # Temporary files
49 | *.~*
50 | tmp/
51 |
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 |
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .coverage
60 | .cache
61 | nosetests.xml
62 | coverage.xml
63 |
64 | # Translations
65 | *.mo
66 |
67 | # Mr Developer
68 | .mr.developer.cfg
69 | .project
70 | .pydevproject
71 |
72 | # Rope
73 | .ropeproject
74 |
75 | # Django stuff:
76 | *.log
77 | *.pot
78 |
79 | # Windows
80 | *.bat
81 | Thumbs.db
82 | Desktop.ini
83 |
84 | # Mac OS X
85 | *.DS_Store
86 |
87 | # Komodo project files
88 | *.komodoproject
89 |
90 | # Other
91 | *.tgz
92 |
93 | # Pycharm project files
94 | *.idea
95 |
96 | # IPython
97 | .ipynb_checkpoints/
98 | *.ipynb
99 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v3.2.0
6 | hooks:
7 | - id: trailing-whitespace
8 | - id: no-commit-to-branch
9 | args: [--branch, main]
10 | - id: end-of-file-fixer
11 | - id: check-yaml
12 | args: [--unsafe]
13 | - id: check-toml
14 | - id: check-json
15 | - id: check-symlinks
16 | - id: check-docstring-first
17 | - id: check-added-large-files
18 | - id: detect-private-key
19 | # - id: requirements-txt-fixer
20 |
21 | - repo: https://github.com/charliermarsh/ruff-pre-commit
22 | rev: v0.0.254
23 | hooks:
24 | - id: ruff
25 | args: [--fix, --exit-non-zero-on-fix]
26 |
27 | - repo: https://github.com/pre-commit/mirrors-mypy
28 | rev: v1.0.1
29 | hooks:
30 | - id: mypy
31 | additional_dependencies:
32 | - "types-requests"
33 | - "types-setuptools"
34 | - "pydantic"
35 |
36 | - repo: https://github.com/pycqa/isort
37 | rev: 5.12.0
38 | hooks:
39 | - id: isort
40 | name: isort (python)
41 | - id: isort
42 | name: isort (cython)
43 | types: [cython]
44 | - id: isort
45 | name: isort (pyi)
46 | types: [pyi]
47 |
48 | - repo: https://github.com/psf/black
49 | rev: 22.3.0
50 | hooks:
51 | - id: black
52 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (C) 2022 ExplosionAI GmbH
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Weasel: A small and easy workflow system
4 |
5 | Weasel lets you manage and share **end-to-end workflows** for
6 | different **use cases and domains**, and orchestrate training, packaging and
7 | serving your custom pipelines. You can start off by cloning a pre-defined
8 | project template, adjust it to fit your needs, load in your data, train a
9 | pipeline, export it as a Python package, upload your outputs to a remote storage
10 | and share your results with your team. Weasel can be used via the
11 | [`weasel`](https://github.com/explosion/weasel/blob/main/docs/cli.md) command and we provide templates in our
12 | [`projects`](https://github.com/explosion/projects) repo.
13 |
14 | 
15 |
16 | ## 💡 Example: Get started with a project template
17 |
18 | The easiest way to get started is to clone a project template and run it – for
19 | example, this [end-to-end template](https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud)
20 | that lets you train a spaCy **part-of-speech
21 | tagger** and **dependency parser** on a Universal Dependencies treebank.
22 |
23 | ```shell
24 | python -m weasel clone pipelines/tagger_parser_ud
25 | ```
26 |
27 | > **Note**
28 | >
29 | > Our [`projects`](https://github.com/explosion/projects) repo includes various
30 | > project templates for different NLP tasks, models, workflows and integrations
31 | > that you can clone and run. The easiest way to get started is to pick a
32 | > template, clone it and start modifying it!
33 |
34 | ## 📕 Documentation
35 |
36 | Get started with the documentation:
37 |
38 | - [Learn how to create a Weasel workflow](https://github.com/explosion/weasel/blob/main/docs/tutorial/workflow.md)
39 | - [Working with directory and assets](https://github.com/explosion/weasel/blob/main/docs/tutorial/directory-and-assets.md)
40 | - [Running custom scripts](https://github.com/explosion/weasel/blob/main/docs/tutorial/custom-scripts.md)
41 | - [Using remote storage](https://github.com/explosion/weasel/blob/main/docs/tutorial/remote-storage.md)
42 | - [Weasel integrations](https://github.com/explosion/weasel/blob/main/docs/tutorial/integrations.md)
43 | - [Command line interface description](https://github.com/explosion/weasel/blob/main/docs/cli.md)
44 |
45 | ## Migrating from spaCy Projects
46 |
47 | Weasel is a standalone replacement for spaCy Projects.
48 | There are a few backward incompatibilities that you should be aware of:
49 |
50 | - The `SPACY_CONFIG_OVERRIDES` environment variable is no longer checked.
51 | You can set configuration overrides using `WEASEL_CONFIG_OVERRIDES`.
52 | - Support for the `spacy_version` configuration key has been dropped.
53 | - Support for the `check_requirements` configuration key has been dropped.
54 | - Support for `SPACY_PROJECT_USE_GIT_VERSION` environment variable has been dropped.
55 | - Error codes are now Weasel-specific, and do not follow spaCy error codes.
56 |
57 | Weasel checks for the first three incompatibilities and will issue a
58 | warning if you're using it with spaCy-specific configuration options.
59 |
--------------------------------------------------------------------------------
/bin/get-package.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | version=$(grep "name = " setup.cfg)
6 | version=${version/__title__ = }
7 | version=${version/\'/}
8 | version=${version/\'/}
9 | version=${version/\"/}
10 | version=${version/\"/}
11 |
12 | echo $version
13 |
--------------------------------------------------------------------------------
/bin/get-version.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | version=$(grep "version = " setup.cfg)
6 | version=${version/version = }
7 | version=${version/\'/}
8 | version=${version/\'/}
9 | version=${version/\"/}
10 | version=${version/\"/}
11 |
12 | echo $version
13 |
--------------------------------------------------------------------------------
/bin/push-tag.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | # Insist repository is clean
6 | git diff-index --quiet HEAD
7 |
8 | git checkout $1
9 | git pull origin $1
10 | git push origin $1
11 |
12 | version=$(grep "version = " setup.cfg)
13 | version=${version/version = }
14 | version=${version/\'/}
15 | version=${version/\'/}
16 | version=${version/\"/}
17 | version=${version/\"/}
18 | git tag "v$version"
19 | git push origin "v$version"
20 |
--------------------------------------------------------------------------------
/docs/assets/images/prodigy_train_curve.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/prodigy_train_curve.jpg
--------------------------------------------------------------------------------
/docs/assets/images/project_document.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/project_document.jpg
--------------------------------------------------------------------------------
/docs/assets/images/projects.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/projects.png
--------------------------------------------------------------------------------
/docs/assets/images/spacy-streamlit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/docs/assets/images/spacy-streamlit.png
--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
1 | # Command Line Interface
2 |
3 | The `weasel` CLI includes subcommands for working with Weasel projects,
4 | end-to-end workflows for building and deploying custom pipelines.
5 |
6 | ## :clipboard: clone
7 |
8 | Clone a project template from a Git repository. Calls into `git` under the hood
9 | and can use the sparse checkout feature if available, so you're only downloading
10 | what you need. By default, Weasel's
11 | [project templates repo](https://github.com/explosion/projects) is used, but you
12 | can provide any other repo (public or private) that you have access to using the
13 | `--repo` option.
14 |
15 | ```bash
16 | python -m weasel clone [name] [dest] [--repo] [--branch] [--sparse]
17 | ```
18 |
19 | > :bulb: **Example usage**
20 | >
21 | > ```bash
22 | > $ python -m weasel clone pipelines/ner_wikiner
23 | > ```
24 |
25 | | Name | Description |
26 | | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
27 | | `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ |
28 | | `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ |
29 | | `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ |
30 | | `--branch`, `-b` | The branch to clone from. Defaults to `master`. ~~str (option)~~ |
31 | | `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
32 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
33 | | **CREATES** | The cloned [project directory](tutorial/directory-and-assets.md). |
34 |
35 | ## :open_file_folder: assets
36 |
37 | Fetch project assets like datasets and pretrained weights. Assets are defined in
38 | the `assets` section of the
39 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). If a `checksum`
40 | is provided, the file is only downloaded if no local file with the same checksum
41 | exists and Weasel will show an error if the checksum of the downloaded file
42 | doesn't match. If assets don't specify a `url` they're considered "private" and
43 | you have to take care of putting them into the destination directory yourself.
44 | If a local path is provided, the asset is copied into the current project.
45 |
46 | ```bash
47 | python -m weasel assets [project_dir]
48 | ```
49 |
50 | | Name | Description |
51 | | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
52 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
53 | | `--extra`, `-e` 3.3.1 | Download assets marked as "extra". Default false. ~~bool (flag)~~ |
54 | | `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
55 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
56 | | **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
57 |
58 | ## :rocket: run
59 |
60 | Run a named command or workflow defined in the
61 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). If a workflow
62 | name is specified, all commands in the workflow are run, in order. If commands
63 | define
64 | [dependencies or outputs](tutorial/directory-and-assets.md#dependencies-and-outputs),
65 | they will only be re-run if state has changed. For example, if the input dataset
66 | changes, a preprocessing command that depends on those files will be re-run.
67 |
68 | ```bash
69 | python -m weasel run [subcommand] [project_dir] [--force] [--dry]
70 | ```
71 |
72 | | Name | Description |
73 | | --------------- | --------------------------------------------------------------------------------------- |
74 | | `subcommand` | Name of the command or workflow to run. ~~str (positional)~~ |
75 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
76 | | `--force`, `-F` | Force re-running steps, even if nothing changed. ~~bool (flag)~~ |
77 | | `--dry`, `-D` | Perform a dry run and don't execute scripts. ~~bool (flag)~~ |
78 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
79 | | **EXECUTES** | The command defined in the `project.yml`. |
80 |
81 | ## :arrow_up: push
82 |
83 | Upload all available files or directories listed as in the `outputs` section of
84 | commands to a remote storage. Outputs are archived and compressed prior to
85 | upload, and addressed in the remote storage using the output's relative path
86 | (URL encoded), a hash of its command string and dependencies, and a hash of its
87 | file contents. This means `push` should **never overwrite** a file in your
88 | remote. If all the hashes match, the contents are the same and nothing happens.
89 | If the contents are different, the new version of the file is uploaded. Deleting
90 | obsolete files is left up to you.
91 |
92 | Remotes can be defined in the `remotes` section of the
93 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). Under the hood,
94 | Weasel uses [`cloudpathlib`](https://cloudpathlib.drivendata.org) to communicate
95 | with the remote storages, so you can use any protocol that `CloudPath` supports,
96 | including [S3](https://aws.amazon.com/s3/),
97 | [Google Cloud Storage](https://cloud.google.com/storage), and the local
98 | filesystem, although you may need to install extra dependencies to use certain
99 | protocols.
100 |
101 | ```bash
102 | python -m weasel push [remote] [project_dir]
103 | ```
104 |
105 | > :bulb: **Example**
106 | >
107 | > ```bash
108 | > $ python -m weasel push my_bucket
109 | > ```
110 | >
111 | > ```yaml title="project.yml"
112 | > remotes:
113 | > my_bucket: 's3://my-weasel-bucket'
114 | > ```
115 |
116 | | Name | Description |
117 | | -------------- | --------------------------------------------------------------------------------------- |
118 | | `remote` | The name of the remote to upload to. Defaults to `"default"`. ~~str (positional)~~ |
119 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
120 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
121 | | **UPLOADS** | All project outputs that exist and are not already stored in the remote. |
122 |
123 | ## :arrow_down: pull
124 |
125 | Download all files or directories listed as `outputs` for commands, unless they
126 | are already present locally. When searching for files in the remote, `pull`
127 | won't just look at the output path, but will also consider the **command
128 | string** and the **hashes of the dependencies**. For instance, let's say you've
129 | previously pushed a checkpoint to the remote, but now you've changed some
130 | hyper-parameters. Because you've changed the inputs to the command, if you run
131 | `pull`, you won't retrieve the stale result. If you train your pipeline and push
132 | the outputs to the remote, the outputs will be saved alongside the prior
133 | outputs, so if you change the config back, you'll be able to fetch back the
134 | result.
135 |
136 | Remotes can be defined in the `remotes` section of the
137 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). Under the hood,
138 | Weasel uses [`cloudpathlib`](https://cloudpathlib.drivendata.org/) to
139 | communicate with the remote storages, so you can use any protocol that
140 | `CloudPath` supports, including [S3](https://aws.amazon.com/s3/),
141 | [Google Cloud Storage](https://cloud.google.com/storage), and the local
142 | filesystem, although you may need to install extra dependencies to use certain
143 | protocols.
144 |
145 | ```bash
146 | python -m weasel pull [remote] [project_dir]
147 | ```
148 |
149 | > :bulb: **Example**
150 | >
151 | > ```bash
152 | > $ python -m weasel pull my_bucket
153 | > ```
154 | >
155 | > ```yaml title="project.yml"
156 | > remotes:
157 | > my_bucket: 's3://my-weasel-bucket'
158 | > ```
159 |
160 | | Name | Description |
161 | | -------------- | --------------------------------------------------------------------------------------- |
162 | | `remote` | The name of the remote to download from. Defaults to `"default"`. ~~str (positional)~~ |
163 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
164 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
165 | | **DOWNLOADS** | All project outputs that do not exist locally and can be found in the remote. |
166 |
167 | ## :closed_book: document
168 |
169 | Auto-generate a pretty Markdown-formatted `README` for your project, based on
170 | its [`project.yml`](tutorial/directory-and-assets.md#project-yml). Will create
171 | sections that document the available commands, workflows and assets. The
172 | auto-generated content will be placed between two hidden markers, so you can add
173 | your own custom content before or after the auto-generated documentation. When
174 | you re-run the `project document` command, only the auto-generated part is
175 | replaced.
176 |
177 | ```bash
178 | python -m weasel document [project_dir] [--output] [--no-emoji]
179 | ```
180 |
181 | > :bulb: **Example usage**
182 | >
183 | > ```bash
184 | > $ python -m weasel document --output README.md
185 | > ```
186 | >
187 | > For more examples, see the templates in our
188 | > [`projects`](https://github.com/explosion/projects) repo.
189 |
190 | | Name | Description |
191 | | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
192 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
193 | | `--output`, `-o` | Path to output file or `-` for stdout (default). If a file is specified and it already exists and contains auto-generated docs, only the auto-generated docs section is replaced. ~~Path (positional)~~ |
194 | | `--no-emoji`, `-NE` | Don't use emoji in the titles. ~~bool (flag)~~ |
195 | | **CREATES** | The Markdown-formatted project documentation. |
196 |
197 | ## :repeat: dvc
198 |
199 | Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls
200 | [`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under
201 | the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline,
202 | so you need to specify one workflow defined in the
203 | [`project.yml`](tutorial/directory-and-assets.md#project-yml). If no workflow is
204 | specified, the first defined workflow is used. The DVC config will only be
205 | updated if the `project.yml` changed. For details, see the
206 | [DVC integration](tutorial/integrations.md#data-version-control-dvc) docs.
207 |
208 | > **Warning**
209 | >
210 | > This command requires DVC to be installed and initialized in the project
211 | > directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init).
212 | > You'll also need to add the assets you want to track with
213 | > [`dvc add`](https://dvc.org/doc/command-reference/add).
214 |
215 | ```bash
216 | python -m weasel dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
217 | ```
218 |
219 | > :bulb: **Example**
220 | >
221 | > ```bash
222 | > $ git init
223 | > $ dvc init
224 | > $ python -m weasel dvc all
225 | > ```
226 |
227 | | Name | Description |
228 | | ----------------- | ------------------------------------------------------------------------------------------------------------- |
229 | | `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
230 | | `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
231 | | `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
232 | | `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
233 | | `--quiet`, `-q` | Print no output generated by DVC. ~~bool (flag)~~ |
234 | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
235 | | **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
236 |
--------------------------------------------------------------------------------
/docs/tutorial/custom-scripts.md:
--------------------------------------------------------------------------------
1 | # Custom scripts and projects
2 |
3 | The `project.yml` lets you define any custom commands and run them as part of
4 | your training, evaluation or deployment workflows. The `script` section defines
5 | a list of commands that are called in a subprocess, in order. This lets you
6 | execute other Python scripts or command-line tools.
7 |
8 | Let's say you're training a spaCy pipeline, and you've written a
9 | few integration tests that load the best model produced by the training command
10 | and check that it works correctly. You can now define a `test` command that
11 | calls into [`pytest`](https://docs.pytest.org/en/latest/), runs your tests and
12 | uses [`pytest-html`](https://github.com/pytest-dev/pytest-html) to export a test
13 | report:
14 |
15 | > :bulb: **Example configuration**
16 | >
17 | > ```yaml title="project.yml"
18 | > commands:
19 | > - name: test
20 | > help: 'Test the trained pipeline'
21 | > script:
22 | > - 'pip install pytest pytest-html'
23 | > - 'python -m pytest ./scripts/tests --html=metrics/test-report.html'
24 | > deps:
25 | > - 'training/model-best'
26 | > outputs:
27 | > - 'metrics/test-report.html'
28 | > no_skip: true
29 | > ```
30 |
31 | Adding `training/model-best` to the command's `deps` lets you ensure that the
32 | file is available. If not, Weasel will show an error and the command won't run.
33 | Setting `no_skip: true` means that the command will always run, even if the
34 | dependencies (the trained pipeline) haven't changed. This makes sense here,
35 | because you typically don't want to skip your tests.
36 |
37 | ## Writing custom scripts
38 |
39 | Your project commands can include any custom scripts – essentially, anything you
40 | can run from the command line. Here's an example of a custom script that uses
41 | [`typer`](https://typer.tiangolo.com/) for quick and easy command-line arguments
42 | that you can define via your `project.yml`:
43 |
44 | ```python title="scripts/custom_evaluation.py"
45 | import typer
46 |
47 | def custom_evaluation(batch_size: int = 128, model_path: str, data_path: str):
48 | # The arguments are now available as positional CLI arguments
49 | print(batch_size, model_path, data_path)
50 |
51 | if __name__ == "__main__":
52 | typer.run(custom_evaluation)
53 | ```
54 |
55 | > :information_source: **About Typer**
56 | >
57 | > [`typer`](https://typer.tiangolo.com/) is a modern library for building Python
58 | > CLIs using type hints. It's a dependency of Weasel, so it will already be
59 | > pre-installed in your environment. Function arguments automatically become
60 | > positional CLI arguments and using Python type hints, you can define the value
61 | > types. For instance, `batch_size: int` means that the value provided via the
62 | > command line is converted to an integer.
63 |
64 | In your `project.yml`, you can then run the script by calling
65 | `python scripts/custom_evaluation.py` with the function arguments. You can also
66 | use the `vars` section to define reusable variables that will be substituted in
67 | commands, paths and URLs. In the following example, the batch size is defined as a
68 | variable will be added in place of `${vars.batch_size}` in the script.
69 |
70 | > :bulb: **Example usage of `vars`**
71 | >
72 | > ```yaml title="project.yml"
73 | > vars:
74 | > batch_size: 128
75 | >
76 | > commands:
77 | > - name: evaluate
78 | > script:
79 | > - 'python scripts/custom_evaluation.py ${vars.batch_size} ./training/model-best ./corpus/eval.json'
80 | > deps:
81 | > - 'training/model-best'
82 | > - 'corpus/eval.json'
83 | > ```
84 |
85 | > :information_source: **Calling into Python**
86 | >
87 | > If any of your command scripts call into `python`, Weasel will take care of
88 | > replacing that with your `sys.executable`, to make sure you're executing
89 | > everything with the same Python (not some other Python installed on your
90 | > system). It also normalizes references to `python3`, `pip3` and `pip`.
91 |
92 | You can also use the `env` section to reference **environment variables** and
93 | make their values available to the commands. This can be useful for overriding
94 | settings on the command line and passing through system-level settings.
95 |
96 | > :bulb: **Example usage of EnvVars**
97 | >
98 | > ```bash
99 | > export GPU_ID=1
100 | > BATCH_SIZE=128 python -m weasel run evaluate
101 | > ```
102 | >
103 | > ```yaml title="project.yml"
104 | > env:
105 | > batch_size: BATCH_SIZE
106 | > gpu_id: GPU_ID
107 | >
108 | > commands:
109 | > - name: evaluate
110 | > script:
111 | > - 'python scripts/custom_evaluation.py ${env.batch_size}'
112 | > ```
113 |
114 | ## Documenting your project
115 |
116 | > :bulb: **Examples**
117 | >
118 | > For more examples, see the [`projects`](https://github.com/explosion/projects)
119 | > repo.
120 | >
121 | > 
122 |
123 | When your custom project is ready and you want to share it with others, you can
124 | use the [`weasel document`](../cli.md#closed_book-document) command to
125 | **auto-generate** a pretty, Markdown-formatted `README` file based on your
126 | project's `project.yml`. It will list all commands, workflows and assets defined
127 | in the project and include details on how to run the project, as well as links
128 | to the relevant Weasel documentation to make it easy for others to get started
129 | using your project.
130 |
131 | ```bash
132 | python -m weasel document --output README.md
133 | ```
134 |
135 | Under the hood, hidden markers are added to identify where the auto-generated
136 | content starts and ends. This means that you can add your own custom content
137 | before or after it and re-running the `document` command will **only
138 | update the auto-generated part**. This makes it easy to keep your documentation
139 | up to date.
140 |
141 | > **Warning**
142 | >
143 | > Note that the contents of an existing file will be **replaced** if no existing
144 | > auto-generated docs are found. If you want Weasel to ignore a file and not update
145 | > it, you can add the comment marker `{/* WEASEL: IGNORE */}` anywhere in
146 | > your markup.
147 |
148 | ## Cloning from your own repo
149 |
150 | The [`weasel clone`](../cli.md#clipboard-clone) command lets you customize
151 | the repo to clone from using the `--repo` option. It calls into `git`, so you'll
152 | be able to clone from any repo that you have access to, including private repos.
153 |
154 | ```bash
155 | python -m weasel clone your_project --repo https://github.com/you/repo
156 | ```
157 |
158 | At a minimum, a valid project template needs to contain a
159 | [`project.yml`](./directory-and-assets.md#projectyml). It can also include
160 | [other files](./directory-and-assets.md), like custom scripts, a
161 | `requirements.txt` listing additional dependencies,
162 | a machine learning model and meta templates, or Jupyter
163 | notebooks with usage examples.
164 |
165 | > :warning: **Important note about assets**
166 | >
167 | > It's typically not a good idea to check large data assets, trained pipelines or
168 | > other artifacts into a Git repo and you should exclude them from your project
169 | > template by adding a `.gitignore`. If you want to version your data and models,
170 | > check out [Data Version Control](./integrations.md#data-version-control-dvc) (DVC),
171 | > which integrates with Weasek.
172 |
--------------------------------------------------------------------------------
/docs/tutorial/directory-and-assets.md:
--------------------------------------------------------------------------------
1 | # Project directory and assets
2 |
3 | ## `project.yml`
4 |
5 | The `project.yml` defines the assets a project depends on, like datasets and
6 | pretrained weights, as well as a series of commands that can be run separately
7 | or as a workflow – for instance, to preprocess the data, convert it to Weasel's
8 | format, train a pipeline, evaluate it and export metrics, package it and spin up
9 | a quick web demo. It looks pretty similar to a config file used to define CI
10 | pipelines.
11 |
12 | > :boom: **Tip: Multi-line YAML**
13 | >
14 | > YAML has [multi-line syntax](https://yaml-multiline.info/) that can be helpful
15 | > for readability with longer values such as project descriptions or commands
16 | > that take several arguments.
17 |
18 | > :boom: **Tip: Variable override**
19 | >
20 | > If you want to override one or more variables on the CLI and are not already
21 | > specifying a project directory, you need to add `.` as a placeholder:
22 | >
23 | > ```
24 | > python -m weasel run test . --vars.foo bar
25 | > ```
26 |
27 | > :boom: **Tip: Environment variables**
28 | >
29 | > Commands in a project file are not executed in a shell, so they don't have
30 | > direct access to environment variables. But you can insert environment
31 | > variables using the `env` dictionary to make values available for
32 | > interpolation, just like values in `vars`. Here's an example `env` dict that
33 | > makes `$PATH` available as `ENV_PATH`:
34 | >
35 | > ```yaml
36 | > env:
37 | > ENV_PATH: PATH
38 | > ```
39 | >
40 | > This can be used in a project command like so:
41 | >
42 | > ```yaml
43 | > - name: 'echo-path'
44 | > script:
45 | > - 'echo ${env.ENV_PATH}'
46 | > ```
47 |
48 | `project.yml` adheres to the following schema:
49 |
50 | | Section | Description |
51 | | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
52 | | `title` | An optional project title used in `--help` message and [auto-generated docs](../cli.md#closed_book-document). |
53 | | `description` | An optional project description used in [auto-generated docs](../cli.md#closed_book-document). |
54 | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](https://spacy.io/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
55 | | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
56 | | `directories` | An optional list of [directories](#data-assets) that should be created in the project for assets, training outputs, metrics etc. Weasel will make sure that these directories always exist. |
57 | | `assets` | A list of assets that can be fetched with the [`assets`](../cli.md#open_file_folder-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
58 | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`run`](../cli.md#rocket-run) command. |
59 | | `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets Weasel determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`run`](../cli.md#rocket-run) command. |
60 |
61 | ## Data assets
62 |
63 | Assets are any files that your project might need, like training and development
64 | corpora or pretrained weights for initializing your model. Assets are defined in
65 | the `assets` block of your `project.yml` and can be downloaded using the
66 | [`assets`](../cli.md#open_file_folder-assets) command. Defining checksums lets you
67 | verify that someone else running your project will use the same files you used.
68 | Asset URLs can be a number of different **protocols**: HTTP, HTTPS, FTP, SSH,
69 | and even **cloud storage** such as GCS and S3. You can also download assets from
70 | a **Git repo** instead.
71 |
72 | ### Downloading from a URL or cloud storage
73 |
74 | Under the hood, Weasel uses the
75 | [`smart_open`](https://github.com/RaRe-Technologies/smart_open) library so you
76 | can use any protocol it supports. Note that you may need to install extra
77 | dependencies to use certain protocols.
78 |
79 | > :bulb: **Example configuration**
80 | >
81 | > ```yaml title="project.yml"
82 | > assets:
83 | > # Download from public HTTPS URL
84 | > - dest: 'assets/training.spacy'
85 | > url: 'https://example.com/data.spacy'
86 | > checksum: '63373dd656daa1fd3043ce166a59474c'
87 | > # Optional download from Google Cloud Storage bucket
88 | > - dest: 'assets/development.spacy'
89 | > extra: True
90 | > url: 'gs://your-bucket/corpora'
91 | > checksum: '5113dc04e03f079525edd8df3f4f39e3'
92 | > ```
93 |
94 | | Name | Description |
95 | | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
96 | | `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
97 | | `extra` | Optional flag determining whether this asset is downloaded only if `weasel assets` is run with `--extra`. `False` by default. |
98 | | `url` | The URL to download from, using the respective protocol. |
99 | | `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
100 | | `description` | Optional asset description, used in [auto-generated docs](../cli.md#closed_book-document). |
101 |
102 | ### Downloading from a Git repo
103 |
104 | If a `git` block is provided, the asset is downloaded from the given Git
105 | repository. You can download from any repo that you have access to. Under the
106 | hood, this uses Git's "sparse checkout" feature, so you're only downloading the
107 | files you need and not the whole repo.
108 |
109 | > :bulb: **Example configuration**
110 | >
111 | > ```yaml title="project.yml"
112 | > assets:
113 | > - dest: 'assets/training.spacy'
114 | > git:
115 | > repo: 'https://github.com/example/repo'
116 | > branch: 'master'
117 | > path: 'path/training.spacy'
118 | > checksum: '63373dd656daa1fd3043ce166a59474c'
119 | > description: 'The training data (5000 examples)'
120 | > ```
121 |
122 | | Name | Description |
123 | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
124 | | `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
125 | | `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.
`branch`: The branch to download from. Defaults to `"master"`. |
126 | | `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
127 | | `description` | Optional asset description, used in [auto-generated docs](../cli.md#closed_book-document). |
128 |
129 | ### Working with private assets
130 |
131 | > :bulb: **Example configuration**
132 | >
133 | > ```yaml title="project.yml"
134 | > assets:
135 | > - dest: 'assets/private_training_data.json'
136 | > checksum: '63373dd656daa1fd3043ce166a59474c'
137 | > - dest: 'assets/private_vectors.bin'
138 | > checksum: '5113dc04e03f079525edd8df3f4f39e3'
139 | > ```
140 |
141 | For many projects, the datasets and weights you're working with might be
142 | company-internal and not available over the internet. In that case, you can
143 | specify the destination paths and a checksum, and leave out the URL. When your
144 | teammates clone and run your project, they can place the files in the respective
145 | directory themselves. The [`assets`](../cli.md#open_file_folder-assets) command
146 | will alert you about missing files and mismatched checksums, so you can ensure
147 | that others are running your project with the same data.
148 |
149 | ## Dependencies and outputs
150 |
151 | Each command defined in the `project.yml` can optionally define a list of
152 | dependencies and outputs. These are the files the command requires and creates.
153 | For example, a command for training a spaCy pipeline may depend on a
154 | [`config.cfg`](https://spacy.io/usage/training#config) and the training and evaluation data, and
155 | it will export a directory `model-best`, which you can then re-use in other
156 | commands.
157 |
158 | > :bulb: **Example configuration**
159 | >
160 | > ```yaml title="project.yml"
161 | > commands:
162 | > - name: train
163 | > help: 'Train a spaCy pipeline using the specified corpus and config'
164 | > script:
165 | > - 'python -m spacy train ./configs/config.cfg -o training/ --paths.train ./corpus/training.spacy --paths.dev ./corpus/evaluation.spacy'
166 | > deps:
167 | > - 'configs/config.cfg'
168 | > - 'corpus/training.spacy'
169 | > - 'corpus/evaluation.spacy'
170 | > outputs:
171 | > - 'training/model-best'
172 | > ```
173 |
174 | > :boom: **Tip: Re-running vs. skipping**
175 | >
176 | > Under the hood, Weasel uses a `project.lock` lockfile that stores the details
177 | > for each command, as well as its dependencies and outputs and their checksums.
178 | > It's updated on each run. If any of this information changes, the command will
179 | > be re-run. Otherwise, it will be skipped.
180 |
181 | If you're running a command and it depends on files that are missing, Weasel will
182 | show you an error. If a command defines dependencies and outputs that haven't
183 | changed since the last run, the command will be skipped. This means that you're
184 | only re-running commands if they need to be re-run. Commands can also set
185 | `no_skip: true` if they should never be skipped – for example commands that run
186 | tests. Commands without outputs are also never skipped. To force re-running a
187 | command or workflow, even if nothing changed, you can set the `--force` flag.
188 |
189 | Note that [`weasel`](../cli.md) doesn't compile any dependency
190 | graphs based on the dependencies and outputs, and won't re-run previous steps
191 | automatically. For instance, if you only run the command `train` that depends on
192 | data created by `preprocess` and those files are missing, Weasel will show an
193 | error – it won't just re-run `preprocess`. If you're looking for more advanced
194 | data management, check out the [Data Version Control (DVC) integration](./integrations.md#data-version-control-dvc).
195 | If you're planning on integrating your Weasel project with DVC, you can also use
196 | `outputs_no_cache` instead of `outputs` to define outputs that won't be cached
197 | or tracked.
198 |
199 | ## Files and directory structure
200 |
201 | The `project.yml` can define a list of `directories` that should be created
202 | within a project – for instance, `assets`, `training`, `corpus` and so on. Weasel
203 | will make sure that these directories are always available, so your commands can
204 | write to and read from them. Project directories will also include all files and
205 | directories copied from the project template with
206 | [`weasel clone`](../cli.md#clipboard-clone). Here's an example of a project
207 | directory:
208 |
209 | > :bulb: **Example configuration**
210 | >
211 | > ```yaml title="project.yml"
212 | > directories:
213 | > - 'assets'
214 | > - 'configs'
215 | > - 'corpus'
216 | > - 'metas'
217 | > - 'metrics'
218 | > - 'notebooks'
219 | > - 'packages'
220 | > - 'scripts'
221 | > - 'training'
222 | > ```
223 | >
224 | >``` title="Example directory structure"
225 | >├── project.yml # the project settings
226 | >├── project.lock # lockfile that tracks inputs/outputs
227 | >├── assets/ # downloaded data assets
228 | >├── configs/ # pipeline config.cfg files used for training
229 | >├── corpus/ # output directory for training corpus
230 | >├── metas/ # pipeline meta.json templates used for packaging
231 | >├── metrics/ # output directory for evaluation metrics
232 | >├── notebooks/ # directory for Jupyter notebooks
233 | >├── packages/ # output directory for pipeline Python packages
234 | >├── scripts/ # directory for scripts, e.g. referenced in commands
235 | >├── training/ # output directory for trained pipelines
236 | >└── ... # any other files, like a requirements.txt etc.
237 | >```
238 |
239 | If you don't want a project to create a directory, you can delete it and remove
240 | its entry from the `project.yml` – just make sure it's not required by any of
241 | the commands. [Custom templates](./custom-scripts.md) can use any directories they need –
242 | the only file that's required for a project is the `project.yml`.
243 |
--------------------------------------------------------------------------------
/docs/tutorial/integrations.md:
--------------------------------------------------------------------------------
1 | # Integrations
2 |
3 | ## Data Version Control (DVC)
4 |
5 | Data assets like training corpora or pretrained weights are at the core of any
6 | NLP project, but they're often difficult to manage: you can't just check them
7 | into your Git repo to version and keep track of them. And if you have multiple
8 | steps that depend on each other, like a preprocessing step that generates your
9 | training data, you need to make sure the data is always up-to-date, and re-run
10 | all steps of your process every time, just to be safe.
11 |
12 | [Data Version Control](https://dvc.org) (DVC) is a standalone open-source tool
13 | that integrates into your workflow like Git, builds a dependency graph for your
14 | data pipelines and tracks and caches your data files. If you're downloading data
15 | from an external source, like a storage bucket, DVC can tell whether the
16 | resource has changed. It can also determine whether to re-run a step, depending
17 | on whether its input have changed or not. All metadata can be checked into a Git
18 | repo, so you'll always be able to reproduce your experiments.
19 |
20 | To set up DVC, install the package and initialize your Weasek project as a Git
21 | and DVC repo. You can also
22 | [customize your DVC installation](https://dvc.org/doc/install/macos#install-with-pip)
23 | to include support for remote storage like Google Cloud Storage, S3, Azure, SSH
24 | and more.
25 |
26 | ```bash
27 | pip install dvc # Install DVC
28 | git init # Initialize a Git repo
29 | dvc init # Initialize a DVC project
30 | ```
31 |
32 | > :warning: **Important note on privacy**
33 | >
34 | > DVC enables usage analytics by default, so if you're working in a
35 | > privacy-sensitive environment, make sure to
36 | > [**opt-out manually**](https://dvc.org/doc/user-guide/analytics#opting-out).
37 |
38 | The [`weasel dvc`](../cli.md#repeat-dvc) command creates a `dvc.yaml`
39 | config file based on a workflow defined in your `project.yml`. Whenever you
40 | update your project, you can re-run the command to update your DVC config. You
41 | can then manage your Weasel project like any other DVC project, run
42 | [`dvc add`](https://dvc.org/doc/command-reference/add) to add and track assets
43 | and [`dvc repro`](https://dvc.org/doc/command-reference/repro) to reproduce the
44 | workflow or individual commands.
45 |
46 | ```bash
47 | python -m weasel dvc [project_dir] [workflow_name]
48 | ```
49 |
50 | > :warning: **Important note for multiple workflows**
51 | >
52 | > DVC currently expects a single workflow per project, so when creating the config
53 | > with [`weasel dvc`](../cli.md#repeat-dvc), you need to specify the name
54 | > of a workflow defined in your `project.yml`. You can still use multiple
55 | > workflows, but only one can be tracked by DVC.
56 |
--------------------------------------------------------------------------------
/docs/tutorial/remote-storage.md:
--------------------------------------------------------------------------------
1 | # Remote Storage
2 |
3 | You can persist your project outputs to a remote storage using the
4 | [`push`](../cli.md#arrow_up-push) command. This can help you **export** your
5 | pipeline packages, **share** work with your team, or **cache results** to avoid
6 | repeating work. The [`pull`](../cli.md#arrow_down-pull) command will download
7 | any outputs that are in the remote storage and aren't available locally.
8 |
9 | You can list one or more remotes in the `remotes` section of your
10 | [`project.yml`](./directory-and-assets.md#projectyml) by mapping a string name
11 | to the URL of the storage. Under the hood, Weasel uses
12 | [`cloudpathlib`](https://cloudpathlib.drivendata.org/) to communicate with the
13 | remote storages, so you can use any protocol that `CloudPath` supports,
14 | including [S3](https://aws.amazon.com/s3/),
15 | [Google Cloud Storage](https://cloud.google.com/storage), and the local
16 | filesystem, although you may need to install extra dependencies to use certain
17 | protocols.
18 |
19 | > :bulb: **Example using remote storage**
20 | >
21 | > ```bash
22 | > $ python -m weasel pull local
23 | > ```
24 | >
25 | > ```yaml title="project.yml"
26 | > remotes:
27 | > default: 's3://my-weasel-bucket'
28 | > local: '/mnt/scratch/cache'
29 | > ```
30 |
31 | > :information_source: **How it works**
32 | >
33 | > Inside the remote storage, Weasel uses a clever **directory structure** to
34 | > avoid overwriting files. The top level of the directory structure is a
35 | > URL-encoded version of the output's path. Within this directory are
36 | > subdirectories named according to a hash of the command string and the
37 | > command's dependencies. Finally, within those directories are files, named
38 | > according to an MD5 hash of their contents.
39 | >
40 | > ```
41 | > └── urlencoded_file_path # Path of original file
42 | > ├── some_command_hash # Hash of command you ran
43 | > │ ├── some_content_hash # Hash of file content
44 | > │ └── another_content_hash
45 | > └── another_command_hash
46 | > └── third_content_hash
47 | > ```
48 |
49 | For instance, let's say you had the following spaCy command in your
50 | `project.yml`:
51 |
52 | ```yaml title="project.yml"
53 | - name: train
54 | help: 'Train a spaCy pipeline using the specified corpus and config'
55 | script:
56 | - 'spacy train ./config.cfg --output training/'
57 | deps:
58 | - 'corpus/train'
59 | - 'corpus/dev'
60 | - 'config.cfg'
61 | outputs:
62 | - 'training/model-best'
63 | ```
64 |
65 | After you finish training, you run [`push`](../cli.md#arrow_up-push) to make
66 | sure the `training/model-best` output is saved to remote storage. Weasel will
67 | then construct a hash from your command script and the listed dependencies,
68 | `corpus/train`, `corpus/dev` and `config.cfg`, in order to identify the
69 | execution context of your output. It would then compute an MD5 hash of the
70 | `training/model-best` directory, and use those three pieces of information to
71 | construct the storage URL.
72 |
73 | ```bash
74 | python -m weasel run train
75 | python -m weasel push
76 | ```
77 |
78 | ```title="Overview of the S3 bucket"
79 | └── s3://my-weasel-bucket/training%2Fmodel-best
80 | └── 1d8cb33a06cc345ad3761c6050934a1b
81 | └── d8e20c3537a084c5c10d95899fe0b1ff
82 | ```
83 |
84 | If you change the command or one of its dependencies (for instance, by editing
85 | the [`config.cfg`](https://spacy.io/usage/training#config) file to tune the
86 | hyperparameters), a different creation hash will be calculated, so when you use
87 | [`push`](../cli.md#arrow_up-push) you won't be overwriting your previous file.
88 | The system even supports multiple outputs for the same file and the same
89 | context, which can happen if your training process is not deterministic, or if
90 | you have dependencies that aren't represented in the command.
91 |
92 | In summary, the `weasel` remote storages are designed to make a particular set
93 | of trade-offs. Priority is placed on **convenience**, **correctness** and
94 | **avoiding data loss**. You can use [`push`](../cli.md#arrow_up-push) freely, as
95 | you'll never overwrite remote state, and you don't have to come up with names or
96 | version numbers. However, it's up to you to manage the size of your remote
97 | storage, and to remove files that are no longer relevant to you.
98 |
--------------------------------------------------------------------------------
/docs/tutorial/workflow.md:
--------------------------------------------------------------------------------
1 | # Workflow
2 |
3 | ## 1. Clone a project template
4 |
5 | > :information_source: **Cloning under the hood**
6 | >
7 | > To clone a project, Weasel calls into `git` and uses the "sparse checkout"
8 | > feature to only clone the relevant directory or directories.
9 |
10 | The [`weasel clone`](../cli.md#clipboard-clone) command clones an existing
11 | project template and copies the files to a local directory. You can then run the
12 | project, e.g. to train a pipeline and edit the commands and scripts to build
13 | fully custom workflows.
14 |
15 | ```bash
16 | python -m weasel clone pipelines/tagger_parser_ud
17 | ```
18 |
19 | By default, the project will be cloned into the current working directory. You
20 | can specify an optional second argument to define the output directory. The
21 | `--repo` option lets you define a custom repo to clone from if you don't want to
22 | use the default [`projects`](https://github.com/explosion/projects) repo. You can
23 | also use any private repo you have access to with Git.
24 |
25 | ## 2. Fetch the project assets
26 |
27 | Assets are data files your project needs – for example, the training and
28 | evaluation data or pretrained vectors and embeddings to initialize your model
29 | with. Each project template comes with a `project.yml` that defines the assets
30 | to download and where to put them. The [`weasel assets`](../cli.md#open_file_folder-assets)
31 | will fetch the project assets for you.
32 |
33 | > :bulb: **Example usage**
34 | >
35 | > ```yaml title="project.yml"
36 | > assets:
37 | > - dest: 'assets/training.spacy'
38 | > url: 'https://example.com/data.spacy'
39 | > checksum: '63373dd656daa1fd3043ce166a59474c'
40 | > - dest: 'assets/development.spacy'
41 | > git:
42 | > repo: 'https://github.com/example/repo'
43 | > branch: 'master'
44 | > path: 'path/development.spacy'
45 | > checksum: '5113dc04e03f079525edd8df3f4f39e3'
46 | > ```
47 | >
48 | > Let Weasel fetch the assets:
49 | >
50 | > ```bash
51 | > python -m weasel assets
52 | > ```
53 |
54 | Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and
55 | even cloud storage such as GCS and S3. You can also fetch assets using git, by
56 | replacing the `url` string with a `git` block. Weasel will use Git's "sparse
57 | checkout" feature to avoid downloading the whole repository.
58 |
59 | Sometimes your project configuration may include large assets that you don't
60 | necessarily want to download when you run `weasel assets`. That's why
61 | assets can be marked as [`extra`](./directory-and-assets.md#data-assets) - by default, these assets
62 | are not downloaded. If they should be, run `weasel assets --extra`.
63 |
64 | ## 3. Run a command
65 |
66 | Commands consist of one or more steps and can be run with
67 | [`weasel run`](../cli.md#rocket-run). The following will run the command
68 | `preprocess` defined in the `project.yml`:
69 |
70 | > :bulb: **Example usage**
71 | >
72 | > ```yaml title="project.yml"
73 | > commands:
74 | > - name: preprocess
75 | > help: "Convert the input data to spaCy's format"
76 | > script:
77 | > - 'python -m spacy convert assets/train.conllu corpus/'
78 | > - 'python -m spacy convert assets/eval.conllu corpus/'
79 | > deps:
80 | > - 'assets/train.conllu'
81 | > - 'assets/eval.conllu'
82 | > outputs:
83 | > - 'corpus/train.spacy'
84 | > - 'corpus/eval.spacy'
85 | > ```
86 | >
87 | > Run the command:
88 | >
89 | > ```bash
90 | > python -m weasel run preprocess
91 | > ```
92 |
93 | Commands can define their expected [dependencies and outputs](./directory-and-assets.md#dependencies-and-outputs)
94 | using the `deps` (files the commands require) and `outputs` (files the commands
95 | create) keys. This allows your project to track changes and determine whether a
96 | command needs to be re-run. For instance, if your input data changes, you want
97 | to re-run the `preprocess` command. But if nothing changed, this step can be
98 | skipped. You can also set `--force` to force re-running a command, or `--dry` to
99 | perform a "dry run" and see what would happen (without actually running the
100 | script).
101 |
102 | ## 4. Run a workflow
103 |
104 | Workflows are series of commands that are run in order and often depend on each
105 | other. For instance, to generate a spaCy pipeline package, you might start by
106 | converting your data, then run [`spacy train`](https://spacy.io/api/cli#train) to train your
107 | pipeline on the converted data and if that's successful, run
108 | [`spacy package`](https://spacy.io/api/cli#package) to turn the best trained artifact into an
109 | installable Python package. The following command runs the workflow named `all`
110 | defined in the `project.yml`, and executes the commands it specifies, in order:
111 |
112 | > :bulb: **Example usage**
113 | >
114 | > ```yaml title="project.yml"
115 | > workflows:
116 | > all:
117 | > - preprocess
118 | > - train
119 | > - package
120 | > ```
121 | >
122 | > ```bash
123 | > python -m weasel run all
124 | > ```
125 |
126 | Using the expected [dependencies and outputs](./directory-and-assets.md#dependencies-and-outputs)
127 | defined in the commands, Weasel can determine whether to re-run a command (if its inputs or
128 | outputs have changed) or whether to skip it. If you're looking to implement more
129 | advanced data pipelines and track your changes in Git, check out the
130 | [Data Version Control (DVC) integration](./integrations.md#data-version-control-dvc). The
131 | [`weasel dvc`](../cli.md#repeat-dvc) command generates a DVC config file
132 | from a workflow defined in your `project.yml` so you can manage your Weasel
133 | project as a DVC repo.
134 |
135 | ## 5. Optional: Push to remote storage
136 |
137 | After training a pipeline, you can optionally use the
138 | [`weasel push`](../cli.md#arrow_up-push) command to upload your outputs to
139 | a remote storage, using protocols like [S3](https://aws.amazon.com/s3/),
140 | [Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help
141 | you **export** your pipeline packages, **share** work with your team, or **cache
142 | results** to avoid repeating work.
143 |
144 | > :bulb: **Example usage**
145 | >
146 | > ```yaml title="project.yml"
147 | > remotes:
148 | > default: 's3://my-weasel-bucket'
149 | > local: '/mnt/scratch/cache'
150 | > ```
151 | >
152 | > Push to remote:
153 | >
154 | > ```bash
155 | > python -m weasel push
156 | > ```
157 |
158 | The `remotes` section in your `project.yml` lets you assign names to the
159 | different storages. To download state from a remote storage, you can use the
160 | [`weasel pull`](../cli.md#arrow_down-pull) command. For more details, see the
161 | docs on [remote storage](./remote-storage.md).
162 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.ruff]
6 | ignore = [
7 | "E501",
8 | ]
9 | select = [
10 | "E", # pycodestyle errors
11 | "W", # pycodestyle warnings
12 | "F", # Pyflakes
13 | "Q", # flake8-quotes
14 | ]
15 |
16 | [tool.ruff.per-file-ignores]
17 | # Ignore unused imports in __init__ files
18 | "__init__.py" = ["F401"]
19 |
20 |
21 | [tool.isort]
22 | multi_line_output = 9
23 | profile = "black"
24 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Our libraries
2 | confection>=0.0.4,<0.2.0
3 | wasabi>=0.9.1,<1.2.0
4 | srsly>=2.4.3,<3.0.0
5 | typer>=0.3.0,<1.0.0
6 | cloudpathlib>=0.7.0,<1.0.0
7 | smart-open>=5.2.1,<8.0.0
8 | # Third party dependencies
9 | requests>=2.13.0,<3.0.0
10 | pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
11 | # Official Python utilities
12 | packaging>=20.0
13 | # Development dependencies
14 | black==22.3.0
15 | pytest>=5.2.0,!=7.1.0
16 | mypy>=1.5.0,<1.7.0; python_version >= "3.8"
17 | types-requests
18 | types-setuptools>=57.0.0
19 | ruff>=0.0.259
20 | isort>=5.12.0,<6.0; python_version > "3.7"
21 | pre-commit>=3.2.0,<4.0.0; python_version > "3.7"
22 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = weasel
3 | version = 0.4.1
4 | description = Weasel: A small and easy workflow system
5 | url = https://github.com/explosion/weasel/
6 | author = Explosion
7 | author_email = contact@explosion.ai
8 | license = MIT
9 | long_description = file: README.md
10 | long_description_content_type = text/markdown
11 | classifiers =
12 | Environment :: Console
13 | Intended Audience :: Developers
14 | Intended Audience :: Science/Research
15 | License :: OSI Approved :: MIT License
16 | Operating System :: POSIX :: Linux
17 | Operating System :: MacOS :: MacOS X
18 | Operating System :: Microsoft :: Windows
19 | Programming Language :: Python :: 3
20 | Programming Language :: Python :: 3.7
21 | Programming Language :: Python :: 3.8
22 | Programming Language :: Python :: 3.9
23 | Programming Language :: Python :: 3.10
24 | Programming Language :: Python :: 3.11
25 | Programming Language :: Python :: 3.12
26 | Topic :: Scientific/Engineering
27 | project_urls =
28 | Release notes = https://github.com/explosion/weasel/releases
29 | Source = https://github.com/explosion/weasel/
30 |
31 | [options]
32 | python_requires = >=3.7
33 | install_requires =
34 | confection>=0.0.4,<0.2.0
35 | packaging>=20.0
36 | wasabi>=0.9.1,<1.2.0
37 | srsly>=2.4.3,<3.0.0
38 | typer>=0.3.0,<1.0.0
39 | cloudpathlib>=0.7.0,<1.0.0
40 | smart-open>=5.2.1,<8.0.0
41 | requests>=2.13.0,<3.0.0
42 | pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
43 |
44 |
45 | [options.entry_points]
46 | console_scripts =
47 | weasel = weasel.cli:app
48 |
49 | [tool:pytest]
50 | markers =
51 | issue: references specific issue
52 |
53 | [mypy]
54 | ignore_missing_imports = True
55 | no_implicit_optional = True
56 | plugins = pydantic.mypy
57 | allow_redefinition = True
58 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 |
4 | if __name__ == "__main__":
5 | from setuptools import find_packages, setup
6 |
7 | setup(packages=find_packages())
8 |
--------------------------------------------------------------------------------
/weasel/__init__.py:
--------------------------------------------------------------------------------
1 | from .cli import app
2 |
--------------------------------------------------------------------------------
/weasel/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli.main import COMMAND, app
2 |
3 | app(prog_name=COMMAND)
4 |
--------------------------------------------------------------------------------
/weasel/about.py:
--------------------------------------------------------------------------------
1 | __projects__ = "https://github.com/explosion/projects"
2 | __projects_branch__ = "v3"
3 |
--------------------------------------------------------------------------------
/weasel/cli/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import app # isort: skip
2 |
3 | from .assets import project_assets
4 | from .clone import project_clone
5 | from .document import project_document
6 | from .dvc import project_update_dvc
7 | from .pull import project_pull
8 | from .push import project_push
9 | from .run import project_run
10 |
--------------------------------------------------------------------------------
/weasel/cli/assets.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import shutil
4 | from pathlib import Path
5 | from typing import Any, Dict, Optional
6 |
7 | import requests
8 | import typer
9 | from wasabi import msg
10 |
11 | from ..util import SimpleFrozenDict, download_file, ensure_path, get_checksum
12 | from ..util import get_git_version, git_checkout, load_project_config
13 | from ..util import parse_config_overrides, working_dir
14 | from .main import PROJECT_FILE, Arg, Opt, app
15 |
16 | # Whether assets are extra if `extra` is not set.
17 | EXTRA_DEFAULT = False
18 |
19 |
20 | @app.command(
21 | "assets",
22 | context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
23 | )
24 | def project_assets_cli(
25 | # fmt: off
26 | ctx: typer.Context, # This is only used to read additional arguments
27 | project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
28 | sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
29 | extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
30 | # fmt: on
31 | ):
32 | """Fetch project assets like datasets and pretrained weights. Assets are
33 | defined in the "assets" section of the project.yml. If a checksum is
34 | provided in the project.yml, the file is only downloaded if no local file
35 | with the same checksum exists.
36 |
37 | DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-and-assets.md
38 | """
39 | overrides = parse_config_overrides(ctx.args)
40 | project_assets(
41 | project_dir,
42 | overrides=overrides,
43 | sparse_checkout=sparse_checkout,
44 | extra=extra,
45 | )
46 |
47 |
48 | def project_assets(
49 | project_dir: Path,
50 | *,
51 | overrides: Dict[str, Any] = SimpleFrozenDict(),
52 | sparse_checkout: bool = False,
53 | extra: bool = False,
54 | ) -> None:
55 | """Fetch assets for a project using DVC if possible.
56 |
57 | project_dir (Path): Path to project directory.
58 | sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
59 | needed.
60 | extra (bool): Whether to download all assets, including those marked as 'extra'.
61 | """
62 | project_path = ensure_path(project_dir)
63 | config = load_project_config(project_path, overrides=overrides)
64 | assets = [
65 | asset
66 | for asset in config.get("assets", [])
67 | if extra or not asset.get("extra", EXTRA_DEFAULT)
68 | ]
69 | if not assets:
70 | msg.warn(
71 | f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
72 | exits=0,
73 | )
74 | msg.info(f"Fetching {len(assets)} asset(s)")
75 |
76 | for asset in assets:
77 | dest = (project_dir / asset["dest"]).resolve()
78 | checksum = asset.get("checksum")
79 | if "git" in asset:
80 | git_err = (
81 | "Cloning Weasel project templates requires Git and the 'git' command. "
82 | "Make sure it's installed and that the executable is available."
83 | )
84 | get_git_version(error=git_err)
85 | if dest.exists():
86 | # If there's already a file, check for checksum
87 | if checksum and checksum == get_checksum(dest):
88 | msg.good(
89 | f"Skipping download with matching checksum: {asset['dest']}"
90 | )
91 | continue
92 | else:
93 | if dest.is_dir():
94 | shutil.rmtree(dest)
95 | else:
96 | dest.unlink()
97 | if "repo" not in asset["git"] or asset["git"]["repo"] is None:
98 | msg.fail(
99 | "A git asset must include 'repo', the repository address.", exits=1
100 | )
101 | if "path" not in asset["git"] or asset["git"]["path"] is None:
102 | msg.fail(
103 | "A git asset must include 'path' - use \"\" to get the entire repository.",
104 | exits=1,
105 | )
106 | git_checkout(
107 | asset["git"]["repo"],
108 | asset["git"]["path"],
109 | dest,
110 | branch=asset["git"].get("branch"),
111 | sparse=sparse_checkout,
112 | )
113 | msg.good(f"Downloaded asset {dest}")
114 | else:
115 | url = asset.get("url")
116 | if not url:
117 | # project.yml defines asset without URL that the user has to place
118 | check_private_asset(dest, checksum)
119 | continue
120 | fetch_asset(project_path, url, dest, checksum)
121 |
122 |
123 | def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
124 | """Check and validate assets without a URL (private assets that the user
125 | has to provide themselves) and give feedback about the checksum.
126 |
127 | dest (Path): Destination path of the asset.
128 | checksum (Optional[str]): Optional checksum of the expected file.
129 | """
130 | if not Path(dest).exists():
131 | err = f"No URL provided for asset. You need to add this file yourself: {dest}"
132 | msg.warn(err)
133 | else:
134 | if not checksum:
135 | msg.good(f"Asset already exists: {dest}")
136 | elif checksum == get_checksum(dest):
137 | msg.good(f"Asset exists with matching checksum: {dest}")
138 | else:
139 | msg.fail(f"Asset available but with incorrect checksum: {dest}")
140 |
141 |
142 | def fetch_asset(
143 | project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
144 | ) -> None:
145 | """Fetch an asset from a given URL or path. If a checksum is provided and a
146 | local file exists, it's only re-downloaded if the checksum doesn't match.
147 |
148 | project_path (Path): Path to project directory.
149 | url (str): URL or path to asset.
150 | checksum (Optional[str]): Optional expected checksum of local file.
151 | RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
152 | the asset failed.
153 | """
154 | dest_path = (project_path / dest).resolve()
155 | if dest_path.exists():
156 | # If there's already a file, check for checksum
157 | if checksum:
158 | if checksum == get_checksum(dest_path):
159 | msg.good(f"Skipping download with matching checksum: {dest}")
160 | return
161 | else:
162 | # If there's not a checksum, make sure the file is a possibly valid size
163 | if os.path.getsize(dest_path) == 0:
164 | msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
165 | os.remove(dest_path)
166 | # We might as well support the user here and create parent directories in
167 | # case the asset dir isn't listed as a dir to create in the project.yml
168 | if not dest_path.parent.exists():
169 | dest_path.parent.mkdir(parents=True)
170 | with working_dir(project_path):
171 | url = convert_asset_url(url)
172 | try:
173 | download_file(url, dest_path)
174 | msg.good(f"Downloaded asset {dest}")
175 | except requests.exceptions.RequestException as e:
176 | if Path(url).exists() and Path(url).is_file():
177 | # If it's a local file, copy to destination
178 | shutil.copy(url, str(dest_path))
179 | msg.good(f"Copied local asset {dest}")
180 | else:
181 | msg.fail(f"Download failed: {dest}", e)
182 | if checksum and checksum != get_checksum(dest_path):
183 | msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
184 |
185 |
186 | def convert_asset_url(url: str) -> str:
187 | """Check and convert the asset URL if needed.
188 |
189 | url (str): The asset URL.
190 | RETURNS (str): The converted URL.
191 | """
192 | # If the asset URL is a regular GitHub URL it's likely a mistake
193 | if (
194 | re.match(r"(http(s?)):\/\/github.com", url)
195 | and "releases/download" not in url
196 | and "/raw/" not in url
197 | ):
198 | converted = url.replace("github.com", "raw.githubusercontent.com")
199 | converted = re.sub(r"/(tree|blob)/", "/", converted)
200 | msg.warn(
201 | "Downloading from a regular GitHub URL. This will only download "
202 | "the source of the page, not the actual file. Converting the URL "
203 | "to a raw URL.",
204 | converted,
205 | )
206 | return converted
207 | return url
208 |
--------------------------------------------------------------------------------
/weasel/cli/clone.py:
--------------------------------------------------------------------------------
1 | import re
2 | import subprocess
3 | from pathlib import Path
4 | from typing import Optional
5 |
6 | import typer
7 | from wasabi import msg
8 |
9 | from .. import about
10 | from ..util import ensure_path, get_git_version, git_checkout, git_repo_branch_exists
11 | from .main import COMMAND, PROJECT_FILE, Arg, Opt, _get_parent_command, app
12 |
13 | DEFAULT_REPO = about.__projects__
14 | DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
15 | DEFAULT_BRANCHES = ["main", "master"]
16 |
17 |
18 | @app.command("clone")
19 | def project_clone_cli(
20 | # fmt: off
21 | ctx: typer.Context, # This is only used to read the parent command
22 | name: str = Arg(..., help="The name of the template to clone"),
23 | dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
24 | repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
25 | branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
26 | sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+."),
27 | # fmt: on
28 | ):
29 | """Clone a project template from a repository. Calls into "git" and will
30 | only download the files from the given subdirectory. The GitHub repo
31 | defaults to the official Weasel template repo, but can be customized
32 | (including using a private repo).
33 |
34 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-clone
35 | """
36 | if dest is None:
37 | dest = Path.cwd() / Path(name).parts[-1]
38 | if repo == DEFAULT_REPO and branch is None:
39 | branch = DEFAULT_PROJECTS_BRANCH
40 |
41 | if branch is None:
42 | for default_branch in DEFAULT_BRANCHES:
43 | if git_repo_branch_exists(repo, default_branch):
44 | branch = default_branch
45 | break
46 | if branch is None:
47 | default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
48 | msg.fail(
49 | "No branch provided and attempted default "
50 | f"branches {default_branches_msg} do not exist.",
51 | exits=1,
52 | )
53 | else:
54 | if not git_repo_branch_exists(repo, branch):
55 | msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
56 | assert isinstance(branch, str)
57 | parent_command = _get_parent_command(ctx)
58 | project_clone(
59 | name,
60 | dest,
61 | repo=repo,
62 | branch=branch,
63 | sparse_checkout=sparse_checkout,
64 | parent_command=parent_command,
65 | )
66 |
67 |
68 | def project_clone(
69 | name: str,
70 | dest: Path,
71 | *,
72 | repo: str = about.__projects__,
73 | branch: str = about.__projects_branch__,
74 | sparse_checkout: bool = False,
75 | parent_command: str = COMMAND,
76 | ) -> None:
77 | """Clone a project template from a repository.
78 |
79 | name (str): Name of subdirectory to clone.
80 | dest (Path): Destination path of cloned project.
81 | repo (str): URL of Git repo containing project templates.
82 | branch (str): The branch to clone from
83 | """
84 | dest = ensure_path(dest)
85 | check_clone(name, dest, repo)
86 | project_dir = dest.resolve()
87 | repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
88 | try:
89 | git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
90 | except subprocess.CalledProcessError:
91 | err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
92 | msg.fail(err, exits=1)
93 | msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
94 | if not (project_dir / PROJECT_FILE).exists():
95 | msg.warn(f"No {PROJECT_FILE} found in directory")
96 | else:
97 | msg.good("Your project is now ready!")
98 | print(f"To fetch the assets, run:\n{parent_command} assets {dest}")
99 |
100 |
101 | def check_clone(name: str, dest: Path, repo: str) -> None:
102 | """Check and validate that the destination path can be used to clone. Will
103 | check that Git is available and that the destination path is suitable.
104 |
105 | name (str): Name of the directory to clone from the repo.
106 | dest (Path): Local destination of cloned directory.
107 | repo (str): URL of the repo to clone from.
108 | """
109 | git_err = (
110 | f"Cloning Weasel project templates requires Git and the 'git' command. "
111 | f"To clone a project without Git, copy the files from the '{name}' "
112 | f"directory in the {repo} to {dest} manually."
113 | )
114 | get_git_version(error=git_err)
115 | if not dest:
116 | msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
117 | if dest.exists():
118 | # Directory already exists (not allowed, clone needs to create it)
119 | msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
120 | if not dest.parent.exists():
121 | # We're not creating parents, parent dir should exist
122 | msg.fail(
123 | f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
124 | f"Create the necessary folder(s) first before continuing.",
125 | exits=1,
126 | )
127 |
--------------------------------------------------------------------------------
/weasel/cli/document.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from wasabi import MarkdownRenderer, msg
4 |
5 | from ..util import load_project_config, working_dir
6 | from .main import PROJECT_FILE, Arg, Opt, app
7 |
8 | DOCS_URL = "https://github.com/explosion/weasel"
9 | INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
10 | project, as well as the available commands and workflows. For details, see the
11 | [Weasel documentation]({DOCS_URL})."""
12 | INTRO_COMMANDS = f"""The following commands are defined by the project. They
13 | can be executed using [`weasel run [name]`]({DOCS_URL}/tree/main/docs/cli.md#rocket-run).
14 | Commands are only re-run if their inputs have changed."""
15 | INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
16 | can be executed using [`weasel run [name]`]({DOCS_URL}/tree/main/docs/cli.md#rocket-run)
17 | and will run the specified commands in order. Commands are only re-run if their
18 | inputs have changed."""
19 | INTRO_ASSETS = f"""The following assets are defined by the project. They can
20 | be fetched by running [`weasel assets`]({DOCS_URL}/tree/main/docs/cli.md#open_file_folder-assets)
21 | in the project directory."""
22 | # These markers are added to the Markdown and can be used to update the file in
23 | # place if it already exists. Only the auto-generated part will be replaced.
24 | MARKER_TAGS = ("WEASEL", "SPACY PROJECT")
25 | MARKER_START = ""
26 | MARKER_END = ""
27 | # If this marker is used in an existing README, it's ignored and not replaced
28 | MARKER_IGNORE = ""
29 |
30 |
31 | @app.command("document")
32 | def project_document_cli(
33 | # fmt: off
34 | project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
35 | output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
36 | no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
37 | # fmt: on
38 | ):
39 | """
40 | Auto-generate a README.md for a project. If the content is saved to a file,
41 | hidden markers are added so you can add custom content before or after the
42 | auto-generated section and only the auto-generated docs will be replaced
43 | when you re-run the command.
44 |
45 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-document
46 | """
47 | project_document(project_dir, output_file, no_emoji=no_emoji)
48 |
49 |
50 | def project_document(
51 | project_dir: Path, output_file: Path, *, no_emoji: bool = False
52 | ) -> None:
53 | is_stdout = str(output_file) == "-"
54 | config = load_project_config(project_dir)
55 | md = MarkdownRenderer(no_emoji=no_emoji)
56 | md.add(MARKER_START.format(tag="WEASEL"))
57 | title = config.get("title")
58 | description = config.get("description")
59 | md.add(md.title(1, f"Weasel Project{f': {title}' if title else ''}", "🪐"))
60 | if description:
61 | md.add(description)
62 | md.add(md.title(2, PROJECT_FILE, "📋"))
63 | md.add(INTRO_PROJECT)
64 | # Commands
65 | cmds = config.get("commands", [])
66 | data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
67 | if data:
68 | md.add(md.title(3, "Commands", "⏯"))
69 | md.add(INTRO_COMMANDS)
70 | md.add(md.table(data, ["Command", "Description"]))
71 | # Workflows
72 | wfs = config.get("workflows", {}).items()
73 | data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs]
74 | if data:
75 | md.add(md.title(3, "Workflows", "⏭"))
76 | md.add(INTRO_WORKFLOWS)
77 | md.add(md.table(data, ["Workflow", "Steps"]))
78 | # Assets
79 | assets = config.get("assets", [])
80 | data = []
81 | for a in assets:
82 | source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
83 | dest_path = a["dest"]
84 | dest = md.code(dest_path)
85 | if source == "Local":
86 | # Only link assets if they're in the repo
87 | with working_dir(project_dir) as p:
88 | if (p / dest_path).exists():
89 | dest = md.link(dest, dest_path)
90 | data.append((dest, source, a.get("description", "")))
91 | if data:
92 | md.add(md.title(3, "Assets", "🗂"))
93 | md.add(INTRO_ASSETS)
94 | md.add(md.table(data, ["File", "Source", "Description"]))
95 | md.add(MARKER_END.format(tag="WEASEL"))
96 | # Output result
97 | if is_stdout:
98 | print(md.text)
99 | else:
100 | content = md.text
101 | if output_file.exists():
102 | with output_file.open("r", encoding="utf8") as f:
103 | existing = f.read()
104 |
105 | for marker_tag in MARKER_TAGS:
106 | if MARKER_IGNORE.format(tag=marker_tag) in existing:
107 | msg.warn(
108 | "Found ignore marker in existing file: skipping", output_file
109 | )
110 | return
111 |
112 | marker_tag_found = False
113 | for marker_tag in MARKER_TAGS:
114 | markers = {
115 | "start": MARKER_START.format(tag=marker_tag),
116 | "end": MARKER_END.format(tag=marker_tag),
117 | }
118 | if markers["start"] in existing and markers["end"] in existing:
119 | marker_tag_found = True
120 | msg.info("Found existing file: only replacing auto-generated docs")
121 | before = existing.split(markers["start"])[0]
122 | after = existing.split(markers["end"])[1]
123 | content = f"{before}{content}{after}"
124 | break
125 | if not marker_tag_found:
126 | msg.warn("Replacing existing file")
127 |
128 | with output_file.open("w", encoding="utf8") as f:
129 | f.write(content)
130 | msg.good("Saved project documentation", output_file)
131 |
--------------------------------------------------------------------------------
/weasel/cli/dvc.py:
--------------------------------------------------------------------------------
1 | """This module contains helpers and subcommands for integrating Weasel
2 | with Data Version Control (DVC). https://dvc.org"""
3 | import subprocess
4 | from pathlib import Path
5 | from typing import Any, Dict, List, Optional
6 |
7 | from wasabi import msg
8 |
9 | from ..util import get_hash, join_command, load_project_config, run_command, working_dir
10 | from .main import COMMAND, NAME, PROJECT_FILE, Arg, Opt, app
11 |
12 | DVC_CONFIG = "dvc.yaml"
13 | DVC_DIR = ".dvc"
14 | UPDATE_COMMAND = "dvc"
15 | DVC_CONFIG_COMMENT = f"""# This file is auto-generated by Weasel based on your {PROJECT_FILE}. If you've
16 | # edited your {PROJECT_FILE}, you can regenerate this file by running:
17 | # {COMMAND} {UPDATE_COMMAND}"""
18 |
19 |
20 | @app.command(UPDATE_COMMAND)
21 | def project_update_dvc_cli(
22 | # fmt: off
23 | project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
24 | workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
25 | verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
26 | quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
27 | force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
28 | # fmt: on
29 | ):
30 | """Auto-generate Data Version Control (DVC) config. A DVC
31 | project can only define one pipeline, so you need to specify one workflow
32 | defined in the project.yml. If no workflow is specified, the first defined
33 | workflow is used. The DVC config will only be updated if the project.yml
34 | changed.
35 |
36 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc
37 | """
38 | project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
39 |
40 |
41 | def project_update_dvc(
42 | project_dir: Path,
43 | workflow: Optional[str] = None,
44 | *,
45 | verbose: bool = False,
46 | quiet: bool = False,
47 | force: bool = False,
48 | ) -> None:
49 | """Update the auto-generated Data Version Control (DVC) config file. A DVC
50 | project can only define one pipeline, so you need to specify one workflow
51 | defined in the project.yml. Will only update the file if the checksum changed.
52 |
53 | project_dir (Path): The project directory.
54 | workflow (Optional[str]): Optional name of workflow defined in project.yml.
55 | If not set, the first workflow will be used.
56 | verbose (bool): Print more info.
57 | quiet (bool): Print less info.
58 | force (bool): Force update DVC config.
59 | """
60 | config = load_project_config(project_dir)
61 | updated = update_dvc_config(
62 | project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
63 | )
64 | help_msg = "To execute the workflow with DVC, run: dvc repro"
65 | if updated:
66 | msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
67 | else:
68 | msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
69 |
70 |
71 | def update_dvc_config(
72 | path: Path,
73 | config: Dict[str, Any],
74 | workflow: Optional[str] = None,
75 | verbose: bool = False,
76 | quiet: bool = False,
77 | force: bool = False,
78 | ) -> bool:
79 | """Re-run the DVC commands in dry mode and update dvc.yaml file in the
80 | project directory. The file is auto-generated based on the config. The
81 | first line of the auto-generated file specifies the hash of the config
82 | dict, so if any of the config values change, the DVC config is regenerated.
83 |
84 | path (Path): The path to the project directory.
85 | config (Dict[str, Any]): The loaded project.yml.
86 | verbose (bool): Whether to print additional info (via DVC).
87 | quiet (bool): Don't output anything (via DVC).
88 | force (bool): Force update, even if hashes match.
89 | RETURNS (bool): Whether the DVC config file was updated.
90 | """
91 | ensure_dvc(path)
92 | workflows = config.get("workflows", {})
93 | workflow_names = list(workflows.keys())
94 | check_workflows(workflow_names, workflow)
95 | if not workflow:
96 | workflow = workflow_names[0]
97 | config_hash = get_hash(config)
98 | path = path.resolve()
99 | dvc_config_path = path / DVC_CONFIG
100 | if dvc_config_path.exists():
101 | # Check if the file was generated using the current config, if not, redo
102 | with dvc_config_path.open("r", encoding="utf8") as f:
103 | ref_hash = f.readline().strip().replace("# ", "")
104 | if ref_hash == config_hash and not force:
105 | return False # Nothing has changed in project.yml, don't need to update
106 | dvc_config_path.unlink()
107 | dvc_commands = []
108 | config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
109 |
110 | # some flags that apply to every command
111 | flags = []
112 | if verbose:
113 | flags.append("--verbose")
114 | if quiet:
115 | flags.append("--quiet")
116 |
117 | for name in workflows[workflow]:
118 | command = config_commands[name]
119 | deps = command.get("deps", [])
120 | outputs = command.get("outputs", [])
121 | outputs_no_cache = command.get("outputs_no_cache", [])
122 | if not deps and not outputs and not outputs_no_cache:
123 | continue
124 | # Default to the working dir as the project path since dvc.yaml is auto-generated
125 | # and we don't want arbitrary paths in there
126 | project_cmd = ["python", "-m", NAME, "project", "run", name]
127 | deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
128 | outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
129 | outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
130 |
131 | dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
132 | if command.get("no_skip"):
133 | dvc_cmd.append("--always-changed")
134 | full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
135 | dvc_commands.append(join_command(full_cmd))
136 |
137 | if not dvc_commands:
138 | # If we don't check for this, then there will be an error when reading the
139 | # config, since DVC wouldn't create it.
140 | msg.fail(
141 | "No usable commands for DVC found. This can happen if none of your "
142 | "commands have dependencies or outputs.",
143 | exits=1,
144 | )
145 |
146 | with working_dir(path):
147 | for c in dvc_commands:
148 | dvc_command = "dvc " + c
149 | run_command(dvc_command)
150 | with dvc_config_path.open("r+", encoding="utf8") as f:
151 | content = f.read()
152 | f.seek(0, 0)
153 | f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
154 | return True
155 |
156 |
157 | def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
158 | """Validate workflows provided in project.yml and check that a given
159 | workflow can be used to generate a DVC config.
160 |
161 | workflows (List[str]): Names of the available workflows.
162 | workflow (Optional[str]): The name of the workflow to convert.
163 | """
164 | if not workflows:
165 | msg.fail(
166 | f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
167 | f"define at least one list of commands.",
168 | exits=1,
169 | )
170 | if workflow is not None and workflow not in workflows:
171 | msg.fail(
172 | f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
173 | f"Available workflows: {', '.join(workflows)}",
174 | exits=1,
175 | )
176 | if not workflow:
177 | msg.warn(
178 | f"No workflow specified for DVC pipeline. Using the first workflow "
179 | f"defined in {PROJECT_FILE}: '{workflows[0]}'"
180 | )
181 |
182 |
183 | def ensure_dvc(project_dir: Path) -> None:
184 | """Ensure that the "dvc" command is available and that the current project
185 | directory is an initialized DVC project.
186 | """
187 | try:
188 | subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
189 | except Exception:
190 | msg.fail(
191 | "To use Weasel with DVC (Data Version Control), DVC needs "
192 | "to be installed and the 'dvc' command needs to be available",
193 | "You can install the Python package from pip (pip install dvc) or "
194 | "conda (conda install -c conda-forge dvc). For more details, see the "
195 | "documentation: https://dvc.org/doc/install",
196 | exits=1,
197 | )
198 | if not (project_dir / ".dvc").exists():
199 | msg.fail(
200 | "Project not initialized as a DVC project",
201 | "To initialize a DVC project, you can run 'dvc init' in the project "
202 | "directory. For more details, see the documentation: "
203 | "https://dvc.org/doc/command-reference/init",
204 | exits=1,
205 | )
206 |
--------------------------------------------------------------------------------
/weasel/cli/main.py:
--------------------------------------------------------------------------------
1 | import typer
2 |
3 | COMMAND = "python -m weasel"
4 | NAME = "weasel"
5 | HELP = """weasel Command-line Interface
6 |
7 | DOCS: https://github.com/explosion/weasel
8 | """
9 |
10 | PROJECT_FILE = "project.yml"
11 | PROJECT_LOCK = "project.lock"
12 |
13 | # Wrappers for Typer's annotations. Initially created to set defaults and to
14 | # keep the names short, but not needed at the moment.
15 | Arg = typer.Argument
16 | Opt = typer.Option
17 |
18 | app = typer.Typer(name=NAME, help=HELP, no_args_is_help=True)
19 |
20 |
21 | def _get_parent_command(ctx: typer.Context) -> str:
22 | parent_command = ""
23 | ctx_parent = ctx.parent
24 | while ctx_parent:
25 | if ctx_parent.info_name:
26 | parent_command = ctx_parent.info_name + " " + parent_command
27 | ctx_parent = ctx_parent.parent
28 | else:
29 | return COMMAND
30 | return parent_command.strip()
31 |
--------------------------------------------------------------------------------
/weasel/cli/pull.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from wasabi import msg
4 |
5 | from ..util import load_project_config, logger
6 | from .main import Arg, app
7 | from .remote_storage import RemoteStorage, get_command_hash
8 | from .run import update_lockfile
9 |
10 |
11 | @app.command("pull")
12 | def project_pull_cli(
13 | # fmt: off
14 | remote: str = Arg("default", help="Name or path of remote storage"),
15 | project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
16 | # fmt: on
17 | ):
18 | """Retrieve available precomputed outputs from a remote storage.
19 | You can alias remotes in your project.yml by mapping them to storage paths.
20 | A storage can be anything that the smart_open library can upload to, e.g.
21 | AWS, Google Cloud Storage, SSH, local directories etc.
22 |
23 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-push
24 | """
25 | for url, output_path in project_pull(project_dir, remote):
26 | if url is not None:
27 | msg.good(f"Pulled {output_path} from {url}")
28 |
29 |
30 | def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
31 | # TODO: We don't have tests for this :(. It would take a bit of mockery to
32 | # set up. I guess see if it breaks first?
33 | config = load_project_config(project_dir)
34 | if remote in config.get("remotes", {}):
35 | remote = config["remotes"][remote]
36 | storage = RemoteStorage(project_dir, remote)
37 | commands = list(config.get("commands", []))
38 | # We use a while loop here because we don't know how the commands
39 | # will be ordered. A command might need dependencies from one that's later
40 | # in the list.
41 | while commands:
42 | for i, cmd in enumerate(list(commands)):
43 | logger.debug("CMD: %s.", cmd["name"])
44 | deps = [project_dir / dep for dep in cmd.get("deps", [])]
45 | if all(dep.exists() for dep in deps):
46 | cmd_hash = get_command_hash("", "", deps, cmd["script"])
47 | for output_path in cmd.get("outputs", []):
48 | url = storage.pull(output_path, command_hash=cmd_hash)
49 | logger.debug(
50 | "URL: %s for %s with command hash %s",
51 | url,
52 | output_path,
53 | cmd_hash,
54 | )
55 | yield url, output_path
56 |
57 | out_locs = [project_dir / out for out in cmd.get("outputs", [])]
58 | if all(loc.exists() for loc in out_locs):
59 | update_lockfile(project_dir, cmd)
60 | # We remove the command from the list here, and break, so that
61 | # we iterate over the loop again.
62 | commands.pop(i)
63 | break
64 | else:
65 | logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
66 | else:
67 | # If we didn't break the for loop, break the while loop.
68 | break
69 |
--------------------------------------------------------------------------------
/weasel/cli/push.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from wasabi import msg
4 |
5 | from ..util import load_project_config, logger
6 | from .main import Arg, app
7 | from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
8 |
9 |
10 | @app.command("push")
11 | def project_push_cli(
12 | # fmt: off
13 | remote: str = Arg("default", help="Name or path of remote storage"),
14 | project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
15 | # fmt: on
16 | ):
17 | """Persist outputs to a remote storage. You can alias remotes in your
18 | project.yml by mapping them to storage paths. A storage can be anything that
19 | the smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
20 | local directories etc.
21 |
22 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push
23 | """
24 | for output_path, url in project_push(project_dir, remote):
25 | if url is None:
26 | msg.info(f"Skipping {output_path}")
27 | else:
28 | msg.good(f"Pushed {output_path} to {url}")
29 |
30 |
31 | def project_push(project_dir: Path, remote: str):
32 | """Persist outputs to a remote storage. You can alias remotes in your project.yml
33 | by mapping them to storage paths. A storage can be anything that the smart_open
34 | library can upload to, e.g. gcs, aws, ssh, local directories etc
35 | """
36 | config = load_project_config(project_dir)
37 | if remote in config.get("remotes", {}):
38 | remote = config["remotes"][remote]
39 | storage = RemoteStorage(project_dir, remote)
40 | for cmd in config.get("commands", []):
41 | logger.debug("CMD: %s", cmd["name"])
42 | deps = [project_dir / dep for dep in cmd.get("deps", [])]
43 | if any(not dep.exists() for dep in deps):
44 | logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
45 | continue
46 | cmd_hash = get_command_hash(
47 | "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
48 | )
49 | logger.debug("CMD_HASH: %s", cmd_hash)
50 | for output_path in cmd.get("outputs", []):
51 | output_loc = project_dir / output_path
52 | if output_loc.exists() and _is_not_empty_dir(output_loc):
53 | url = storage.push(
54 | output_path,
55 | command_hash=cmd_hash,
56 | content_hash=get_content_hash(output_loc),
57 | )
58 | logger.debug(
59 | "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
60 | )
61 | yield output_path, url
62 |
63 |
64 | def _is_not_empty_dir(loc: Path):
65 | if not loc.is_dir():
66 | return True
67 | elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
68 | return True
69 | else:
70 | return False
71 |
--------------------------------------------------------------------------------
/weasel/cli/remote_storage.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import os
3 | import site
4 | import sys
5 | import tarfile
6 | import urllib.parse
7 | from pathlib import Path
8 | from typing import TYPE_CHECKING, Dict, List, Optional
9 |
10 | from wasabi import msg
11 |
12 | from ..errors import Errors
13 | from ..util import check_spacy_env_vars, download_file, ensure_pathy, get_checksum
14 | from ..util import get_hash, make_tempdir, upload_file
15 |
16 | if TYPE_CHECKING:
17 | from cloudpathlib import CloudPath
18 |
19 |
20 | class RemoteStorage:
21 | """Push and pull outputs to and from a remote file storage.
22 |
23 | Remotes can be anything that `smart_open` can support: AWS, GCS, file system,
24 | ssh, etc.
25 | """
26 |
27 | def __init__(self, project_root: Path, url: str, *, compression="gz"):
28 | self.root = project_root
29 | self.url = ensure_pathy(url)
30 | self.compression = compression
31 |
32 | def push(self, path: Path, command_hash: str, content_hash: str) -> "CloudPath":
33 | """Compress a file or directory within a project and upload it to a remote
34 | storage. If an object exists at the full URL, nothing is done.
35 |
36 | Within the remote storage, files are addressed by their project path
37 | (url encoded) and two user-supplied hashes, representing their creation
38 | context and their file contents. If the URL already exists, the data is
39 | not uploaded. Paths are archived and compressed prior to upload.
40 | """
41 | loc = self.root / path
42 | if not loc.exists():
43 | raise IOError(f"Cannot push {loc}: does not exist.")
44 | url = self.make_url(path, command_hash, content_hash)
45 | if url.exists():
46 | return url
47 | tmp: Path
48 | with make_tempdir() as tmp:
49 | tar_loc = tmp / self.encode_name(str(path))
50 | mode_string = f"w:{self.compression}" if self.compression else "w"
51 | with tarfile.open(tar_loc, mode=mode_string) as tar_file:
52 | tar_file.add(str(loc), arcname=str(path))
53 | upload_file(tar_loc, url)
54 | return url
55 |
56 | def pull(
57 | self,
58 | path: Path,
59 | *,
60 | command_hash: Optional[str] = None,
61 | content_hash: Optional[str] = None,
62 | ) -> Optional["CloudPath"]:
63 | """Retrieve a file from the remote cache. If the file already exists,
64 | nothing is done.
65 |
66 | If the command_hash and/or content_hash are specified, only matching
67 | results are returned. If no results are available, an error is raised.
68 | """
69 | dest = self.root / path
70 | if dest.exists():
71 | return None
72 | url = self.find(path, command_hash=command_hash, content_hash=content_hash)
73 | if url is None:
74 | return url
75 | else:
76 | # Make sure the destination exists
77 | if not dest.parent.exists():
78 | dest.parent.mkdir(parents=True)
79 | tmp: Path
80 | with make_tempdir() as tmp:
81 | tar_loc = tmp / url.parts[-1]
82 | download_file(url, tar_loc)
83 | mode_string = f"r:{self.compression}" if self.compression else "r"
84 | with tarfile.open(tar_loc, mode=mode_string) as tar_file:
85 | # This requires that the path is added correctly, relative
86 | # to root. This is how we set things up in push()
87 |
88 | # Disallow paths outside the current directory for the tar
89 | # file (CVE-2007-4559, directory traversal vulnerability)
90 | def is_within_directory(directory, target):
91 | abs_directory = os.path.abspath(directory)
92 | abs_target = os.path.abspath(target)
93 | prefix = os.path.commonprefix([abs_directory, abs_target])
94 | return prefix == abs_directory
95 |
96 | def safe_extract(tar, path):
97 | for member in tar.getmembers():
98 | member_path = os.path.join(path, member.name)
99 | if not is_within_directory(path, member_path):
100 | raise ValueError(Errors.E201)
101 | if sys.version_info >= (3, 12):
102 | tar.extractall(path, filter="data")
103 | else:
104 | tar.extractall(path)
105 |
106 | safe_extract(tar_file, self.root)
107 | return url
108 |
109 | def find(
110 | self,
111 | path: Path,
112 | *,
113 | command_hash: Optional[str] = None,
114 | content_hash: Optional[str] = None,
115 | ) -> Optional["CloudPath"]:
116 | """Find the best matching version of a file within the storage,
117 | or `None` if no match can be found. If both the creation and content hash
118 | are specified, only exact matches will be returned. Otherwise, the most
119 | recent matching file is preferred.
120 | """
121 | name = self.encode_name(str(path))
122 | urls = []
123 | if command_hash is not None and content_hash is not None:
124 | url = self.url / name / command_hash / content_hash
125 | urls = [url] if url.exists() else []
126 | elif command_hash is not None:
127 | if (self.url / name / command_hash).exists():
128 | urls = list((self.url / name / command_hash).iterdir())
129 | else:
130 | if (self.url / name).exists():
131 | for sub_dir in (self.url / name).iterdir():
132 | urls.extend(sub_dir.iterdir())
133 | if content_hash is not None:
134 | urls = [url for url in urls if url.parts[-1] == content_hash]
135 | if len(urls) >= 2:
136 | try:
137 | urls.sort(key=lambda x: x.stat().st_mtime)
138 | except Exception:
139 | msg.warn(
140 | "Unable to sort remote files by last modified. The file(s) "
141 | "pulled from the cache may not be the most recent."
142 | )
143 | return urls[-1] if urls else None
144 |
145 | def make_url(self, path: Path, command_hash: str, content_hash: str) -> "CloudPath":
146 | """Construct a URL from a subpath, a creation hash and a content hash."""
147 | return self.url / self.encode_name(str(path)) / command_hash / content_hash
148 |
149 | def encode_name(self, name: str) -> str:
150 | """Encode a subpath into a URL-safe name."""
151 | return urllib.parse.quote_plus(name)
152 |
153 |
154 | def get_content_hash(loc: Path) -> str:
155 | return get_checksum(loc)
156 |
157 |
158 | def get_command_hash(
159 | site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
160 | ) -> str:
161 | """Create a hash representing the execution of a command. This includes the
162 | currently installed packages, whatever environment variables have been marked
163 | as relevant, and the command.
164 | """
165 | check_spacy_env_vars()
166 | dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
167 | hashes = [site_hash, env_hash] + dep_checksums
168 | hashes.extend(cmd)
169 | creation_bytes = "".join(hashes).encode("utf8")
170 | return hashlib.md5(creation_bytes).hexdigest()
171 |
172 |
173 | def get_site_hash():
174 | """Hash the current Python environment's site-packages contents, including
175 | the name and version of the libraries. The list we're hashing is what
176 | `pip freeze` would output.
177 | """
178 | site_dirs = site.getsitepackages()
179 | if site.ENABLE_USER_SITE:
180 | site_dirs.extend(site.getusersitepackages())
181 | packages = set()
182 | for site_dir in site_dirs:
183 | site_dir = Path(site_dir)
184 | for subpath in site_dir.iterdir():
185 | if subpath.parts[-1].endswith("dist-info"):
186 | packages.add(subpath.parts[-1].replace(".dist-info", ""))
187 | package_bytes = "".join(sorted(packages)).encode("utf8")
188 | return hashlib.md5sum(package_bytes).hexdigest()
189 |
190 |
191 | def get_env_hash(env: Dict[str, str]) -> str:
192 | """Construct a hash of the environment variables that will be passed into
193 | the commands.
194 |
195 | Values in the env dict may be references to the current os.environ, using
196 | the syntax $ENV_VAR to mean os.environ[ENV_VAR]
197 | """
198 | env_vars = {}
199 | for key, value in env.items():
200 | if value.startswith("$"):
201 | env_vars[key] = os.environ.get(value[1:], "")
202 | else:
203 | env_vars[key] = value
204 | return get_hash(env_vars)
205 |
--------------------------------------------------------------------------------
/weasel/cli/run.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 | from typing import Any, Dict, Iterable, List, Optional, Sequence
4 |
5 | import srsly
6 | import typer
7 | from wasabi import msg
8 | from wasabi.util import locale_escape
9 |
10 | from ..util import SimpleFrozenDict, SimpleFrozenList, check_spacy_env_vars
11 | from ..util import get_checksum, get_hash, is_cwd, join_command, load_project_config
12 | from ..util import parse_config_overrides, run_command, split_command, working_dir
13 | from .main import COMMAND, PROJECT_FILE, PROJECT_LOCK, Arg, Opt, _get_parent_command
14 | from .main import app
15 |
16 |
17 | @app.command(
18 | "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
19 | )
20 | def project_run_cli(
21 | # fmt: off
22 | ctx: typer.Context, # This is only used to read additional arguments
23 | subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
24 | project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
25 | force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
26 | dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
27 | show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
28 | # fmt: on
29 | ):
30 | """Run a named command or workflow defined in the project.yml. If a workflow
31 | name is specified, all commands in the workflow are run, in order. If
32 | commands define dependencies and/or outputs, they will only be re-run if
33 | state has changed.
34 |
35 | DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#rocket-run
36 | """
37 | parent_command = _get_parent_command(ctx)
38 | if show_help or not subcommand:
39 | print_run_help(project_dir, subcommand, parent_command)
40 | else:
41 | overrides = parse_config_overrides(ctx.args)
42 | project_run(
43 | project_dir,
44 | subcommand,
45 | overrides=overrides,
46 | force=force,
47 | dry=dry,
48 | parent_command=parent_command,
49 | )
50 |
51 |
52 | def project_run(
53 | project_dir: Path,
54 | subcommand: str,
55 | *,
56 | overrides: Dict[str, Any] = SimpleFrozenDict(),
57 | force: bool = False,
58 | dry: bool = False,
59 | capture: bool = False,
60 | skip_requirements_check: bool = False,
61 | parent_command: str = COMMAND,
62 | ) -> None:
63 | """Run a named script defined in the project.yml. If the script is part
64 | of the default pipeline (defined in the "run" section), DVC is used to
65 | execute the command, so it can determine whether to rerun it. It then
66 | calls into "exec" to execute it.
67 |
68 | project_dir (Path): Path to project directory.
69 | subcommand (str): Name of command to run.
70 | overrides (Dict[str, Any]): Optional config overrides.
71 | force (bool): Force re-running, even if nothing changed.
72 | dry (bool): Perform a dry run and don't execute commands.
73 | capture (bool): Whether to capture the output and errors of individual commands.
74 | If False, the stdout and stderr will not be redirected, and if there's an error,
75 | sys.exit will be called with the return code. You should use capture=False
76 | when you want to turn over execution to the command, and capture=True
77 | when you want to run the command more like a function.
78 | skip_requirements_check (bool): No longer used, deprecated.
79 | """
80 | config = load_project_config(project_dir, overrides=overrides)
81 | commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
82 | workflows = config.get("workflows", {})
83 | validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
84 |
85 | if subcommand in workflows:
86 | msg.info(f"Running workflow '{subcommand}'")
87 | for cmd in workflows[subcommand]:
88 | project_run(
89 | project_dir,
90 | cmd,
91 | overrides=overrides,
92 | force=force,
93 | dry=dry,
94 | capture=capture,
95 | )
96 | else:
97 | cmd = commands[subcommand]
98 | for dep in cmd.get("deps", []):
99 | if not (project_dir / dep).exists():
100 | err = f"Missing dependency specified by command '{subcommand}': {dep}"
101 | err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
102 | err_exits = 1 if not dry else None
103 | msg.fail(err, err_help, exits=err_exits)
104 | check_spacy_env_vars()
105 | with working_dir(project_dir) as current_dir:
106 | msg.divider(subcommand)
107 | rerun = check_rerun(current_dir, cmd)
108 | if not rerun and not force:
109 | msg.info(f"Skipping '{cmd['name']}': nothing changed")
110 | else:
111 | run_commands(cmd["script"], dry=dry, capture=capture)
112 | if not dry:
113 | update_lockfile(current_dir, cmd)
114 |
115 |
116 | def print_run_help(
117 | project_dir: Path, subcommand: Optional[str] = None, parent_command: str = COMMAND
118 | ) -> None:
119 | """Simulate a CLI help prompt using the info available in the project.yml.
120 |
121 | project_dir (Path): The project directory.
122 | subcommand (Optional[str]): The subcommand or None. If a subcommand is
123 | provided, the subcommand help is shown. Otherwise, the top-level help
124 | and a list of available commands is printed.
125 | """
126 | config = load_project_config(project_dir)
127 | config_commands = config.get("commands", [])
128 | commands = {cmd["name"]: cmd for cmd in config_commands}
129 | workflows = config.get("workflows", {})
130 | project_loc = "" if is_cwd(project_dir) else project_dir
131 | if subcommand:
132 | validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
133 | print(f"Usage: {parent_command} run {subcommand} {project_loc}")
134 | if subcommand in commands:
135 | help_text = commands[subcommand].get("help")
136 | if help_text:
137 | print(f"\n{help_text}\n")
138 | elif subcommand in workflows:
139 | steps = workflows[subcommand]
140 | print(f"\nWorkflow consisting of {len(steps)} commands:")
141 | steps_data = [
142 | (f"{i + 1}. {step}", commands[step].get("help", ""))
143 | for i, step in enumerate(steps)
144 | ]
145 | msg.table(steps_data)
146 | help_cmd = f"{parent_command} run [COMMAND] {project_loc} --help"
147 | print(f"For command details, run: {help_cmd}")
148 | else:
149 | print("")
150 | title = config.get("title")
151 | if title:
152 | print(f"{locale_escape(title)}\n")
153 | if config_commands:
154 | print(f"Available commands in {PROJECT_FILE}")
155 | print(f"Usage: {parent_command} run [COMMAND] {project_loc}")
156 | msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
157 | if workflows:
158 | print(f"Available workflows in {PROJECT_FILE}")
159 | print(f"Usage: {parent_command} run [WORKFLOW] {project_loc}")
160 | msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
161 |
162 |
163 | def run_commands(
164 | commands: Iterable[str] = SimpleFrozenList(),
165 | silent: bool = False,
166 | dry: bool = False,
167 | capture: bool = False,
168 | ) -> None:
169 | """Run a sequence of commands in a subprocess, in order.
170 |
171 | commands (List[str]): The string commands.
172 | silent (bool): Don't print the commands.
173 | dry (bool): Perform a dry run and don't execut anything.
174 | capture (bool): Whether to capture the output and errors of individual commands.
175 | If False, the stdout and stderr will not be redirected, and if there's an error,
176 | sys.exit will be called with the return code. You should use capture=False
177 | when you want to turn over execution to the command, and capture=True
178 | when you want to run the command more like a function.
179 | """
180 | for c in commands:
181 | command = split_command(c)
182 | # Not sure if this is needed or a good idea. Motivation: users may often
183 | # use commands in their config that reference "python" and we want to
184 | # make sure that it's always executing the same Python that Weasel is
185 | # executed with and the pip in the same env, not some other Python/pip.
186 | # Also ensures cross-compatibility if user 1 writes "python3" (because
187 | # that's how it's set up on their system), and user 2 without the
188 | # shortcut tries to re-run the command.
189 | if len(command) and command[0] in ("python", "python3"):
190 | command[0] = sys.executable
191 | elif len(command) and command[0] in ("pip", "pip3"):
192 | command = [sys.executable, "-m", "pip", *command[1:]]
193 | if not silent:
194 | print(f"Running command: {join_command(command)}")
195 | if not dry:
196 | run_command(command, capture=capture)
197 |
198 |
199 | def validate_subcommand(
200 | commands: Sequence[str], workflows: Sequence[str], subcommand: str
201 | ) -> None:
202 | """Check that a subcommand is valid and defined. Raises an error otherwise.
203 |
204 | commands (Sequence[str]): The available commands.
205 | subcommand (str): The subcommand.
206 | """
207 | if not commands and not workflows:
208 | msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
209 | if subcommand not in commands and subcommand not in workflows:
210 | help_msg = []
211 | if subcommand in ["assets", "asset"]:
212 | help_msg.append("Did you mean to run: python -m weasel assets?")
213 | if commands:
214 | help_msg.append(f"Available commands: {', '.join(commands)}")
215 | if workflows:
216 | help_msg.append(f"Available workflows: {', '.join(workflows)}")
217 | msg.fail(
218 | f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
219 | ". ".join(help_msg),
220 | exits=1,
221 | )
222 |
223 |
224 | def check_rerun(
225 | project_dir: Path,
226 | command: Dict[str, Any],
227 | ) -> bool:
228 | """Check if a command should be rerun because its settings or inputs/outputs
229 | changed.
230 |
231 | project_dir (Path): The current project directory.
232 | command (Dict[str, Any]): The command, as defined in the project.yml.
233 | strict_version (bool):
234 | RETURNS (bool): Whether to re-run the command.
235 | """
236 | # Always rerun if no-skip is set
237 | if command.get("no_skip", False):
238 | return True
239 | lock_path = project_dir / PROJECT_LOCK
240 | if not lock_path.exists(): # We don't have a lockfile, run command
241 | return True
242 | data = srsly.read_yaml(lock_path)
243 | if command["name"] not in data: # We don't have info about this command
244 | return True
245 | entry = data[command["name"]]
246 | # Always run commands with no outputs (otherwise they'd always be skipped)
247 | if not entry.get("outs", []):
248 | return True
249 | # If the entry in the lockfile matches the lockfile entry that would be
250 | # generated from the current command, we don't rerun because it means that
251 | # all inputs/outputs, hashes and scripts are the same and nothing changed
252 | lock_entry = get_lock_entry(project_dir, command)
253 | return get_hash(lock_entry) != get_hash(entry)
254 |
255 |
256 | def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
257 | """Update the lockfile after running a command. Will create a lockfile if
258 | it doesn't yet exist and will add an entry for the current command, its
259 | script and dependencies/outputs.
260 |
261 | project_dir (Path): The current project directory.
262 | command (Dict[str, Any]): The command, as defined in the project.yml.
263 | """
264 | lock_path = project_dir / PROJECT_LOCK
265 | if not lock_path.exists():
266 | srsly.write_yaml(lock_path, {})
267 | data = {}
268 | else:
269 | data = srsly.read_yaml(lock_path)
270 | data[command["name"]] = get_lock_entry(project_dir, command)
271 | srsly.write_yaml(lock_path, data)
272 |
273 |
274 | def get_lock_entry(
275 | project_dir: Path, command: Dict[str, Any], *, parent_command: str = COMMAND
276 | ) -> Dict[str, Any]:
277 | """Get a lockfile entry for a given command. An entry includes the command,
278 | the script (command steps) and a list of dependencies and outputs with
279 | their paths and file hashes, if available. The format is based on the
280 | dvc.lock files, to keep things consistent.
281 |
282 | project_dir (Path): The current project directory.
283 | command (Dict[str, Any]): The command, as defined in the project.yml.
284 | RETURNS (Dict[str, Any]): The lockfile entry.
285 | """
286 | deps = get_fileinfo(project_dir, command.get("deps", []))
287 | outs = get_fileinfo(project_dir, command.get("outputs", []))
288 | outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
289 | return {
290 | "cmd": f"{parent_command} run {command['name']}",
291 | "script": command["script"],
292 | "deps": deps,
293 | "outs": [*outs, *outs_nc],
294 | }
295 |
296 |
297 | def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
298 | """Generate the file information for a list of paths (dependencies, outputs).
299 | Includes the file path and the file's checksum.
300 |
301 | project_dir (Path): The current project directory.
302 | paths (List[str]): The file paths.
303 | RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
304 | """
305 | data = []
306 | for path in paths:
307 | file_path = project_dir / path
308 | md5 = get_checksum(file_path) if file_path.exists() else None
309 | data.append({"path": path, "md5": md5})
310 | return data
311 |
--------------------------------------------------------------------------------
/weasel/compat.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | is_windows = sys.platform.startswith("win")
4 | is_linux = sys.platform.startswith("linux")
5 | is_osx = sys.platform == "darwin"
6 |
--------------------------------------------------------------------------------
/weasel/errors.py:
--------------------------------------------------------------------------------
1 | class ErrorsWithCodes(type):
2 | def __getattribute__(self, code):
3 | msg = super().__getattribute__(code)
4 | if code.startswith("__"): # python system attributes like __class__
5 | return msg
6 | else:
7 | return "[{code}] {msg}".format(code=code, msg=msg)
8 |
9 |
10 | class Warnings(metaclass=ErrorsWithCodes):
11 | # File system
12 | W801 = "Could not clean/remove the temp directory at {dir}: {msg}."
13 | W802 = (
14 | "Remote storage is not yet supported for Python 3.12 with "
15 | "cloudpathlib. Please use Python 3.11 or earlier for remote storage."
16 | )
17 |
18 |
19 | class Errors(metaclass=ErrorsWithCodes):
20 | # API - Datastructure
21 | E001 = (
22 | "Can't write to frozen dictionary. This is likely an internal "
23 | "error. Are you writing to a default function argument?"
24 | )
25 | E002 = (
26 | "Can't write to frozen list. Maybe you're trying to modify a computed "
27 | "property or default function argument?"
28 | )
29 |
30 | # Workflow
31 | E501 = "Can not execute command '{str_command}'. Do you have '{tool}' installed?"
32 |
33 | # File system
34 | E801 = "The tar file pulled from the remote attempted an unsafe path " "traversal."
35 |
--------------------------------------------------------------------------------
/weasel/schemas.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from typing import Any, Dict, List, Optional, Type, Union
3 |
4 | try:
5 | from pydantic.v1 import BaseModel, Field, StrictStr, ValidationError, root_validator
6 | except ImportError:
7 | from pydantic import BaseModel, Field, StrictStr, ValidationError, root_validator # type: ignore
8 |
9 | from wasabi import msg
10 |
11 |
12 | def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
13 | """Validate data against a given pydantic schema.
14 |
15 | obj (Dict[str, Any]): JSON-serializable data to validate.
16 | schema (pydantic.BaseModel): The schema to validate against.
17 | RETURNS (List[str]): A list of error messages, if available.
18 | """
19 | try:
20 | schema(**obj)
21 | return []
22 | except ValidationError as e:
23 | errors = e.errors()
24 | data = defaultdict(list)
25 | for error in errors:
26 | err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
27 | data[err_loc].append(error.get("msg"))
28 | return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] # type: ignore[arg-type]
29 |
30 |
31 | # Project config Schema
32 |
33 |
34 | class ProjectConfigAssetGitItem(BaseModel):
35 | # fmt: off
36 | repo: StrictStr = Field(..., title="URL of Git repo to download from")
37 | path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)")
38 | branch: StrictStr = Field("master", title="Branch to clone from")
39 | # fmt: on
40 |
41 |
42 | class ProjectConfigAssetURL(BaseModel):
43 | # fmt: off
44 | dest: StrictStr = Field(..., title="Destination of downloaded asset")
45 | url: Optional[StrictStr] = Field(None, title="URL of asset")
46 | checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
47 | description: StrictStr = Field("", title="Description of asset")
48 | # fmt: on
49 |
50 |
51 | class ProjectConfigAssetGit(BaseModel):
52 | # fmt: off
53 | git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
54 | checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
55 | description: Optional[StrictStr] = Field(None, title="Description of asset")
56 | # fmt: on
57 |
58 |
59 | class ProjectConfigCommand(BaseModel):
60 | # fmt: off
61 | name: StrictStr = Field(..., title="Name of command")
62 | help: Optional[StrictStr] = Field(None, title="Command description")
63 | script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
64 | deps: List[StrictStr] = Field([], title="File dependencies required by this command")
65 | outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
66 | outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
67 | no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
68 | # fmt: on
69 |
70 | class Config:
71 | title = "A single named command specified in a project config"
72 | extra = "forbid"
73 |
74 |
75 | class ProjectConfigSchema(BaseModel):
76 | # fmt: off
77 | vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
78 | env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
79 | assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
80 | workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
81 | commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
82 | title: Optional[str] = Field(None, title="Project title")
83 | # fmt: on
84 |
85 | class Config:
86 | title = "Schema for project configuration file"
87 |
88 | @root_validator(pre=True)
89 | def check_legacy_keys(cls, obj: Dict[str, Any]) -> Dict[str, Any]:
90 | if "spacy_version" in obj:
91 | msg.warn(
92 | "Your project configuration file includes a `spacy_version` key, "
93 | "which is now deprecated. Weasel will not validate your version of spaCy.",
94 | )
95 | if "check_requirements" in obj:
96 | msg.warn(
97 | "Your project configuration file includes a `check_requirements` key, "
98 | "which is now deprecated. Weasel will not validate your requirements.",
99 | )
100 | return obj
101 |
--------------------------------------------------------------------------------
/weasel/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/weasel/tests/__init__.py
--------------------------------------------------------------------------------
/weasel/tests/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/weasel/9a0724d4b012ec42552f9463d6ebf56a5460c152/weasel/tests/cli/__init__.py
--------------------------------------------------------------------------------
/weasel/tests/cli/test_cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | import pytest
5 | import srsly
6 |
7 | from weasel.cli.remote_storage import RemoteStorage
8 | from weasel.schemas import ProjectConfigSchema, validate
9 | from weasel.util import git_checkout, is_subpath_of, load_project_config, make_tempdir
10 | from weasel.util import validate_project_commands
11 |
12 |
13 | def test_issue11235():
14 | """
15 | Test that the cli handles interpolation in the directory names correctly when loading project config.
16 | """
17 | lang_var = "en"
18 | variables = {"lang": lang_var}
19 | commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
20 | directories = ["cfg", "${vars.lang}_model"]
21 | project = {"commands": commands, "vars": variables, "directories": directories}
22 | with make_tempdir() as d:
23 | srsly.write_yaml(d / "project.yml", project)
24 | cfg = load_project_config(d)
25 | # Check that the directories are interpolated and created correctly
26 | assert os.path.exists(d / "cfg")
27 | assert os.path.exists(d / f"{lang_var}_model")
28 | assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
29 |
30 |
31 | def test_project_config_validation_full():
32 | config = {
33 | "vars": {"some_var": 20},
34 | "directories": ["assets", "configs", "corpus", "scripts", "training"],
35 | "assets": [
36 | {
37 | "dest": "x",
38 | "extra": True,
39 | "url": "https://example.com",
40 | "checksum": "63373dd656daa1fd3043ce166a59474c",
41 | },
42 | {
43 | "dest": "y",
44 | "git": {
45 | "repo": "https://github.com/example/repo",
46 | "branch": "develop",
47 | "path": "y",
48 | },
49 | },
50 | {
51 | "dest": "z",
52 | "extra": False,
53 | "url": "https://example.com",
54 | "checksum": "63373dd656daa1fd3043ce166a59474c",
55 | },
56 | ],
57 | "commands": [
58 | {
59 | "name": "train",
60 | "help": "Train a model",
61 | "script": ["python -m spacy train config.cfg -o training"],
62 | "deps": ["config.cfg", "corpus/training.spcy"],
63 | "outputs": ["training/model-best"],
64 | },
65 | {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True},
66 | ],
67 | "workflows": {"all": ["train", "test"], "train": ["train"]},
68 | }
69 | errors = validate(ProjectConfigSchema, config)
70 | assert not errors
71 |
72 |
73 | @pytest.mark.parametrize(
74 | "config",
75 | [
76 | {"commands": [{"name": "a"}, {"name": "a"}]},
77 | {"commands": [{"name": "a"}], "workflows": {"a": []}},
78 | {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
79 | ],
80 | )
81 | def test_project_config_validation1(config):
82 | with pytest.raises(SystemExit):
83 | validate_project_commands(config)
84 |
85 |
86 | @pytest.mark.parametrize(
87 | "config,n_errors",
88 | [
89 | ({"commands": {"a": []}}, 1),
90 | ({"commands": [{"help": "..."}]}, 1),
91 | ({"commands": [{"name": "a", "extra": "b"}]}, 1),
92 | ({"commands": [{"extra": "b"}]}, 2),
93 | ({"commands": [{"name": "a", "deps": [123]}]}, 1),
94 | ],
95 | )
96 | def test_project_config_validation2(config, n_errors):
97 | errors = validate(ProjectConfigSchema, config)
98 | assert len(errors) == n_errors
99 |
100 |
101 | @pytest.mark.parametrize(
102 | "parent,child,expected",
103 | [
104 | ("/tmp", "/tmp", True),
105 | ("/tmp", "/", False),
106 | ("/tmp", "/tmp/subdir", True),
107 | ("/tmp", "/tmpdir", False),
108 | ("/tmp", "/tmp/subdir/..", True),
109 | ("/tmp", "/tmp/..", False),
110 | ],
111 | )
112 | def test_is_subpath_of(parent, child, expected):
113 | assert is_subpath_of(parent, child) == expected
114 |
115 |
116 | def test_local_remote_storage():
117 | with make_tempdir() as d:
118 | filename = "a.txt"
119 |
120 | content_hashes = ("aaaa", "cccc", "bbbb")
121 | for i, content_hash in enumerate(content_hashes):
122 | # make sure that each subsequent file has a later timestamp
123 | if i > 0:
124 | time.sleep(1)
125 | content = f"{content_hash} content"
126 | loc_file = d / "root" / filename
127 | if not loc_file.parent.exists():
128 | loc_file.parent.mkdir(parents=True)
129 | with loc_file.open(mode="w") as file_:
130 | file_.write(content)
131 |
132 | # push first version to remote storage
133 | remote = RemoteStorage(d / "root", str(d / "remote"))
134 | remote.push(filename, "aaaa", content_hash)
135 |
136 | # retrieve with full hashes
137 | loc_file.unlink()
138 | remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
139 | with loc_file.open(mode="r") as file_:
140 | assert file_.read() == content
141 |
142 | # retrieve with command hash
143 | loc_file.unlink()
144 | remote.pull(filename, command_hash="aaaa")
145 | with loc_file.open(mode="r") as file_:
146 | assert file_.read() == content
147 |
148 | # retrieve with content hash
149 | loc_file.unlink()
150 | remote.pull(filename, content_hash=content_hash)
151 | with loc_file.open(mode="r") as file_:
152 | assert file_.read() == content
153 |
154 | # retrieve with no hashes
155 | loc_file.unlink()
156 | remote.pull(filename)
157 | with loc_file.open(mode="r") as file_:
158 | assert file_.read() == content
159 |
160 |
161 | def test_local_remote_storage_pull_missing():
162 | # pulling from a non-existent remote pulls nothing gracefully
163 | with make_tempdir() as d:
164 | filename = "a.txt"
165 | remote = RemoteStorage(d / "root", str(d / "remote"))
166 | assert remote.pull(filename, command_hash="aaaa") is None
167 | assert remote.pull(filename) is None
168 |
169 |
170 | def test_project_git_dir_asset():
171 | with make_tempdir() as d:
172 | # Use a very small repo.
173 | git_checkout(
174 | "https://github.com/explosion/os-signpost.git",
175 | "os_signpost",
176 | d / "signpost",
177 | branch="v0.0.3",
178 | )
179 | assert os.path.isdir(d / "signpost")
180 |
181 |
182 | @pytest.mark.issue(66)
183 | def test_project_git_file_asset():
184 | with make_tempdir() as d:
185 | # Use a very small repo.
186 | git_checkout(
187 | "https://github.com/explosion/os-signpost.git",
188 | "README.md",
189 | d / "readme.md",
190 | branch="v0.0.3",
191 | )
192 | assert os.path.isfile(d / "readme.md")
193 |
--------------------------------------------------------------------------------
/weasel/tests/cli/test_cli_app.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Any, Dict
3 |
4 | import pytest
5 | import srsly
6 | from typer.testing import CliRunner
7 |
8 | from weasel import app
9 | from weasel.cli.main import HELP
10 | from weasel.util import get_git_version
11 |
12 | runner = CliRunner()
13 |
14 |
15 | @pytest.mark.parametrize("cmd", [None, "--help"])
16 | def test_show_help(cmd):
17 | """Basic test to confirm help text appears"""
18 | result = runner.invoke(app, [cmd] if cmd else None)
19 | for line in HELP.splitlines():
20 | assert line in result.stdout
21 |
22 |
23 | def has_git():
24 | try:
25 | get_git_version()
26 | return True
27 | except RuntimeError:
28 | return False
29 |
30 |
31 | SAMPLE_PROJECT: Dict[str, Any] = {
32 | "title": "Sample project",
33 | "description": "This is a project for testing",
34 | "assets": [
35 | {
36 | "dest": "assets/weasel-readme.md",
37 | "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/README.md",
38 | "checksum": "65f4c426a9b153b7683738c92d0d20f9",
39 | },
40 | {
41 | "dest": "assets/pyproject.toml",
42 | "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/pyproject.toml",
43 | "checksum": "1e2da3a3030d6611520952d5322cd94e",
44 | "extra": True,
45 | },
46 | ],
47 | "commands": [
48 | {
49 | "name": "ok",
50 | "help": "print ok",
51 | "script": ["python -c \"print('okokok')\""],
52 | },
53 | {
54 | "name": "create",
55 | "help": "make a file",
56 | "script": ["touch abc.txt"],
57 | "outputs": ["abc.txt"],
58 | },
59 | {
60 | "name": "clean",
61 | "help": "remove test file",
62 | "script": ["rm abc.txt"],
63 | },
64 | ],
65 | }
66 |
67 | SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
68 |
69 |
70 | @pytest.fixture
71 | def project_dir(tmp_path: Path):
72 | path = tmp_path / "project"
73 | path.mkdir()
74 | (path / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
75 | yield path
76 |
77 |
78 | def test_project_document(project_dir: Path):
79 | readme_path = project_dir / "README.md"
80 | assert not readme_path.exists(), "README already exists"
81 | result = CliRunner().invoke(
82 | app, ["document", str(project_dir), "-o", str(readme_path)]
83 | )
84 | assert result.exit_code == 0
85 | assert readme_path.is_file()
86 | text = readme_path.read_text("utf-8")
87 | assert SAMPLE_PROJECT["description"] in text
88 |
89 |
90 | def test_project_assets(project_dir: Path):
91 | asset_dir = project_dir / "assets"
92 | assert not asset_dir.exists(), "Assets dir is already present"
93 | result = CliRunner().invoke(app, ["assets", str(project_dir)])
94 | assert result.exit_code == 0
95 | assert (asset_dir / "weasel-readme.md").is_file(), "Assets not downloaded"
96 | # check that extras work
97 | result = CliRunner().invoke(app, ["assets", "--extra", str(project_dir)])
98 | assert result.exit_code == 0
99 | assert (asset_dir / "pyproject.toml").is_file(), "Extras not downloaded"
100 |
101 |
102 | def test_project_run(project_dir: Path):
103 | # make sure dry run works
104 | test_file = project_dir / "abc.txt"
105 | result = CliRunner().invoke(app, ["run", "--dry", "create", str(project_dir)])
106 | assert result.exit_code == 0
107 | assert not test_file.is_file()
108 | result = CliRunner().invoke(app, ["run", "create", str(project_dir)])
109 | assert result.exit_code == 0
110 | assert test_file.is_file()
111 | result = CliRunner().invoke(app, ["run", "ok", str(project_dir)])
112 | assert result.exit_code == 0
113 | assert "okokok" in result.stdout
114 |
115 |
116 | def test_check_spacy_env_vars(project_dir: Path, monkeypatch: pytest.MonkeyPatch):
117 | # make sure dry run works
118 | project_dir / "abc.txt"
119 |
120 | result = CliRunner().invoke(app, ["run", "--dry", "create", str(project_dir)])
121 | assert result.exit_code == 0
122 | assert (
123 | "You've set a `SPACY_CONFIG_OVERRIDES` environment variable"
124 | not in result.output
125 | )
126 | assert (
127 | "You've set a `SPACY_PROJECT_USE_GIT_VERSION` environment variable"
128 | not in result.output
129 | )
130 |
131 | monkeypatch.setenv("SPACY_CONFIG_OVERRIDES", "test")
132 | monkeypatch.setenv("SPACY_PROJECT_USE_GIT_VERSION", "false")
133 |
134 | result = CliRunner().invoke(app, ["run", "--dry", "create", str(project_dir)])
135 | assert result.exit_code == 0
136 |
137 | assert "You've set a `SPACY_CONFIG_OVERRIDES` environment variable" in result.output
138 | assert (
139 | "You've set a `SPACY_PROJECT_USE_GIT_VERSION` environment variable"
140 | in result.output
141 | )
142 |
143 |
144 | @pytest.mark.skipif(not has_git(), reason="git not installed")
145 | @pytest.mark.parametrize(
146 | "options_string",
147 | [
148 | "",
149 | # "--sparse",
150 | "--branch v3",
151 | "--repo https://github.com/explosion/projects --branch v3",
152 | ],
153 | )
154 | def test_project_clone(tmp_path: Path, options_string: str):
155 | out = tmp_path / "project_clone"
156 | target = "benchmarks/ner_conll03"
157 | if not options_string:
158 | options = []
159 | else:
160 | options = options_string.split()
161 | result = CliRunner().invoke(app, ["clone", target, *options, str(out)])
162 | assert result.exit_code == 0
163 | assert "weasel assets" in result.output
164 | assert (out / "README.md").is_file()
165 |
166 |
167 | def test_project_push_pull(tmp_path: Path, project_dir: Path):
168 | proj = dict(SAMPLE_PROJECT)
169 | remote = "xyz"
170 |
171 | remote_dir = tmp_path / "remote"
172 | remote_dir.mkdir()
173 |
174 | proj["remotes"] = {remote: str(remote_dir)}
175 | proj_text = srsly.yaml_dumps(proj)
176 | (project_dir / "project.yml").write_text(proj_text)
177 |
178 | test_file = project_dir / "abc.txt"
179 | result = CliRunner().invoke(app, ["run", "create", str(project_dir)])
180 | assert result.exit_code == 0
181 | assert test_file.is_file()
182 | result = CliRunner().invoke(app, ["push", remote, str(project_dir)])
183 | assert result.exit_code == 0
184 | result = CliRunner().invoke(app, ["run", "clean", str(project_dir)])
185 | assert result.exit_code == 0
186 | assert not test_file.exists()
187 | result = CliRunner().invoke(app, ["pull", remote, str(project_dir)])
188 | assert result.exit_code == 0
189 | assert test_file.is_file()
190 |
--------------------------------------------------------------------------------
/weasel/tests/cli/test_document.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Any, Dict
3 |
4 | import pytest
5 | import srsly
6 | from typer.testing import CliRunner
7 |
8 | from weasel import app
9 | from weasel.cli.document import MARKER_END, MARKER_IGNORE, MARKER_START, MARKER_TAGS
10 |
11 | runner = CliRunner()
12 |
13 | SAMPLE_PROJECT: Dict[str, Any] = {
14 | "title": "Sample project",
15 | "description": "This is a project for testing",
16 | "assets": [
17 | {
18 | "dest": "assets/weasel-readme.md",
19 | "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/README.md",
20 | "checksum": "65f4c426a9b153b7683738c92d0d20f9",
21 | },
22 | {
23 | "dest": "assets/pyproject.toml",
24 | "url": "https://github.com/explosion/weasel/raw/9a3632862b47069d2f9033b773e814d4c4e09c83/pyproject.toml",
25 | "checksum": "1e2da3a3030d6611520952d5322cd94e",
26 | "extra": True,
27 | },
28 | ],
29 | "commands": [
30 | {
31 | "name": "ok",
32 | "help": "print ok",
33 | "script": ["python -c \"print('okokok')\""],
34 | },
35 | {
36 | "name": "create",
37 | "help": "make a file",
38 | "script": ["touch abc.txt"],
39 | "outputs": ["abc.txt"],
40 | },
41 | {
42 | "name": "clean",
43 | "help": "remove test file",
44 | "script": ["rm abc.txt"],
45 | },
46 | ],
47 | }
48 |
49 |
50 | @pytest.fixture(scope="function")
51 | def project_yaml_file(
52 | tmp_path_factory: pytest.TempPathFactory,
53 | ):
54 | test_dir = tmp_path_factory.mktemp("project")
55 | path = test_dir / "project.yml"
56 | path.write_text(srsly.yaml_dumps(SAMPLE_PROJECT))
57 | return path
58 |
59 |
60 | def test_create_docs(project_yaml_file: Path):
61 | result = runner.invoke(app, ["document", str(project_yaml_file.parent)])
62 | conf_data = srsly.read_yaml(project_yaml_file)
63 | assert result.exit_code == 0
64 | assert conf_data["title"] in result.stdout
65 |
66 |
67 | def test_raise_error_no_config():
68 | result = runner.invoke(app, ["document"])
69 | assert result.exit_code == 1
70 |
71 |
72 | @pytest.mark.parametrize("marker", MARKER_TAGS)
73 | def test_markers(tmp_path_factory: pytest.TempPathFactory, marker: str):
74 | """Weasel should be able to handle both 'SPACY PROJECT' and 'WEASEL' markers."""
75 | project: Dict[str, Any] = {
76 | "title": "Awesome project",
77 | "description": "Project using spacy projects and gets migrated to weasel.",
78 | }
79 | additional_text = (
80 | "\n\n## Some additional information\n\nHere is some additional information about this project "
81 | "that is not autogenerated from the [`project.yml`](project.yml)."
82 | )
83 |
84 | # Create project file.
85 | test_dir = tmp_path_factory.mktemp("project")
86 | path = test_dir / "project.yml"
87 | path.write_text(srsly.yaml_dumps(project))
88 |
89 | # Store readme with additional information.
90 | # runner.invoke(app, ["document", str(path.parent), "--output", test_dir / "readme.md"])
91 | with open(test_dir / "readme.md", "w+", encoding="utf-8") as file:
92 | readme = runner.invoke(app, ["document", str(path.parent)]).output
93 | for to_replace in (MARKER_START, MARKER_END, MARKER_IGNORE):
94 | readme = readme.replace(
95 | to_replace.format(tag="WEASEL"), to_replace.format(tag=marker)
96 | )
97 | file.writelines(readme)
98 | file.writelines(additional_text)
99 |
100 | # Run `document` again on existing readme file. Ensure additional information is still there.
101 | runner.invoke(
102 | app, ["document", str(path.parent), "--output", str(test_dir / "readme.md")]
103 | )
104 | with open(test_dir / "readme.md", "r", encoding="utf-8") as file:
105 | assert additional_text in "".join(file.readlines())
106 |
--------------------------------------------------------------------------------
/weasel/tests/cli/test_remote.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 | from typer.testing import CliRunner
5 |
6 | from weasel import app
7 |
8 | from .test_cli_app import has_git
9 |
10 | runner = CliRunner()
11 |
12 |
13 | @pytest.fixture
14 | def project_dir(tmp_path_factory: pytest.TempPathFactory):
15 | # a working directory for the session
16 | base = tmp_path_factory.mktemp("project")
17 | return base / "project"
18 |
19 |
20 | @pytest.fixture
21 | def remote_url(tmp_path_factory: pytest.TempPathFactory):
22 | # a "remote" for testing
23 | base = tmp_path_factory.mktemp("remote")
24 | return base / "remote"
25 |
26 |
27 | @pytest.fixture
28 | def clone(project_dir: Path):
29 | """Cloning shouldn't fail"""
30 | repo = "https://github.com/explosion/weasel"
31 | branch = "main"
32 | result = runner.invoke(
33 | app,
34 | [
35 | "clone",
36 | "--repo",
37 | repo,
38 | "--branch",
39 | branch,
40 | "weasel/tests/demo_project",
41 | str(project_dir),
42 | ],
43 | )
44 |
45 | assert result.exit_code == 0
46 | assert (project_dir / "project.yml").exists()
47 |
48 |
49 | @pytest.fixture(autouse=True)
50 | def assets(clone, project_dir: Path):
51 | result = runner.invoke(app, ["assets", str(project_dir)])
52 |
53 | print(result.stdout)
54 | assert result.exit_code == 0
55 | assert (project_dir / "assets/README.md").exists()
56 |
57 |
58 | @pytest.mark.skipif(not has_git(), reason="git not installed")
59 | def test_remote(project_dir: Path, remote_url: Path):
60 | result = runner.invoke(app, ["assets", str(project_dir)])
61 | assert result.exit_code == 0
62 | assert (project_dir / "assets/README.md").exists()
63 |
64 | result = runner.invoke(app, ["run", "prep", str(project_dir)])
65 | assert result.exit_code == 0
66 |
67 | # append remote to the file
68 | with open(project_dir / "project.yml", "a") as project_file:
69 | project_file.write(f"\nremotes:\n default: {remote_url}\n")
70 |
71 | result = runner.invoke(app, ["push", "default", str(project_dir)])
72 | assert result.exit_code == 0
73 |
74 | # delete a file, and make sure pull restores it
75 | (project_dir / "corpus/stuff.txt").unlink()
76 |
77 | result = runner.invoke(app, ["pull", "default", str(project_dir)])
78 | assert result.exit_code == 0
79 | assert (project_dir / "corpus/stuff.txt").exists()
80 |
--------------------------------------------------------------------------------
/weasel/tests/demo_project/project.yml:
--------------------------------------------------------------------------------
1 | title: Weasel demo project (for tests)
2 | description: |
3 | This project is a minimal demo for the Weasel tests.
4 |
5 | directories: [assets, corpus, scripts]
6 |
7 | assets:
8 | - dest: assets/README.md
9 | url: https://raw.githubusercontent.com/explosion/weasel/main/README.md
10 |
11 | commands:
12 | - name: prep
13 | help: Make a file to test with push/pull
14 | script:
15 | - python scripts/check.py
16 | outputs:
17 | - corpus/stuff.txt
18 |
--------------------------------------------------------------------------------
/weasel/tests/demo_project/scripts/check.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 |
3 | workdir = pathlib.Path(__file__).parent.resolve().parent
4 |
5 | text = """
6 | _
7 | __ _____ __ _ ___ ___| |
8 | \\ \\ /\\ / / _ \\/ _` / __|/ _ \\ |
9 | \\ V V / __/ (_| \\__ \\ __/ |
10 | \\_/\\_/ \\___|\\__,_|___/\\___|_|
11 |
12 | """
13 |
14 | with open(workdir / "corpus/stuff.txt", "w") as outfile:
15 | outfile.write(text)
16 |
--------------------------------------------------------------------------------
/weasel/tests/test_schemas.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 | import srsly
5 | from typer.testing import CliRunner
6 |
7 | from weasel import app
8 |
9 | EXAMPLES = [
10 | (dict(title="Test"), False),
11 | (dict(title="Test", spacy_version=""), True),
12 | (dict(title="Test", spacy_version="3.4.1"), True),
13 | ]
14 |
15 |
16 | @pytest.fixture
17 | def project_dir(tmp_path: Path):
18 | path = tmp_path / "project"
19 | path.mkdir()
20 | yield path
21 |
22 |
23 | @pytest.mark.parametrize("conf,should_warn", EXAMPLES)
24 | def test_project_document(project_dir: Path, conf, should_warn):
25 | config = srsly.yaml_dumps(conf)
26 |
27 | (project_dir / "project.yml").write_text(config)
28 |
29 | result = CliRunner().invoke(app, ["document", str(project_dir)])
30 | assert result.exit_code == 0
31 | assert (
32 | "Your project configuration file includes a `spacy_version` key, "
33 | in result.output
34 | ) is should_warn
35 |
--------------------------------------------------------------------------------
/weasel/tests/test_validation.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import srsly
3 | from confection import ConfigValidationError
4 |
5 | from weasel.schemas import ProjectConfigSchema, validate
6 | from weasel.util import is_subpath_of, load_project_config, make_tempdir
7 | from weasel.util import substitute_project_variables, validate_project_commands
8 |
9 |
10 | @pytest.mark.parametrize(
11 | "parent,child,expected",
12 | [
13 | ("/tmp", "/tmp", True),
14 | ("/tmp", "/", False),
15 | ("/tmp", "/tmp/subdir", True),
16 | ("/tmp", "/tmpdir", False),
17 | ("/tmp", "/tmp/subdir/..", True),
18 | ("/tmp", "/tmp/..", False),
19 | ],
20 | )
21 | def test_is_subpath_of(parent, child, expected):
22 | assert is_subpath_of(parent, child) == expected
23 |
24 |
25 | def test_project_config_validation_full():
26 | config = {
27 | "vars": {"some_var": 20},
28 | "directories": ["assets", "configs", "corpus", "scripts", "training"],
29 | "assets": [
30 | {
31 | "dest": "x",
32 | "extra": True,
33 | "url": "https://example.com",
34 | "checksum": "63373dd656daa1fd3043ce166a59474c",
35 | },
36 | {
37 | "dest": "y",
38 | "git": {
39 | "repo": "https://github.com/example/repo",
40 | "branch": "develop",
41 | "path": "y",
42 | },
43 | },
44 | {
45 | "dest": "z",
46 | "extra": False,
47 | "url": "https://example.com",
48 | "checksum": "63373dd656daa1fd3043ce166a59474c",
49 | },
50 | ],
51 | "commands": [
52 | {
53 | "name": "train",
54 | "help": "Train a model",
55 | "script": ["python -m spacy train config.cfg -o training"],
56 | "deps": ["config.cfg", "corpus/training.spcy"],
57 | "outputs": ["training/model-best"],
58 | },
59 | {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True},
60 | ],
61 | "workflows": {"all": ["train", "test"], "train": ["train"]},
62 | }
63 | errors = validate(ProjectConfigSchema, config)
64 | assert not errors
65 |
66 |
67 | @pytest.mark.parametrize(
68 | "config",
69 | [
70 | {"commands": [{"name": "a"}, {"name": "a"}]},
71 | {"commands": [{"name": "a"}], "workflows": {"a": []}},
72 | {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
73 | ],
74 | )
75 | def test_project_config_validation1(config):
76 | with pytest.raises(SystemExit):
77 | validate_project_commands(config)
78 |
79 |
80 | @pytest.mark.parametrize(
81 | "config,n_errors",
82 | [
83 | ({"commands": {"a": []}}, 1),
84 | ({"commands": [{"help": "..."}]}, 1),
85 | ({"commands": [{"name": "a", "extra": "b"}]}, 1),
86 | ({"commands": [{"extra": "b"}]}, 2),
87 | ({"commands": [{"name": "a", "deps": [123]}]}, 1),
88 | ],
89 | )
90 | def test_project_config_validation2(config, n_errors):
91 | errors = validate(ProjectConfigSchema, config)
92 | assert len(errors) == n_errors
93 |
94 |
95 | @pytest.mark.parametrize(
96 | "int_value",
97 | [10, pytest.param("10", marks=pytest.mark.xfail)],
98 | )
99 | def test_project_config_interpolation(int_value):
100 | variables = {"a": int_value, "b": {"c": "foo", "d": True}}
101 | commands = [
102 | {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
103 | {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
104 | ]
105 | project = {"commands": commands, "vars": variables}
106 | with make_tempdir() as d:
107 | srsly.write_yaml(d / "project.yml", project)
108 | cfg = load_project_config(d)
109 | assert type(cfg) == dict
110 | assert type(cfg["commands"]) == list
111 | assert cfg["commands"][0]["script"][0] == "hello 10 foo"
112 | assert cfg["commands"][1]["script"][0] == "foo true"
113 | commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
114 | project = {"commands": commands, "vars": variables}
115 | with pytest.raises(ConfigValidationError):
116 | substitute_project_variables(project)
117 |
118 |
119 | @pytest.mark.parametrize(
120 | "greeting",
121 | [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)],
122 | )
123 | def test_project_config_interpolation_override(greeting):
124 | variables = {"a": "world"}
125 | commands = [
126 | {"name": "x", "script": ["hello ${vars.a}"]},
127 | ]
128 | overrides = {"vars.a": greeting}
129 | project = {"commands": commands, "vars": variables}
130 | with make_tempdir() as d:
131 | srsly.write_yaml(d / "project.yml", project)
132 | cfg = load_project_config(d, overrides=overrides)
133 | assert type(cfg) == dict
134 | assert type(cfg["commands"]) == list
135 | assert cfg["commands"][0]["script"][0] == f"hello {greeting}"
136 |
137 |
138 | def test_project_config_interpolation_env(monkeypatch: pytest.MonkeyPatch):
139 | variables = {"a": 10}
140 | env_var = "SPACY_TEST_FOO"
141 |
142 | env_vars = {"foo": env_var}
143 | commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
144 | project = {"commands": commands, "vars": variables, "env": env_vars}
145 |
146 | with make_tempdir() as d:
147 | srsly.write_yaml(d / "project.yml", project)
148 | cfg = load_project_config(d)
149 | assert cfg["commands"][0]["script"][0] == "hello 10 "
150 |
151 | monkeypatch.setenv(env_var, "123")
152 |
153 | with make_tempdir() as d:
154 | srsly.write_yaml(d / "project.yml", project)
155 | cfg = load_project_config(d)
156 | assert cfg["commands"][0]["script"][0] == "hello 10 123"
157 |
--------------------------------------------------------------------------------
/weasel/tests/util.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | import re
3 | import tempfile
4 |
5 | import srsly
6 |
7 |
8 | @contextlib.contextmanager
9 | def make_tempfile(mode="r"):
10 | f = tempfile.TemporaryFile(mode=mode)
11 | yield f
12 | f.close()
13 |
14 |
15 | def assert_packed_msg_equal(b1, b2):
16 | """Assert that two packed msgpack messages are equal."""
17 | msg1 = srsly.msgpack_loads(b1)
18 | msg2 = srsly.msgpack_loads(b2)
19 | assert sorted(msg1.keys()) == sorted(msg2.keys())
20 | for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
21 | assert k1 == k2
22 | assert v1 == v2
23 |
24 |
25 | def normalize_whitespace(s):
26 | return re.sub(r"\s+", " ", s)
27 |
--------------------------------------------------------------------------------
/weasel/util/__init__.py:
--------------------------------------------------------------------------------
1 | from .commands import join_command, run_command, split_command
2 | from .config import load_project_config, parse_config_overrides
3 | from .config import substitute_project_variables
4 | from .environment import ENV_VARS, check_bool_env_var, check_spacy_env_vars
5 | from .filesystem import ensure_path, ensure_pathy, is_cwd, is_subpath_of, make_tempdir
6 | from .filesystem import working_dir
7 | from .frozen import SimpleFrozenDict, SimpleFrozenList
8 | from .git import _http_to_git, get_git_version, git_checkout, git_repo_branch_exists
9 | from .git import git_sparse_checkout
10 | from .hashing import get_checksum, get_hash
11 | from .logging import logger
12 | from .modules import import_file
13 | from .remote import download_file, upload_file
14 | from .validation import validate_project_commands
15 | from .versions import get_minor_version, is_compatible_version, is_minor_version_match
16 |
--------------------------------------------------------------------------------
/weasel/util/commands.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shlex
3 | import subprocess
4 | import sys
5 | from typing import Any, List, Optional, Union
6 |
7 | from ..compat import is_windows
8 | from ..errors import Errors
9 |
10 |
11 | def split_command(command: str) -> List[str]:
12 | """Split a string command using shlex. Handles platform compatibility.
13 |
14 | command (str) : The command to split
15 | RETURNS (List[str]): The split command.
16 | """
17 | return shlex.split(command, posix=not is_windows)
18 |
19 |
20 | def join_command(command: List[str]) -> str:
21 | """Join a command using shlex. shlex.join is only available for Python 3.8+,
22 | so we're using a workaround here.
23 |
24 | command (List[str]): The command to join.
25 | RETURNS (str): The joined command
26 | """
27 | return " ".join(shlex.quote(cmd) for cmd in command)
28 |
29 |
30 | def run_command(
31 | command: Union[str, List[str]],
32 | *,
33 | stdin: Optional[Any] = None,
34 | capture: bool = False,
35 | ) -> subprocess.CompletedProcess:
36 | """Run a command on the command line as a subprocess. If the subprocess
37 | returns a non-zero exit code, a system exit is performed.
38 |
39 | command (str / List[str]): The command. If provided as a string, the
40 | string will be split using shlex.split.
41 | stdin (Optional[Any]): stdin to read from or None.
42 | capture (bool): Whether to capture the output and errors. If False,
43 | the stdout and stderr will not be redirected, and if there's an error,
44 | sys.exit will be called with the return code. You should use capture=False
45 | when you want to turn over execution to the command, and capture=True
46 | when you want to run the command more like a function.
47 | RETURNS (Optional[CompletedProcess]): The process object.
48 | """
49 | if isinstance(command, str):
50 | cmd_list = split_command(command)
51 | cmd_str = command
52 | else:
53 | cmd_list = command
54 | cmd_str = " ".join(command)
55 | try:
56 | ret = subprocess.run(
57 | cmd_list,
58 | env=os.environ.copy(),
59 | input=stdin,
60 | encoding="utf8",
61 | check=False,
62 | stdout=subprocess.PIPE if capture else None,
63 | stderr=subprocess.STDOUT if capture else None,
64 | )
65 | except FileNotFoundError:
66 | # Indicates the *command* wasn't found, it's an error before the command
67 | # is run.
68 | raise FileNotFoundError(
69 | Errors.E501.format(str_command=cmd_str, tool=cmd_list[0])
70 | ) from None
71 | if ret.returncode != 0 and capture:
72 | message = f"Error running command:\n\n{cmd_str}\n\n"
73 | message += f"Subprocess exited with status {ret.returncode}"
74 | if ret.stdout is not None:
75 | message += "\n\nProcess log (stdout and stderr):\n\n"
76 | message += ret.stdout
77 | error = subprocess.SubprocessError(message)
78 | error.ret = ret # type: ignore[attr-defined]
79 | error.command = cmd_str # type: ignore[attr-defined]
80 | raise error
81 | elif ret.returncode != 0:
82 | sys.exit(ret.returncode)
83 | return ret
84 |
--------------------------------------------------------------------------------
/weasel/util/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from pathlib import Path
4 | from typing import Any, Dict, List, Optional
5 |
6 | import srsly
7 | from click import NoSuchOption
8 | from click.parser import split_arg_string
9 | from confection import Config
10 | from wasabi import msg
11 |
12 | from ..cli.main import PROJECT_FILE
13 | from ..schemas import ProjectConfigSchema, validate
14 | from .environment import ENV_VARS
15 | from .frozen import SimpleFrozenDict
16 | from .logging import logger
17 | from .validation import show_validation_error, validate_project_commands
18 |
19 |
20 | def parse_config_overrides(
21 | args: List[str], env_var: Optional[str] = ENV_VARS.CONFIG_OVERRIDES
22 | ) -> Dict[str, Any]:
23 | """Generate a dictionary of config overrides based on the extra arguments
24 | provided on the CLI, e.g. --training.batch_size to override
25 | "training.batch_size". Arguments without a "." are considered invalid,
26 | since the config only allows top-level sections to exist.
27 |
28 | env_vars (Optional[str]): Optional environment variable to read from.
29 | RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
30 | """
31 | env_string = os.environ.get(env_var, "") if env_var else ""
32 | env_overrides = _parse_overrides(split_arg_string(env_string))
33 | cli_overrides = _parse_overrides(args, is_cli=True)
34 | if cli_overrides:
35 | keys = [k for k in cli_overrides if k not in env_overrides]
36 | logger.debug("Config overrides from CLI: %s", keys)
37 | if env_overrides:
38 | logger.debug("Config overrides from env variables: %s", list(env_overrides))
39 | return {**cli_overrides, **env_overrides}
40 |
41 |
42 | def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]:
43 | result = {}
44 | while args:
45 | opt = args.pop(0)
46 | err = f"Invalid config override '{opt}'"
47 | if opt.startswith("--"): # new argument
48 | orig_opt = opt
49 | opt = opt.replace("--", "")
50 | if "." not in opt:
51 | if is_cli:
52 | raise NoSuchOption(orig_opt)
53 | else:
54 | msg.fail(f"{err}: can't override top-level sections", exits=1)
55 | if "=" in opt: # we have --opt=value
56 | opt, value = opt.split("=", 1)
57 | opt = opt.replace("-", "_")
58 | else:
59 | if not args or args[0].startswith("--"): # flag with no value
60 | value = "true"
61 | else:
62 | value = args.pop(0)
63 | result[opt] = _parse_override(value)
64 | else:
65 | msg.fail(f"{err}: name should start with --", exits=1)
66 | return result
67 |
68 |
69 | def _parse_override(value: Any) -> Any:
70 | # Just like we do in the config, we're calling json.loads on the
71 | # values. But since they come from the CLI, it'd be unintuitive to
72 | # explicitly mark strings with escaped quotes. So we're working
73 | # around that here by falling back to a string if parsing fails.
74 | # TODO: improve logic to handle simple types like list of strings?
75 | try:
76 | return srsly.json_loads(value)
77 | except ValueError:
78 | return str(value)
79 |
80 |
81 | def load_project_config(
82 | path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
83 | ) -> Dict[str, Any]:
84 | """Load the project.yml file from a directory and validate it. Also make
85 | sure that all directories defined in the config exist.
86 |
87 | path (Path): The path to the project directory.
88 | interpolate (bool): Whether to substitute project variables.
89 | overrides (Dict[str, Any]): Optional config overrides.
90 | RETURNS (Dict[str, Any]): The loaded project.yml.
91 | """
92 | config_path = path / PROJECT_FILE
93 | if not config_path.exists():
94 | msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
95 | invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
96 | try:
97 | config = srsly.read_yaml(config_path)
98 | except ValueError as e:
99 | msg.fail(invalid_err, e, exits=1)
100 | errors = validate(ProjectConfigSchema, config)
101 | if errors:
102 | msg.fail(invalid_err)
103 | print("\n".join(errors))
104 | sys.exit(1)
105 | validate_project_commands(config)
106 | if interpolate:
107 | err = f"{PROJECT_FILE} validation error"
108 | with show_validation_error(title=err, hint_fill=False):
109 | config = substitute_project_variables(config, overrides)
110 | # Make sure directories defined in config exist
111 | for subdir in config.get("directories", []):
112 | dir_path = path / subdir
113 | if not dir_path.exists():
114 | dir_path.mkdir(parents=True)
115 | return config
116 |
117 |
118 | def substitute_project_variables(
119 | config: Dict[str, Any],
120 | overrides: Dict[str, Any] = SimpleFrozenDict(),
121 | key: str = "vars",
122 | env_key: str = "env",
123 | ) -> Dict[str, Any]:
124 | """Interpolate variables in the project file using the config system.
125 |
126 | config (Dict[str, Any]): The project config.
127 | overrides (Dict[str, Any]): Optional config overrides.
128 | key (str): Key containing variables in project config.
129 | env_key (str): Key containing environment variable mapping in project config.
130 | RETURNS (Dict[str, Any]): The interpolated project config.
131 | """
132 | config.setdefault(key, {})
133 | config.setdefault(env_key, {})
134 | # Substitute references to env vars with their values
135 | for config_var, env_var in config[env_key].items():
136 | config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
137 | # Need to put variables in the top scope again so we can have a top-level
138 | # section "project" (otherwise, a list of commands in the top scope wouldn't)
139 | # be allowed by Thinc's config system
140 | cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
141 | cfg = Config().from_str(cfg.to_str(), overrides=overrides)
142 | interpolated = cfg.interpolate()
143 | return dict(interpolated["project"])
144 |
--------------------------------------------------------------------------------
/weasel/util/environment.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from wasabi import msg
4 |
5 |
6 | class ENV_VARS:
7 | CONFIG_OVERRIDES = "WEASEL_CONFIG_OVERRIDES"
8 |
9 |
10 | def check_spacy_env_vars():
11 | if "SPACY_CONFIG_OVERRIDES" in os.environ:
12 | msg.warn(
13 | "You've set a `SPACY_CONFIG_OVERRIDES` environment variable, "
14 | "which is now deprecated. Weasel will not use it. "
15 | "You can use `WEASEL_CONFIG_OVERRIDES` instead."
16 | )
17 | if "SPACY_PROJECT_USE_GIT_VERSION" in os.environ:
18 | msg.warn(
19 | "You've set a `SPACY_PROJECT_USE_GIT_VERSION` environment variable, "
20 | "which is now deprecated. Weasel will not use it."
21 | )
22 |
23 |
24 | def check_bool_env_var(env_var: str) -> bool:
25 | """Convert the value of an environment variable to a boolean. Add special
26 | check for "0" (falsy) and consider everything else truthy, except unset.
27 |
28 | env_var (str): The name of the environment variable to check.
29 | RETURNS (bool): Its boolean value.
30 | """
31 | value = os.environ.get(env_var, False)
32 | if value == "0":
33 | return False
34 | return bool(value)
35 |
--------------------------------------------------------------------------------
/weasel/util/filesystem.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import stat
4 | import sys
5 | import tempfile
6 | import warnings
7 | from contextlib import contextmanager
8 | from pathlib import Path
9 | from typing import Any, Generator, Iterator, Union
10 |
11 | from ..errors import Warnings
12 |
13 |
14 | @contextmanager
15 | def working_dir(path: Union[str, Path]) -> Iterator[Path]:
16 | """Change current working directory and returns to previous on exit.
17 |
18 | path (str / Path): The directory to navigate to.
19 | YIELDS (Path): The absolute path to the current working directory. This
20 | should be used if the block needs to perform actions within the working
21 | directory, to prevent mismatches with relative paths.
22 | """
23 | prev_cwd = Path.cwd()
24 | current = Path(path).resolve()
25 | os.chdir(str(current))
26 | try:
27 | yield current
28 | finally:
29 | os.chdir(str(prev_cwd))
30 |
31 |
32 | @contextmanager
33 | def make_tempdir() -> Generator[Path, None, None]:
34 | """Execute a block in a temporary directory and remove the directory and
35 | its contents at the end of the with block.
36 |
37 | YIELDS (Path): The path of the temp directory.
38 | """
39 | d = Path(tempfile.mkdtemp())
40 | yield d
41 |
42 | # On Windows, git clones use read-only files, which cause permission errors
43 | # when being deleted. This forcibly fixes permissions.
44 | def force_remove(rmfunc, path, ex):
45 | os.chmod(path, stat.S_IWRITE)
46 | rmfunc(path)
47 |
48 | try:
49 | if sys.version_info >= (3, 12):
50 | shutil.rmtree(str(d), onexc=force_remove)
51 | else:
52 | shutil.rmtree(str(d), onerror=force_remove)
53 | except PermissionError as e:
54 | warnings.warn(Warnings.W801.format(dir=d, msg=e))
55 |
56 |
57 | def is_cwd(path: Union[Path, str]) -> bool:
58 | """Check whether a path is the current working directory.
59 |
60 | path (Union[Path, str]): The directory path.
61 | RETURNS (bool): Whether the path is the current working directory.
62 | """
63 | return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
64 |
65 |
66 | def ensure_path(path: Any) -> Any:
67 | """Ensure string is converted to a Path.
68 |
69 | path (Any): Anything. If string, it's converted to Path.
70 | RETURNS: Path or original argument.
71 | """
72 | if isinstance(path, str):
73 | return Path(path)
74 | else:
75 | return path
76 |
77 |
78 | def ensure_pathy(path):
79 | """Temporary helper to prevent importing cloudpathlib globally (which was
80 | originally added due to a slow and annoying Google Cloud warning with
81 | Pathy)"""
82 | from cloudpathlib import AnyPath # noqa: F811
83 |
84 | return AnyPath(path)
85 |
86 |
87 | def is_subpath_of(parent, child):
88 | """
89 | Check whether `child` is a path contained within `parent`.
90 | """
91 | # Based on https://stackoverflow.com/a/37095733 .
92 |
93 | # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
94 | # we can stop using crusty old os.path functions.
95 | parent_realpath = os.path.realpath(parent)
96 | child_realpath = os.path.realpath(child)
97 | return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
98 |
--------------------------------------------------------------------------------
/weasel/util/frozen.py:
--------------------------------------------------------------------------------
1 | from ..errors import Errors
2 |
3 |
4 | class SimpleFrozenDict(dict):
5 | """Simplified implementation of a frozen dict, mainly used as default
6 | function or method argument (for arguments that should default to empty
7 | dictionary). Will raise an error if user or Weasel attempts to add to dict.
8 | """
9 |
10 | def __init__(self, *args, error: str = Errors.E001, **kwargs) -> None:
11 | """Initialize the frozen dict. Can be initialized with pre-defined
12 | values.
13 |
14 | error (str): The error message when user tries to assign to dict.
15 | """
16 | super().__init__(*args, **kwargs)
17 | self.error = error
18 |
19 | def __setitem__(self, key, value):
20 | raise NotImplementedError(self.error)
21 |
22 | def pop(self, key, default=None):
23 | raise NotImplementedError(self.error)
24 |
25 | def update(self, other):
26 | raise NotImplementedError(self.error)
27 |
28 |
29 | class SimpleFrozenList(list):
30 | """Wrapper class around a list that lets us raise custom errors if certain
31 | attributes/methods are accessed. Mostly used for properties like
32 | Language.pipeline that return an immutable list (and that we don't want to
33 | convert to a tuple to not break too much backwards compatibility). If a user
34 | accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
35 | """
36 |
37 | def __init__(self, *args, error: str = Errors.E002) -> None:
38 | """Initialize the frozen list.
39 |
40 | error (str): The error message when user tries to mutate the list.
41 | """
42 | self.error = error
43 | super().__init__(*args)
44 |
45 | def append(self, *args, **kwargs):
46 | raise NotImplementedError(self.error)
47 |
48 | def clear(self, *args, **kwargs):
49 | raise NotImplementedError(self.error)
50 |
51 | def extend(self, *args, **kwargs):
52 | raise NotImplementedError(self.error)
53 |
54 | def insert(self, *args, **kwargs):
55 | raise NotImplementedError(self.error)
56 |
57 | def pop(self, *args, **kwargs):
58 | raise NotImplementedError(self.error)
59 |
60 | def remove(self, *args, **kwargs):
61 | raise NotImplementedError(self.error)
62 |
63 | def reverse(self, *args, **kwargs):
64 | raise NotImplementedError(self.error)
65 |
66 | def sort(self, *args, **kwargs):
67 | raise NotImplementedError(self.error)
68 |
--------------------------------------------------------------------------------
/weasel/util/git.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | from pathlib import Path
4 | from typing import Tuple
5 |
6 | from wasabi import msg
7 |
8 | from .commands import run_command
9 | from .filesystem import is_subpath_of, make_tempdir
10 |
11 |
12 | def git_checkout(
13 | repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
14 | ):
15 | git_version = get_git_version()
16 | if dest.exists():
17 | msg.fail("Destination of checkout must not exist", exits=1)
18 | if not dest.parent.exists():
19 | msg.fail("Parent of destination of checkout must exist", exits=1)
20 | if sparse and git_version >= (2, 22):
21 | return git_sparse_checkout(repo, subpath, dest, branch)
22 | elif sparse:
23 | # Only show warnings if the user explicitly wants sparse checkout but
24 | # the Git version doesn't support it
25 | err_old = (
26 | f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
27 | f"that doesn't fully support sparse checkout yet."
28 | )
29 | err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
30 | msg.warn(
31 | f"{err_unk if git_version == (0, 0) else err_old} "
32 | f"This means that more files than necessary may be downloaded "
33 | f"temporarily. To only download the files needed, make sure "
34 | f"you're using Git v2.22 or above."
35 | )
36 | with make_tempdir() as tmp_dir:
37 | cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
38 | run_command(cmd, capture=True)
39 | # We need Path(name) to make sure we also support subdirectories
40 | try:
41 | source_path = tmp_dir / Path(subpath)
42 | if not is_subpath_of(tmp_dir, source_path):
43 | err = f"'{subpath}' is a path outside of the cloned repository."
44 | msg.fail(err, repo, exits=1)
45 | if os.path.isdir(source_path):
46 | shutil.copytree(source_path, dest)
47 | else:
48 | shutil.copyfile(source_path, dest)
49 | except FileNotFoundError:
50 | err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
51 | msg.fail(err, repo, exits=1)
52 |
53 |
54 | def git_sparse_checkout(repo, subpath, dest, branch):
55 | # We're using Git, partial clone and sparse checkout to
56 | # only clone the files we need
57 | # This ends up being RIDICULOUS. omg.
58 | # So, every tutorial and SO post talks about 'sparse checkout'...But they
59 | # go and *clone* the whole repo. Worthless. And cloning part of a repo
60 | # turns out to be completely broken. The only way to specify a "path" is..
61 | # a path *on the server*? The contents of which, specifies the paths. Wat.
62 | # Obviously this is hopelessly broken and insecure, because you can query
63 | # arbitrary paths on the server! So nobody enables this.
64 | # What we have to do is disable *all* files. We could then just checkout
65 | # the path, and it'd "work", but be hopelessly slow...Because it goes and
66 | # transfers every missing object one-by-one. So the final piece is that we
67 | # need to use some weird git internals to fetch the missings in bulk, and
68 | # *that* we can do by path.
69 | # We're using Git and sparse checkout to only clone the files we need
70 | with make_tempdir() as tmp_dir:
71 | # This is the "clone, but don't download anything" part.
72 | cmd = (
73 | f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
74 | f"-b {branch} --filter=blob:none"
75 | )
76 | run_command(cmd)
77 | # Now we need to find the missing filenames for the subpath we want.
78 | # Looking for this 'rev-list' command in the git --help? Hah.
79 | cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
80 | ret = run_command(cmd, capture=True)
81 | git_repo = _http_to_git(repo)
82 | # Now pass those missings into another bit of git internals
83 | missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
84 | if not missings:
85 | err = (
86 | f"Could not find any relevant files for '{subpath}'. "
87 | f"Did you specify a correct and complete path within repo '{repo}' "
88 | f"and branch {branch}?"
89 | )
90 | msg.fail(err, exits=1)
91 | cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
92 | run_command(cmd, capture=True)
93 | # And finally, we can checkout our subpath
94 | cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
95 | run_command(cmd, capture=True)
96 |
97 | # Get a subdirectory of the cloned path, if appropriate
98 | source_path = tmp_dir / Path(subpath)
99 | if not is_subpath_of(tmp_dir, source_path):
100 | err = f"'{subpath}' is a path outside of the cloned repository."
101 | msg.fail(err, repo, exits=1)
102 |
103 | shutil.move(str(source_path), str(dest))
104 |
105 |
106 | def git_repo_branch_exists(repo: str, branch: str) -> bool:
107 | """Uses 'git ls-remote' to check if a repository and branch exists
108 |
109 | repo (str): URL to get repo.
110 | branch (str): Branch on repo to check.
111 | RETURNS (bool): True if repo:branch exists.
112 | """
113 | get_git_version()
114 | cmd = f"git ls-remote {repo} {branch}"
115 | # We might be tempted to use `--exit-code` with `git ls-remote`, but
116 | # `run_command` handles the `returncode` for us, so we'll rely on
117 | # the fact that stdout returns '' if the requested branch doesn't exist
118 | ret = run_command(cmd, capture=True)
119 | exists = ret.stdout != ""
120 | return exists
121 |
122 |
123 | def get_git_version(
124 | error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
125 | ) -> Tuple[int, int]:
126 | """Get the version of git and raise an error if calling 'git --version' fails.
127 |
128 | error (str): The error message to show.
129 | RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
130 | (0, 0) if the version couldn't be determined.
131 | """
132 | try:
133 | ret = run_command("git --version", capture=True)
134 | except Exception:
135 | raise RuntimeError(error)
136 | stdout = ret.stdout.strip()
137 | if not stdout or not stdout.startswith("git version"):
138 | return 0, 0
139 | version = stdout[11:].strip().split(".")
140 | return int(version[0]), int(version[1])
141 |
142 |
143 | def _http_to_git(repo: str) -> str:
144 | if repo.startswith("http://"):
145 | repo = repo.replace(r"http://", r"https://")
146 | if repo.startswith(r"https://"):
147 | repo = repo.replace("https://", "git@").replace("/", ":", 1)
148 | if repo.endswith("/"):
149 | repo = repo[:-1]
150 | repo = f"{repo}.git"
151 | return repo
152 |
--------------------------------------------------------------------------------
/weasel/util/hashing.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | from pathlib import Path
3 | from typing import Iterable, Union
4 |
5 | import srsly
6 | from wasabi import msg
7 |
8 |
9 | def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
10 | """Get the hash for a JSON-serializable object.
11 |
12 | data: The data to hash.
13 | exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
14 | RETURNS (str): The hash.
15 | """
16 | if isinstance(data, dict):
17 | data = {k: v for k, v in data.items() if k not in exclude}
18 | data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
19 | return hashlib.md5(data_str).hexdigest()
20 |
21 |
22 | def get_checksum(path: Union[Path, str]) -> str:
23 | """Get the checksum for a file or directory given its file path. If a
24 | directory path is provided, this uses all files in that directory.
25 |
26 | path (Union[Path, str]): The file or directory path.
27 | RETURNS (str): The checksum.
28 | """
29 | path = Path(path)
30 | if not (path.is_file() or path.is_dir()):
31 | msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
32 | if path.is_file():
33 | return hashlib.md5(Path(path).read_bytes()).hexdigest()
34 | else:
35 | # TODO: this is currently pretty slow
36 | dir_checksum = hashlib.md5()
37 | for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
38 | dir_checksum.update(sub_file.read_bytes())
39 | return dir_checksum.hexdigest()
40 |
--------------------------------------------------------------------------------
/weasel/util/logging.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | logger = logging.getLogger("weasel")
4 | logger_stream_handler = logging.StreamHandler()
5 | logger_stream_handler.setFormatter(
6 | logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
7 | )
8 | logger.addHandler(logger_stream_handler)
9 |
--------------------------------------------------------------------------------
/weasel/util/modules.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from pathlib import Path
3 | from types import ModuleType
4 | from typing import Union
5 |
6 |
7 | def import_file(name: str, loc: Union[str, Path]) -> ModuleType:
8 | """Import module from a file. Used to load models from a directory.
9 |
10 | name (str): Name of module to load.
11 | loc (str / Path): Path to the file.
12 | RETURNS: The loaded module.
13 | """
14 | spec = importlib.util.spec_from_file_location(name, str(loc)) # type: ignore
15 | module = importlib.util.module_from_spec(spec) # type: ignore
16 | spec.loader.exec_module(module) # type: ignore[union-attr]
17 | return module
18 |
--------------------------------------------------------------------------------
/weasel/util/remote.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from pathlib import Path
3 | from typing import TYPE_CHECKING, Union
4 |
5 | if TYPE_CHECKING:
6 | from cloudpathlib import CloudPath
7 |
8 |
9 | def upload_file(src: Path, dest: Union[str, "CloudPath"]) -> None:
10 | """Upload a file.
11 |
12 | src (Path): The source path.
13 | url (str): The destination URL to upload to.
14 | """
15 | import smart_open
16 |
17 | # Create parent directories for local paths
18 | if isinstance(dest, Path):
19 | if not dest.parent.exists():
20 | dest.parent.mkdir(parents=True)
21 |
22 | dest = str(dest)
23 | with smart_open.open(dest, mode="wb") as output_file:
24 | with src.open(mode="rb") as input_file:
25 | output_file.write(input_file.read())
26 |
27 |
28 | def download_file(
29 | src: Union[str, "CloudPath"], dest: Path, *, force: bool = False
30 | ) -> None:
31 | """Download a file using smart_open.
32 |
33 | url (str): The URL of the file.
34 | dest (Path): The destination path.
35 | force (bool): Whether to force download even if file exists.
36 | If False, the download will be skipped.
37 | """
38 | import smart_open
39 |
40 | if dest.exists() and not force:
41 | return None
42 | src = str(src)
43 | with smart_open.open(src, mode="rb", compression="disable") as input_file:
44 | with dest.open(mode="wb") as output_file:
45 | shutil.copyfileobj(input_file, output_file)
46 |
--------------------------------------------------------------------------------
/weasel/util/validation.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from configparser import InterpolationError
3 | from contextlib import contextmanager
4 | from pathlib import Path
5 | from typing import TYPE_CHECKING, Any, Dict, Optional, Union
6 |
7 | from confection import ConfigValidationError
8 | from wasabi import msg
9 |
10 | from ..cli.main import PROJECT_FILE
11 |
12 | if TYPE_CHECKING:
13 | pass
14 |
15 |
16 | @contextmanager
17 | def show_validation_error(
18 | file_path: Optional[Union[str, Path]] = None,
19 | *,
20 | title: Optional[str] = None,
21 | desc: str = "",
22 | show_config: Optional[bool] = None,
23 | hint_fill: bool = True,
24 | ):
25 | """Helper to show custom config validation errors on the CLI.
26 |
27 | file_path (str / Path): Optional file path of config file, used in hints.
28 | title (str): Override title of custom formatted error.
29 | desc (str): Override description of custom formatted error.
30 | show_config (bool): Whether to output the config the error refers to.
31 | hint_fill (bool): Show hint about filling config.
32 | """
33 | try:
34 | yield
35 | except ConfigValidationError as e:
36 | title = title if title is not None else e.title
37 | if e.desc:
38 | desc = f"{e.desc}" if not desc else f"{e.desc}\n\n{desc}"
39 | # Re-generate a new error object with overrides
40 | err = e.from_error(e, title="", desc=desc, show_config=show_config)
41 | msg.fail(title)
42 | print(err.text.strip())
43 | if hint_fill and "value_error.missing" in err.error_types:
44 | config_path = (
45 | file_path
46 | if file_path is not None and str(file_path) != "-"
47 | else "config.cfg"
48 | )
49 | msg.text(
50 | "If your config contains missing values, you can run the 'init "
51 | "fill-config' command to fill in all the defaults, if possible:",
52 | spaced=True,
53 | )
54 | print(f"python -m spacy init fill-config {config_path} {config_path} \n")
55 | sys.exit(1)
56 | except InterpolationError as e:
57 | msg.fail("Config validation error", e, exits=1)
58 |
59 |
60 | def validate_project_commands(config: Dict[str, Any]) -> None:
61 | """Check that project commands and workflows are valid, don't contain
62 | duplicates, don't clash and only refer to commands that exist.
63 |
64 | config (Dict[str, Any]): The loaded config.
65 | """
66 | command_names = [cmd["name"] for cmd in config.get("commands", [])]
67 | workflows = config.get("workflows", {})
68 | duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
69 | if duplicates:
70 | err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
71 | msg.fail(err, exits=1)
72 | for workflow_name, workflow_steps in workflows.items():
73 | if workflow_name in command_names:
74 | err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
75 | msg.fail(err, exits=1)
76 | for step in workflow_steps:
77 | if step not in command_names:
78 | msg.fail(
79 | f"Unknown command specified in workflow '{workflow_name}': {step}",
80 | f"Workflows can only refer to commands defined in the 'commands' "
81 | f"section of the {PROJECT_FILE}.",
82 | exits=1,
83 | )
84 |
--------------------------------------------------------------------------------
/weasel/util/versions.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from packaging.specifiers import InvalidSpecifier, SpecifierSet
4 | from packaging.version import InvalidVersion, Version
5 |
6 |
7 | def is_compatible_version(
8 | version: str, constraint: str, prereleases: bool = True
9 | ) -> Optional[bool]:
10 | """Check if a version (e.g. "2.0.0") is compatible given a version
11 | constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
12 | it's interpreted as =={version}.
13 |
14 | version (str): The version to check.
15 | constraint (str): The constraint string.
16 | prereleases (bool): Whether to allow prereleases. If set to False,
17 | prerelease versions will be considered incompatible.
18 | RETURNS (bool / None): Whether the version is compatible, or None if the
19 | version or constraint are invalid.
20 | """
21 | # Handle cases where exact version is provided as constraint
22 | if constraint[0].isdigit():
23 | constraint = f"=={constraint}"
24 | try:
25 | spec = SpecifierSet(constraint)
26 | version = Version(version) # type: ignore[assignment]
27 | except (InvalidSpecifier, InvalidVersion):
28 | return None
29 | spec.prereleases = prereleases
30 | return version in spec
31 |
32 |
33 | def get_minor_version(version: str) -> Optional[str]:
34 | """Get the major + minor version (without patch or prerelease identifiers).
35 |
36 | version (str): The version.
37 | RETURNS (str): The major + minor version or None if version is invalid.
38 | """
39 | try:
40 | v = Version(version)
41 | except (TypeError, InvalidVersion):
42 | return None
43 | return f"{v.major}.{v.minor}"
44 |
45 |
46 | def is_minor_version_match(version_a: str, version_b: str) -> bool:
47 | """Compare two versions and check if they match in major and minor, without
48 | patch or prerelease identifiers. Used internally for compatibility checks
49 | that should be insensitive to patch releases.
50 |
51 | version_a (str): The first version
52 | version_b (str): The second version.
53 | RETURNS (bool): Whether the versions match.
54 | """
55 | a = get_minor_version(version_a)
56 | b = get_minor_version(version_b)
57 | return a is not None and b is not None and a == b
58 |
--------------------------------------------------------------------------------