├── .devcontainer.json ├── .github └── workflows │ ├── example-get-started-deploy.yaml │ ├── example-get-started-experiments-deploy.yaml │ ├── example-get-started-experiments-test.yaml │ └── example-get-started-test.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── README.md ├── example-get-started-experiments ├── .gitignore ├── README.md ├── code │ ├── .devcontainer.json │ ├── .gitattributes │ ├── .github │ │ └── workflows │ │ │ ├── deploy-model-sagemaker.yml │ │ │ ├── deploy-model-template.yml │ │ │ └── dvc-studio.yml │ ├── .gitignore │ ├── .gitlab-ci.yml │ ├── LICENSE │ ├── README.md │ ├── data │ │ ├── .gitignore │ │ └── pool_data.dvc │ ├── gitlab-workflows │ │ └── cloud-experiment.gitlab-ci.yml │ ├── notebooks │ │ └── TrainSegModel.ipynb │ ├── params.yaml │ ├── requirements.txt │ ├── sagemaker │ │ ├── code │ │ │ ├── inference.py │ │ │ └── requirements.txt │ │ └── deploy_model.py │ └── src │ │ ├── data_split.py │ │ ├── endpoint_prediction.py │ │ ├── evaluate.py │ │ └── train.py └── generate.sh ├── example-get-started ├── .gitignore ├── README.md ├── code │ ├── .devcontainer.json │ ├── .gitattributes │ ├── .github │ │ └── workflows │ │ │ └── cml.yaml │ ├── .gitlab-ci.yml │ ├── README.md │ ├── params.yaml │ └── src │ │ ├── evaluate.py │ │ ├── featurization.py │ │ ├── prepare.py │ │ ├── requirements.txt │ │ └── train.py ├── deploy.sh ├── generate.sh └── generate_data.py └── example-gto ├── code ├── .github │ └── workflows │ │ └── gto-act-on-tags.yml ├── .gitignore ├── README.md ├── mlem │ ├── .github │ │ └── workflows │ │ │ └── deploy-model-with-mlem.yml │ ├── .mlem.yaml │ ├── deploy │ │ ├── dev.mlem │ │ ├── prod.mlem │ │ └── staging.mlem │ ├── requirements.txt │ └── train.py └── requirements.txt └── generate.sh /.devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "example-repos-dev", 3 | "image": "mcr.microsoft.com/devcontainers/python:3.10", 4 | "runArgs": ["--ipc=host"], 5 | "extensions": ["Iterative.dvc", "ms-python.python", "redhat.vscode-yaml"], 6 | "features": { 7 | "ghcr.io/devcontainers/features/nvidia-cuda:1": { 8 | "installCudnn": true 9 | }, 10 | "ghcr.io/saml-to/devcontainer-features/assume-aws-role:1": { 11 | "role": "arn:aws:iam::342840881361:role/iterative-saml-codespaces" 12 | }, 13 | "ghcr.io/devcontainers/features/aws-cli:1": {}, 14 | "ghcr.io/devcontainers/features/github-cli:1": {} 15 | }, 16 | "customizations": { 17 | "codespaces": { 18 | "repositories": { 19 | "iterative/example-get-started": { 20 | "permissions": "write-all" 21 | }, 22 | "iterative/example-get-started-experiments": { 23 | "permissions": "write-all" 24 | } 25 | } 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /.github/workflows/example-get-started-deploy.yaml: -------------------------------------------------------------------------------- 1 | name: example-get-started deploy 2 | on: 3 | push: 4 | paths: 5 | - example-get-started/** 6 | branches: 7 | - master 8 | workflow_dispatch: 9 | permissions: write-all 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | environment: aws 14 | steps: 15 | - name: Checkout repo 16 | uses: actions/checkout@v3 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: '3.11' 21 | cache: 'pip' 22 | - uses: aws-actions/configure-aws-credentials@v2 23 | with: 24 | aws-region: us-east-2 25 | role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }} 26 | role-duration-seconds: 43200 27 | - uses: iterative/setup-dvc@v1 28 | - name: Generate repo 29 | run: | 30 | pip install virtualenv 31 | cd example-get-started 32 | ./generate.sh prod 33 | - name: Deploy repo 34 | env: 35 | GH_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 36 | run: | 37 | cd example-get-started/build/example-get-started 38 | . .venv/bin/activate 39 | # add remote 40 | git remote add origin https://${{ secrets.PERSONAL_ACCESS_TOKEN }}@github.com/iterative/example-get-started.git 41 | # close open PRs 42 | gh pr close try-large-dataset 43 | gh pr close tune-hyperparams 44 | # drop existing refs 45 | git ls-remote origin | awk '{print $2}' | xargs -n 1 git push --delete origin || true 46 | # force push branches 47 | git push --force origin main 48 | git push --force origin try-large-dataset 49 | git push --force origin tune-hyperparams 50 | # we push git tags one by one for Studio to receive webhooks: 51 | git tag --sort=creatordate | xargs -n 1 git push --force origin 52 | # push exp refs 53 | dvc exp push origin -A 54 | # create PRs 55 | gh pr create -t "Try 40K dataset (4x data)" \ 56 | -b "We are trying here a large dataset, since the smaller one looks unstable" \ 57 | -B main -H try-large-dataset 58 | gh pr create -t "Run experiments tuning random forest params" \ 59 | -b "Better RF split and number of estimators based on small grid search." \ 60 | -B main -H tune-hyperparams 61 | -------------------------------------------------------------------------------- /.github/workflows/example-get-started-experiments-deploy.yaml: -------------------------------------------------------------------------------- 1 | name: example-get-started-experiments deploy 2 | on: 3 | push: 4 | paths: 5 | - example-get-started-experiments/** 6 | branches: 7 | - master 8 | workflow_dispatch: 9 | permissions: write-all 10 | jobs: 11 | deploy-runner: 12 | environment: aws 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - uses: iterative/setup-cml@v2 17 | - uses: aws-actions/configure-aws-credentials@v1 18 | with: 19 | aws-region: us-east-2 20 | role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }} 21 | role-duration-seconds: 43200 22 | - name: Create Runner 23 | env: 24 | REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 25 | run: | 26 | cml runner launch --single \ 27 | --labels=cml \ 28 | --cloud=aws \ 29 | --cloud-region=us-east \ 30 | --cloud-hdd-size=40 \ 31 | --cloud-type=g5.2xlarge \ 32 | --idle-timeout=3600 \ 33 | test: 34 | needs: deploy-runner 35 | runs-on: [ self-hosted, cml ] 36 | environment: aws 37 | container: 38 | image: iterativeai/cml:0-dvc2-base1-gpu 39 | options: --gpus all --ipc host 40 | steps: 41 | - name: Checkout repo 42 | uses: actions/checkout@v3 43 | - name: Set up Python 44 | uses: actions/setup-python@v4 45 | with: 46 | python-version: '3.11' 47 | cache: 'pip' 48 | - uses: aws-actions/configure-aws-credentials@v2 49 | with: 50 | aws-region: us-east-2 51 | role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }} 52 | role-duration-seconds: 43200 53 | - name: Generate repo 54 | env: 55 | REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 56 | run: | 57 | pip install virtualenv 58 | cd example-get-started-experiments 59 | ./generate.sh 60 | - name: Deploy repo 61 | env: 62 | GH_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 63 | run: | 64 | cd example-get-started-experiments/build/example-get-started-experiments 65 | . .venv/bin/activate 66 | # add remote 67 | git remote add origin https://${{ secrets.PERSONAL_ACCESS_TOKEN }}@github.com/iterative/example-get-started-experiments.git 68 | # drop existing refs 69 | git ls-remote origin | awk '{print $2}' | xargs -n 1 git push --delete origin || true 70 | # push updated refs 71 | git push --force origin main 72 | dvc exp push origin -A 73 | # we push git tags one by one for Studio to receive webhooks: 74 | git tag --sort=creatordate | xargs -n 1 git push --force origin 75 | -------------------------------------------------------------------------------- /.github/workflows/example-get-started-experiments-test.yaml: -------------------------------------------------------------------------------- 1 | name: example-get-started-experiments test 2 | on: 3 | push: 4 | paths: 5 | - example-get-started-experiments/** 6 | branches: 7 | - '**' # matches every branch 8 | - '!master' # excludes master 9 | workflow_dispatch: 10 | schedule: 11 | - cron: '0 0 * * 1' 12 | permissions: 13 | contents: read 14 | id-token: write 15 | jobs: 16 | deploy-runner: 17 | environment: aws 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v3 21 | - uses: iterative/setup-cml@v2 22 | - uses: aws-actions/configure-aws-credentials@v1 23 | with: 24 | aws-region: us-east-2 25 | role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }} 26 | role-duration-seconds: 43200 27 | - name: Create Runner 28 | env: 29 | REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 30 | run: | 31 | cml runner launch --single \ 32 | --labels=cml \ 33 | --cloud=aws \ 34 | --cloud-region=us-east \ 35 | --cloud-hdd-size=40 \ 36 | --cloud-type=g5.2xlarge \ 37 | --idle-timeout=3600 \ 38 | test: 39 | needs: deploy-runner 40 | runs-on: [ self-hosted, cml ] 41 | environment: aws 42 | container: 43 | image: iterativeai/cml:0-dvc2-base1-gpu 44 | options: --gpus all --ipc host 45 | steps: 46 | - name: Checkout repo 47 | uses: actions/checkout@v3 48 | - name: Set up Python 49 | uses: actions/setup-python@v4 50 | with: 51 | python-version: '3.11' 52 | cache: 'pip' 53 | - uses: aws-actions/configure-aws-credentials@v2 54 | with: 55 | aws-region: us-east-2 56 | role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }} 57 | role-duration-seconds: 43200 58 | - name: Generate repo 59 | env: 60 | REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 61 | run: | 62 | pip install virtualenv 63 | cd example-get-started-experiments 64 | ./generate.sh 65 | -------------------------------------------------------------------------------- /.github/workflows/example-get-started-test.yaml: -------------------------------------------------------------------------------- 1 | name: example-get-started test 2 | on: 3 | push: 4 | paths: 5 | - example-get-started/** 6 | branches: 7 | - '**' # matches every branch 8 | - '!master' # excludes master 9 | workflow_dispatch: 10 | schedule: 11 | - cron: '0 0 * * 1' 12 | permissions: 13 | contents: read 14 | id-token: write 15 | jobs: 16 | test: 17 | runs-on: ubuntu-latest 18 | environment: aws 19 | steps: 20 | - name: Checkout repo 21 | uses: actions/checkout@v3 22 | - name: Set up Python 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: '3.11' 26 | cache: 'pip' 27 | - uses: aws-actions/configure-aws-credentials@v2 28 | with: 29 | aws-region: us-east-2 30 | role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }} 31 | role-duration-seconds: 43200 32 | - name: Generate repo 33 | run: | 34 | pip install virtualenv 35 | cd example-get-started 36 | ./generate.sh 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | MANIFEST 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | .pytest_cache/ 37 | 38 | # Sphinx documentation 39 | docs/_build/ 40 | 41 | # PyBuilder 42 | target/ 43 | 44 | # Environments 45 | .env 46 | .venv 47 | env/ 48 | venv/ 49 | ENV/ 50 | env.bak/ 51 | venv.bak/ 52 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.4.0 6 | hooks: 7 | - id: check-added-large-files 8 | - id: check-case-conflict 9 | - id: check-docstring-first 10 | - id: check-executables-have-shebangs 11 | - id: check-merge-conflict 12 | - id: check-yaml 13 | - id: debug-statements 14 | - id: end-of-file-fixer 15 | - id: mixed-line-ending 16 | - id: sort-simple-yaml 17 | - id: trailing-whitespace 18 | - repo: local 19 | hooks: 20 | - id: todo 21 | name: Check TODO 22 | language: pygrep 23 | entry: WIP 24 | args: [-i] 25 | types: [text] 26 | exclude: ^.pre-commit-config.yaml$ 27 | - repo: https://github.com/lovesegfault/beautysh 28 | rev: v6.1.0 29 | hooks: 30 | - id: beautysh 31 | args: [-i, '2'] 32 | - repo: https://gitlab.com/pycqa/flake8 33 | rev: 3.9.2 34 | hooks: 35 | - id: flake8 36 | args: [-j8, --max-line-length=99, --extend-ignore=P1] 37 | additional_dependencies: 38 | - flake8-bugbear 39 | - flake8-comprehensions 40 | - flake8-debugger 41 | - flake8-string-format 42 | - repo: https://github.com/PyCQA/isort 43 | rev: 5.8.0 44 | hooks: 45 | - id: isort 46 | args: [--profile=black, -l=99] 47 | - repo: https://github.com/ambv/black 48 | rev: 22.3.0 49 | hooks: 50 | - id: black 51 | args: [-l, '99'] 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Get Started Tutorial (sources) 2 | 3 | Contains source code and [Shell](https://www.shellscript.sh/) scripts to 4 | generate and deploy example DVC repositories used in the [Get 5 | Started](https://dvc.org/doc/get-started) and other sections of the DVC docs. 6 | 7 | ## Requirements 8 | 9 | Please make sure you have these available on the environment where these scripts 10 | will run: 11 | 12 | - [Git](https://git-scm.com/) 13 | - [Python](https://www.python.org/) 3 (with `python3` and [pip](https://pypi.org/project/pip/) commands) 14 | - [Virtualenv](https://virtualenv.pypa.io/en/stable/) 15 | 16 | ## Naming Convention for Example Repositories 17 | 18 | In order to have a consistent naming scheme across all example repositories, the 19 | new repositories should be named as: 20 | 21 | ``` 22 | example-PROD-FEATURE 23 | ``` 24 | 25 | where `PROD` is one of the products like `dvc`, `cml`, `studio`, or `dvclive`, and `FEATURE` is 26 | the feature that the repository focused on, like `experiments`, or `pipelines`. 27 | You can also use additional keywords as suffix to differentiate from the others. 28 | 29 | ⚠️ Please create all new repositories with the prefix `example-`. 30 | 31 | ## Scripts 32 | 33 | Each example DVC project is in each of the root directories (below). `cd` into 34 | the directory first before running the desired script, for example: 35 | 36 | ```console 37 | $ cd example-get-started 38 | $ ./deploy.sh 39 | ``` 40 | 41 | ### example-get-started 42 | 43 | There are 2 GitHub Actions set up to test and deploy the project: 44 | 45 | - [test](.github/workflows/example-get-started-test.yaml) 46 | - [deploy](.github/workflows/example-get-started-deploy.yaml) 47 | 48 | These will automatically test and deploy the project. If you need to run the project 49 | locally/manually, you only directly need `generate.sh`. `deploy.sh` is a helper script 50 | run within `generate.sh`. 51 | 52 | - `generate.sh`: Generates the `example-get-started` DVC project from 53 | scratch. 54 | 55 | By default, the source code archive is derived from the local workspace for 56 | development purposes. 57 | 58 | For deployment, use `generate.sh prod` to upload/download a source code 59 | archive from S3 the same way as in [Connect Code and 60 | Data](https://dvc.org/doc/get-started/connect-code-and-data). 61 | 62 | - `deploy.sh`: Makes and deploys code archive from 63 | [example-get-started/code](example-get-started/code) to use for `generate.sh`. 64 | 65 | By default, makes local code archive in example-get-started/code.zip. 66 | 67 | For deployment, use `deploy.sh prod` to upload to S3. 68 | 69 | > Requires AWS CLI and write access to `s3://dvc-public/code/get-started/`. 70 | 71 | ### example-get-started-experiments 72 | 73 | There are 2 GitHub Actions set up to test and deploy the project: 74 | 75 | - [test](.github/workflows/example-get-started-experiments-test.yaml) 76 | - [deploy](.github/workflows/example-get-started-experiments-deploy.yaml) 77 | 78 | These will automatically test and deploy the project. If you need to run the project locally/manually, run `generate.sh`. 79 | 80 | Even after automatic deployment, you still need to follow the 81 | [instructions](example-get-started-experiments/README.md) to: 82 | - Update Studio to create a PR from the best generated experiment. 83 | - Push to GitLab if you want to update the repo there. 84 | -------------------------------------------------------------------------------- /example-get-started-experiments/.gitignore: -------------------------------------------------------------------------------- 1 | # Custom 2 | *.zip 3 | /tmp 4 | build/ 5 | -------------------------------------------------------------------------------- /example-get-started-experiments/README.md: -------------------------------------------------------------------------------- 1 | Generate the actual repo by running: 2 | 3 | ```shell 4 | bash generate.sh 5 | ``` 6 | 7 | The repo generated in `build/example-get-started-experiments` is intended to be 8 | published on https://github.com/iterative/example-get-started-experiments. 9 | Make sure the Github repo exists first and that you have appropriate write 10 | permissions. 11 | 12 | Run the commands below to force push it. 13 | Modify `ORIGIN` on demmand, for example use 14 | `git@gitlab.com:iterative.ai/example-get-started-experiments.git` to force 15 | push a copy to GitLab. 16 | 17 | ```shell 18 | cd build/example-get-started-experiments 19 | export ORIGIN=https://github.com/iterative/example-get-started-experiments.git 20 | git remote add origin ${ORIGIN} 21 | git push --force origin main 22 | # we push git tags one by one for Studio to receive webhooks: 23 | git tag --sort=creatordate | xargs -n 1 git push --force origin 24 | ``` 25 | 26 | Run these to drop and then rewrite the experiment references on the repo: 27 | 28 | ```shell 29 | source .venv/bin/activate 30 | dvc exp remove -A -g origin 31 | dvc exp push origin -A 32 | ``` 33 | 34 | Finally, return to the directory where you started: 35 | 36 | ```shell 37 | cd ../.. 38 | ``` 39 | 40 | You may remove the generated repo with: 41 | 42 | ```shell 43 | rm -fR build 44 | ``` 45 | 46 | To update the project in Studio, follow the instructions at: 47 | 48 | https://github.com/iterative/studio/wiki/Updating-and-synchronizing-demo-project 49 | 50 | 51 | Pay attention to whether experiments shown in experiments table are "detached" 52 | or if the experiments you just pushed doesn't show up in the Project table. 53 | 54 | Manual Studio PR: 55 | 56 | Once the repo has been generated and pushed, go to the 57 | [corresponding Studio project](https://studio.iterative.ai/team/Iterative/projects/example-get-started-experiments-y8toqd433r) 58 | and create a PR from the best of the 3 experiments that are found in the latest 59 | commit of the `main` branch. 60 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/.devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "example-cv", 3 | "image": "mcr.microsoft.com/devcontainers/python:3.10", 4 | "runArgs": ["--ipc=host"], 5 | "features": { 6 | "ghcr.io/devcontainers/features/nvidia-cuda:1": { 7 | "installCudnn": true 8 | }, 9 | "ghcr.io/iterative/features/nvtop:1": {} 10 | }, 11 | "extensions": [ 12 | "Iterative.dvc", 13 | "ms-python.python", 14 | "redhat.vscode-yaml" 15 | ], 16 | "postCreateCommand": "pip install --user -r requirements.txt" 17 | } 18 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/.gitattributes: -------------------------------------------------------------------------------- 1 | *.dvc linguist-language=YAML 2 | dvc.lock linguist-language=YAML 3 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/.github/workflows/deploy-model-sagemaker.yml: -------------------------------------------------------------------------------- 1 | name: Deploy model (Sagemaker) 2 | 3 | on: 4 | push: 5 | tags: 6 | - "*" 7 | 8 | permissions: 9 | contents: write 10 | id-token: write 11 | 12 | jobs: 13 | parse: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: "Parse GTO tag" 18 | id: gto 19 | uses: iterative/gto-action@v2 20 | outputs: 21 | event: ${{ steps.gto.outputs.event }} 22 | name: ${{ steps.gto.outputs.name }} 23 | stage: ${{ steps.gto.outputs.stage }} 24 | version: ${{ steps.gto.outputs.version }} 25 | 26 | deploy-model: 27 | needs: parse 28 | if: "${{ needs.parse.outputs.event == 'assignment' }}" 29 | environment: cloud 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v3 33 | with: 34 | fetch-depth: 0 35 | 36 | - uses: aws-actions/configure-aws-credentials@v4 37 | with: 38 | aws-region: us-east-2 39 | role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }} 40 | role-duration-seconds: 43200 41 | 42 | - name: Set up Python 43 | uses: actions/setup-python@v4 44 | with: 45 | python-version: '3.8' 46 | cache: 'pip' 47 | cache-dependency-path: requirements.txt 48 | 49 | - run: pip install -r requirements.txt 50 | 51 | - run: dvc remote add -d --local storage s3://dvc-public/remote/get-started-pools 52 | 53 | - run: | 54 | MODEL_DATA=$(dvc get --show-url . model.tar.gz) 55 | python sagemaker/deploy_model.py \ 56 | --name ${{ needs.parse.outputs.name }} \ 57 | --stage ${{ needs.parse.outputs.stage }} \ 58 | --version ${{ needs.parse.outputs.version }} \ 59 | --model_data $MODEL_DATA \ 60 | --role ${{ vars.AWS_SANDBOX_ROLE }} 61 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/.github/workflows/deploy-model-template.yml: -------------------------------------------------------------------------------- 1 | name: Deploy Model (Template) 2 | 3 | on: 4 | # the workflow is triggered whenever a tag is pushed to the repository 5 | push: 6 | tags: 7 | - "*" 8 | jobs: 9 | 10 | # This job parses the git tag with the GTO GitHub Action to identify model registry actions 11 | parse: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: "Parse GTO tag" 16 | id: gto 17 | uses: iterative/gto-action@v2 18 | outputs: 19 | event: ${{ steps.gto.outputs.event }} 20 | name: ${{ steps.gto.outputs.name }} 21 | stage: ${{ steps.gto.outputs.stage }} 22 | version: ${{ steps.gto.outputs.version }} 23 | 24 | deploy-model: 25 | needs: parse 26 | # using the outputs from the "parse" job, we run this job only for actions 27 | # in the model registry and only when the model was assigned to a stage called "prod" 28 | if: ${{ needs.parse.outputs.event == 'assignment' && needs.parse.outputs.stage == 'prod' }} 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: iterative/setup-dvc@v1 32 | # this step uses DVC to download the model from our remote repository and deploys the model 33 | # Model deployment is mocked here as it is specific to each deployment environment 34 | # The DVC Studio token is used to avoid having to store specific remote storage credentials on GitHub 35 | - name: Get Model For Deployment 36 | run: | 37 | dvc config --global studio.token ${{ secrets.DVC_STUDIO_TOKEN }} 38 | dvc artifacts get ${{ github.server_url }}/${{ github.repository }} ${{ needs.parse.outputs.name }} --rev ${{ needs.parse.outputs.version }} 39 | echo "The right model is available and you can use the rest of this command to deploy it. Good job!" 40 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/.github/workflows/dvc-studio.yml: -------------------------------------------------------------------------------- 1 | name: DVC Studio Experiment 2 | 3 | on: 4 | 5 | push: 6 | tags-ignore: 7 | - '**' 8 | 9 | workflow_dispatch: 10 | inputs: 11 | exp-run-args: 12 | description: 'Args to be passed to dvc exp run call' 13 | required: false 14 | type: string 15 | default: '' 16 | parent-sha: 17 | description: 'SHA of the commit to start the experiment from' 18 | required: false 19 | type: string 20 | default: '' 21 | cloud: 22 | description: 'Cloud compute provider to host the runner' 23 | required: false 24 | default: 'aws' 25 | type: choice 26 | options: 27 | - aws 28 | - azure 29 | - gcp 30 | type: 31 | description: 'https://registry.terraform.io/providers/iterative/iterative/latest/docs/resources/task#machine-type' 32 | required: false 33 | default: 'g5.2xlarge' 34 | region: 35 | description: 'https://registry.terraform.io/providers/iterative/iterative/latest/docs/resources/task#cloud-region' 36 | required: false 37 | default: 'us-east' 38 | spot: 39 | description: 'Request a spot instance' 40 | required: false 41 | default: false 42 | type: boolean 43 | storage: 44 | description: 'Disk size in GB' 45 | required: false 46 | default: 40 47 | type: number 48 | timeout: 49 | description: 'Timeout in seconds' 50 | required: false 51 | default: 3600 52 | type: number 53 | 54 | permissions: 55 | contents: write 56 | id-token: write 57 | pull-requests: write 58 | 59 | jobs: 60 | 61 | deploy-runner: 62 | if: ${{ (github.actor == 'iterative-studio[bot]') || (github.event_name == 'workflow_dispatch') }} 63 | environment: cloud 64 | runs-on: ubuntu-latest 65 | 66 | steps: 67 | - uses: actions/checkout@v3 68 | with: 69 | ref: ${{ inputs.parent-sha || '' }} 70 | - uses: iterative/setup-cml@v2 71 | - uses: aws-actions/configure-aws-credentials@v4 72 | with: 73 | aws-region: us-east-2 74 | role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }} 75 | role-duration-seconds: 43200 76 | - name: Create Runner 77 | env: 78 | REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 79 | run: | 80 | cml runner launch --single \ 81 | --labels=cml \ 82 | --cloud=${{ inputs.cloud || 'aws' }} \ 83 | --cloud-region=${{ inputs.region || 'us-east' }} \ 84 | --cloud-hdd-size=${{ inputs.storage || '40' }} \ 85 | --cloud-type=${{ inputs.type || 'g5.2xlarge' }} \ 86 | --idle-timeout=${{ inputs.timeout || '3600' }} \ 87 | ${{ (inputs.spot == 'true' && '--cloud-spot') || '' }} 88 | 89 | runner-job: 90 | needs: deploy-runner 91 | runs-on: [ self-hosted, cml ] 92 | environment: cloud 93 | container: 94 | image: iterativeai/cml:latest-gpu 95 | options: --gpus all --ipc host 96 | 97 | steps: 98 | - uses: actions/checkout@v3 99 | with: 100 | ref: ${{ inputs.parent-sha || '' }} 101 | - uses: aws-actions/configure-aws-credentials@v4 102 | with: 103 | aws-region: us-east-2 104 | role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }} 105 | role-duration-seconds: 43200 106 | 107 | - run: pip install -r requirements.txt 108 | 109 | - name: Train 110 | env: 111 | REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 112 | DVC_STUDIO_TOKEN: ${{ secrets.DVC_STUDIO_TOKEN }} 113 | DVCLIVE_LOGLEVEL: DEBUG 114 | run: | 115 | cml ci --fetch-depth 0 116 | dvc exp run --pull --allow-missing ${{ github.event.inputs.exp-run-args }} 117 | dvc remote add --local push_remote s3://dvc-public/remote/get-started-pools 118 | 119 | - name: Workflow Dispatch Sharing 120 | if: github.event_name == 'workflow_dispatch' 121 | env: 122 | DVC_STUDIO_TOKEN: ${{ secrets.DVC_STUDIO_TOKEN }} 123 | run: | 124 | dvc exp push origin -r push_remote 125 | 126 | - name: Commit-based Sharing 127 | if: github.actor == 'iterative-studio[bot]' 128 | env: 129 | REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 130 | run: | 131 | dvc push -r push_remote 132 | cml pr --squash --skip-ci . 133 | echo "## Metrics" > report.md 134 | dvc metrics diff main --md >> report.md 135 | echo "## Params" >> report.md 136 | dvc params diff main --md >> report.md 137 | cml comment create --pr report.md 138 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # DVCLive report 132 | dvclive/report.md 133 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | # Deploy Model (Template) 2 | 3 | workflow: 4 | rules: 5 | # Run the pipeline whenever a tag is pushed to the repository 6 | - if: $CI_COMMIT_TAG 7 | 8 | parse: 9 | # This job parses the model tag to identify model registry actions 10 | image: python:3.11-slim 11 | script: 12 | # Install GTO to parse model tags 13 | - pip install gto 14 | # This job parses the model tags to identify model registry actions 15 | - echo "CI_COMMIT_TAG - ${CI_COMMIT_TAG}" 16 | - echo MODEL_NAME="$(gto check-ref ${CI_COMMIT_TAG} --name)" >> parse.env 17 | - echo MODEL_VERSION="$(gto check-ref ${CI_COMMIT_TAG} --version)" >> parse.env 18 | - echo MODEL_EVENT="$(gto check-ref ${CI_COMMIT_TAG} --event)" >> parse.env 19 | - echo MODEL_STAGE="$(gto check-ref ${CI_COMMIT_TAG} --stage)" >> parse.env 20 | # Print variables saved to parse.env 21 | - cat parse.env 22 | artifacts: 23 | reports: 24 | dotenv: parse.env 25 | 26 | deploy-model: 27 | needs: 28 | - job: parse 29 | artifacts: true 30 | image: python:3.11-slim 31 | script: 32 | # Check if the model is assigned to prod (variables from parse.env are only available in the 'script' section) 33 | - if [[ $MODEL_EVENT == 'assignment' && $MODEL_STAGE == 'prod' ]]; then echo "Deploy model"; else exit 1; fi 34 | # Install DVC 35 | - pip install dvc 36 | # Build commands to download and deploy the model 37 | - dvc config --global studio.token ${DVC_STUDIO_TOKEN} 38 | - dvc artifacts get ${CI_REPOSITORY_URL} ${MODEL_NAME} --rev ${MODEL_VERSION} 39 | - echo "The right model is available and you can use the rest of this command to deploy it. Good job!" -------------------------------------------------------------------------------- /example-get-started-experiments/code/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Iterative 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/README.md: -------------------------------------------------------------------------------- 1 | [![DVC](https://img.shields.io/badge/-Open_in_Studio-grey.svg?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/projects/example-get-started-experiments-y8toqd433r) 2 | [![DVC-metrics](https://img.shields.io/badge/dynamic/json?style=flat-square&colorA=grey&colorB=F46737&label=Dice%20Metric&url=https://github.com/iterative/example-get-started-experiments/raw/main/results/evaluate/metrics.json&query=dice_multi)](https://github.com/iterative/example-get-started-experiments/raw/main/results/evaluate/metrics.json) 3 | 4 | [Train Report](./results/train/report.md) - [Evaluation Report](./results/evaluate/report.md) 5 | 6 | # DVC Get Started: Experiments 7 | 8 | This is an auto-generated repository for use in [DVC](https://dvc.org) 9 | [Get Started: Experiments](https://dvc.org/doc/start/experiment-management). 10 | 11 | This is a Computer Vision (CV) project that solves the problem of segmenting out 12 | swimming pools from satellite images. 13 | 14 | [Example results](./results/evaluate/plots/images/) 15 | 16 | We use a slightly modified version of the [BH-Pools dataset](http://patreo.dcc.ufmg.br/2020/07/29/bh-pools-watertanks-datasets/): 17 | we split the original 4k images into tiles of 1024x1024 pixels. 18 | 19 | 20 | 🐛 Please report any issues found in this project here - 21 | [example-repos-dev](https://github.com/iterative/example-repos-dev). 22 | 23 | ## Installation 24 | 25 | Python 3.8+ is required to run code from this repo. 26 | 27 | ```console 28 | $ git clone https://github.com/iterative/example-get-started-experiments 29 | $ cd example-get-started-experiments 30 | ``` 31 | 32 | Now let's install the requirements. But before we do that, we **strongly** 33 | recommend creating a virtual environment with a tool such as 34 | [virtualenv](https://virtualenv.pypa.io/en/stable/): 35 | 36 | ```console 37 | $ python -m venv .venv 38 | $ source .venv/bin/activate 39 | $ pip install -r requirements.txt 40 | ``` 41 | 42 | This DVC project comes with a preconfigured DVC 43 | [remote storage](https://dvc.org/doc/commands-reference/remote) that holds raw 44 | data (input), intermediate, and final results that are produced. This is a 45 | read-only HTTP remote. 46 | 47 | ```console 48 | $ dvc remote list 49 | storage https://remote.dvc.org/get-started-pools 50 | ``` 51 | 52 | You can run [`dvc pull`](https://man.dvc.org/pull) to download the data: 53 | 54 | ```console 55 | $ dvc pull 56 | ``` 57 | 58 | ## Running in your environment 59 | 60 | Run [`dvc exp run`](https://man.dvc.org/exp/run) to reproduce the 61 | [pipeline](https://dvc.org/doc/user-guide/pipelines/defining-pipelinese): 62 | 63 | ```console 64 | $ dvc exp run 65 | Data and pipelines are up to date. 66 | ``` 67 | 68 | If you'd like to test commands like [`dvc push`](https://man.dvc.org/push), 69 | that require write access to the remote storage, the easiest way would be to set 70 | up a "local remote" on your file system: 71 | 72 | > This kind of remote is located in the local file system, but is external to 73 | > the DVC project. 74 | 75 | ```console 76 | $ mkdir -p /tmp/dvc-storage 77 | $ dvc remote add local /tmp/dvc-storage 78 | ``` 79 | 80 | You should now be able to run: 81 | 82 | ```console 83 | $ dvc push -r local 84 | ``` 85 | 86 | ## Existing stages 87 | 88 | There is a couple of git tags in this project : 89 | 90 | ### [1-notebook-dvclive](https://github.com/iterative/example-get-started-experiments/tree/1-notebook-dvclive) 91 | 92 | Contains an end-to-end Jupyter notebook that loads data, trains a model and 93 | reports model performance. 94 | [DVCLive](https://dvc.org/doc/dvclive) is used for experiment tracking. 95 | See this [blog post](https://iterative.ai/blog/exp-tracking-dvc-python) for more 96 | details. 97 | 98 | ### [2-dvc-pipeline](https://github.com/iterative/example-get-started-experiments/tree/2-dvc-pipeline) 99 | 100 | Contains a DVC pipeline `dvc.yaml` that was created by refactoring the above 101 | notebook into individual pipeline stages. 102 | 103 | The pipeline artifacts (processed data, model file, etc) are automatically 104 | versioned. 105 | 106 | This tag also contains a GitHub Actions workflow that reruns the pipeline if any 107 | changes are introduced to the pipeline-related files. 108 | [CML](https://cml.dev/) is used in this workflow to provision a cloud-based GPU 109 | machine as well as report model performance results in Pull Requests. 110 | 111 | ## Model Deployment 112 | 113 | Check out the [GitHub Workflow](https://github.com/iterative/example-get-started-experiments/blob/main/.github/workflows/deploy-model.yml) 114 | that uses the [Iterative Studio Model Registry](https://dvc.org/doc/studio/user-guide/model-registry/what-is-a-model-registry). 115 | to deploy the model to [AWS Sagemaker](https://aws.amazon.com/es/sagemaker/) whenever a new [version is registered](https://dvc.org/doc/studio/user-guide/model-registry/register-version). 116 | 117 | ## Project structure 118 | 119 | The data files, DVC files, and results change as stages are created one by one. 120 | After cloning and using [`dvc pull`](https://man.dvc.org/pull) to download 121 | data, models, and plots tracked by DVC, the workspace should look like this: 122 | 123 | ```console 124 | $ tree -L 2 125 | . 126 | ├── LICENSE 127 | ├── README.md 128 | ├── data. # <-- Directory with raw and intermediate data 129 | │ ├── pool_data # <-- Raw image data 130 | │ ├── pool_data.dvc # <-- .dvc file - a placeholder/pointer to raw data 131 | │ ├── test_data # <-- Processed test data 132 | │ └── train_data # <-- Processed train data 133 | ├── dvc.lock 134 | ├── dvc.yaml # <-- DVC pipeline file 135 | ├── models 136 | │ └── model.pkl # <-- Trained model file 137 | ├── notebooks 138 | │ └── TrainSegModel.ipynb # <-- Initial notebook (refactored into `dvc.yaml`) 139 | ├── params.yaml # <-- Parameters file 140 | ├── requirements.txt # <-- Python dependencies needed in the project 141 | ├── results # <-- DVCLive reports and plots 142 | │ ├── evaluate 143 | │ └── train 144 | └── src # <-- Source code to run the pipeline stages 145 | ├── data_split.py 146 | ├── evaluate.py 147 | └── train.py 148 | ``` 149 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/data/.gitignore: -------------------------------------------------------------------------------- 1 | /pool_data 2 | /test_data 3 | /train_data 4 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/data/pool_data.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 14d187e749ee5614e105741c719fa185.dir 3 | size: 18999874 4 | nfiles: 183 5 | path: pool_data 6 | hash: md5 7 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/gitlab-workflows/cloud-experiment.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | EXP_RUN_ARGS: "" 3 | deploy-runner: 4 | image: iterativeai/cml:0-dvc2-base1 5 | script: 6 | - pip install awscli 7 | - > 8 | CREDENTIALS=($(aws sts assume-role-with-web-identity 9 | --region=us-east-1 10 | --role-arn=arn:aws:iam::342840881361:role/SandboxUser 11 | --role-session-name=GitLab 12 | --duration-seconds=3600 13 | --web-identity-token="$CI_JOB_JWT_V2" 14 | --query="Credentials.[AccessKeyId,SecretAccessKey,SessionToken]" 15 | --output=text)) 16 | - export AWS_ACCESS_KEY_ID="${CREDENTIALS[0]}" 17 | - export AWS_SECRET_ACCESS_KEY="${CREDENTIALS[1]}" 18 | - export AWS_SESSION_TOKEN="${CREDENTIALS[2]}" 19 | - aws sts get-caller-identity 20 | - > 21 | cml runner launch --single \ 22 | --labels=cml \ 23 | --cloud=aws \ 24 | --cloud-region=us-east \ 25 | --cloud-hdd-size=40 \ 26 | --cloud-type=g5.2xlarge 27 | runner-job: 28 | needs: 29 | - deploy-runner 30 | tags: 31 | - cml 32 | image: iterativeai/cml:0-dvc2-base1 33 | script: 34 | - pip install awscli 35 | - > 36 | CREDENTIALS=($(aws sts assume-role-with-web-identity 37 | --region=us-east-1 38 | --role-arn=arn:aws:iam::342840881361:role/SandboxUser 39 | --role-session-name=GitLab 40 | --duration-seconds=3600 41 | --web-identity-token="$CI_JOB_JWT_V2" 42 | --query="Credentials.[AccessKeyId,SecretAccessKey,SessionToken]" 43 | --output=text)) 44 | - export AWS_ACCESS_KEY_ID="${CREDENTIALS[0]}" 45 | - export AWS_SECRET_ACCESS_KEY="${CREDENTIALS[1]}" 46 | - export AWS_SESSION_TOKEN="${CREDENTIALS[2]}" 47 | - aws sts get-caller-identity 48 | - pip install -r requirements.txt 49 | - cml ci 50 | - dvc exp run --pull --allow-missing $EXP_RUN_ARGS 51 | - dvc remote add --local push_remote s3://dvc-public/remote/get-started-pools 52 | - dvc exp push origin -r push_remote 53 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/notebooks/TrainSegModel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import shutil\n", 11 | "from functools import partial\n", 12 | "from pathlib import Path\n", 13 | "import warnings\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "import torch\n", 17 | "from box import ConfigBox\n", 18 | "from dvclive import Live\n", 19 | "from dvclive.fastai import DVCLiveCallback\n", 20 | "from fastai.data.all import Normalize, get_files\n", 21 | "from fastai.metrics import DiceMulti\n", 22 | "from fastai.vision.all import (Resize, SegmentationDataLoaders,\n", 23 | " imagenet_stats, models, unet_learner)\n", 24 | "from ruamel.yaml import YAML\n", 25 | "from PIL import Image\n", 26 | "\n", 27 | "os.chdir(\"..\")\n", 28 | "warnings.filterwarnings(\"ignore\")" 29 | ] 30 | }, 31 | { 32 | "attachments": {}, 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "### Load data and split it into train/test\n", 37 | "\n", 38 | "We have some [data in DVC](https://dvc.org/doc/start/data-management/data-versioning) that we can pull. \n", 39 | "\n", 40 | "This data includes:\n", 41 | "* satellite images\n", 42 | "* masks of the swimming pools in each satellite image\n", 43 | "\n", 44 | "DVC can help connect your data to your repo, but it isn't necessary to have your data in DVC to start tracking experiments with DVC and DVCLive." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "!dvc pull" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "test_regions = [\"REGION_1-\"]\n", 63 | "\n", 64 | "img_fpaths = get_files(Path(\"data\") / \"pool_data\" / \"images\", extensions=\".jpg\")\n", 65 | "\n", 66 | "train_data_dir = Path(\"data\") / \"train_data\"\n", 67 | "train_data_dir.mkdir(exist_ok=True)\n", 68 | "test_data_dir = Path(\"data\") / \"test_data\"\n", 69 | "test_data_dir.mkdir(exist_ok=True)\n", 70 | "for img_path in img_fpaths:\n", 71 | " msk_path = Path(\"data\") / \"pool_data\" / \"masks\" / f\"{img_path.stem}.png\"\n", 72 | " if any(region in str(img_path) for region in test_regions):\n", 73 | " shutil.copy(img_path, test_data_dir)\n", 74 | " shutil.copy(msk_path, test_data_dir)\n", 75 | " else:\n", 76 | " shutil.copy(img_path, train_data_dir)\n", 77 | " shutil.copy(msk_path, train_data_dir)" 78 | ] 79 | }, 80 | { 81 | "attachments": {}, 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### Create a data loader\n", 86 | "\n", 87 | "Load and prepare the images and masks by creating a data loader." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "def get_mask_path(x, train_data_dir):\n", 97 | " return Path(train_data_dir) / f\"{Path(x).stem}.png\"" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "bs = 8\n", 107 | "valid_pct = 0.20\n", 108 | "img_size = 256\n", 109 | "\n", 110 | "data_loader = SegmentationDataLoaders.from_label_func(\n", 111 | " path=train_data_dir,\n", 112 | " fnames=get_files(train_data_dir, extensions=\".jpg\"),\n", 113 | " label_func=partial(get_mask_path, train_data_dir=train_data_dir),\n", 114 | " codes=[\"not-pool\", \"pool\"],\n", 115 | " bs=bs,\n", 116 | " valid_pct=valid_pct,\n", 117 | " item_tfms=Resize(img_size),\n", 118 | " batch_tfms=[\n", 119 | " Normalize.from_stats(*imagenet_stats),\n", 120 | " ],\n", 121 | " )" 122 | ] 123 | }, 124 | { 125 | "attachments": {}, 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "### Review a sample batch of data\n", 130 | "\n", 131 | "Below are some examples of the images overlaid with their masks." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "data_loader.show_batch(alpha=0.7)" 141 | ] 142 | }, 143 | { 144 | "attachments": {}, 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "### Train multiple models with different learning rates using `DVCLiveCallback`\n", 149 | "\n", 150 | "Set up model training, using DVCLive to capture the results of each experiment." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "def dice(mask_pred, mask_true, classes=[0, 1], eps=1e-6):\n", 160 | " dice_list = []\n", 161 | " for c in classes:\n", 162 | " y_true = mask_true == c\n", 163 | " y_pred = mask_pred == c\n", 164 | " intersection = 2.0 * np.sum(y_true * y_pred)\n", 165 | " dice = intersection / (np.sum(y_true) + np.sum(y_pred) + eps)\n", 166 | " dice_list.append(dice)\n", 167 | " return np.mean(dice_list)\n", 168 | "\n", 169 | "\n", 170 | "def evaluate(learn):\n", 171 | " test_img_fpaths = sorted(get_files(Path(\"data\") / \"test_data\", extensions=\".jpg\"))\n", 172 | " test_dl = learn.dls.test_dl(test_img_fpaths)\n", 173 | " preds, _ = learn.get_preds(dl=test_dl)\n", 174 | " masks_pred = np.array(preds[:, 1, :] > 0.5, dtype=np.uint8)\n", 175 | " test_mask_fpaths = [\n", 176 | " get_mask_path(fpath, Path(\"data\") / \"test_data\") for fpath in test_img_fpaths\n", 177 | " ]\n", 178 | " masks_true = [Image.open(mask_path) for mask_path in test_mask_fpaths]\n", 179 | "\n", 180 | " dice_multi = 0.0\n", 181 | " for ii in range(len(masks_true)):\n", 182 | " mask_pred, mask_true = masks_pred[ii], masks_true[ii]\n", 183 | " mask_pred = np.array(\n", 184 | " Image.fromarray(mask_pred).resize((mask_true.shape[1], mask_true.shape[0])),\n", 185 | " dtype=int\n", 186 | " )\n", 187 | " mask_true = np.array(mask_true, dtype=int)\n", 188 | " dice_multi += dice(mask_true, mask_pred) / len(masks_true)\n", 189 | "\n", 190 | " return dice_multi" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "train_arch = 'shufflenet_v2_x2_0'\n", 200 | "\n", 201 | "for base_lr in [0.001, 0.005, 0.01]:\n", 202 | " # initialize dvclive, optionally provide output path, and show report in notebook\n", 203 | " # don't save dvc experiment until post-training metrics below\n", 204 | " with Live(\"results/train\", report=\"notebook\", save_dvc_exp=False) as live:\n", 205 | " # log a parameter\n", 206 | " live.log_param(\"train_arch\", train_arch)\n", 207 | " fine_tune_args = {\n", 208 | " 'epochs': 8,\n", 209 | " 'base_lr': base_lr\n", 210 | " }\n", 211 | " # log a dict of parameters\n", 212 | " live.log_params(fine_tune_args)\n", 213 | "\n", 214 | " learn = unet_learner(data_loader, \n", 215 | " arch=getattr(models, train_arch), \n", 216 | " metrics=DiceMulti)\n", 217 | " # train model and automatically capture metrics with DVCLiveCallback\n", 218 | " learn.fine_tune(\n", 219 | " **fine_tune_args,\n", 220 | " cbs=[DVCLiveCallback(live=live)])\n", 221 | "\n", 222 | " # save model artifact to dvc\n", 223 | " models_dir = Path(\"models\")\n", 224 | " models_dir.mkdir(exist_ok=True)\n", 225 | " learn.export(fname=(models_dir / \"model.pkl\").absolute())\n", 226 | " torch.save(learn.model, (models_dir / \"model.pth\").absolute())\n", 227 | " live.log_artifact(\n", 228 | " str(models_dir / \"model.pkl\"),\n", 229 | " type=\"model\",\n", 230 | " name=\"pool-segmentation\",\n", 231 | " desc=\"This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.\",\n", 232 | " labels=[\"cv\", \"segmentation\", \"satellite-images\", \"unet\"],\n", 233 | " )\n", 234 | "\n", 235 | " # add additional post-training summary metrics.\n", 236 | " with Live(\"results/evaluate\") as live:\n", 237 | " live.summary[\"dice_multi\"] = evaluate(learn)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "# Compare experiments\n", 247 | "!dvc exp show --only-changed" 248 | ] 249 | }, 250 | { 251 | "attachments": {}, 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "### Review sample preditions vs ground truth\n", 256 | "\n", 257 | "Below are some example of the predicted masks." 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "learn.show_results(max_n=6, alpha=0.7)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [] 275 | } 276 | ], 277 | "metadata": { 278 | "kernelspec": { 279 | "display_name": "Python 3 (ipykernel)", 280 | "language": "python", 281 | "name": "python3" 282 | }, 283 | "language_info": { 284 | "codemirror_mode": { 285 | "name": "ipython", 286 | "version": 3 287 | }, 288 | "file_extension": ".py", 289 | "mimetype": "text/x-python", 290 | "name": "python", 291 | "nbconvert_exporter": "python", 292 | "pygments_lexer": "ipython3", 293 | "version": "3.11.6" 294 | }, 295 | "vscode": { 296 | "interpreter": { 297 | "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" 298 | } 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 4 303 | } 304 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/params.yaml: -------------------------------------------------------------------------------- 1 | base: 2 | random_seed: 42 3 | 4 | data_split: 5 | test_regions: 6 | - REGION_1 7 | 8 | train: 9 | valid_pct: 0.1 10 | arch: shufflenet_v2_x2_0 11 | img_size: 256 12 | batch_size: 8 13 | fine_tune_args: 14 | epochs: 8 15 | base_lr: 0.01 16 | 17 | evaluate: 18 | n_samples_to_save: 10 19 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/requirements.txt: -------------------------------------------------------------------------------- 1 | dvc[s3]>=3.29.0 2 | dvclive>=3.0.1 3 | fastai 4 | python-box 5 | sagemaker 6 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/sagemaker/code/inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reference: 3 | https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#id4 4 | """ 5 | import io 6 | import os 7 | 8 | import numpy as np 9 | import torch 10 | from PIL import Image 11 | from torchvision.transforms import Compose, Normalize, Resize, ToTensor 12 | 13 | 14 | def model_fn(model_dir, context): 15 | kwargs = { 16 | "f": os.path.join(model_dir, "code/model.pth") 17 | } 18 | if not torch.cuda.is_available(): 19 | kwargs["map_location"] = torch.device("cpu") 20 | model = torch.load(**kwargs) 21 | return model 22 | 23 | 24 | def input_fn(request_body, request_content_type, context): 25 | if request_content_type: 26 | img_pil = Image.open(io.BytesIO(request_body)) 27 | img_transform = Compose([Resize(512), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 28 | img_tensor = img_transform(img_pil).unsqueeze_(0) 29 | return img_tensor 30 | else: 31 | raise ValueError(f"Unsupported request_content_type {request_content_type}") 32 | 33 | 34 | def predict_fn(input_object, model, context): 35 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 36 | model.to(device) 37 | with torch.no_grad(): 38 | result = model(input_object) 39 | return result 40 | 41 | 42 | def output_fn(prediction_output, content_type): 43 | output = np.array( 44 | prediction_output[:, 1, :] > 0.5, dtype=np.uint8 45 | ) 46 | if torch.cuda.is_available(): 47 | output = output.cpu() 48 | buffer = io.BytesIO() 49 | np.save(buffer, output) 50 | return buffer.getvalue() 51 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/sagemaker/code/requirements.txt: -------------------------------------------------------------------------------- 1 | fastai 2 | pillow 3 | torch 4 | torchvision -------------------------------------------------------------------------------- /example-get-started-experiments/code/sagemaker/deploy_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import sys 4 | 5 | import boto3 6 | import botocore 7 | 8 | from sagemaker.deserializers import JSONDeserializer 9 | from sagemaker.pytorch import PyTorchModel 10 | from sagemaker.serverless import ServerlessInferenceConfig 11 | 12 | 13 | memory_size = { 14 | "dev": 4096 , 15 | "staging": 4096, 16 | "prod": 6144 , 17 | "default": 4096, 18 | } 19 | max_concurrency = { 20 | "dev": 5, 21 | "staging": 5, 22 | "prod": 10, 23 | "default": 5, 24 | } 25 | 26 | 27 | def deploy( 28 | name: str, 29 | stage: str, 30 | version: str, 31 | model_data: str, 32 | role: str, 33 | ): 34 | sagemaker_logger = logging.getLogger("sagemaker") 35 | sagemaker_logger.setLevel(logging.DEBUG) 36 | sagemaker_logger.addHandler(logging.StreamHandler(sys.stdout)) 37 | 38 | version_name = re.sub( 39 | r"[^a-zA-Z0-9\-]", "-", f"{name}-{version}") 40 | 41 | model = PyTorchModel( 42 | name=version_name, 43 | model_data=model_data, 44 | framework_version="1.12", 45 | py_version="py38", 46 | role=role, 47 | env={ 48 | "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600", 49 | "TS_MAX_RESPONSE_SIZE": "2000000000", 50 | "TS_MAX_REQUEST_SIZE": "2000000000", 51 | "MMS_MAX_RESPONSE_SIZE": "2000000000", 52 | "MMS_MAX_REQUEST_SIZE": "2000000000", 53 | }, 54 | ) 55 | 56 | stage_name = re.sub( 57 | r"[^a-zA-Z0-9\-]", "-", f"{name}-{stage}") 58 | try: 59 | boto3.client("sagemaker").delete_endpoint(EndpointName=stage_name) 60 | except botocore.exceptions.ClientError as e: 61 | sagemaker_logger.warn(e) 62 | try: 63 | boto3.client("sagemaker").delete_endpoint_config(EndpointConfigName=stage_name) 64 | except botocore.exceptions.ClientError as e: 65 | sagemaker_logger.warn(e) 66 | 67 | return model.deploy( 68 | initial_instance_count=1, 69 | deserializer=JSONDeserializer(), 70 | endpoint_name=stage_name, 71 | serverless_inference_config=ServerlessInferenceConfig( 72 | memory_size_in_mb=memory_size[stage], 73 | max_concurrency=max_concurrency[stage] 74 | ) 75 | ) 76 | 77 | 78 | if __name__ == "__main__": 79 | import argparse 80 | 81 | parser = argparse.ArgumentParser(description="Deploy a model to Amazon SageMaker") 82 | 83 | parser.add_argument("--name", type=str, required=True, help="Name of the model") 84 | parser.add_argument("--stage", type=str, required=True, help="Stage of the model") 85 | parser.add_argument("--version", type=str, required=True, help="Version of the model") 86 | parser.add_argument("--model_data", type=str, required=True, help="S3 location of the model data") 87 | parser.add_argument("--role", type=str, required=True, help="ARN of the IAM role to use") 88 | 89 | args = parser.parse_args() 90 | 91 | deploy(name=args.name, stage=args.stage, version=args.version, model_data=args.model_data, role=args.role) 92 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/src/data_split.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | from box import ConfigBox 6 | from fastai.vision.all import get_files 7 | from ruamel.yaml import YAML 8 | 9 | 10 | yaml = YAML(typ="safe") 11 | 12 | 13 | def data_split(): 14 | params = ConfigBox(yaml.load(open("params.yaml", encoding="utf-8"))) 15 | np.random.seed(params.base.random_seed) 16 | img_fpaths = get_files(Path("data") / "pool_data" / "images", extensions=".jpg") 17 | 18 | train_data_dir = Path("data") / "train_data" 19 | train_data_dir.mkdir(exist_ok=True) 20 | test_data_dir = Path("data") / "test_data" 21 | test_data_dir.mkdir(exist_ok=True) 22 | for img_path in img_fpaths: 23 | msk_path = Path("data") / "pool_data" / "masks" / f"{img_path.stem}.png" 24 | if any(region in str(img_path) for region in params.data_split.test_regions): 25 | shutil.copy(img_path, test_data_dir) 26 | shutil.copy(msk_path, test_data_dir) 27 | else: 28 | shutil.copy(img_path, train_data_dir) 29 | shutil.copy(msk_path, train_data_dir) 30 | 31 | 32 | if __name__ == "__main__": 33 | data_split() 34 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/src/endpoint_prediction.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from pathlib import Path 3 | 4 | import dvc.api 5 | import numpy as np 6 | from PIL import Image 7 | from sagemaker.deserializers import NumpyDeserializer 8 | from sagemaker.pytorch import PyTorchPredictor 9 | from sagemaker.serializers import IdentitySerializer 10 | 11 | 12 | def paint_mask(mask, color_map={0: (0, 0, 0), 1: (0, 0, 255)}): 13 | vis_shape = mask.shape + (3,) 14 | vis = np.zeros(vis_shape) 15 | for i, c in color_map.items(): 16 | vis[mask == i] = color_map[i] 17 | return Image.fromarray(vis.astype(np.uint8)) 18 | 19 | 20 | def endpoint_prediction( 21 | img_path: str, 22 | endpoint_name: str, 23 | output_path: str = "predictions", 24 | ): 25 | params = dvc.api.params_show() 26 | img_size = params["train"]["img_size"] 27 | predictor = PyTorchPredictor(endpoint_name, serializer=IdentitySerializer(), deserializer=NumpyDeserializer()) 28 | name = endpoint_name 29 | 30 | output_file = Path(output_path) / name / Path(img_path).name 31 | output_file.parent.mkdir(exist_ok=True, parents=True) 32 | 33 | io = BytesIO() 34 | Image.open(img_path).resize((img_size, img_size)).save(io, format="PNG") 35 | result = predictor.predict(io.getvalue())[0] 36 | 37 | img_pil = Image.open(img_path) 38 | overlay_img_pil = Image.blend( 39 | img_pil.convert("RGBA"), 40 | paint_mask(result).convert("RGBA").resize(img_pil.size), 41 | 0.5 42 | ) 43 | overlay_img_pil.save(str(output_file.with_suffix(".png"))) 44 | 45 | 46 | if __name__ == "__main__": 47 | import argparse 48 | 49 | parser = argparse.ArgumentParser(description='Run inference on an image using a SageMaker endpoint') 50 | parser.add_argument('--img_path', type=str, help='path to the input image') 51 | parser.add_argument('--endpoint_name', type=str, help='name of the SageMaker endpoint to use') 52 | parser.add_argument('--output_path', type=str, default='predictions', help='path to save the output predictions') 53 | 54 | args = parser.parse_args() 55 | 56 | endpoint_prediction(args.img_path, args.endpoint_name, args.output_path) 57 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/src/evaluate.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import numpy as np 4 | from box import ConfigBox 5 | from dvclive import Live 6 | from fastai.vision.all import get_files, load_learner 7 | from PIL import Image 8 | from ruamel.yaml import YAML 9 | 10 | 11 | yaml = YAML(typ="safe") 12 | 13 | 14 | def dice(mask_pred, mask_true, classes=[0, 1], eps=1e-6): 15 | dice_list = [] 16 | for c in classes: 17 | y_true = mask_true == c 18 | y_pred = mask_pred == c 19 | intersection = 2.0 * np.sum(y_true * y_pred) 20 | dice = intersection / (np.sum(y_true) + np.sum(y_pred) + eps) 21 | dice_list.append(dice) 22 | return np.mean(dice_list) 23 | 24 | 25 | def paint_mask(mask, color_map={0: (0, 0, 0), 1: (0, 0, 255)}): 26 | vis_shape = mask.shape + (3,) 27 | vis = np.zeros(vis_shape) 28 | for i, c in color_map.items(): 29 | vis[mask == i] = color_map[i] 30 | return Image.fromarray(vis.astype(np.uint8)) 31 | 32 | 33 | def stack_images(im1, im2): 34 | dst = Image.new("RGB", (im1.width + im2.width, im1.height)) 35 | dst.paste(im1, (0, 0)) 36 | dst.paste(im2, (im1.width, 0)) 37 | return dst 38 | 39 | 40 | def get_overlay_image(img_fpath, mask_true, mask_pred): 41 | img_pil = Image.open(img_fpath) 42 | overlay_img_true = Image.blend( 43 | img_pil.convert("RGBA"), paint_mask(mask_true).convert("RGBA"), 0.5 44 | ) 45 | 46 | new_color_map = { 47 | 0: (0, 0, 0), # no color - TN 48 | 1: (255, 0, 255), # purple - FN 49 | 2: (255, 255, 0), # yellow - FP 50 | 3: (0, 0, 255), # blue - TP 51 | } 52 | combined_mask = mask_true + 2 * mask_pred 53 | 54 | overlay_img_pred = Image.blend( 55 | img_pil.convert("RGBA"), 56 | paint_mask(combined_mask, color_map=new_color_map).convert("RGBA"), 57 | 0.5, 58 | ) 59 | stacked_image = stack_images(overlay_img_true, overlay_img_pred) 60 | return stacked_image 61 | 62 | 63 | def get_mask_path(x, train_data_dir): 64 | return Path(train_data_dir) / f"{Path(x).stem}.png" 65 | 66 | 67 | def evaluate(): 68 | params = ConfigBox(yaml.load(open("params.yaml", encoding="utf-8"))) 69 | model_fpath = Path("models") / "model.pkl" 70 | learn = load_learner(model_fpath, cpu=False) 71 | test_img_fpaths = sorted(get_files(Path("data") / "test_data", extensions=".jpg")) 72 | test_dl = learn.dls.test_dl(test_img_fpaths) 73 | preds, _ = learn.get_preds(dl=test_dl) 74 | masks_pred = np.array(preds[:, 1, :] > 0.5, dtype=np.uint8) 75 | test_mask_fpaths = [ 76 | get_mask_path(fpath, Path("data") / "test_data") for fpath in test_img_fpaths 77 | ] 78 | masks_true = [Image.open(mask_path) for mask_path in test_mask_fpaths] 79 | with Live("results/evaluate") as live: 80 | dice_multi = 0.0 81 | for ii in range(len(masks_true)): 82 | mask_pred, mask_true = masks_pred[ii], masks_true[ii] 83 | mask_pred = np.array( 84 | Image.fromarray(mask_pred).resize((mask_true.shape[1], mask_true.shape[0])), 85 | dtype=int 86 | ) 87 | mask_true = np.array(mask_true, dtype=int) 88 | dice_multi += dice(mask_true, mask_pred) / len(masks_true) 89 | 90 | if ii < params.evaluate.n_samples_to_save: 91 | stacked_image = get_overlay_image( 92 | test_img_fpaths[ii], mask_true, mask_pred 93 | ) 94 | stacked_image = stacked_image.resize((512, 256)) 95 | live.log_image(f"{Path(test_img_fpaths[ii]).stem}.png", stacked_image) 96 | 97 | live.summary["dice_multi"] = dice_multi 98 | 99 | 100 | if __name__ == "__main__": 101 | evaluate() 102 | -------------------------------------------------------------------------------- /example-get-started-experiments/code/src/train.py: -------------------------------------------------------------------------------- 1 | import random 2 | from functools import partial 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import torch 7 | from box import ConfigBox 8 | from dvclive import Live 9 | from dvclive.fastai import DVCLiveCallback 10 | from fastai.data.all import Normalize, get_files 11 | from fastai.metrics import DiceMulti 12 | from fastai.vision.all import ( 13 | Resize, 14 | SegmentationDataLoaders, 15 | imagenet_stats, 16 | models, 17 | unet_learner, 18 | ) 19 | from ruamel.yaml import YAML 20 | 21 | yaml = YAML(typ="safe") 22 | 23 | 24 | def get_mask_path(x, train_data_dir): 25 | return Path(train_data_dir) / f"{Path(x).stem}.png" 26 | 27 | 28 | def train(): 29 | params = ConfigBox(yaml.load(open("params.yaml", encoding="utf-8"))) 30 | 31 | np.random.seed(params.base.random_seed) 32 | torch.manual_seed(params.base.random_seed) 33 | random.seed(params.base.random_seed) 34 | train_data_dir = Path("data") / "train_data" 35 | 36 | data_loader = SegmentationDataLoaders.from_label_func( 37 | path=train_data_dir, 38 | fnames=get_files(train_data_dir, extensions=".jpg"), 39 | label_func=partial(get_mask_path, train_data_dir=train_data_dir), 40 | codes=["not-pool", "pool"], 41 | bs=params.train.batch_size, 42 | valid_pct=params.train.valid_pct, 43 | item_tfms=Resize(params.train.img_size), 44 | batch_tfms=[ 45 | Normalize.from_stats(*imagenet_stats), 46 | ], 47 | ) 48 | 49 | model_names = [ 50 | name 51 | for name in dir(models) 52 | if not name.startswith("_") 53 | and name.islower() 54 | and name not in ("all", "tvm", "unet", "xresnet") 55 | ] 56 | if params.train.arch not in model_names: 57 | raise ValueError(f"Unsupported model, must be one of:\n{model_names}") 58 | 59 | with Live("results/train") as live: 60 | learn = unet_learner( 61 | data_loader, arch=getattr(models, params.train.arch), metrics=DiceMulti 62 | ) 63 | 64 | learn.fine_tune( 65 | **params.train.fine_tune_args, 66 | cbs=[DVCLiveCallback(live=live)], 67 | ) 68 | models_dir = Path("models") 69 | models_dir.mkdir(exist_ok=True) 70 | learn.export(fname=(models_dir / "model.pkl").absolute()) 71 | torch.save(learn.model, (models_dir / "model.pth").absolute()) 72 | live.log_artifact( 73 | str(models_dir / "model.pkl"), 74 | type="model", 75 | name="pool-segmentation", 76 | desc="This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.", 77 | labels=["cv", "segmentation", "satellite-images", params.train.arch], 78 | ) 79 | 80 | 81 | if __name__ == "__main__": 82 | train() 83 | -------------------------------------------------------------------------------- /example-get-started-experiments/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Setup script env: 4 | # e Exit immediately if a command exits with a non-zero exit status. 5 | # u Treat unset variables as an error when substituting. 6 | # x Print commands and their arguments as they are executed. 7 | set -eux 8 | HERE="$( cd "$(dirname "$0")" ; pwd -P )" 9 | REPO_NAME="example-get-started-experiments" 10 | REPO_PATH="$HERE/build/$REPO_NAME" 11 | PROD=${1:-false} 12 | 13 | if [ -d "$REPO_PATH" ]; then 14 | echo "Repo $REPO_PATH already exists, please remove it first." 15 | exit 1 16 | fi 17 | 18 | TOTAL_TAGS=8 19 | STEP_TIME=100000 20 | BEGIN_TIME=$(( $(date +%s) - ( ${TOTAL_TAGS} * ${STEP_TIME}) )) 21 | export TAG_TIME=${BEGIN_TIME} 22 | export GIT_AUTHOR_DATE="${TAG_TIME} +0000" 23 | tick(){ 24 | export TAG_TIME=$(( ${TAG_TIME} + ${STEP_TIME} )) 25 | export GIT_AUTHOR_DATE="${TAG_TIME} +0000" 26 | } 27 | 28 | export GIT_AUTHOR_NAME="Alex Kim" 29 | export GIT_AUTHOR_EMAIL="alex000kim@gmail.com" 30 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME" 31 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL" 32 | 33 | mkdir -p $REPO_PATH 34 | pushd $REPO_PATH 35 | 36 | virtualenv -p python3 .venv 37 | export VIRTUAL_ENV_DISABLE_PROMPT=true 38 | source .venv/bin/activate 39 | echo '.venv/' > .gitignore 40 | 41 | # Installing from main since we'd like to update repo before 42 | # the release 43 | pip install "git+https://github.com/iterative/dvc#egg=dvc[s3]" gto 44 | 45 | git init 46 | cp $HERE/code/README.md . 47 | cp $HERE/code/.devcontainer.json . 48 | cp $HERE/code/.gitattributes . 49 | cp $HERE/code/.gitlab-ci.yml . 50 | cp $HERE/code/requirements.txt . 51 | cp -r $HERE/code/.github . 52 | git add . 53 | tick 54 | git commit -m "Initialize Git repository" 55 | git branch -M main 56 | 57 | 58 | dvc init 59 | # Remote active on this env only, for writing to HTTP redirect below. 60 | dvc remote add -d --local storage s3://dvc-public/remote/get-started-pools 61 | # Actual remote for generated project (read-only). Redirect of S3 bucket above. 62 | dvc remote add -d storage https://remote.dvc.org/get-started-pools 63 | git add . 64 | tick 65 | git commit -m "Initialize DVC project" 66 | 67 | 68 | cp -r $HERE/code/data . 69 | git add data/.gitignore data/pool_data.dvc 70 | tick 71 | git commit -m "Add data" 72 | dvc pull 73 | 74 | 75 | cp -r $HERE/code/notebooks . 76 | git add . 77 | git commit -m "Add notebook using DVCLive" 78 | 79 | pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu118 80 | pip install jupyter 81 | jupyter nbconvert --execute 'notebooks/TrainSegModel.ipynb' --inplace 82 | # Apply best experiment 83 | BEST_EXP_ROW=$(dvc exp show --drop '.*' --keep 'Experiment|results/evaluate/metrics.json:dice_multi|base_lr' --csv --sort-by 'results/evaluate/metrics.json:dice_multi' | tail -n 1) 84 | BEST_EXP_NAME=$(echo $BEST_EXP_ROW | cut -d, -f 1) 85 | BEST_EXP_BASE_LR=$(echo $BEST_EXP_ROW | cut -d, -f 3) 86 | dvc exp apply $BEST_EXP_NAME 87 | git add . 88 | tick 89 | git commit -m "Run notebook and apply best experiment" 90 | git tag -a "1-notebook-dvclive" -m "Experiment using Notebook" 91 | 92 | 93 | cp -r $HERE/code/src . 94 | cp -r $HERE/code/sagemaker . 95 | cp $HERE/code/params.yaml . 96 | sed -e "s/base_lr: 0.01/base_lr: $BEST_EXP_BASE_LR/" -i".bkp" params.yaml 97 | rm params.yaml.bkp 98 | 99 | git rm -r --cached 'results' 'models' 100 | git commit -m "stop tracking results" 101 | 102 | dvc stage add -n data_split \ 103 | -p base,data_split \ 104 | -d src/data_split.py -d data/pool_data \ 105 | -o data/train_data -o data/test_data \ 106 | python src/data_split.py 107 | 108 | dvc remove models/model.pkl.dvc 109 | dvc stage add -n train \ 110 | -p base,train \ 111 | -d src/train.py -d data/train_data \ 112 | -o models/model.pkl -o models/model.pth \ 113 | -o results/train python src/train.py 114 | 115 | dvc stage add -n evaluate \ 116 | -p base,evaluate \ 117 | -d src/evaluate.py -d models/model.pkl -d data/test_data \ 118 | -o results/evaluate python src/evaluate.py 119 | 120 | dvc stage add -n sagemaker \ 121 | -d models/model.pth -o model.tar.gz \ 122 | 'cp models/model.pth sagemaker/code/model.pth && cd sagemaker && tar -cpzf model.tar.gz code/ && cd .. && mv sagemaker/model.tar.gz . && rm sagemaker/code/model.pth' 123 | 124 | git add . 125 | tick 126 | git commit -m "Convert Notebook to dvc.yaml pipeline" 127 | 128 | dvc exp run 129 | git add . 130 | tick 131 | git commit -m "Run dvc.yaml pipeline" 132 | git tag -a "2-dvc-pipeline" -m "Experiment run using dvc pipeline" 133 | tick 134 | gto register pool-segmentation --version v1.0.0 135 | gto assign pool-segmentation --version v1.0.0 --stage dev 136 | tick 137 | gto assign pool-segmentation --version v1.0.0 --stage prod 138 | gto deprecate pool-segmentation v1.0.0 dev 139 | 140 | 141 | export GIT_AUTHOR_NAME="David de la Iglesia" 142 | export GIT_AUTHOR_EMAIL="daviddelaiglesiacastro@gmail.com" 143 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME" 144 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL" 145 | 146 | dvc exp run --queue --set-param 'train.arch=alexnet,resnet34,squeezenet1_1' --message 'Tune train.arch' 147 | dvc exp run --run-all 148 | 149 | dvc push -A 150 | 151 | popd 152 | 153 | unset TAG_TIME 154 | unset GIT_AUTHOR_DATE 155 | unset GIT_COMMITTER_DATE 156 | unset GIT_AUTHOR_NAME 157 | unset GIT_AUTHOR_EMAIL 158 | unset GIT_COMMITTER_NAME 159 | unset GIT_COMMITTER_EMAIL 160 | 161 | cat README.md 162 | -------------------------------------------------------------------------------- /example-get-started/.gitignore: -------------------------------------------------------------------------------- 1 | # Custom 2 | *.zip 3 | /tmp 4 | build/ 5 | -------------------------------------------------------------------------------- /example-get-started/README.md: -------------------------------------------------------------------------------- 1 | A set of scripts to generate an NLP DVC Studio project with multiple branches, 2 | commit history, experiments, metrics, plots, etc. It used in the DVC docs and in 3 | Studio as a demo project. 4 | 5 | This script can be also used in an advanced scenario to generate a nested 6 | mono-repositories that are used as fixtures in Studio testing, or testing 7 | different remote types. See the section below for the advanced settings. 8 | 9 | ## Demo project 10 | 11 | Note! In some cases, before rebuilding the project you might want to delete the 12 | existing remote tags if you change the order, or names. 13 | 14 | ```shell 15 | git clone git@github.com:/example-get-started.git 16 | cd example-get-started 17 | git tag -l | xargs -n 1 git push --delete origin 18 | ``` 19 | 20 | For the basic use case (docs and Studio demo), use the command below. 21 | 22 | ```shell 23 | ./generate.sh 24 | ``` 25 | 26 | If change source code, to publish it on S3 (needed for the get started tutorial) 27 | pass `prod` to the command. It's needed when you ready to publish it. 28 | 29 | ```shell 30 | ./generate.sh prod 31 | ``` 32 | 33 | The repo generated in `build/example-get-started` is intended to be published on 34 | to the https://github.com/iterative/example-get-started. Make sure the Github 35 | repo exists first and that you have appropriate write permissions. 36 | 37 | To create it with https://cli.github.com/, run: 38 | 39 | ```shell 40 | gh repo create iterative/example-get-started --public \ 41 | -d "Get Started DVC project" -h "https://dvc.org/doc/get-started" 42 | ``` 43 | 44 | Run these commands to force push it: 45 | 46 | ```shell 47 | cd build/example-get-started 48 | git remote add origin git@github.com:/example-get-started.git 49 | # close open PRs 50 | gh pr close try-large-dataset 51 | gh pr close tune-hyperparams 52 | # remove existing tags, branches, experiments 53 | git ls-remote origin | awk '{print $2}' | xargs -n 1 git push --delete origin || true 54 | # force push branches 55 | git push --force origin main 56 | git push --force origin try-large-dataset 57 | git push --force origin tune-hyperparams 58 | # we push git tags one by one for Studio to receive webhooks: 59 | git tag --sort=creatordate | xargs -n 1 git push --force origin 60 | ``` 61 | 62 | Run these to drop and then rewrite the experiment references on the repo: 63 | 64 | ```shell 65 | source .venv/bin/activate 66 | dvc exp remove -A -g origin 67 | dvc exp push origin -A 68 | ``` 69 | 70 | To create a PR from the `try-large-dataset` branch: 71 | 72 | ```shell 73 | gh pr create -t "Try 40K dataset (4x data)" \ 74 | -b "We are trying here a large dataset, since the smaller one looks unstable" \ 75 | -B main -H try-large-dataset 76 | ``` 77 | 78 | To create a PR from the `tune-hyperparams` branch: 79 | 80 | ```shell 81 | gh pr create -t "Run experiments tuning random forest params" \ 82 | -b "Better RF split and number of estimators based on small grid search." \ 83 | -B main -H tune-hyperparams 84 | ``` 85 | 86 | Finally, return to the directory where you started: 87 | 88 | ```shell 89 | cd ../.. 90 | ``` 91 | 92 | You may remove the generated repo with: 93 | 94 | ```shell 95 | rm -fR build/example-get-started 96 | ``` 97 | 98 | To update the project in Studio, follow the instructions at: 99 | 100 | https://github.com/iterative/studio/wiki/Updating-and-synchronizing-demo-project 101 | 102 | 103 | ## Advanced usage 104 | 105 | Inside the script there a few options that could help generating advanced nested 106 | repositories and/or use different remote types. 107 | 108 | - `OPT_TESTING_REPO='false'` - (default `false`). Set to true to generate a 109 | fixture repo or a testing repo. It generates a `README` in those repos that 110 | has the dump of all the settings that were used to generate them. This way it 111 | can be reproduced next time. 112 | - `OPT_SUBDIR=''` - (default `''`). No leading or trailing slashes. If specified 113 | the new repo will be created inside the 114 | `build/example-get-started/$OPT_SUBDIR` path. 115 | - `OPT_INIT_GIT='true'` - (default `true`). Whether to run or not `git init`. If 116 | there is already initialized Git repo in place we don't need to run it again. 117 | Usually needed if you are generating a nested repo. 118 | - `OPT_INIT_DVC='true'` - (default `true`). Whether to run or not 119 | `dvc init` in the generated directory. If it's nested directory `--subdir` is 120 | added. 121 | - `OPT_NON_DVC='false'` - (default `false`). To generate a non DVC repo with 122 | some sources, basic params, and metrics. To test non DVC root, or custom 123 | metrics, etc. 124 | - `OPT_BRANCHES='true'` - (default `true`). Whether we need to generate 125 | branches (bigger dataset, etc). It supports nested repos - branch names will 126 | have prefixes or suffixes to distinguish them. 127 | - `OPT_REMOTE="public-s3"` - (default `private-s3`). Other options: `public-s3`, 128 | `private-http`, `private-ssh`, `private-gdrive`, etc. 129 | - `OPT_DVC_TRACKED_METRICS='true'` - (default `true`). Either we should use 130 | DVC to also track all metric and plot files (e.g. to test that Studio can get 131 | plots from the remote storage). 132 | - `OPT_REGISTER_MODELS='false'` - (default `true`). Use the `gto` to register 133 | models. It supports nested repos. 134 | - `OPT_TAGS='true'` - (default `true`). Generate Git tags for commits. 135 | Independent of `OPT_REGISTER_MODELS` and `OPT_TAG_MODELS`. 136 | - `OPT_SQUASH_COMMITS='false'` - (default `false`). Squash commits into one 137 | after generating a repo or a sub repo. It speedups parsing in tests. Be 138 | careful with Git tags (disable them for example). 139 | - `OPT_TAG_MODELS='true'` - (default `true`). Creates Git tags using GTO. 140 | Independent of `OPT_REGISTER_MODELS` and `OPT_TAGS`. 141 | - `OPT_MODEL_NAME='text-classification'` - (default `text-classification`). 142 | Model name to register. 143 | 144 | ## Remotes 145 | 146 | A variety of remotes could be used to generated different repositories to test 147 | private storage credentials in Studio (manually or via CI). 148 | 149 | `OPT_REMOTE` takes different values (see above or in the `generate.sh`). 150 | 151 | For SSH and HTTP remotes we use a machine that is deployed in GCP with IP 152 | address http://35.194.53.251. Credentials for both could be found in this Slack 153 | [thread](https://iterativeai.slack.com/archives/CUSNDR35K/p1595393188054200). 154 | You might need to change a path to SSH key in the script. HTTP remote doesn't 155 | support PUT/POST so we use SSH to upload data there. 156 | -------------------------------------------------------------------------------- /example-get-started/code/.devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "example-get-started", 3 | "image": "mcr.microsoft.com/devcontainers/python:3.10", 4 | "extensions": ["Iterative.dvc", "ms-python.python", "redhat.vscode-yaml"], 5 | "features": { 6 | "ghcr.io/iterative/features/dvc:1": {} 7 | }, 8 | "postCreateCommand": "pip3 install --user -r src/requirements.txt" 9 | } 10 | -------------------------------------------------------------------------------- /example-get-started/code/.gitattributes: -------------------------------------------------------------------------------- 1 | *.dvc linguist-language=YAML 2 | dvc.lock linguist-language=YAML 3 | -------------------------------------------------------------------------------- /example-get-started/code/.github/workflows/cml.yaml: -------------------------------------------------------------------------------- 1 | name: CML Report 2 | on: pull_request 3 | jobs: 4 | run: 5 | runs-on: [ubuntu-latest] 6 | steps: 7 | - uses: iterative/setup-cml@v2 8 | - uses: iterative/setup-dvc@v1 9 | - uses: actions/checkout@v3 10 | with: 11 | fetch-depth: 2 12 | # Needed for https://github.com/iterative/example-repos-dev/issues/225 13 | - name: Installs JSON5 14 | run: npm install -g json5 15 | - name: Generate metrics report 16 | env: 17 | REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | run: | 19 | cml ci 20 | if [ $GITHUB_REF = refs/heads/main ]; then 21 | PREVIOUS_REF=HEAD~1 22 | else 23 | PREVIOUS_REF=main 24 | git fetch origin main:main 25 | fi 26 | 27 | dvc pull eval 28 | dvc plots diff $PREVIOUS_REF workspace \ 29 | --show-vega --targets ROC | json5 > vega.json 30 | vl2svg vega.json roc.svg 31 | 32 | dvc plots diff $PREVIOUS_REF workspace \ 33 | --show-vega --targets Precision-Recall | json5 > vega.json 34 | vl2svg vega.json prc.svg 35 | 36 | dvc plots diff $PREVIOUS_REF workspace \ 37 | --show-vega --targets Confusion-Matrix | json5 > vega.json 38 | vl2svg vega.json confusion.svg 39 | 40 | cp eval/plots/images/importance.png importance_workspace.png 41 | 42 | git checkout $PREVIOUS_REF -- dvc.lock 43 | cp eval/plots/images/importance.png importance_previous.png 44 | 45 | dvc_report=$(dvc exp diff $PREVIOUS_REF --md) 46 | 47 | cat < report.md 48 | # CML Report 49 | ## Plots 50 | ![ROC](./roc.svg) 51 | ![Precision-Recall](./prc.svg) 52 | ![Confusion Matrix](./confusion.svg) 53 | #### Feature Importance: ${PREVIOUS_REF} 54 | ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png) 55 | #### Feature Importance: workspace 56 | ![Feature Importance: workspace](./importance_workspace.png) 57 | 58 | ## Metrics and Params 59 | ### ${PREVIOUS_REF} → workspace 60 | ${dvc_report} 61 | EOF 62 | 63 | cml comment create --publish --pr=false report.md 64 | -------------------------------------------------------------------------------- /example-get-started/code/.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | report: 2 | rules: 3 | - if: $CI_PIPELINE_SOURCE == 'merge_request_event' 4 | - if: $CI_COMMIT_BRANCH == 'main' 5 | image: dvcorg/cml:0-dvc3-base1 6 | before_script: 7 | - cml ci && cml --version 8 | - npm install -g json5 9 | script: | 10 | if [ $CI_COMMIT_REF_NAME = main ]; then 11 | PREVIOUS_REF=HEAD~1 12 | COMMIT_HASH1=$CI_COMMIT_BEFORE_SHA 13 | COMMIT_HASH2=$CI_COMMIT_SHA 14 | else 15 | PREVIOUS_REF=main 16 | git fetch --depth=1 origin main:main 17 | COMMIT_HASH1=$CI_MERGE_REQUEST_DIFF_BASE_SHA 18 | COMMIT_HASH2=$CI_COMMIT_SHA 19 | fi 20 | 21 | dvc pull eval 22 | dvc plots diff $PREVIOUS_REF workspace \ 23 | --show-vega --targets ROC | json5 > vega.json 24 | vl2svg vega.json roc.svg 25 | 26 | dvc plots diff $PREVIOUS_REF workspace \ 27 | --show-vega --targets Precision-Recall | json5 > vega.json 28 | vl2svg vega.json prc.svg 29 | 30 | dvc plots diff $PREVIOUS_REF workspace \ 31 | --show-vega --targets Confusion-Matrix | json5 > vega.json 32 | vl2svg vega.json confusion.svg 33 | 34 | cp eval/plots/images/importance.png importance_workspace.png 35 | 36 | git checkout $PREVIOUS_REF -- dvc.lock 37 | cp eval/plots/images/importance.png importance_previous.png 38 | 39 | dvc_report=$(dvc exp diff $PREVIOUS_REF --md) 40 | 41 | cat < report.md 42 | # CML Report 43 | [![DVC](https://img.shields.io/badge/-Open_in_Studio-grey?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/views/example-get-started-2gpv7kdqx2?panels=plots%2C%3Bcompare%2C&commits=${COMMIT_HASH2}%3B${COMMIT_HASH1}&activeCommits=${COMMIT_HASH1}%3Aprimary%3B${COMMIT_HASH2}%3Apurple) 44 | ## Plots 45 | ![ROC](./roc.svg) 46 | ![Precision-Recall](./prc.svg) 47 | ![Confusion Matrix](./confusion.svg) 48 | #### Feature Importance: ${PREVIOUS_REF} 49 | ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png) 50 | #### Feature Importance: workspace 51 | ![Feature Importance: workspace](./importance_workspace.png) 52 | 53 | ## Metrics and Params 54 | ### ${PREVIOUS_REF} → workspace 55 | ${dvc_report} 56 | EOF 57 | 58 | if [ $CI_COMMIT_REF_NAME = main ]; then 59 | cml comment create --target=commit report.md 60 | else 61 | cml comment update --target=pr report.md 62 | fi 63 | -------------------------------------------------------------------------------- /example-get-started/code/README.md: -------------------------------------------------------------------------------- 1 | [![DVC](https://img.shields.io/badge/-Open_in_Studio-grey.svg?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/views/example-get-started-zde16i6c4g) 2 | 3 | # DVC Get Started 4 | 5 | This is an auto-generated repository for use in [DVC](https://dvc.org) 6 | [Get Started](https://dvc.org/doc/get-started). It is a step-by-step quick 7 | introduction into basic DVC concepts. 8 | 9 | ![](https://static.iterative.ai/img/example-get-started/readme-head.png) 10 | 11 | The project is a natural language processing (NLP) binary classifier problem of 12 | predicting tags for a given StackOverflow question. For example, we want one 13 | classifier which can predict a post that is about the R language by tagging it 14 | `R`. 15 | 16 | 🐛 Please report any issues found in this project here - 17 | [example-repos-dev](https://github.com/iterative/example-repos-dev). 18 | 19 | ## Installation 20 | 21 | Python 3.9+ is required to run code from this repo. 22 | 23 | ```console 24 | $ git clone https://github.com/iterative/example-get-started 25 | $ cd example-get-started 26 | ``` 27 | 28 | Now let's install the requirements. But before we do that, we **strongly** 29 | recommend creating a virtual environment with a tool such as 30 | [virtualenv](https://virtualenv.pypa.io/en/stable/): 31 | 32 | ```console 33 | $ virtualenv -p python3 .venv 34 | $ source .venv/bin/activate 35 | $ pip install -r src/requirements.txt 36 | ``` 37 | 38 | > This instruction assumes that DVC is already installed, as it is frequently 39 | > used as a global tool like Git. If DVC is not installed, see the 40 | > [DVC installation guide](https://dvc.org/doc/install) on how to install DVC. 41 | 42 | This DVC project comes with a preconfigured DVC 43 | [remote storage](https://dvc.org/doc/commands-reference/remote) that holds raw 44 | data (input), intermediate, and final results that are produced. This is a 45 | read-only HTTP remote. 46 | 47 | ```console 48 | $ dvc remote list 49 | storage https://remote.dvc.org/get-started 50 | ``` 51 | 52 | You can run [`dvc pull`](https://man.dvc.org/pull) to download the data: 53 | 54 | ```console 55 | $ dvc pull 56 | ``` 57 | 58 | ## Running in your environment 59 | 60 | Run [`dvc exp run`](https://man.dvc.org/exp/run) to reproduce the 61 | [pipeline](https://dvc.org/doc/user-guide/pipelines) and create a new 62 | [experiment](https://dvc.org/doc/user-guide/experiment-management). 63 | 64 | ```console 65 | $ dvc exp run 66 | Ran experiment(s): rapid-cane 67 | Experiment results have been applied to your workspace. 68 | ``` 69 | 70 | If you'd like to test commands like [`dvc push`](https://man.dvc.org/push), 71 | that require write access to the remote storage, the easiest way would be to set 72 | up a "local remote" on your file system: 73 | 74 | > This kind of remote is located in the local file system, but is external to 75 | > the DVC project. 76 | 77 | ```console 78 | $ mkdir -p /tmp/dvc-storage 79 | $ dvc remote add local /tmp/dvc-storage 80 | ``` 81 | 82 | You should now be able to run: 83 | 84 | ```console 85 | $ dvc push -r local 86 | ``` 87 | 88 | ## Existing stages 89 | 90 | This project with the help of the Git tags reflects the sequence of actions that 91 | are run in the DVC [get started](https://dvc.org/doc/get-started) guide. Feel 92 | free to checkout one of them and play with the DVC commands having the 93 | playground ready. 94 | 95 | - `0-git-init`: Empty Git repository initialized. 96 | - `1-dvc-init`: DVC has been initialized. `.dvc/` with the cache directory 97 | created. 98 | - `2-track-data`: Raw data file `data.xml` downloaded and tracked with DVC using 99 | [`dvc add`](https://man.dvc.org/add). First `.dvc` file created. 100 | - `3-config-remote`: Remote HTTP storage initialized. It's a shared read only 101 | storage that contains all data artifacts produced during next steps. 102 | - `4-import-data`: Use `dvc import` to get the same `data.xml` from the DVC data 103 | registry. 104 | - `5-source-code`: Source code downloaded and put into Git. 105 | - `6-prepare-stage`: Create `dvc.yaml` and the first pipeline stage with 106 | [`dvc run`](https://man.dvc.org/run). It transforms XML data into TSV. 107 | - `7-ml-pipeline`: Feature extraction and train stages created. It takes data in 108 | TSV format and produces two `.pkl` files that contain serialized feature 109 | matrices. Train runs random forest classifier and creates the `model.pkl` file. 110 | - `8-evaluation`: Evaluation stage. Runs the model on a test dataset to produce 111 | its performance AUC value. The result is dumped into a DVC metric file so that 112 | we can compare it with other experiments later. 113 | - `9-bigrams-model`: Bigrams experiment, code has been modified to extract more 114 | features. We run [`dvc repro`](https://man.dvc.org/repro) for the first time 115 | to illustrate how DVC can reuse cached files and detect changes along the 116 | computational graph, regenerating the model with the updated data. 117 | - `10-bigrams-experiment`: Reproduce the evaluation stage with the bigrams based 118 | model. 119 | - `11-random-forest-experiments`: Reproduce experiments to tune the random 120 | forest classifier parameters and select the best experiment. 121 | 122 | There are three additional tags: 123 | 124 | - `baseline-experiment`: First end-to-end result that we have performance metric 125 | for. 126 | - `bigrams-experiment`: Second experiment (model trained using bigrams 127 | features). 128 | - `random-forest-experiments`: Best of additional experiments tuning random 129 | forest parameters. 130 | 131 | These tags can be used to illustrate `-a` or `-T` options across different 132 | [DVC commands](https://man.dvc.org/). 133 | 134 | ## Project structure 135 | 136 | The data files, DVC files, and results change as stages are created one by one. 137 | After cloning and using [`dvc pull`](https://man.dvc.org/pull) to download 138 | data, models, and plots tracked by DVC, the workspace should look like this: 139 | 140 | ```console 141 | $ tree 142 | . 143 | ├── README.md 144 | ├── data # <-- Directory with raw and intermediate data 145 | │   ├── data.xml # <-- Initial XML StackOverflow dataset (raw data) 146 | │   ├── data.xml.dvc # <-- .dvc file - a placeholder/pointer to raw data 147 | │   ├── features # <-- Extracted feature matrices 148 | │   │   ├── test.pkl 149 | │   │   └── train.pkl 150 | │   └── prepared # <-- Processed dataset (split and TSV formatted) 151 | │   ├── test.tsv 152 | │   └── train.tsv 153 | ├── dvc.lock 154 | ├── dvc.yaml # <-- DVC pipeline file 155 | ├── eval 156 | │   ├── metrics.json # <-- Binary classifier final metrics (e.g. AUC) 157 | │   └── plots 158 | │   ├── images 159 | │   │   └── importance.png # <-- Feature importance plot 160 | │   └── sklearn # <-- Data points for ROC, confusion matrix 161 | │   ├── cm 162 | │   │   ├── test.json 163 | │   │   └── train.json 164 | │   ├── prc 165 | │   │   ├── test.json 166 | │   │   └── train.json 167 | │   └── roc 168 | │   ├── test.json 169 | │   └── train.json 170 | ├── model.pkl # <-- Trained model file 171 | ├── params.yaml # <-- Parameters file 172 | └── src # <-- Source code to run the pipeline stages 173 | ├── evaluate.py 174 | ├── featurization.py 175 | ├── prepare.py 176 | ├── requirements.txt # <-- Python dependencies needed in the project 177 | └── train.py 178 | ``` 179 | -------------------------------------------------------------------------------- /example-get-started/code/params.yaml: -------------------------------------------------------------------------------- 1 | prepare: 2 | split: 0.20 3 | seed: 20170428 4 | 5 | featurize: 6 | max_features: 100 7 | ngrams: 1 8 | 9 | train: 10 | seed: 20170428 11 | n_est: 50 12 | min_split: 0.01 13 | 14 | -------------------------------------------------------------------------------- /example-get-started/code/src/evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import os 4 | import pickle 5 | import sys 6 | 7 | import pandas as pd 8 | from sklearn import metrics 9 | from sklearn import tree 10 | from dvclive import Live 11 | from matplotlib import pyplot as plt 12 | 13 | 14 | def evaluate(model, matrix, split, live, save_path): 15 | """ 16 | Dump all evaluation metrics and plots for given datasets. 17 | 18 | Args: 19 | model (sklearn.ensemble.RandomForestClassifier): Trained classifier. 20 | matrix (scipy.sparse.csr_matrix): Input matrix. 21 | split (str): Dataset name. 22 | live (dvclive.Live): Dvclive instance. 23 | save_path (str): Path to save the metrics. 24 | """ 25 | labels = matrix[:, 1].toarray().astype(int) 26 | x = matrix[:, 2:] 27 | 28 | predictions_by_class = model.predict_proba(x) 29 | predictions = predictions_by_class[:, 1] 30 | 31 | # Use dvclive to log a few simple metrics... 32 | avg_prec = metrics.average_precision_score(labels, predictions) 33 | roc_auc = metrics.roc_auc_score(labels, predictions) 34 | if not live.summary: 35 | live.summary = {"avg_prec": {}, "roc_auc": {}} 36 | live.summary["avg_prec"][split] = avg_prec 37 | live.summary["roc_auc"][split] = roc_auc 38 | 39 | # ... and plots... 40 | # ... like an roc plot... 41 | live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}") 42 | # ... and precision recall plot... 43 | # ... which passes `drop_intermediate=True` to the sklearn method... 44 | live.log_sklearn_plot( 45 | "precision_recall", 46 | labels, 47 | predictions, 48 | name=f"prc/{split}", 49 | drop_intermediate=True, 50 | ) 51 | # ... and confusion matrix plot 52 | live.log_sklearn_plot( 53 | "confusion_matrix", 54 | labels.squeeze(), 55 | predictions_by_class.argmax(-1), 56 | name=f"cm/{split}", 57 | ) 58 | 59 | 60 | def save_importance_plot(live, model, feature_names): 61 | """ 62 | Save feature importance plot. 63 | 64 | Args: 65 | live (dvclive.Live): DVCLive instance. 66 | model (sklearn.ensemble.RandomForestClassifier): Trained classifier. 67 | feature_names (list): List of feature names. 68 | """ 69 | fig, axes = plt.subplots(dpi=100) 70 | fig.subplots_adjust(bottom=0.2, top=0.95) 71 | axes.set_ylabel("Mean decrease in impurity") 72 | 73 | importances = model.feature_importances_ 74 | forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30) 75 | forest_importances.plot.bar(ax=axes) 76 | 77 | live.log_image("importance.png", fig) 78 | 79 | 80 | def main(): 81 | EVAL_PATH = "eval" 82 | 83 | if len(sys.argv) != 3: 84 | sys.stderr.write("Arguments error. Usage:\n") 85 | sys.stderr.write("\tpython evaluate.py model features\n") 86 | sys.exit(1) 87 | 88 | model_file = sys.argv[1] 89 | train_file = os.path.join(sys.argv[2], "train.pkl") 90 | test_file = os.path.join(sys.argv[2], "test.pkl") 91 | 92 | # Load model and data. 93 | with open(model_file, "rb") as fd: 94 | model = pickle.load(fd) 95 | 96 | with open(train_file, "rb") as fd: 97 | train, feature_names = pickle.load(fd) 98 | 99 | with open(test_file, "rb") as fd: 100 | test, _ = pickle.load(fd) 101 | 102 | # Evaluate train and test datasets. 103 | with Live(EVAL_PATH) as live: 104 | evaluate(model, train, "train", live, save_path=EVAL_PATH) 105 | evaluate(model, test, "test", live, save_path=EVAL_PATH) 106 | 107 | # Dump feature importance plot. 108 | save_importance_plot(live, model, feature_names) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /example-get-started/code/src/featurization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import sys 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scipy.sparse as sparse 8 | import yaml 9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 10 | 11 | 12 | def get_df(data): 13 | """Read the input data file and return a data frame.""" 14 | df = pd.read_csv( 15 | data, 16 | encoding="utf-8", 17 | header=None, 18 | delimiter="\t", 19 | names=["id", "label", "text"], 20 | ) 21 | sys.stderr.write(f"The input data frame {data} size is {df.shape}\n") 22 | return df 23 | 24 | 25 | def save_matrix(df, matrix, names, output): 26 | """ 27 | Save the matrix to a pickle file. 28 | 29 | Args: 30 | df (pandas.DataFrame): Input data frame. 31 | matrix (scipy.sparse.csr_matrix): Input matrix. 32 | names (list): List of feature names. 33 | output (str): Output file name. 34 | """ 35 | id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T 36 | label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T 37 | 38 | result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr") 39 | 40 | msg = "The output matrix {} size is {} and data type is {}\n" 41 | sys.stderr.write(msg.format(output, result.shape, result.dtype)) 42 | 43 | with open(output, "wb") as fd: 44 | pickle.dump((result, names), fd) 45 | pass 46 | 47 | 48 | def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf): 49 | """ 50 | Generate train feature matrix. 51 | 52 | Args: 53 | train_input (str): Train input file name. 54 | train_output (str): Train output file name. 55 | bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words. 56 | tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer. 57 | """ 58 | df_train = get_df(train_input) 59 | train_words = np.array(df_train.text.str.lower().values) 60 | 61 | bag_of_words.fit(train_words) 62 | 63 | train_words_binary_matrix = bag_of_words.transform(train_words) 64 | feature_names = bag_of_words.get_feature_names_out() 65 | 66 | tfidf.fit(train_words_binary_matrix) 67 | train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix) 68 | 69 | save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output) 70 | 71 | 72 | def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf): 73 | """ 74 | Generate test feature matrix. 75 | 76 | Args: 77 | test_input (str): Test input file name. 78 | test_output (str): Test output file name. 79 | bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words. 80 | tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer. 81 | """ 82 | df_test = get_df(test_input) 83 | test_words = np.array(df_test.text.str.lower().values) 84 | 85 | test_words_binary_matrix = bag_of_words.transform(test_words) 86 | test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix) 87 | feature_names = bag_of_words.get_feature_names_out() 88 | 89 | save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output) 90 | 91 | 92 | def main(): 93 | params = yaml.safe_load(open("params.yaml"))["featurize"] 94 | 95 | np.set_printoptions(suppress=True) 96 | 97 | if len(sys.argv) != 3 and len(sys.argv) != 5: 98 | sys.stderr.write("Arguments error. Usage:\n") 99 | sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n") 100 | sys.exit(1) 101 | 102 | in_path = sys.argv[1] 103 | out_path = sys.argv[2] 104 | 105 | train_input = os.path.join(in_path, "train.tsv") 106 | test_input = os.path.join(in_path, "test.tsv") 107 | train_output = os.path.join(out_path, "train.pkl") 108 | test_output = os.path.join(out_path, "test.pkl") 109 | 110 | max_features = params["max_features"] 111 | ngrams = params["ngrams"] 112 | 113 | os.makedirs(out_path, exist_ok=True) 114 | 115 | bag_of_words = CountVectorizer( 116 | stop_words="english", max_features=max_features, ngram_range=(1, ngrams) 117 | ) 118 | tfidf = TfidfTransformer(smooth_idf=False) 119 | 120 | generate_and_save_train_features( 121 | train_input=train_input, 122 | train_output=train_output, 123 | bag_of_words=bag_of_words, 124 | tfidf=tfidf, 125 | ) 126 | 127 | generate_and_save_test_features( 128 | test_input=test_input, 129 | test_output=test_output, 130 | bag_of_words=bag_of_words, 131 | tfidf=tfidf, 132 | ) 133 | 134 | 135 | if __name__ == "__main__": 136 | main() 137 | -------------------------------------------------------------------------------- /example-get-started/code/src/prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import re 4 | import sys 5 | import xml.etree.ElementTree 6 | 7 | import yaml 8 | 9 | 10 | def process_posts(input_lines, fd_out_train, fd_out_test, target_tag, split): 11 | """ 12 | Process the input lines and write the output to the output files. 13 | 14 | Args: 15 | input_lines (list): List of input lines. 16 | fd_out_train (file): Output file for the training data set. 17 | fd_out_test (file): Output file for the test data set. 18 | target_tag (str): Target tag. 19 | split (float): Test data set split ratio. 20 | """ 21 | num = 1 22 | for line in input_lines: 23 | try: 24 | fd_out = fd_out_train if random.random() > split else fd_out_test 25 | attr = xml.etree.ElementTree.fromstring(line).attrib 26 | 27 | pid = attr.get("Id", "") 28 | label = 1 if target_tag in attr.get("Tags", "") else 0 29 | title = re.sub(r"\s+", " ", attr.get("Title", "")).strip() 30 | body = re.sub(r"\s+", " ", attr.get("Body", "")).strip() 31 | text = title + " " + body 32 | 33 | fd_out.write("{}\t{}\t{}\n".format(pid, label, text)) 34 | 35 | num += 1 36 | except Exception as ex: 37 | sys.stderr.write(f"Skipping the broken line {num}: {ex}\n") 38 | 39 | 40 | def main(): 41 | params = yaml.safe_load(open("params.yaml"))["prepare"] 42 | 43 | if len(sys.argv) != 2: 44 | sys.stderr.write("Arguments error. Usage:\n") 45 | sys.stderr.write("\tpython prepare.py data-file\n") 46 | sys.exit(1) 47 | 48 | # Test data set split ratio 49 | split = params["split"] 50 | random.seed(params["seed"]) 51 | 52 | input = sys.argv[1] 53 | output_train = os.path.join("data", "prepared", "train.tsv") 54 | output_test = os.path.join("data", "prepared", "test.tsv") 55 | 56 | os.makedirs(os.path.join("data", "prepared"), exist_ok=True) 57 | 58 | input_lines = [] 59 | with open(input) as fd_in: 60 | input_lines = fd_in.readlines() 61 | 62 | fd_out_train = open(output_train, "w", encoding="utf-8") 63 | fd_out_test = open(output_test, "w", encoding="utf-8") 64 | 65 | process_posts( 66 | input_lines=input_lines, 67 | fd_out_train=fd_out_train, 68 | fd_out_test=fd_out_test, 69 | target_tag="", 70 | split=split, 71 | ) 72 | 73 | fd_out_train.close() 74 | fd_out_test.close() 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /example-get-started/code/src/requirements.txt: -------------------------------------------------------------------------------- 1 | dvclive>=3.0 2 | pandas 3 | pyaml 4 | scikit-learn>=1.3 5 | scipy 6 | matplotlib 7 | -------------------------------------------------------------------------------- /example-get-started/code/src/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import sys 4 | 5 | import numpy as np 6 | import yaml 7 | from sklearn.ensemble import RandomForestClassifier 8 | 9 | 10 | def train(seed, n_est, min_split, matrix): 11 | """ 12 | Train a random forest classifier. 13 | 14 | Args: 15 | seed (int): Random seed. 16 | n_est (int): Number of trees in the forest. 17 | min_split (int): Minimum number of samples required to split an internal node. 18 | matrix (scipy.sparse.csr_matrix): Input matrix. 19 | 20 | Returns: 21 | sklearn.ensemble.RandomForestClassifier: Trained classifier. 22 | """ 23 | labels = np.squeeze(matrix[:, 1].toarray()) 24 | x = matrix[:, 2:] 25 | 26 | sys.stderr.write("Input matrix size {}\n".format(matrix.shape)) 27 | sys.stderr.write("X matrix size {}\n".format(x.shape)) 28 | sys.stderr.write("Y matrix size {}\n".format(labels.shape)) 29 | 30 | clf = RandomForestClassifier( 31 | n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed 32 | ) 33 | 34 | clf.fit(x, labels) 35 | 36 | return clf 37 | 38 | 39 | def main(): 40 | params = yaml.safe_load(open("params.yaml"))["train"] 41 | 42 | if len(sys.argv) != 3: 43 | sys.stderr.write("Arguments error. Usage:\n") 44 | sys.stderr.write("\tpython train.py features model\n") 45 | sys.exit(1) 46 | 47 | input = sys.argv[1] 48 | output = sys.argv[2] 49 | seed = params["seed"] 50 | n_est = params["n_est"] 51 | min_split = params["min_split"] 52 | 53 | # Load the data 54 | with open(os.path.join(input, "train.pkl"), "rb") as fd: 55 | matrix, _ = pickle.load(fd) 56 | 57 | clf = train(seed=seed, n_est=n_est, min_split=min_split, matrix=matrix) 58 | 59 | # Save the model 60 | with open(output, "wb") as fd: 61 | pickle.dump(clf, fd) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /example-get-started/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -eux 4 | 5 | PACKAGE_DIR=code 6 | PACKAGE="code.zip" 7 | TEST_DIR=tmp 8 | TEST_PACKAGE=$TEST_DIR/$PACKAGE 9 | PROD=${1:-false} 10 | 11 | rm -f $PACKAGE 12 | rm -rf $TEST_DIR 13 | mkdir $TEST_DIR 14 | 15 | pushd $PACKAGE_DIR 16 | zip -r $PACKAGE params.yaml src/* .github/* 17 | popd 18 | 19 | # Requires AWS CLI and write access to `s3://dvc-public/code/get-started/`. 20 | mv $PACKAGE_DIR/$PACKAGE . 21 | if [ $PROD == 'prod' ]; then 22 | 23 | aws s3 cp $PACKAGE s3://dvc-public/code/get-started/$PACKAGE 24 | 25 | # Sanity check 26 | wget https://code.dvc.org/get-started/$PACKAGE -O $TEST_PACKAGE 27 | unzip $TEST_PACKAGE -d $TEST_DIR 28 | 29 | echo "\nNo output should be produced by the following cmp and diff commands:\n" 30 | 31 | cmp $PACKAGE $TEST_PACKAGE # Expected output: nothing 32 | rm -f $TEST_PACKAGE 33 | cp -f $PACKAGE_DIR/README.md $TEST_DIR 34 | cp -f $PACKAGE_DIR/.devcontainer.json $TEST_DIR 35 | cp -f $PACKAGE_DIR/.gitlab-ci.yml $TEST_DIR 36 | cp -f $PACKAGE_DIR/.gitattributes $TEST_DIR 37 | diff -r $PACKAGE_DIR $TEST_DIR # Expected output: nothing 38 | rm -fR $TEST_DIR 39 | 40 | fi 41 | -------------------------------------------------------------------------------- /example-get-started/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # See https://dvc.org/get-started 3 | 4 | set -eux 5 | 6 | HERE=$( cd "$(dirname "$0")" ; pwd -P ) 7 | REPO_NAME="example-get-started" 8 | REPO_PATH_BASE="$HERE/build/$REPO_NAME" 9 | PROD=${1:-false} 10 | 11 | # Some additional options to tune the exact repo structure that we generate. 12 | # It useful to generate nested (monorepo), private storages, a mix of those 13 | # cases to be used in Studio fixtures or QA. 14 | OPT_TESTING_REPO='false' # Default false. 15 | OPT_SUBDIR='' # No leading or trailing slashes. Default "". 16 | OPT_INIT_GIT='true' # Default true. 17 | OPT_INIT_DVC='true' # Default true. 18 | OPT_NON_DVC='false' # Default false. 19 | OPT_BRANCHES='true' # Default true. 20 | OPT_TAGS='true' # Default true. 21 | # Default "public-s3". Other options: "public-s3", "private-http", "private-ssh", etc. 22 | # See the details below in the `init_remote_storage` and in the README. 23 | OPT_REMOTE='public-s3' 24 | OPT_DVC_TRACKED_METRICS='true' # Default true. 25 | OPT_REGISTER_MODELS='true' # Default true. 26 | OPT_MODEL_NAME='text-classification' # Default "text-classification". 27 | OPT_TAG_MODELS='true' # Default true. 28 | OPT_SQUASH_COMMITS='false' # Default false. 29 | 30 | 31 | if [ -z $OPT_SUBDIR ]; then 32 | COMMIT_PREFIX="" 33 | GIT_TAG_SUFFIX="" 34 | GTO_PREFIX="" 35 | MAIN_REPO_README="" 36 | else 37 | [ -d "$REPO_PATH_BASE" ] && cp -r "$REPO_PATH_BASE" "${REPO_PATH_BASE}-backup-$(date +%s)" 38 | MODIFIER=$(echo ${OPT_SUBDIR} | tr / -) 39 | COMMIT_PREFIX="[$MODIFIER] " 40 | GIT_TAG_SUFFIX="-$MODIFIER" 41 | # In GTO we use : as a separator to get the full model name 42 | GTO_PREFIX="${OPT_SUBDIR}:" 43 | MAIN_REPO_README="${REPO_PATH_BASE}/README.md" 44 | fi 45 | 46 | REPO_PATH="${REPO_PATH_BASE}/${OPT_SUBDIR}" 47 | if [ -d "$REPO_PATH" ]; then 48 | echo "Repo $REPO_PATH already exists, please remove it first." 49 | exit 1 50 | fi 51 | 52 | create_tag() { 53 | if [ $OPT_TAGS == 'true' ]; then 54 | git tag -a "$1" -m "$2" 55 | fi 56 | } 57 | 58 | init_remote_storage() { 59 | if [ $OPT_REMOTE == 'public-s3' ]; then 60 | # Remote active on this env only, for writing. 61 | dvc remote add -f -d --local $OPT_REMOTE s3://dvc-public/remote/get-started 62 | # Actual remote for generated project (read-only). Redirect of S3 bucket above. 63 | dvc remote add -f -d $OPT_REMOTE https://remote.dvc.org/get-started 64 | fi 65 | 66 | if [ $OPT_REMOTE == 'private-gdrive' ]; then 67 | # This corresponds to the Iterative shared GDrive disk. 68 | dvc remote add -f -d $OPT_REMOTE gdrive://1x2tUWiCqcHxmUli7BP6qOrrlmp-12DYY 69 | # In Studio we have to use a custom app for GDrive. This one is created in 70 | # the pydrive-test project and in Studio we provide service account credentials. 71 | # When testing in Studio get a service credentials JSON file from the Google 72 | # Cloud Console. 73 | dvc remote modify $OPT_REMOTE gdrive_client_id "47794215776-cd9ssb6a4vv5otkq6n0iadpgc4efgjb1.apps.googleusercontent.com" 74 | dvc remote modify $OPT_REMOTE gdrive_client_secret 'i2gerGA7uBjZbR08HqSOSt9Z' 75 | fi 76 | 77 | if [ $OPT_REMOTE == 'private-s3' ]; then 78 | dvc remote add -f -d $OPT_REMOTE s3://dvc-private/remote/get-started 79 | fi 80 | 81 | if [ $OPT_REMOTE == 'private-http' ]; then 82 | dvc remote add -f -d --local storage ssh://dvc@35.194.53.251/home/dvc/storage 83 | dvc remote modify --local storage keyfile /Users/ivan/.ssh/dvc_gcp_remotes_rsa 84 | dvc remote add -f -d $OPT_REMOTE http://35.194.53.251 85 | fi 86 | 87 | if [ $OPT_REMOTE == 'private-ssh' ]; then 88 | dvc remote add -f -d $OPT_REMOTE ssh://dvc@35.194.53.251/home/dvc/storage 89 | dvc remote modify $OPT_REMOTE keyfile /Users/ivan/.ssh/dvc_gcp_remotes_rsa 90 | fi 91 | 92 | if [ $OPT_REMOTE == 'private-azure' ]; then 93 | # Make sure that you have connection string in your env or some other way 94 | # provide credentials for the `dvcprivate` storage account. Copy the connection 95 | # string from the Azure portal and export it with 96 | # `AZURE_STORAGE_CONNECTION_STRING` 97 | dvc remote add -f -d $OPT_REMOTE azure://nlp 98 | fi 99 | } 100 | 101 | mkdir -p $REPO_PATH 102 | pushd $REPO_PATH 103 | 104 | TOTAL_TAGS=50 105 | STEP_TIME=500000 106 | 107 | if [ $(git rev-parse --show-toplevel) == $REPO_PATH_BASE ]; then 108 | BEGIN_TIME=$(git log -1 --format=%ct) 109 | else 110 | BEGIN_TIME=$(( $(date +%s) - (${TOTAL_TAGS} * ${STEP_TIME}) )) 111 | fi 112 | 113 | export TAG_TIME=${BEGIN_TIME} 114 | export GIT_AUTHOR_DATE="${TAG_TIME} +0000" 115 | export GIT_COMMITTER_DATE="${TAG_TIME} +0000" 116 | 117 | tick(){ 118 | TICK_DELTA=$(python3 -c "print(int(${STEP_TIME} * ($RANDOM+1)/32767))") 119 | export TAG_TIME=$(( ${TAG_TIME} + ${TICK_DELTA} )) 120 | export GIT_AUTHOR_DATE="${TAG_TIME} +0000" 121 | export GIT_COMMITTER_DATE="${TAG_TIME} +0000" 122 | } 123 | 124 | if [ $OPT_TESTING_REPO == 'true' ]; then 125 | export GIT_AUTHOR_NAME="R. Daneel Olivaw" 126 | export GIT_AUTHOR_EMAIL="olivaw@iterative.ai" 127 | else 128 | export GIT_AUTHOR_NAME="Ivan Shcheklein" 129 | export GIT_AUTHOR_EMAIL="shcheklein@gmail.com" 130 | fi 131 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME" 132 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL" 133 | 134 | virtualenv -p python3 .venv 135 | export VIRTUAL_ENV_DISABLE_PROMPT=true 136 | source .venv/bin/activate 137 | echo '.venv/' > .gitignore 138 | 139 | # Installing from main since we'd like to update repo before 140 | # the release 141 | pip install "git+https://github.com/iterative/dvc#egg=dvc[all]" gto 142 | 143 | 144 | if [ $OPT_INIT_GIT == 'true' ]; then 145 | git init 146 | git checkout -b main 147 | cp $HERE/code/README.md . 148 | cp $HERE/code/.devcontainer.json . 149 | cp $HERE/code/.gitlab-ci.yml . 150 | cp $HERE/code/.gitattributes . 151 | git add . 152 | else 153 | git checkout main 154 | fi 155 | 156 | # Dump the config for the repo into README if we are generating a testing repo. 157 | if [ $OPT_TESTING_REPO == 'true' ]; then 158 | echo -e "This is a [DVC Studio](https://studio.iterative.ai) testing (fixture) repository." > README.md 159 | echo -e "\n## \`/${OPT_SUBDIR}\` config\n\n\`\`\`bash" | tee -a README.md $MAIN_REPO_README 160 | while read var; do 161 | echo "$var='$(eval "echo \"\$$var\"")'" | tee -a README.md $MAIN_REPO_README 162 | done < <( declare -p | cut -d " " -f 2 | grep = | grep "^OPT_" | cut -f 1 -d '=') 163 | echo '```' | tee -a README.md $MAIN_REPO_README 164 | git add $REPO_PATH_BASE/. 165 | fi 166 | 167 | if [ $OPT_INIT_GIT == 'true' ] || [ $OPT_TESTING_REPO == 'true' ]; then 168 | if [ $OPT_INIT_GIT == 'true' ]; then 169 | tick 170 | git commit -m "${COMMIT_PREFIX}Initialize Git repository" 171 | create_tag "0-git-init${GIT_TAG_SUFFIX}" "Git initialized." 172 | else 173 | tick 174 | git commit -m "${COMMIT_PREFIX}Add testing repo" 175 | create_tag "0-git-init${GIT_TAG_SUFFIX}" "Testing repo initialized." 176 | fi 177 | fi 178 | 179 | BASE_COMMT=$(git rev-parse HEAD) 180 | 181 | if [ $OPT_INIT_DVC == 'true' ]; then 182 | dvc init --subdir 183 | tick 184 | git commit -m "${COMMIT_PREFIX}Initialize DVC project" 185 | create_tag "1-dvc-init${GIT_TAG_SUFFIX}" "DVC initialized." 186 | fi 187 | 188 | 189 | mkdir data 190 | dvc get https://github.com/iterative/dataset-registry \ 191 | get-started/data.xml -o data/data.xml 192 | 193 | if [ $OPT_NON_DVC == 'false' ]; then 194 | if [ $OPT_REGISTER_MODELS == "true" ]; then 195 | echo "artifacts: 196 | stackoverflow-dataset: 197 | path: data/data.xml 198 | type: dataset 199 | desc: Initial XML StackOverflow dataset (raw data)" >> dvc.yaml 200 | fi 201 | dvc add data/data.xml 202 | git add data/data.xml.dvc 203 | else 204 | echo "data.xml" > data/.gitignore 205 | fi 206 | git add data/.gitignore 207 | tick 208 | git commit -m "${COMMIT_PREFIX}Add raw data" 209 | create_tag "2-track-data${GIT_TAG_SUFFIX}" "Data file added." 210 | 211 | 212 | if [ $OPT_NON_DVC == 'false' ]; then 213 | init_remote_storage 214 | 215 | git add $REPO_PATH_BASE/. 216 | tick 217 | git commit -m "${COMMIT_PREFIX}Configure default remote" 218 | create_tag "3-config-remote${GIT_TAG_SUFFIX}" "Remote storage configured." 219 | dvc push 220 | fi 221 | 222 | if [ $OPT_NON_DVC == 'false' ]; then 223 | rm data/data.xml data/data.xml.dvc 224 | dvc import https://github.com/iterative/dataset-registry \ 225 | get-started/data.xml -o data/data.xml 226 | git add data/data.xml.dvc 227 | tick 228 | git commit -m "${COMMIT_PREFIX}Import raw data (overwrite)" 229 | create_tag "4-import-data${GIT_TAG_SUFFIX}" "Data file overwritten with an import." 230 | dvc push 231 | fi 232 | 233 | # Deploy code 234 | pushd $HERE 235 | source deploy.sh $PROD 236 | popd 237 | 238 | # Get deployed code 239 | if [ $PROD == 'prod' ]; then 240 | wget https://code.dvc.org/get-started/code.zip 241 | else 242 | mv $HERE/code.zip code.zip 243 | fi 244 | 245 | unzip code.zip 246 | rm -f code.zip 247 | pip install -r src/requirements.txt 248 | git add . 249 | if [ $OPT_NON_DVC == 'true' ]; then 250 | cat <> metrics.json 251 | { 252 | "avg_prec": { 253 | "train": 0.9743681430252835, 254 | "test": 0.9249974999612706 255 | }, 256 | "roc_auc": { 257 | "train": 0.9866678562450621, 258 | "test": 0.9460213440787918 259 | } 260 | } 261 | EOF 262 | fi 263 | tick 264 | git commit -m "${COMMIT_PREFIX}Add source code files to repo" 265 | create_tag "5-source-code${GIT_TAG_SUFFIX}" "Source code added." 266 | 267 | if [ $OPT_NON_DVC == 'false' ]; then 268 | dvc stage add -n prepare \ 269 | -p prepare.seed,prepare.split \ 270 | -d src/prepare.py -d data/data.xml \ 271 | -o data/prepared \ 272 | python src/prepare.py data/data.xml 273 | dvc repro 274 | git add data/.gitignore dvc.yaml dvc.lock 275 | tick 276 | git commit -m "${COMMIT_PREFIX}Create data preparation stage" 277 | create_tag "6-prepare-stage${GIT_TAG_SUFFIX}" "First pipeline stage (data preparation) created." 278 | dvc push 279 | 280 | dvc stage add -n featurize \ 281 | -p featurize.max_features,featurize.ngrams \ 282 | -d src/featurization.py -d data/prepared \ 283 | -o data/features \ 284 | python src/featurization.py \ 285 | data/prepared data/features 286 | dvc stage add -n train \ 287 | -p train.seed,train.n_est,train.min_split \ 288 | -d src/train.py -d data/features \ 289 | -o model.pkl \ 290 | python src/train.py data/features model.pkl 291 | dvc repro 292 | 293 | if [ $OPT_REGISTER_MODELS == "true" ]; then 294 | python <> dvc.yaml 349 | 350 | dvc repro 351 | if [ $OPT_DVC_TRACKED_METRICS == "true" ]; then 352 | git add .gitignore dvc.yaml dvc.lock 353 | else 354 | git add .gitignore dvc.yaml dvc.lock eval 355 | fi 356 | tick 357 | git commit -am "${COMMIT_PREFIX}Create evaluation stage" 358 | create_tag "8-dvclive-eval${GIT_TAG_SUFFIX}" "DVCLive evaluation stage created." 359 | create_tag "baseline-experiment${GIT_TAG_SUFFIX}" "Baseline experiment evaluation" 360 | if [ $OPT_TAG_MODELS == "true" ]; then 361 | gto register "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.0.0 362 | gto assign "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.0.0 --stage prod 363 | fi 364 | dvc push 365 | 366 | 367 | sed -e "s/max_features: 100/max_features: 200/" -i".bck" params.yaml 368 | sed -e "s/ngrams: 1/ngrams: 2/" -i".bck" params.yaml 369 | rm -f params.yaml.bck 370 | dvc repro train 371 | tick 372 | git commit -am "${COMMIT_PREFIX}Reproduce model using bigrams" 373 | create_tag "9-bigrams-model${GIT_TAG_SUFFIX}" "Model retrained using bigrams." 374 | if [ $OPT_TAG_MODELS == "true" ]; then 375 | gto register "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.1.0 376 | gto assign "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.1.0 --stage stage 377 | fi 378 | dvc push 379 | 380 | 381 | dvc repro evaluate 382 | tick 383 | git commit -am "${COMMIT_PREFIX}Evaluate bigrams model" 384 | create_tag "bigrams-experiment${GIT_TAG_SUFFIX}" "Bigrams experiment evaluation" 385 | create_tag "10-bigrams-experiment${GIT_TAG_SUFFIX}" "Evaluated bigrams model." 386 | if [ $OPT_TAG_MODELS == "true" ]; then 387 | gto register "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.2.0 388 | gto assign "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.2.0 --stage dev 389 | fi 390 | dvc push 391 | fi 392 | 393 | if [ $OPT_SQUASH_COMMITS == 'true' ]; then 394 | git reset --soft $BASE_COMMT 395 | git commit --amend --no-edit 396 | fi 397 | 398 | if [ $OPT_NON_DVC == 'false' ] && [ $OPT_BRANCHES == 'true' ]; then 399 | export GIT_AUTHOR_NAME="Dave Berenbaum" 400 | export GIT_AUTHOR_EMAIL="dave.berenbaum@gmail.com" 401 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME" 402 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL" 403 | 404 | git checkout -b "tune-hyperparams${GIT_TAG_SUFFIX}" 405 | 406 | unset GIT_AUTHOR_DATE 407 | unset GIT_COMMITTER_DATE 408 | 409 | dvc exp run --queue --set-param train.min_split=8 410 | dvc exp run --queue --set-param train.min_split=64 411 | dvc exp run --queue --set-param train.min_split=2 --set-param train.n_est=100 412 | dvc exp run --queue --set-param train.min_split=8 --set-param train.n_est=100 413 | dvc exp run --queue --set-param train.min_split=64 --set-param train.n_est=100 414 | dvc exp run --run-all -j 2 415 | # Apply best experiment 416 | EXP=$(dvc exp show --csv --sort-by avg_prec.test | tail -n 1 | cut -d , -f 1) 417 | dvc exp apply $EXP 418 | tick 419 | git commit -am "${COMMIT_PREFIX}Run experiments tuning random forest params" 420 | create_tag "random-forest-experiments${GIT_TAG_SUFFIX}" "Run experiments to tune random forest params" 421 | create_tag "11-random-forest-experiments${GIT_TAG_SUFFIX}" "Tuned random forest classifier." 422 | dvc push 423 | 424 | git checkout main 425 | 426 | export GIT_AUTHOR_NAME="Dmitry Petrov" 427 | export GIT_AUTHOR_EMAIL="dmitry.petrov@nevesomo.com" 428 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME" 429 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL" 430 | 431 | git checkout -b "try-large-dataset${GIT_TAG_SUFFIX}" 432 | 433 | dvc update data/data.xml.dvc --rev get-started-40K 434 | sed -e "s/max_features: 200/max_features: 500/" -i".bck" params.yaml 435 | rm -f params.yaml.bck 436 | dvc repro 437 | dvc push 438 | git commit -am "${COMMIT_PREFIX}Try a 40K dataset (4x data)" 439 | fi 440 | 441 | popd 442 | 443 | unset TAG_TIME 444 | unset GIT_AUTHOR_DATE 445 | unset GIT_COMMITTER_DATE 446 | unset GIT_AUTHOR_NAME 447 | unset GIT_AUTHOR_EMAIL 448 | unset GIT_COMMITTER_NAME 449 | unset GIT_COMMITTER_EMAIL 450 | 451 | set +eux 452 | echo 453 | echo "==========================================" 454 | echo "Done! Read README for the next steps." 455 | echo "==========================================" 456 | echo 457 | -------------------------------------------------------------------------------- /example-get-started/generate_data.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import random 4 | import sys 5 | import xml.etree.ElementTree 6 | 7 | # This file is not part of the project but is used to generate a slice of 8 | # data from the full SO dump https://archive.org/details/stackexchange 9 | 10 | 11 | if len(sys.argv) != 3: 12 | sys.stderr.write("Arguments error. Usage:\n") 13 | sys.stderr.write("\tpython analyze.py data-file output-file\n") 14 | sys.exit(1) 15 | 16 | 17 | target = 40000 18 | split = 0.3 19 | 20 | 21 | def lines_matched_test(fd, test): 22 | for line in fd: 23 | try: 24 | attr = xml.etree.ElementTree.fromstring(line).attrib 25 | if test(attr.get("Tags", "")): 26 | yield line 27 | except Exception as ex: 28 | sys.stderr.write(f"Skipping the broken line: {ex}\n") 29 | 30 | 31 | def process_posts(fd_in, fd_not, fd_out): 32 | count = 0 33 | in_lines = lines_matched_test(fd_in, lambda x: "" in x) 34 | not_lines = lines_matched_test(fd_not, lambda x: "" not in x) 35 | while count < target: 36 | line = next(not_lines) if random.random() > split else next(in_lines) 37 | fd_out.write(line) 38 | count += 1 39 | 40 | 41 | with io.open(sys.argv[1], encoding="utf8") as fd_in: 42 | with io.open(sys.argv[1], encoding="utf8") as fd_not: 43 | with io.open(sys.argv[2], "w", encoding="utf8") as fd_out: 44 | process_posts(fd_in, fd_not, fd_out) 45 | -------------------------------------------------------------------------------- /example-gto/code/.github/workflows/gto-act-on-tags.yml: -------------------------------------------------------------------------------- 1 | name: Act on artifact registrations and promotions 2 | on: 3 | push: 4 | tags: 5 | - "*" 6 | 7 | jobs: 8 | act: 9 | name: Figure out what was registered/promoted and act on it 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: "GTO: figure out what was registered/promoted and show the Registry state" 14 | id: gto 15 | uses: iterative/gto-action@v2 16 | - uses: actions/setup-python@v2 17 | - name: Install dependencies 18 | run: | 19 | pip install --upgrade pip setuptools wheel 20 | pip install -r requirements.txt 21 | - name: "Publish (act on registering a new version)" 22 | if: steps.gto.outputs.event == 'registration' 23 | run: | 24 | echo "[$GITHUB_REF] You got version '${{ steps.gto.outputs.version }}' registered for model '${{ steps.gto.outputs.name }}' " 25 | echo "[$GITHUB_REF] It is about time to publish it somewhere so others could use it!" 26 | - name: "Deploy (act on assigning a new stage)" 27 | if: steps.gto.outputs.event == 'assignment' 28 | run: | 29 | echo "[$GITHUB_REF] You got model '${{ steps.gto.outputs.name }}' of version '${{ steps.gto.outputs.version }}' promoted to stage '${{ steps.gto.outputs.stage }}'" 30 | echo "[$GITHUB_REF] It is about time to deploy it somewhere!" 31 | -------------------------------------------------------------------------------- /example-gto/code/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /example-gto/code/README.md: -------------------------------------------------------------------------------- 1 | # Example GTO Model Registry 2 | 3 | A [model registry] is a tool to catalog ML models and their versions. Models from 4 | your data science projects can be discovered, tested, shared, deployed, and 5 | audited from there. [DVC] and [GTO] enable these capabilities on top of 6 | Git, so you can stick to an existing software engineering stack. 7 | 8 | This repo is an example of Model Registry built with these tools. The model 9 | dashboard: 10 | 11 |
12 | $ gto show
13 | ╒══════════╤══════════╤═════════╤═════════╤════════════╕
14 | │ name     │ latest   │ #dev    │ #prod   │ #staging   │
15 | ╞══════════╪══════════╪═════════╪═════════╪════════════╡
16 | │ churn    │ [v3.1.1](https://github.com/iterative/example-gto/releases/tag/churn@v3.1.1)   │ [v3.1.0](https://github.com/iterative/example-gto/releases/tag/churn%23dev%234)  │ [v3.0.0](https://github.com/iterative/example-gto/releases/tag/churn%23prod%233)  │ [v3.1.0](https://github.com/iterative/example-gto/releases/tag/churn%23staging%232)     │
17 | │ segment  │ [v0.4.1](https://github.com/iterative/example-gto/releases/tag/segment@v0.4.1)   │ [v0.4.1](https://github.com/iterative/example-gto/releases/tag/segment%23dev%231)  │ -       │ -          │
18 | │ cv-class │ [v0.1.13](https://github.com/iterative/example-gto/releases/tag/cv-class@v0.1.13)  │ -       │ -       │ -          │
19 | ╘══════════╧══════════╧═════════╧═════════╧════════════╛
20 | 
21 | 22 | - The `latest` column shows the latest model versions, 23 | - The `#dev` column represent model versions promoted to a Stage `dev` (same for 24 | `#prod` and `#staging`), 25 | - Versions are registered and promoted to Stages by [Git tags] - you can click 26 | the links to see the which specific Git tag did it, 27 | - Artifact metadata like `path` and `description` is stored in 28 | [`artifacts.yaml`], 29 | - [Github Actions page] of this repo have examples of workflows where we act 30 | upon these Git tags. 31 | 32 | Check out [public Model Registry] in [Studio] built on top of DVC and GTO 33 | that provides more insight into your ML models development, including 34 | training params, metrics and plots. 35 | 36 | 🧑‍💻 To continue learning, head to [Get Started with GTO]. 37 | 38 | [github actions page]: https://github.com/iterative/example-gto/actions 39 | [get started with gto]: https://dvc.org/doc/gto/get-started 40 | [model registry]: https://dvc.org/doc/use-cases/model-registry 41 | [dvc]: https://github.com/iterative/dvc 42 | [gto]: https://github.com/iterative/gto 43 | [git tags]: https://github.com/iterative/example-gto/tags 44 | [`artifacts.yaml`]: 45 | https://github.com/iterative/example-gto/blob/main/artifacts.yaml 46 | [public model registry]: https://studio.iterative.ai/team/Iterative/models 47 | [studio]: https://studio.iterative.ai 48 | -------------------------------------------------------------------------------- /example-gto/code/mlem/.github/workflows/deploy-model-with-mlem.yml: -------------------------------------------------------------------------------- 1 | name: Deploy MLEM model after GTO Stage assignment 2 | on: 3 | push: 4 | tags: 5 | - "*" 6 | env: 7 | HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} 8 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 9 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 10 | 11 | jobs: 12 | parse-git-tag: 13 | name: Figure out what was registered/promoted 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: "GTO: figure out what was registered/promoted and show the Registry state" 18 | id: gto 19 | uses: iterative/gto-action@v2 20 | # we define the Job outputs here to let the next Job use them 21 | outputs: 22 | name: ${{ steps.gto.outputs.name }} 23 | stage: ${{ steps.gto.outputs.stage }} 24 | event: ${{ steps.gto.outputs.event }} 25 | path: ${{ steps.gto.outputs.path }} 26 | deploy-model: 27 | name: Deploy a MLEM model (act on assigning a new stage) 28 | needs: parse-git-tag 29 | if: needs.parse-git-tag.outputs.event == 'assignment' 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v3 33 | - uses: actions/setup-python@v2 34 | with: 35 | python-version: '3.10' 36 | - name: Install dependencies 37 | run: | 38 | pip install --upgrade pip setuptools wheel 39 | pip install -r requirements.txt 40 | - name: Run `mlem deploy`` 41 | run: | 42 | mlem deployment run --load deploy/${{ needs.parse-git-tag.outputs.stage }} --model ${{ needs.parse-git-tag.outputs.path }} 43 | -------------------------------------------------------------------------------- /example-gto/code/mlem/.mlem.yaml: -------------------------------------------------------------------------------- 1 | core: 2 | state: 3 | uri: s3://gto-mlem-example/mlem-deployment-state 4 | -------------------------------------------------------------------------------- /example-gto/code/mlem/deploy/dev.mlem: -------------------------------------------------------------------------------- 1 | object_type: deployment 2 | type: heroku 3 | app_name: mlem-dev 4 | -------------------------------------------------------------------------------- /example-gto/code/mlem/deploy/prod.mlem: -------------------------------------------------------------------------------- 1 | object_type: deployment 2 | type: heroku 3 | app_name: mlem-prod 4 | -------------------------------------------------------------------------------- /example-gto/code/mlem/deploy/staging.mlem: -------------------------------------------------------------------------------- 1 | object_type: deployment 2 | type: heroku 3 | app_name: mlem-staging 4 | -------------------------------------------------------------------------------- /example-gto/code/mlem/requirements.txt: -------------------------------------------------------------------------------- 1 | gto 2 | mlem[s3,fastapi,heroku] -------------------------------------------------------------------------------- /example-gto/code/mlem/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import mlem 4 | 5 | if __name__ == "__main__": 6 | value = sys.argv[1] if len(sys.argv) > 1 else "no value" 7 | 8 | def model(data): 9 | return value 10 | 11 | mlem.api.save(model, "models/churn.pkl", sample_data="string") 12 | -------------------------------------------------------------------------------- /example-gto/code/requirements.txt: -------------------------------------------------------------------------------- 1 | gto -------------------------------------------------------------------------------- /example-gto/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Setup script env: 4 | # e Exit immediately if a command exits with a non-zero exit status. 5 | # u Treat unset variables as an error when substituting. 6 | # x Print commands and their arguments as they are executed. 7 | set -eux 8 | 9 | PUSH=false 10 | echo $# 11 | if [ "$#" -eq 1 ] && [ "$0" != "--push" ]; then 12 | PUSH=true 13 | echo "Will push things to GitHub :tada:" 14 | fi 15 | 16 | HERE="$( 17 | cd "$(dirname "$0")" 18 | pwd -P 19 | )" 20 | USER_NAME="iterative" 21 | REPO_NAME="example-gto" 22 | 23 | BUILD_PATH="$HERE/build" 24 | REPO_PATH="$BUILD_PATH/$REPO_NAME" 25 | 26 | if [ -d "$REPO_PATH" ]; then 27 | echo "Repo $REPO_PATH already exists, please remove it first." 28 | exit 1 29 | fi 30 | 31 | mkdir -p $BUILD_PATH 32 | pushd $BUILD_PATH 33 | if [ ! -d "$BUILD_PATH/.venv" ]; then 34 | virtualenv -p python3 .venv 35 | source .venv/bin/activate 36 | echo '.venv/' >.gitignore 37 | pip install -r ../code/requirements.txt 38 | git clone git@github.com:iterative/example-gto.git 39 | pip install -e ./gto 40 | fi 41 | popd 42 | 43 | source $BUILD_PATH/.venv/bin/activate 44 | 45 | TOTAL_TAGS=15 46 | STEP_TIME=100000 47 | SLEEP_TIME=90 48 | BEGIN_TIME=$(($(date +%s) - (${TOTAL_TAGS} * ${STEP_TIME}))) 49 | export TAG_TIME=${BEGIN_TIME} 50 | export GIT_AUTHOR_DATE="${TAG_TIME} +0000" 51 | export GIT_COMMITTER_DATE="${TAG_TIME} +0000" 52 | tick() { 53 | export TAG_TIME=$((${TAG_TIME} + ${STEP_TIME})) 54 | export GIT_AUTHOR_DATE="${TAG_TIME} +0000" 55 | export GIT_COMMITTER_DATE="${TAG_TIME} +0000" 56 | } 57 | 58 | export GIT_AUTHOR_NAME="Alexander Guschin" 59 | export GIT_AUTHOR_EMAIL="1aguschin@gmail.com" 60 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME" 61 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL" 62 | 63 | mkdir -p $REPO_PATH 64 | pushd $REPO_PATH 65 | 66 | git init -b main 67 | cp $HERE/code/.gitignore . 68 | git add .gitignore 69 | cp $HERE/code/requirements.txt . 70 | cp $HERE/code/README.md . 71 | cp -R $HERE/code/.github . 72 | git add . 73 | tick 74 | git commit -m "Initialize Git repository with CI workflow" 75 | 76 | if $PUSH; then 77 | # remove GH Actions workflows 78 | gh api repos/$USER_NAME/$REPO_NAME/actions/runs \ 79 | --paginate -q '.workflow_runs[] | "\(.id)"' | 80 | xargs -n1 -I % gh api --silent repos/$USER_NAME/$REPO_NAME/actions/runs/% -X DELETE 81 | # add remote 82 | git remote add origin git@github.com:$USER_NAME/$REPO_NAME.git 83 | # remove all tags from remote 84 | git ls-remote --tags origin | awk '/^(.*)(\s+)(.*[a-zA-Z0-9])$/ {print ":" $2}' | xargs git push origin 85 | fi 86 | 87 | echo "Initialize DVC" 88 | dvc init 89 | git commit -m "Initialize DVC" 90 | echo "Create new models" 91 | mkdir models 92 | echo "1st version" > models/churn.pkl 93 | git add models requirements.txt 94 | tick 95 | git commit -am "Create models" 96 | 97 | cat >> dvc.yaml<< EOF 98 | artifacts: 99 | churn: 100 | type: model 101 | path: models/churn.pkl 102 | segment: 103 | type: model 104 | path: s3://mycorp/proj-ml/segm-model-2022-04-15.pt 105 | cv-class: 106 | type: model 107 | path: s3://mycorp/proj-ml/classif-v2.pt 108 | EOF 109 | git add dvc.yaml 110 | 111 | tick 112 | git commit -m "Annotate models with GTO" 113 | if $PUSH; then 114 | git push --set-upstream origin main -f 115 | fi 116 | 117 | echo "Register new model" 118 | tick 119 | gto register churn --version v3.0.0 120 | tick 121 | gto register segment --version v0.4.1 122 | tick 123 | gto register cv-class --version v0.1.13 124 | if $PUSH; then 125 | git push --tags 126 | sleep $SLEEP_TIME 127 | fi 128 | 129 | echo "Update the model" 130 | echo "2nd version" >models/churn.pkl 131 | tick 132 | git commit -am "Update model" 133 | if $PUSH; then 134 | git push 135 | fi 136 | 137 | echo "Register models" 138 | tick 139 | gto register churn --bump-minor 140 | if $PUSH; then 141 | git push --tags 142 | sleep $SLEEP_TIME 143 | fi 144 | 145 | echo "Promote models" 146 | tick 147 | gto assign churn --version v3.0.0 --stage dev 148 | if $PUSH; then 149 | git push --tags 150 | sleep $SLEEP_TIME 151 | fi 152 | 153 | tick 154 | gto assign churn HEAD --stage staging 155 | if $PUSH; then 156 | git push --tags 157 | sleep $SLEEP_TIME 158 | fi 159 | 160 | tick 161 | gto assign churn --version v3.0.0 --stage prod 162 | if $PUSH; then 163 | git push --tags 164 | sleep $SLEEP_TIME 165 | fi 166 | 167 | tick 168 | gto assign churn --version v3.1.0 --stage dev 169 | gto assign segment --version v0.4.1 --stage dev 170 | if $PUSH; then 171 | git push --tags 172 | fi 173 | 174 | 175 | gto show 176 | gto history 177 | 178 | 179 | if $PUSH; then 180 | git push --set-upstream origin main -f 181 | fi 182 | 183 | popd 184 | 185 | unset TAG_TIME 186 | unset GIT_AUTHOR_DATE 187 | unset GIT_COMMITTER_DATE 188 | unset GIT_AUTHOR_NAME 189 | unset GIT_AUTHOR_EMAIL 190 | unset GIT_COMMITTER_NAME 191 | unset GIT_COMMITTER_EMAIL 192 | 193 | cat <