├── .devcontainer.json
├── .github
    └── workflows
    │   ├── example-get-started-deploy.yaml
    │   ├── example-get-started-experiments-deploy.yaml
    │   ├── example-get-started-experiments-test.yaml
    │   └── example-get-started-test.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── README.md
├── example-get-started-experiments
    ├── .gitignore
    ├── README.md
    ├── code
    │   ├── .devcontainer.json
    │   ├── .gitattributes
    │   ├── .github
    │   │   └── workflows
    │   │   │   ├── deploy-model-sagemaker.yml
    │   │   │   ├── deploy-model-template.yml
    │   │   │   └── dvc-studio.yml
    │   ├── .gitignore
    │   ├── .gitlab-ci.yml
    │   ├── LICENSE
    │   ├── README.md
    │   ├── data
    │   │   ├── .gitignore
    │   │   └── pool_data.dvc
    │   ├── gitlab-workflows
    │   │   └── cloud-experiment.gitlab-ci.yml
    │   ├── notebooks
    │   │   └── TrainSegModel.ipynb
    │   ├── params.yaml
    │   ├── requirements.txt
    │   ├── sagemaker
    │   │   ├── code
    │   │   │   ├── inference.py
    │   │   │   └── requirements.txt
    │   │   └── deploy_model.py
    │   └── src
    │   │   ├── data_split.py
    │   │   ├── endpoint_prediction.py
    │   │   ├── evaluate.py
    │   │   └── train.py
    └── generate.sh
├── example-get-started
    ├── .gitignore
    ├── README.md
    ├── code
    │   ├── .devcontainer.json
    │   ├── .gitattributes
    │   ├── .github
    │   │   └── workflows
    │   │   │   └── cml.yaml
    │   ├── .gitlab-ci.yml
    │   ├── README.md
    │   ├── params.yaml
    │   └── src
    │   │   ├── evaluate.py
    │   │   ├── featurization.py
    │   │   ├── prepare.py
    │   │   ├── requirements.txt
    │   │   └── train.py
    ├── deploy.sh
    ├── generate.sh
    └── generate_data.py
└── example-gto
    ├── code
        ├── .github
        │   └── workflows
        │   │   └── gto-act-on-tags.yml
        ├── .gitignore
        ├── README.md
        ├── mlem
        │   ├── .github
        │   │   └── workflows
        │   │   │   └── deploy-model-with-mlem.yml
        │   ├── .mlem.yaml
        │   ├── deploy
        │   │   ├── dev.mlem
        │   │   ├── prod.mlem
        │   │   └── staging.mlem
        │   ├── requirements.txt
        │   └── train.py
        └── requirements.txt
    └── generate.sh


/.devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "example-repos-dev",
 3 |   "image": "mcr.microsoft.com/devcontainers/python:3.10",
 4 |   "runArgs": ["--ipc=host"],
 5 |   "extensions": ["Iterative.dvc", "ms-python.python", "redhat.vscode-yaml"],
 6 |   "features": {
 7 |       "ghcr.io/devcontainers/features/nvidia-cuda:1": {
 8 |           "installCudnn": true
 9 |       },
10 |       "ghcr.io/saml-to/devcontainer-features/assume-aws-role:1": {
11 |           "role": "arn:aws:iam::342840881361:role/iterative-saml-codespaces"
12 |       },
13 |     "ghcr.io/devcontainers/features/aws-cli:1": {},
14 |     "ghcr.io/devcontainers/features/github-cli:1": {}
15 |   },
16 |   "customizations": {
17 |     "codespaces": {
18 |       "repositories": {
19 |         "iterative/example-get-started": {
20 |           "permissions": "write-all"
21 |         },
22 |         "iterative/example-get-started-experiments": {
23 |           "permissions": "write-all"
24 |         }
25 |       }
26 |     }
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/.github/workflows/example-get-started-deploy.yaml:
--------------------------------------------------------------------------------
 1 | name: example-get-started deploy
 2 | on: 
 3 |   push:
 4 |     paths:
 5 |       - example-get-started/**
 6 |     branches:
 7 |       - master
 8 |   workflow_dispatch:
 9 | permissions: write-all
10 | jobs:
11 |   test:
12 |     runs-on: ubuntu-latest
13 |     environment: aws
14 |     steps:
15 |     - name: Checkout repo
16 |       uses: actions/checkout@v3
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v4
19 |       with:
20 |         python-version: '3.11'
21 |         cache: 'pip'
22 |     - uses: aws-actions/configure-aws-credentials@v2
23 |       with:
24 |         aws-region: us-east-2
25 |         role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }}
26 |         role-duration-seconds: 43200
27 |     - uses: iterative/setup-dvc@v1
28 |     - name: Generate repo
29 |       run: |
30 |         pip install virtualenv
31 |         cd example-get-started
32 |         ./generate.sh prod
33 |     - name: Deploy repo
34 |       env:
35 |         GH_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
36 |       run: |
37 |         cd example-get-started/build/example-get-started
38 |         . .venv/bin/activate
39 |         # add remote
40 |         git remote add origin https://${{ secrets.PERSONAL_ACCESS_TOKEN }}@github.com/iterative/example-get-started.git
41 |         # close open PRs
42 |         gh pr close try-large-dataset
43 |         gh pr close tune-hyperparams
44 |         # drop existing refs
45 |         git ls-remote origin | awk '{print $2}' | xargs -n 1 git push --delete origin || true
46 |         # force push branches
47 |         git push --force origin main
48 |         git push --force origin try-large-dataset
49 |         git push --force origin tune-hyperparams
50 |         # we push git tags one by one for Studio to receive webhooks:
51 |         git tag --sort=creatordate | xargs -n 1 git push --force origin
52 |         # push exp refs
53 |         dvc exp push origin -A
54 |         # create PRs
55 |         gh pr create -t "Try 40K dataset (4x data)" \
56 |            -b "We are trying here a large dataset, since the smaller one looks unstable" \
57 |            -B main -H try-large-dataset
58 |         gh pr create -t "Run experiments tuning random forest params" \
59 |            -b "Better RF split and number of estimators based on small grid search." \
60 |            -B main -H tune-hyperparams
61 | 


--------------------------------------------------------------------------------
/.github/workflows/example-get-started-experiments-deploy.yaml:
--------------------------------------------------------------------------------
 1 | name: example-get-started-experiments deploy
 2 | on: 
 3 |   push:
 4 |     paths:
 5 |       - example-get-started-experiments/**
 6 |     branches:
 7 |       - master
 8 |   workflow_dispatch:
 9 | permissions: write-all
10 | jobs:
11 |   deploy-runner:
12 |     environment: aws
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - uses: iterative/setup-cml@v2
17 |       - uses: aws-actions/configure-aws-credentials@v1
18 |         with:
19 |           aws-region: us-east-2
20 |           role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }}
21 |           role-duration-seconds: 43200
22 |       - name: Create Runner
23 |         env:
24 |           REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
25 |         run: |
26 |           cml runner launch --single \
27 |             --labels=cml \
28 |             --cloud=aws \
29 |             --cloud-region=us-east \
30 |             --cloud-hdd-size=40 \
31 |             --cloud-type=g5.2xlarge \
32 |             --idle-timeout=3600 \
33 |   test:
34 |     needs: deploy-runner
35 |     runs-on: [ self-hosted, cml ]
36 |     environment: aws
37 |     container:
38 |       image: iterativeai/cml:0-dvc2-base1-gpu
39 |       options: --gpus all --ipc host
40 |     steps:
41 |     - name: Checkout repo
42 |       uses: actions/checkout@v3
43 |     - name: Set up Python
44 |       uses: actions/setup-python@v4
45 |       with:
46 |         python-version: '3.11'
47 |         cache: 'pip'
48 |     - uses: aws-actions/configure-aws-credentials@v2
49 |       with:
50 |         aws-region: us-east-2
51 |         role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }}
52 |         role-duration-seconds: 43200
53 |     - name: Generate repo
54 |       env:
55 |         REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
56 |       run: |
57 |         pip install virtualenv
58 |         cd example-get-started-experiments
59 |         ./generate.sh
60 |     - name: Deploy repo
61 |       env:
62 |         GH_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
63 |       run: |
64 |         cd example-get-started-experiments/build/example-get-started-experiments
65 |         . .venv/bin/activate
66 |         # add remote
67 |         git remote add origin https://${{ secrets.PERSONAL_ACCESS_TOKEN }}@github.com/iterative/example-get-started-experiments.git
68 |         # drop existing refs
69 |         git ls-remote origin | awk '{print $2}' | xargs -n 1 git push --delete origin || true
70 |         # push updated refs
71 |         git push --force origin main
72 |         dvc exp push origin -A
73 |         # we push git tags one by one for Studio to receive webhooks:
74 |         git tag --sort=creatordate | xargs -n 1 git push --force origin
75 | 


--------------------------------------------------------------------------------
/.github/workflows/example-get-started-experiments-test.yaml:
--------------------------------------------------------------------------------
 1 | name: example-get-started-experiments test
 2 | on: 
 3 |   push:
 4 |     paths:
 5 |       - example-get-started-experiments/**
 6 |     branches:
 7 |       - '**'        # matches every branch
 8 |       - '!master'   # excludes master
 9 |   workflow_dispatch:
10 |   schedule:
11 |     - cron:  '0 0 * * 1'
12 | permissions:
13 |   contents: read
14 |   id-token: write
15 | jobs:
16 |   deploy-runner:
17 |     environment: aws
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: actions/checkout@v3
21 |       - uses: iterative/setup-cml@v2
22 |       - uses: aws-actions/configure-aws-credentials@v1
23 |         with:
24 |           aws-region: us-east-2
25 |           role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }}
26 |           role-duration-seconds: 43200
27 |       - name: Create Runner
28 |         env:
29 |           REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
30 |         run: |
31 |           cml runner launch --single \
32 |             --labels=cml \
33 |             --cloud=aws \
34 |             --cloud-region=us-east \
35 |             --cloud-hdd-size=40 \
36 |             --cloud-type=g5.2xlarge \
37 |             --idle-timeout=3600 \
38 |   test:
39 |     needs: deploy-runner
40 |     runs-on: [ self-hosted, cml ]
41 |     environment: aws
42 |     container:
43 |       image: iterativeai/cml:0-dvc2-base1-gpu
44 |       options: --gpus all --ipc host
45 |     steps:
46 |     - name: Checkout repo
47 |       uses: actions/checkout@v3
48 |     - name: Set up Python
49 |       uses: actions/setup-python@v4
50 |       with:
51 |         python-version: '3.11'
52 |         cache: 'pip'
53 |     - uses: aws-actions/configure-aws-credentials@v2
54 |       with:
55 |         aws-region: us-east-2
56 |         role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }}
57 |         role-duration-seconds: 43200
58 |     - name: Generate repo
59 |       env:
60 |         REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
61 |       run: |
62 |         pip install virtualenv
63 |         cd example-get-started-experiments
64 |         ./generate.sh
65 | 


--------------------------------------------------------------------------------
/.github/workflows/example-get-started-test.yaml:
--------------------------------------------------------------------------------
 1 | name: example-get-started test
 2 | on: 
 3 |   push:
 4 |     paths:
 5 |       - example-get-started/**
 6 |     branches:
 7 |       - '**'        # matches every branch
 8 |       - '!master'   # excludes master
 9 |   workflow_dispatch:
10 |   schedule:
11 |     - cron:  '0 0 * * 1'
12 | permissions:
13 |   contents: read
14 |   id-token: write
15 | jobs:
16 |   test:
17 |     runs-on: ubuntu-latest
18 |     environment: aws
19 |     steps:
20 |     - name: Checkout repo
21 |       uses: actions/checkout@v3
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v4
24 |       with:
25 |         python-version: '3.11'
26 |         cache: 'pip'
27 |     - uses: aws-actions/configure-aws-credentials@v2
28 |       with:
29 |         aws-region: us-east-2
30 |         role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }}
31 |         role-duration-seconds: 43200
32 |     - name: Generate repo
33 |       run: |
34 |         pip install virtualenv
35 |         cd example-get-started
36 |         ./generate.sh
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | .Python
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | *.egg-info/
21 | .installed.cfg
22 | *.egg
23 | MANIFEST
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | .pytest_cache/
37 | 
38 | # Sphinx documentation
39 | docs/_build/
40 | 
41 | # PyBuilder
42 | target/
43 | 
44 | # Environments
45 | .env
46 | .venv
47 | env/
48 | venv/
49 | ENV/
50 | env.bak/
51 | venv.bak/
52 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3
 3 | repos:
 4 | - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |   rev: v3.4.0
 6 |   hooks:
 7 |   - id: check-added-large-files
 8 |   - id: check-case-conflict
 9 |   - id: check-docstring-first
10 |   - id: check-executables-have-shebangs
11 |   - id: check-merge-conflict
12 |   - id: check-yaml
13 |   - id: debug-statements
14 |   - id: end-of-file-fixer
15 |   - id: mixed-line-ending
16 |   - id: sort-simple-yaml
17 |   - id: trailing-whitespace
18 | - repo: local
19 |   hooks:
20 |   - id: todo
21 |     name: Check TODO
22 |     language: pygrep
23 |     entry: WIP
24 |     args: [-i]
25 |     types: [text]
26 |     exclude: ^.pre-commit-config.yaml$
27 | - repo: https://github.com/lovesegfault/beautysh
28 |   rev: v6.1.0
29 |   hooks:
30 |   - id: beautysh
31 |     args: [-i, '2']
32 | - repo: https://gitlab.com/pycqa/flake8
33 |   rev: 3.9.2
34 |   hooks:
35 |   - id: flake8
36 |     args: [-j8, --max-line-length=99, --extend-ignore=P1]
37 |     additional_dependencies:
38 |     - flake8-bugbear
39 |     - flake8-comprehensions
40 |     - flake8-debugger
41 |     - flake8-string-format
42 | - repo: https://github.com/PyCQA/isort
43 |   rev: 5.8.0
44 |   hooks:
45 |   - id: isort
46 |     args: [--profile=black, -l=99]
47 | - repo: https://github.com/ambv/black
48 |   rev: 22.3.0
49 |   hooks:
50 |   - id: black
51 |     args: [-l, '99']
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Get Started Tutorial (sources)
 2 | 
 3 | Contains source code and [Shell](https://www.shellscript.sh/) scripts to
 4 | generate and deploy example DVC repositories used in the [Get
 5 | Started](https://dvc.org/doc/get-started) and other sections of the DVC docs.
 6 | 
 7 | ## Requirements
 8 | 
 9 | Please make sure you have these available on the environment where these scripts
10 | will run:
11 | 
12 | - [Git](https://git-scm.com/)
13 | - [Python](https://www.python.org/) 3 (with `python3` and [pip](https://pypi.org/project/pip/) commands)
14 | - [Virtualenv](https://virtualenv.pypa.io/en/stable/)
15 | 
16 | ## Naming Convention for Example Repositories
17 | 
18 | In order to have a consistent naming scheme across all example repositories, the
19 | new repositories should be named as:
20 | 
21 | ```
22 | example-PROD-FEATURE
23 | ```
24 | 
25 | where `PROD` is one of the products like `dvc`, `cml`, `studio`, or `dvclive`, and `FEATURE` is
26 | the feature that the repository focused on, like `experiments`, or `pipelines`.
27 | You can also use additional keywords as suffix to differentiate from the others.
28 | 
29 | ⚠️ Please create all new repositories with the prefix `example-`.
30 | 
31 | ## Scripts
32 | 
33 | Each example DVC project is in each of the root directories (below). `cd` into
34 | the directory first before running the desired script, for example:
35 | 
36 | ```console
37 | $ cd example-get-started
38 | $ ./deploy.sh
39 | ```
40 | 
41 | ### example-get-started
42 | 
43 | There are 2 GitHub Actions set up to test and deploy the project:
44 | 
45 | - [test](.github/workflows/example-get-started-test.yaml)
46 | - [deploy](.github/workflows/example-get-started-deploy.yaml)
47 | 
48 | These will automatically test and deploy the project. If you need to run the project
49 | locally/manually, you only directly need `generate.sh`. `deploy.sh` is a helper script
50 | run within `generate.sh`.
51 | 
52 | - `generate.sh`: Generates the `example-get-started` DVC project from
53 |   scratch. 
54 | 
55 |   By default, the source code archive is derived from the local workspace for
56 |   development purposes.
57 | 
58 |   For deployment, use `generate.sh prod` to upload/download a source code
59 |   archive from S3 the same way as in [Connect Code and
60 |   Data](https://dvc.org/doc/get-started/connect-code-and-data).
61 | 
62 | - `deploy.sh`: Makes and deploys code archive from
63 |   [example-get-started/code](example-get-started/code) to use for `generate.sh`.
64 | 
65 |   By default, makes local code archive in example-get-started/code.zip.
66 | 
67 |   For deployment, use `deploy.sh prod` to upload to S3.
68 | 
69 |   > Requires AWS CLI and write access to `s3://dvc-public/code/get-started/`.
70 | 
71 | ### example-get-started-experiments
72 | 
73 | There are 2 GitHub Actions set up to test and deploy the project:
74 | 
75 | - [test](.github/workflows/example-get-started-experiments-test.yaml)
76 | - [deploy](.github/workflows/example-get-started-experiments-deploy.yaml)
77 | 
78 | These will automatically test and deploy the project. If you need to run the project locally/manually, run `generate.sh`.
79 | 
80 | Even after automatic deployment, you still need to follow the
81 | [instructions](example-get-started-experiments/README.md) to:
82 | - Update Studio to create a PR from the best generated experiment.
83 | - Push to GitLab if you want to update the repo there.
84 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/.gitignore:
--------------------------------------------------------------------------------
1 | # Custom
2 | *.zip
3 | /tmp
4 | build/
5 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/README.md:
--------------------------------------------------------------------------------
 1 | Generate the actual repo by running: 
 2 | 
 3 | ```shell
 4 | bash generate.sh
 5 | ```
 6 | 
 7 | The repo generated in `build/example-get-started-experiments` is intended to be 
 8 | published on https://github.com/iterative/example-get-started-experiments. 
 9 | Make sure the Github repo exists first and that you have appropriate write 
10 | permissions.
11 | 
12 | Run the commands below to force push it. 
13 | Modify `ORIGIN` on demmand, for example use
14 | `git@gitlab.com:iterative.ai/example-get-started-experiments.git` to force
15 | push a copy to GitLab.
16 | 
17 | ```shell
18 | cd build/example-get-started-experiments
19 | export ORIGIN=https://github.com/iterative/example-get-started-experiments.git
20 | git remote add origin ${ORIGIN}
21 | git push --force origin main
22 | # we push git tags one by one for Studio to receive webhooks:
23 | git tag --sort=creatordate | xargs -n 1 git push --force origin
24 | ```
25 | 
26 | Run these to drop and then rewrite the experiment references on the repo:
27 | 
28 | ```shell
29 | source .venv/bin/activate
30 | dvc exp remove -A -g origin
31 | dvc exp push origin -A
32 | ```
33 | 
34 | Finally, return to the directory where you started:
35 | 
36 | ```shell
37 | cd ../..
38 | ```
39 | 
40 | You may remove the generated repo with:
41 | 
42 | ```shell
43 | rm -fR build
44 | ```
45 | 
46 | To update the project in Studio, follow the instructions at:
47 | 
48 | https://github.com/iterative/studio/wiki/Updating-and-synchronizing-demo-project
49 | 
50 | 
51 | Pay attention to whether experiments shown in experiments table are "detached"
52 | or if the experiments you just pushed doesn't show up in the Project table.
53 | 
54 | Manual Studio PR:
55 | 
56 | Once the repo has been generated and pushed, go to the 
57 | [corresponding Studio project](https://studio.iterative.ai/team/Iterative/projects/example-get-started-experiments-y8toqd433r) 
58 | and create a PR from the best of the 3 experiments that are found in the latest
59 | commit of the `main` branch.
60 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/.devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "example-cv",
 3 |     "image": "mcr.microsoft.com/devcontainers/python:3.10",
 4 |     "runArgs": ["--ipc=host"],
 5 |     "features": {
 6 |       "ghcr.io/devcontainers/features/nvidia-cuda:1": {
 7 |         "installCudnn": true
 8 |       },
 9 |       "ghcr.io/iterative/features/nvtop:1": {}
10 |     },
11 |     "extensions": [
12 |       "Iterative.dvc",
13 |       "ms-python.python",
14 |       "redhat.vscode-yaml"
15 |     ],
16 |     "postCreateCommand": "pip install --user -r requirements.txt"
17 | }
18 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/.gitattributes:
--------------------------------------------------------------------------------
1 | *.dvc linguist-language=YAML
2 | dvc.lock linguist-language=YAML
3 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/.github/workflows/deploy-model-sagemaker.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy model (Sagemaker)
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "*"
 7 | 
 8 | permissions:
 9 |   contents: write
10 |   id-token: write
11 | 
12 | jobs:
13 |   parse:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |     - uses: actions/checkout@v3
17 |     - name: "Parse GTO tag"
18 |       id: gto
19 |       uses: iterative/gto-action@v2
20 |     outputs:
21 |       event: ${{ steps.gto.outputs.event }}
22 |       name: ${{ steps.gto.outputs.name }}
23 |       stage: ${{ steps.gto.outputs.stage }}
24 |       version: ${{ steps.gto.outputs.version }}
25 | 
26 |   deploy-model:
27 |     needs: parse
28 |     if: "${{ needs.parse.outputs.event == 'assignment' }}"
29 |     environment: cloud
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |     - uses: actions/checkout@v3
33 |       with:
34 |         fetch-depth: 0
35 | 
36 |     - uses: aws-actions/configure-aws-credentials@v4
37 |       with:
38 |         aws-region: us-east-2
39 |         role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }}
40 |         role-duration-seconds: 43200
41 | 
42 |     - name: Set up Python
43 |       uses: actions/setup-python@v4
44 |       with:
45 |         python-version: '3.8'
46 |         cache: 'pip'
47 |         cache-dependency-path: requirements.txt
48 | 
49 |     - run: pip install -r requirements.txt
50 | 
51 |     - run:  dvc remote add -d --local storage s3://dvc-public/remote/get-started-pools
52 | 
53 |     - run: |
54 |         MODEL_DATA=$(dvc get --show-url . model.tar.gz)
55 |         python sagemaker/deploy_model.py \
56 |         --name ${{ needs.parse.outputs.name }} \
57 |         --stage ${{ needs.parse.outputs.stage }} \
58 |         --version ${{ needs.parse.outputs.version }} \
59 |         --model_data $MODEL_DATA \
60 |         --role ${{ vars.AWS_SANDBOX_ROLE }}
61 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/.github/workflows/deploy-model-template.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy Model (Template)
 2 | 
 3 | on:
 4 |   # the workflow is triggered whenever a tag is pushed to the repository
 5 |   push:
 6 |     tags:
 7 |       - "*"
 8 | jobs:
 9 | 
10 |   # This job parses the git tag with the GTO GitHub Action to identify model registry actions
11 |   parse:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |     - uses: actions/checkout@v3
15 |     - name: "Parse GTO tag"
16 |       id: gto
17 |       uses: iterative/gto-action@v2
18 |     outputs:
19 |       event: ${{ steps.gto.outputs.event }}
20 |       name: ${{ steps.gto.outputs.name }}
21 |       stage: ${{ steps.gto.outputs.stage }}
22 |       version: ${{ steps.gto.outputs.version }}
23 | 
24 |   deploy-model:
25 |     needs: parse
26 |     # using the outputs from the "parse" job, we run this job only for actions
27 |     # in the model registry and only when the model was assigned to a stage called "prod"
28 |     if: ${{ needs.parse.outputs.event == 'assignment' && needs.parse.outputs.stage == 'prod' }}
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |     - uses: iterative/setup-dvc@v1
32 |     # this step uses DVC to download the model from our remote repository and deploys the model
33 |     # Model deployment is mocked here as it is specific to each deployment environment
34 |     # The DVC Studio token is used to avoid having to store specific remote storage credentials on GitHub
35 |     - name: Get Model For Deployment
36 |       run: |
37 |         dvc config --global studio.token ${{ secrets.DVC_STUDIO_TOKEN }}
38 |         dvc artifacts get  ${{ github.server_url }}/${{ github.repository }} ${{ needs.parse.outputs.name }} --rev ${{ needs.parse.outputs.version }}
39 |         echo "The right model is available and you can use the rest of this command to deploy it. Good job!"
40 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/.github/workflows/dvc-studio.yml:
--------------------------------------------------------------------------------
  1 | name: DVC Studio Experiment
  2 | 
  3 | on:
  4 | 
  5 |   push:
  6 |     tags-ignore:
  7 |       - '**'
  8 | 
  9 |   workflow_dispatch:
 10 |     inputs:
 11 |       exp-run-args:
 12 |         description: 'Args to be passed to dvc exp run call'
 13 |         required: false
 14 |         type: string
 15 |         default: ''
 16 |       parent-sha:
 17 |         description: 'SHA of the commit to start the experiment from'
 18 |         required: false
 19 |         type: string
 20 |         default: ''
 21 |       cloud:
 22 |         description: 'Cloud compute provider to host the runner'
 23 |         required: false
 24 |         default: 'aws'
 25 |         type: choice
 26 |         options:
 27 |           - aws
 28 |           - azure
 29 |           - gcp
 30 |       type:
 31 |         description: 'https://registry.terraform.io/providers/iterative/iterative/latest/docs/resources/task#machine-type'
 32 |         required: false
 33 |         default: 'g5.2xlarge'
 34 |       region:
 35 |         description: 'https://registry.terraform.io/providers/iterative/iterative/latest/docs/resources/task#cloud-region'
 36 |         required: false
 37 |         default: 'us-east'
 38 |       spot:
 39 |         description: 'Request a spot instance'
 40 |         required: false
 41 |         default: false
 42 |         type: boolean
 43 |       storage:
 44 |         description: 'Disk size in GB'
 45 |         required: false
 46 |         default: 40
 47 |         type: number
 48 |       timeout:
 49 |         description: 'Timeout in seconds'
 50 |         required: false
 51 |         default: 3600
 52 |         type: number
 53 |   
 54 | permissions:
 55 |   contents: write
 56 |   id-token: write
 57 |   pull-requests: write
 58 | 
 59 | jobs:
 60 | 
 61 |   deploy-runner:
 62 |     if: ${{ (github.actor == 'iterative-studio[bot]') || (github.event_name == 'workflow_dispatch') }} 
 63 |     environment: cloud
 64 |     runs-on: ubuntu-latest
 65 | 
 66 |     steps:
 67 |       - uses: actions/checkout@v3
 68 |         with:
 69 |           ref: ${{ inputs.parent-sha || '' }}
 70 |       - uses: iterative/setup-cml@v2
 71 |       - uses: aws-actions/configure-aws-credentials@v4
 72 |         with:
 73 |           aws-region: us-east-2
 74 |           role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }}
 75 |           role-duration-seconds: 43200
 76 |       - name: Create Runner
 77 |         env:
 78 |           REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
 79 |         run: |
 80 |           cml runner launch --single \
 81 |             --labels=cml \
 82 |             --cloud=${{ inputs.cloud || 'aws' }} \
 83 |             --cloud-region=${{ inputs.region || 'us-east' }} \
 84 |             --cloud-hdd-size=${{ inputs.storage || '40' }} \
 85 |             --cloud-type=${{ inputs.type || 'g5.2xlarge' }} \
 86 |             --idle-timeout=${{ inputs.timeout || '3600' }} \
 87 |              ${{ (inputs.spot == 'true' && '--cloud-spot') || '' }}
 88 | 
 89 |   runner-job:
 90 |     needs: deploy-runner
 91 |     runs-on: [ self-hosted, cml ]
 92 |     environment: cloud
 93 |     container:
 94 |       image: iterativeai/cml:latest-gpu
 95 |       options: --gpus all --ipc host
 96 | 
 97 |     steps:
 98 |       - uses: actions/checkout@v3
 99 |         with:
100 |           ref: ${{ inputs.parent-sha || '' }}
101 |       - uses: aws-actions/configure-aws-credentials@v4
102 |         with:
103 |           aws-region: us-east-2
104 |           role-to-assume: ${{ vars.AWS_SANDBOX_ROLE }}
105 |           role-duration-seconds: 43200
106 | 
107 |       - run: pip install -r requirements.txt
108 | 
109 |       - name: Train
110 |         env:
111 |           REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
112 |           DVC_STUDIO_TOKEN: ${{ secrets.DVC_STUDIO_TOKEN }}
113 |           DVCLIVE_LOGLEVEL: DEBUG
114 |         run: |
115 |           cml ci --fetch-depth 0
116 |           dvc exp run --pull --allow-missing ${{ github.event.inputs.exp-run-args }}
117 |           dvc remote add --local push_remote s3://dvc-public/remote/get-started-pools 
118 | 
119 |       - name: Workflow Dispatch Sharing
120 |         if: github.event_name == 'workflow_dispatch'
121 |         env:
122 |           DVC_STUDIO_TOKEN: ${{ secrets.DVC_STUDIO_TOKEN }}
123 |         run: |
124 |           dvc exp push origin -r push_remote
125 | 
126 |       - name: Commit-based Sharing
127 |         if: github.actor == 'iterative-studio[bot]'
128 |         env:
129 |           REPO_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
130 |         run: |
131 |           dvc push -r push_remote
132 |           cml pr --squash --skip-ci .
133 |           echo "## Metrics" > report.md
134 |           dvc metrics diff main --md >> report.md
135 |           echo "## Params" >> report.md
136 |           dvc params diff main --md >> report.md
137 |           cml comment create --pr report.md
138 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # DVCLive report
132 | dvclive/report.md
133 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | # Deploy Model (Template)
 2 | 
 3 | workflow:
 4 |   rules:
 5 |     # Run the pipeline whenever a tag is pushed to the repository
 6 |     - if: $CI_COMMIT_TAG
 7 | 
 8 | parse:
 9 |   # This job parses the model tag to identify model registry actions
10 |   image: python:3.11-slim
11 |   script: 
12 |   # Install GTO to parse model tags
13 |   - pip install gto
14 |   # This job parses the model tags to identify model registry actions
15 |   - echo "CI_COMMIT_TAG - ${CI_COMMIT_TAG}" 
16 |   - echo MODEL_NAME="$(gto check-ref ${CI_COMMIT_TAG} --name)" >> parse.env
17 |   - echo MODEL_VERSION="$(gto check-ref ${CI_COMMIT_TAG} --version)" >> parse.env
18 |   - echo MODEL_EVENT="$(gto check-ref ${CI_COMMIT_TAG} --event)" >> parse.env
19 |   - echo MODEL_STAGE="$(gto check-ref ${CI_COMMIT_TAG} --stage)" >> parse.env
20 |   # Print variables saved to parse.env
21 |   - cat parse.env
22 |   artifacts:
23 |     reports:
24 |       dotenv: parse.env
25 | 
26 | deploy-model:
27 |   needs:
28 |   - job: parse
29 |     artifacts: true
30 |   image: python:3.11-slim
31 |   script: 
32 |   # Check if the model is assigned to prod (variables from parse.env are only available in the 'script' section)
33 |   - if [[ $MODEL_EVENT == 'assignment' && $MODEL_STAGE == 'prod' ]]; then echo "Deploy model"; else exit 1; fi
34 |   # Install DVC
35 |   - pip install dvc
36 |   # Build commands to download and deploy the model
37 |   - dvc config --global studio.token ${DVC_STUDIO_TOKEN}
38 |   - dvc artifacts get  ${CI_REPOSITORY_URL} ${MODEL_NAME} --rev ${MODEL_VERSION}
39 |   - echo "The right model is available and you can use the rest of this command to deploy it. Good job!"


--------------------------------------------------------------------------------
/example-get-started-experiments/code/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Iterative
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/README.md:
--------------------------------------------------------------------------------
  1 | [![DVC](https://img.shields.io/badge/-Open_in_Studio-grey.svg?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/projects/example-get-started-experiments-y8toqd433r) 
  2 | [![DVC-metrics](https://img.shields.io/badge/dynamic/json?style=flat-square&colorA=grey&colorB=F46737&label=Dice%20Metric&url=https://github.com/iterative/example-get-started-experiments/raw/main/results/evaluate/metrics.json&query=dice_multi)](https://github.com/iterative/example-get-started-experiments/raw/main/results/evaluate/metrics.json)
  3 | 
  4 | [Train Report](./results/train/report.md) - [Evaluation Report](./results/evaluate/report.md)
  5 | 
  6 | # DVC Get Started: Experiments
  7 | 
  8 | This is an auto-generated repository for use in [DVC](https://dvc.org)
  9 | [Get Started: Experiments](https://dvc.org/doc/start/experiment-management).
 10 | 
 11 | This is a Computer Vision (CV) project that solves the problem of segmenting out 
 12 | swimming pools from satellite images. 
 13 | 
 14 | [Example results](./results/evaluate/plots/images/)
 15 | 
 16 | We use a slightly modified version of the [BH-Pools dataset](http://patreo.dcc.ufmg.br/2020/07/29/bh-pools-watertanks-datasets/):
 17 | we split the original 4k images into tiles of 1024x1024 pixels.
 18 | 
 19 | 
 20 | 🐛 Please report any issues found in this project here -
 21 | [example-repos-dev](https://github.com/iterative/example-repos-dev).
 22 | 
 23 | ## Installation
 24 | 
 25 | Python 3.8+ is required to run code from this repo.
 26 | 
 27 | ```console
 28 | $ git clone https://github.com/iterative/example-get-started-experiments
 29 | $ cd example-get-started-experiments
 30 | ```
 31 | 
 32 | Now let's install the requirements. But before we do that, we **strongly**
 33 | recommend creating a virtual environment with a tool such as
 34 | [virtualenv](https://virtualenv.pypa.io/en/stable/):
 35 | 
 36 | ```console
 37 | $ python -m venv .venv
 38 | $ source .venv/bin/activate
 39 | $ pip install -r requirements.txt
 40 | ```
 41 | 
 42 | This DVC project comes with a preconfigured DVC
 43 | [remote storage](https://dvc.org/doc/commands-reference/remote) that holds raw
 44 | data (input), intermediate, and final results that are produced. This is a
 45 | read-only HTTP remote.
 46 | 
 47 | ```console
 48 | $ dvc remote list
 49 | storage  https://remote.dvc.org/get-started-pools
 50 | ```
 51 | 
 52 | You can run [`dvc pull`](https://man.dvc.org/pull) to download the data:
 53 | 
 54 | ```console
 55 | $ dvc pull
 56 | ```
 57 | 
 58 | ## Running in your environment
 59 | 
 60 | Run [`dvc exp run`](https://man.dvc.org/exp/run) to reproduce the
 61 | [pipeline](https://dvc.org/doc/user-guide/pipelines/defining-pipelinese):
 62 | 
 63 | ```console
 64 | $ dvc exp run
 65 | Data and pipelines are up to date.
 66 | ```
 67 | 
 68 | If you'd like to test commands like [`dvc push`](https://man.dvc.org/push),
 69 | that require write access to the remote storage, the easiest way would be to set
 70 | up a "local remote" on your file system:
 71 | 
 72 | > This kind of remote is located in the local file system, but is external to
 73 | > the DVC project.
 74 | 
 75 | ```console
 76 | $ mkdir -p /tmp/dvc-storage
 77 | $ dvc remote add local /tmp/dvc-storage
 78 | ```
 79 | 
 80 | You should now be able to run:
 81 | 
 82 | ```console
 83 | $ dvc push -r local
 84 | ```
 85 | 
 86 | ## Existing stages
 87 | 
 88 | There is a couple of git tags in this project :
 89 | 
 90 | ### [1-notebook-dvclive](https://github.com/iterative/example-get-started-experiments/tree/1-notebook-dvclive)
 91 | 
 92 | Contains an end-to-end Jupyter notebook that loads data, trains a model and 
 93 | reports model performance. 
 94 | [DVCLive](https://dvc.org/doc/dvclive) is used for experiment tracking. 
 95 | See this [blog post](https://iterative.ai/blog/exp-tracking-dvc-python) for more
 96 | details.
 97 | 
 98 | ### [2-dvc-pipeline](https://github.com/iterative/example-get-started-experiments/tree/2-dvc-pipeline)
 99 | 
100 | Contains a DVC pipeline `dvc.yaml` that was created by refactoring the above 
101 | notebook into individual pipeline stages. 
102 | 
103 | The pipeline artifacts (processed data, model file, etc) are automatically 
104 | versioned. 
105 | 
106 | This tag also contains a GitHub Actions workflow that reruns the pipeline if any
107 |  changes are introduced to the pipeline-related files. 
108 | [CML](https://cml.dev/) is used in this workflow to provision a cloud-based GPU 
109 | machine as well as report model performance results in Pull Requests.
110 | 
111 | ## Model Deployment
112 | 
113 | Check out the [GitHub Workflow](https://github.com/iterative/example-get-started-experiments/blob/main/.github/workflows/deploy-model.yml)
114 | that uses the [Iterative Studio Model Registry](https://dvc.org/doc/studio/user-guide/model-registry/what-is-a-model-registry).
115 | to deploy the model to [AWS Sagemaker](https://aws.amazon.com/es/sagemaker/) whenever a new [version is registered](https://dvc.org/doc/studio/user-guide/model-registry/register-version).
116 | 
117 | ## Project structure
118 | 
119 | The data files, DVC files, and results change as stages are created one by one.
120 | After cloning and using [`dvc pull`](https://man.dvc.org/pull) to download
121 | data, models, and plots tracked by DVC, the workspace should look like this:
122 | 
123 | ```console
124 | $ tree -L 2
125 | .
126 | ├── LICENSE
127 | ├── README.md
128 | ├── data.            # <-- Directory with raw and intermediate data
129 | │   ├── pool_data    # <-- Raw image data
130 | │   ├── pool_data.dvc # <-- .dvc file - a placeholder/pointer to raw data
131 | │   ├── test_data    # <-- Processed test data
132 | │   └── train_data   # <-- Processed train data
133 | ├── dvc.lock
134 | ├── dvc.yaml         # <-- DVC pipeline file
135 | ├── models
136 | │   └── model.pkl    # <-- Trained model file
137 | ├── notebooks
138 | │   └── TrainSegModel.ipynb # <-- Initial notebook (refactored into `dvc.yaml`) 
139 | ├── params.yaml      # <-- Parameters file
140 | ├── requirements.txt # <-- Python dependencies needed in the project
141 | ├── results          # <-- DVCLive reports and plots
142 | │   ├── evaluate
143 | │   └── train
144 | └── src              # <-- Source code to run the pipeline stages
145 |     ├── data_split.py
146 |     ├── evaluate.py
147 |     └── train.py
148 | ```
149 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/data/.gitignore:
--------------------------------------------------------------------------------
1 | /pool_data
2 | /test_data
3 | /train_data
4 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/data/pool_data.dvc:
--------------------------------------------------------------------------------
1 | outs:
2 | - md5: 14d187e749ee5614e105741c719fa185.dir
3 |   size: 18999874
4 |   nfiles: 183
5 |   path: pool_data
6 |   hash: md5
7 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/gitlab-workflows/cloud-experiment.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | variables:
 2 |   EXP_RUN_ARGS: ""
 3 | deploy-runner:
 4 |   image: iterativeai/cml:0-dvc2-base1
 5 |   script:
 6 |   - pip install awscli
 7 |   - >
 8 |     CREDENTIALS=($(aws sts assume-role-with-web-identity
 9 |     --region=us-east-1
10 |     --role-arn=arn:aws:iam::342840881361:role/SandboxUser
11 |     --role-session-name=GitLab
12 |     --duration-seconds=3600
13 |     --web-identity-token="$CI_JOB_JWT_V2"
14 |     --query="Credentials.[AccessKeyId,SecretAccessKey,SessionToken]"
15 |     --output=text))
16 |   - export AWS_ACCESS_KEY_ID="${CREDENTIALS[0]}"
17 |   - export AWS_SECRET_ACCESS_KEY="${CREDENTIALS[1]}"
18 |   - export AWS_SESSION_TOKEN="${CREDENTIALS[2]}"
19 |   - aws sts get-caller-identity
20 |   - >
21 |     cml runner launch --single \
22 |       --labels=cml \
23 |       --cloud=aws \
24 |       --cloud-region=us-east \
25 |       --cloud-hdd-size=40 \
26 |       --cloud-type=g5.2xlarge
27 | runner-job:
28 |   needs:
29 |     - deploy-runner
30 |   tags:
31 |     - cml
32 |   image: iterativeai/cml:0-dvc2-base1
33 |   script:
34 |   - pip install awscli
35 |   - >
36 |     CREDENTIALS=($(aws sts assume-role-with-web-identity
37 |     --region=us-east-1
38 |     --role-arn=arn:aws:iam::342840881361:role/SandboxUser
39 |     --role-session-name=GitLab
40 |     --duration-seconds=3600
41 |     --web-identity-token="$CI_JOB_JWT_V2"
42 |     --query="Credentials.[AccessKeyId,SecretAccessKey,SessionToken]"
43 |     --output=text))
44 |   - export AWS_ACCESS_KEY_ID="${CREDENTIALS[0]}"
45 |   - export AWS_SECRET_ACCESS_KEY="${CREDENTIALS[1]}"
46 |   - export AWS_SESSION_TOKEN="${CREDENTIALS[2]}"
47 |   - aws sts get-caller-identity
48 |   - pip install -r requirements.txt
49 |   - cml ci
50 |   - dvc exp run --pull --allow-missing $EXP_RUN_ARGS
51 |   - dvc remote add --local push_remote s3://dvc-public/remote/get-started-pools 
52 |   - dvc exp push origin -r push_remote
53 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/notebooks/TrainSegModel.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import shutil\n",
 11 |     "from functools import partial\n",
 12 |     "from pathlib import Path\n",
 13 |     "import warnings\n",
 14 |     "\n",
 15 |     "import numpy as np\n",
 16 |     "import torch\n",
 17 |     "from box import ConfigBox\n",
 18 |     "from dvclive import Live\n",
 19 |     "from dvclive.fastai import DVCLiveCallback\n",
 20 |     "from fastai.data.all import Normalize, get_files\n",
 21 |     "from fastai.metrics import DiceMulti\n",
 22 |     "from fastai.vision.all import (Resize, SegmentationDataLoaders,\n",
 23 |     "                               imagenet_stats, models, unet_learner)\n",
 24 |     "from ruamel.yaml import YAML\n",
 25 |     "from PIL import Image\n",
 26 |     "\n",
 27 |     "os.chdir(\"..\")\n",
 28 |     "warnings.filterwarnings(\"ignore\")"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "attachments": {},
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "### Load data and split it into train/test\n",
 37 |     "\n",
 38 |     "We have some [data in DVC](https://dvc.org/doc/start/data-management/data-versioning) that we can pull. \n",
 39 |     "\n",
 40 |     "This data includes:\n",
 41 |     "* satellite images\n",
 42 |     "* masks of the swimming pools in each satellite image\n",
 43 |     "\n",
 44 |     "DVC can help connect your data to your repo, but it isn't necessary to have your data in DVC to start tracking experiments with DVC and DVCLive."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "!dvc pull"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "test_regions = [\"REGION_1-\"]\n",
 63 |     "\n",
 64 |     "img_fpaths = get_files(Path(\"data\") / \"pool_data\" / \"images\", extensions=\".jpg\")\n",
 65 |     "\n",
 66 |     "train_data_dir = Path(\"data\") / \"train_data\"\n",
 67 |     "train_data_dir.mkdir(exist_ok=True)\n",
 68 |     "test_data_dir = Path(\"data\") / \"test_data\"\n",
 69 |     "test_data_dir.mkdir(exist_ok=True)\n",
 70 |     "for img_path in img_fpaths:\n",
 71 |     "    msk_path = Path(\"data\") / \"pool_data\" / \"masks\" / f\"{img_path.stem}.png\"\n",
 72 |     "    if any(region in str(img_path) for region in test_regions):\n",
 73 |     "        shutil.copy(img_path, test_data_dir)\n",
 74 |     "        shutil.copy(msk_path, test_data_dir)\n",
 75 |     "    else:\n",
 76 |     "        shutil.copy(img_path, train_data_dir)\n",
 77 |     "        shutil.copy(msk_path, train_data_dir)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "attachments": {},
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "### Create a data loader\n",
 86 |     "\n",
 87 |     "Load and prepare the images and masks by creating a data loader."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "def get_mask_path(x, train_data_dir):\n",
 97 |     "    return Path(train_data_dir) / f\"{Path(x).stem}.png\""
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "bs = 8\n",
107 |     "valid_pct = 0.20\n",
108 |     "img_size = 256\n",
109 |     "\n",
110 |     "data_loader = SegmentationDataLoaders.from_label_func(\n",
111 |     "        path=train_data_dir,\n",
112 |     "        fnames=get_files(train_data_dir, extensions=\".jpg\"),\n",
113 |     "        label_func=partial(get_mask_path, train_data_dir=train_data_dir),\n",
114 |     "        codes=[\"not-pool\", \"pool\"],\n",
115 |     "        bs=bs,\n",
116 |     "        valid_pct=valid_pct,\n",
117 |     "        item_tfms=Resize(img_size),\n",
118 |     "        batch_tfms=[\n",
119 |     "            Normalize.from_stats(*imagenet_stats),\n",
120 |     "        ],\n",
121 |     "    )"
122 |    ]
123 |   },
124 |   {
125 |    "attachments": {},
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "### Review a sample batch of data\n",
130 |     "\n",
131 |     "Below are some examples of the images overlaid with their masks."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "data_loader.show_batch(alpha=0.7)"
141 |    ]
142 |   },
143 |   {
144 |    "attachments": {},
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "### Train multiple models with different learning rates using `DVCLiveCallback`\n",
149 |     "\n",
150 |     "Set up model training, using DVCLive to capture the results of each experiment."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "def dice(mask_pred, mask_true, classes=[0, 1], eps=1e-6):\n",
160 |     "    dice_list = []\n",
161 |     "    for c in classes:\n",
162 |     "        y_true = mask_true == c\n",
163 |     "        y_pred = mask_pred == c\n",
164 |     "        intersection = 2.0 * np.sum(y_true * y_pred)\n",
165 |     "        dice = intersection / (np.sum(y_true) + np.sum(y_pred) + eps)\n",
166 |     "        dice_list.append(dice)\n",
167 |     "    return np.mean(dice_list)\n",
168 |     "\n",
169 |     "\n",
170 |     "def evaluate(learn):\n",
171 |     "    test_img_fpaths = sorted(get_files(Path(\"data\") / \"test_data\", extensions=\".jpg\"))\n",
172 |     "    test_dl = learn.dls.test_dl(test_img_fpaths)\n",
173 |     "    preds, _ = learn.get_preds(dl=test_dl)\n",
174 |     "    masks_pred = np.array(preds[:, 1, :] > 0.5, dtype=np.uint8)\n",
175 |     "    test_mask_fpaths = [\n",
176 |     "        get_mask_path(fpath, Path(\"data\") / \"test_data\") for fpath in test_img_fpaths\n",
177 |     "    ]\n",
178 |     "    masks_true = [Image.open(mask_path) for mask_path in test_mask_fpaths]\n",
179 |     "\n",
180 |     "    dice_multi = 0.0\n",
181 |     "    for ii in range(len(masks_true)):\n",
182 |     "        mask_pred, mask_true = masks_pred[ii], masks_true[ii]\n",
183 |     "        mask_pred = np.array(\n",
184 |     "            Image.fromarray(mask_pred).resize((mask_true.shape[1], mask_true.shape[0])),\n",
185 |     "            dtype=int\n",
186 |     "        )\n",
187 |     "        mask_true = np.array(mask_true, dtype=int)\n",
188 |     "        dice_multi += dice(mask_true, mask_pred) / len(masks_true)\n",
189 |     "\n",
190 |     "    return dice_multi"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "train_arch = 'shufflenet_v2_x2_0'\n",
200 |     "\n",
201 |     "for base_lr in [0.001, 0.005, 0.01]:\n",
202 |     "    # initialize dvclive, optionally provide output path, and show report in notebook\n",
203 |     "    # don't save dvc experiment until post-training metrics below\n",
204 |     "    with Live(\"results/train\", report=\"notebook\", save_dvc_exp=False) as live:\n",
205 |     "        # log a parameter\n",
206 |     "        live.log_param(\"train_arch\", train_arch)\n",
207 |     "        fine_tune_args = {\n",
208 |     "            'epochs': 8,\n",
209 |     "            'base_lr': base_lr\n",
210 |     "        }\n",
211 |     "        # log a dict of parameters\n",
212 |     "        live.log_params(fine_tune_args)\n",
213 |     "\n",
214 |     "        learn = unet_learner(data_loader, \n",
215 |     "                            arch=getattr(models, train_arch), \n",
216 |     "                            metrics=DiceMulti)\n",
217 |     "        # train model and automatically capture metrics with DVCLiveCallback\n",
218 |     "        learn.fine_tune(\n",
219 |     "            **fine_tune_args,\n",
220 |     "            cbs=[DVCLiveCallback(live=live)])\n",
221 |     "\n",
222 |     "        # save model artifact to dvc\n",
223 |     "        models_dir = Path(\"models\")\n",
224 |     "        models_dir.mkdir(exist_ok=True)\n",
225 |     "        learn.export(fname=(models_dir / \"model.pkl\").absolute())\n",
226 |     "        torch.save(learn.model, (models_dir / \"model.pth\").absolute())\n",
227 |     "        live.log_artifact(\n",
228 |     "            str(models_dir / \"model.pkl\"),\n",
229 |     "            type=\"model\",\n",
230 |     "            name=\"pool-segmentation\",\n",
231 |     "            desc=\"This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.\",\n",
232 |     "            labels=[\"cv\", \"segmentation\", \"satellite-images\", \"unet\"],\n",
233 |     "        )\n",
234 |     "\n",
235 |     "    # add additional post-training summary metrics.\n",
236 |     "    with Live(\"results/evaluate\") as live:\n",
237 |     "        live.summary[\"dice_multi\"] = evaluate(learn)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "# Compare experiments\n",
247 |     "!dvc exp show --only-changed"
248 |    ]
249 |   },
250 |   {
251 |    "attachments": {},
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "### Review sample preditions vs ground truth\n",
256 |     "\n",
257 |     "Below are some example of the predicted masks."
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "learn.show_results(max_n=6, alpha=0.7)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": []
275 |   }
276 |  ],
277 |  "metadata": {
278 |   "kernelspec": {
279 |    "display_name": "Python 3 (ipykernel)",
280 |    "language": "python",
281 |    "name": "python3"
282 |   },
283 |   "language_info": {
284 |    "codemirror_mode": {
285 |     "name": "ipython",
286 |     "version": 3
287 |    },
288 |    "file_extension": ".py",
289 |    "mimetype": "text/x-python",
290 |    "name": "python",
291 |    "nbconvert_exporter": "python",
292 |    "pygments_lexer": "ipython3",
293 |    "version": "3.11.6"
294 |   },
295 |   "vscode": {
296 |    "interpreter": {
297 |     "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
298 |    }
299 |   }
300 |  },
301 |  "nbformat": 4,
302 |  "nbformat_minor": 4
303 | }
304 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/params.yaml:
--------------------------------------------------------------------------------
 1 | base:
 2 |   random_seed: 42
 3 | 
 4 | data_split:
 5 |   test_regions: 
 6 |     - REGION_1
 7 | 
 8 | train:
 9 |   valid_pct: 0.1
10 |   arch: shufflenet_v2_x2_0
11 |   img_size: 256
12 |   batch_size: 8
13 |   fine_tune_args:
14 |     epochs: 8
15 |     base_lr: 0.01
16 | 
17 | evaluate:
18 |   n_samples_to_save: 10
19 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/requirements.txt:
--------------------------------------------------------------------------------
1 | dvc[s3]>=3.29.0
2 | dvclive>=3.0.1
3 | fastai
4 | python-box
5 | sagemaker
6 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/sagemaker/code/inference.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Reference:
 3 | https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#id4
 4 | """
 5 | import io
 6 | import os
 7 | 
 8 | import numpy as np
 9 | import torch
10 | from PIL import Image
11 | from torchvision.transforms import Compose, Normalize, Resize, ToTensor
12 | 
13 | 
14 | def model_fn(model_dir, context):
15 |     kwargs = {
16 |         "f": os.path.join(model_dir, "code/model.pth")
17 |     }
18 |     if not torch.cuda.is_available():
19 |         kwargs["map_location"] = torch.device("cpu")
20 |     model = torch.load(**kwargs)
21 |     return model
22 | 
23 | 
24 | def input_fn(request_body, request_content_type, context):
25 |     if request_content_type:
26 |         img_pil = Image.open(io.BytesIO(request_body))
27 |         img_transform = Compose([Resize(512), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
28 |         img_tensor = img_transform(img_pil).unsqueeze_(0)
29 |         return img_tensor
30 |     else:
31 |         raise ValueError(f"Unsupported request_content_type {request_content_type}")
32 | 
33 | 
34 | def predict_fn(input_object, model, context):
35 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36 |     model.to(device)
37 |     with torch.no_grad():
38 |         result = model(input_object)
39 |     return result
40 | 
41 | 
42 | def output_fn(prediction_output, content_type):
43 |     output = np.array(
44 |         prediction_output[:, 1, :] > 0.5, dtype=np.uint8
45 |     )
46 |     if torch.cuda.is_available():
47 |         output = output.cpu()
48 |     buffer = io.BytesIO()
49 |     np.save(buffer, output)
50 |     return buffer.getvalue()
51 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/sagemaker/code/requirements.txt:
--------------------------------------------------------------------------------
1 | fastai
2 | pillow
3 | torch
4 | torchvision


--------------------------------------------------------------------------------
/example-get-started-experiments/code/sagemaker/deploy_model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | import sys
 4 | 
 5 | import boto3
 6 | import botocore
 7 | 
 8 | from sagemaker.deserializers import JSONDeserializer
 9 | from sagemaker.pytorch import PyTorchModel
10 | from sagemaker.serverless import ServerlessInferenceConfig
11 | 
12 | 
13 | memory_size = { 
14 |     "dev": 4096 ,
15 |     "staging": 4096,
16 |     "prod": 6144 ,
17 |     "default": 4096,
18 | }
19 | max_concurrency = { 
20 |     "dev": 5,
21 |     "staging": 5,
22 |     "prod": 10,
23 |     "default": 5,
24 | }
25 | 
26 | 
27 | def deploy(
28 |     name: str,
29 |     stage: str,
30 |     version: str,
31 |     model_data: str,
32 |     role: str,
33 | ):
34 |     sagemaker_logger = logging.getLogger("sagemaker")
35 |     sagemaker_logger.setLevel(logging.DEBUG)
36 |     sagemaker_logger.addHandler(logging.StreamHandler(sys.stdout))
37 | 
38 |     version_name =  re.sub(
39 |         r"[^a-zA-Z0-9\-]", "-", f"{name}-{version}")
40 | 
41 |     model = PyTorchModel(
42 |         name=version_name,
43 |         model_data=model_data,
44 |         framework_version="1.12",
45 |         py_version="py38",
46 |         role=role,
47 |         env={
48 |             "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600",
49 |             "TS_MAX_RESPONSE_SIZE": "2000000000",
50 |             "TS_MAX_REQUEST_SIZE": "2000000000",
51 |             "MMS_MAX_RESPONSE_SIZE": "2000000000",
52 |             "MMS_MAX_REQUEST_SIZE": "2000000000",
53 |         },
54 |     )
55 | 
56 |     stage_name =  re.sub(
57 |         r"[^a-zA-Z0-9\-]", "-", f"{name}-{stage}")
58 |     try:
59 |         boto3.client("sagemaker").delete_endpoint(EndpointName=stage_name)
60 |     except botocore.exceptions.ClientError as e:
61 |         sagemaker_logger.warn(e)
62 |     try:
63 |         boto3.client("sagemaker").delete_endpoint_config(EndpointConfigName=stage_name)
64 |     except botocore.exceptions.ClientError as e:
65 |         sagemaker_logger.warn(e)
66 | 
67 |     return model.deploy(
68 |         initial_instance_count=1,
69 |         deserializer=JSONDeserializer(),
70 |         endpoint_name=stage_name,
71 |         serverless_inference_config=ServerlessInferenceConfig(
72 |             memory_size_in_mb=memory_size[stage],
73 |             max_concurrency=max_concurrency[stage]
74 |         )
75 |     )
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     import argparse
80 | 
81 |     parser = argparse.ArgumentParser(description="Deploy a model to Amazon SageMaker")
82 | 
83 |     parser.add_argument("--name", type=str, required=True, help="Name of the model")
84 |     parser.add_argument("--stage", type=str, required=True, help="Stage of the model")
85 |     parser.add_argument("--version", type=str, required=True, help="Version of the model")
86 |     parser.add_argument("--model_data", type=str, required=True, help="S3 location of the model data")
87 |     parser.add_argument("--role", type=str, required=True, help="ARN of the IAM role to use")
88 | 
89 |     args = parser.parse_args()
90 | 
91 |     deploy(name=args.name, stage=args.stage, version=args.version, model_data=args.model_data, role=args.role)
92 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/src/data_split.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from pathlib import Path
 3 | 
 4 | import numpy as np
 5 | from box import ConfigBox
 6 | from fastai.vision.all import get_files
 7 | from ruamel.yaml import YAML
 8 | 
 9 | 
10 | yaml = YAML(typ="safe")
11 | 
12 | 
13 | def data_split():
14 |     params = ConfigBox(yaml.load(open("params.yaml", encoding="utf-8")))
15 |     np.random.seed(params.base.random_seed)
16 |     img_fpaths = get_files(Path("data") / "pool_data" / "images", extensions=".jpg")
17 | 
18 |     train_data_dir = Path("data") / "train_data"
19 |     train_data_dir.mkdir(exist_ok=True)
20 |     test_data_dir = Path("data") / "test_data"
21 |     test_data_dir.mkdir(exist_ok=True)
22 |     for img_path in img_fpaths:
23 |         msk_path = Path("data") / "pool_data" / "masks" / f"{img_path.stem}.png"
24 |         if any(region in str(img_path) for region in params.data_split.test_regions):
25 |             shutil.copy(img_path, test_data_dir)
26 |             shutil.copy(msk_path, test_data_dir)
27 |         else:
28 |             shutil.copy(img_path, train_data_dir)
29 |             shutil.copy(msk_path, train_data_dir)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     data_split()
34 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/src/endpoint_prediction.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from pathlib import Path
 3 | 
 4 | import dvc.api
 5 | import numpy as np
 6 | from PIL import Image
 7 | from sagemaker.deserializers import NumpyDeserializer
 8 | from sagemaker.pytorch import PyTorchPredictor
 9 | from sagemaker.serializers import IdentitySerializer
10 | 
11 | 
12 | def paint_mask(mask, color_map={0: (0, 0, 0), 1: (0, 0, 255)}):
13 |     vis_shape = mask.shape + (3,)
14 |     vis = np.zeros(vis_shape)
15 |     for i, c in color_map.items():
16 |         vis[mask == i] = color_map[i]
17 |     return Image.fromarray(vis.astype(np.uint8))
18 | 
19 | 
20 | def endpoint_prediction(
21 |     img_path: str,
22 |     endpoint_name: str,
23 |     output_path: str = "predictions",
24 | ):
25 |     params = dvc.api.params_show()
26 |     img_size = params["train"]["img_size"]
27 |     predictor = PyTorchPredictor(endpoint_name, serializer=IdentitySerializer(), deserializer=NumpyDeserializer())
28 |     name = endpoint_name
29 |     
30 |     output_file = Path(output_path) / name / Path(img_path).name
31 |     output_file.parent.mkdir(exist_ok=True, parents=True)
32 | 
33 |     io = BytesIO()
34 |     Image.open(img_path).resize((img_size, img_size)).save(io, format="PNG")
35 |     result = predictor.predict(io.getvalue())[0]
36 | 
37 |     img_pil = Image.open(img_path)
38 |     overlay_img_pil = Image.blend(
39 |         img_pil.convert("RGBA"), 
40 |         paint_mask(result).convert("RGBA").resize(img_pil.size), 
41 |         0.5
42 |     )
43 |     overlay_img_pil.save(str(output_file.with_suffix(".png")))
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     import argparse
48 | 
49 |     parser = argparse.ArgumentParser(description='Run inference on an image using a SageMaker endpoint')
50 |     parser.add_argument('--img_path', type=str, help='path to the input image')
51 |     parser.add_argument('--endpoint_name', type=str, help='name of the SageMaker endpoint to use')
52 |     parser.add_argument('--output_path', type=str, default='predictions', help='path to save the output predictions')
53 | 
54 |     args = parser.parse_args()
55 | 
56 |     endpoint_prediction(args.img_path, args.endpoint_name, args.output_path)
57 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/src/evaluate.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import numpy as np
  4 | from box import ConfigBox
  5 | from dvclive import Live
  6 | from fastai.vision.all import get_files, load_learner
  7 | from PIL import Image
  8 | from ruamel.yaml import YAML
  9 | 
 10 | 
 11 | yaml = YAML(typ="safe")
 12 | 
 13 | 
 14 | def dice(mask_pred, mask_true, classes=[0, 1], eps=1e-6):
 15 |     dice_list = []
 16 |     for c in classes:
 17 |         y_true = mask_true == c
 18 |         y_pred = mask_pred == c
 19 |         intersection = 2.0 * np.sum(y_true * y_pred)
 20 |         dice = intersection / (np.sum(y_true) + np.sum(y_pred) + eps)
 21 |         dice_list.append(dice)
 22 |     return np.mean(dice_list)
 23 | 
 24 | 
 25 | def paint_mask(mask, color_map={0: (0, 0, 0), 1: (0, 0, 255)}):
 26 |     vis_shape = mask.shape + (3,)
 27 |     vis = np.zeros(vis_shape)
 28 |     for i, c in color_map.items():
 29 |         vis[mask == i] = color_map[i]
 30 |     return Image.fromarray(vis.astype(np.uint8))
 31 | 
 32 | 
 33 | def stack_images(im1, im2):
 34 |     dst = Image.new("RGB", (im1.width + im2.width, im1.height))
 35 |     dst.paste(im1, (0, 0))
 36 |     dst.paste(im2, (im1.width, 0))
 37 |     return dst
 38 | 
 39 | 
 40 | def get_overlay_image(img_fpath, mask_true, mask_pred):
 41 |     img_pil = Image.open(img_fpath)
 42 |     overlay_img_true = Image.blend(
 43 |         img_pil.convert("RGBA"), paint_mask(mask_true).convert("RGBA"), 0.5
 44 |     )
 45 | 
 46 |     new_color_map = {
 47 |         0: (0, 0, 0),  # no color - TN
 48 |         1: (255, 0, 255),  # purple - FN
 49 |         2: (255, 255, 0),  # yellow - FP
 50 |         3: (0, 0, 255),  # blue - TP
 51 |     }
 52 |     combined_mask = mask_true + 2 * mask_pred
 53 | 
 54 |     overlay_img_pred = Image.blend(
 55 |         img_pil.convert("RGBA"),
 56 |         paint_mask(combined_mask, color_map=new_color_map).convert("RGBA"),
 57 |         0.5,
 58 |     )
 59 |     stacked_image = stack_images(overlay_img_true, overlay_img_pred)
 60 |     return stacked_image
 61 | 
 62 | 
 63 | def get_mask_path(x, train_data_dir):
 64 |     return Path(train_data_dir) / f"{Path(x).stem}.png"
 65 | 
 66 | 
 67 | def evaluate():
 68 |     params = ConfigBox(yaml.load(open("params.yaml", encoding="utf-8")))
 69 |     model_fpath = Path("models") / "model.pkl"
 70 |     learn = load_learner(model_fpath, cpu=False)
 71 |     test_img_fpaths = sorted(get_files(Path("data") / "test_data", extensions=".jpg"))
 72 |     test_dl = learn.dls.test_dl(test_img_fpaths)
 73 |     preds, _ = learn.get_preds(dl=test_dl)
 74 |     masks_pred = np.array(preds[:, 1, :] > 0.5, dtype=np.uint8)
 75 |     test_mask_fpaths = [
 76 |         get_mask_path(fpath, Path("data") / "test_data") for fpath in test_img_fpaths
 77 |     ]
 78 |     masks_true = [Image.open(mask_path) for mask_path in test_mask_fpaths]
 79 |     with Live("results/evaluate") as live:
 80 |         dice_multi = 0.0
 81 |         for ii in range(len(masks_true)):
 82 |             mask_pred, mask_true = masks_pred[ii], masks_true[ii]
 83 |             mask_pred = np.array(
 84 |                 Image.fromarray(mask_pred).resize((mask_true.shape[1], mask_true.shape[0])),
 85 |                 dtype=int
 86 |             )
 87 |             mask_true = np.array(mask_true, dtype=int)
 88 |             dice_multi += dice(mask_true, mask_pred) / len(masks_true)
 89 | 
 90 |             if ii < params.evaluate.n_samples_to_save:
 91 |                 stacked_image = get_overlay_image(
 92 |                     test_img_fpaths[ii], mask_true, mask_pred
 93 |                 )
 94 |                 stacked_image = stacked_image.resize((512, 256))
 95 |                 live.log_image(f"{Path(test_img_fpaths[ii]).stem}.png", stacked_image)
 96 | 
 97 |         live.summary["dice_multi"] = dice_multi
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     evaluate()
102 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/code/src/train.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from functools import partial
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | from box import ConfigBox
 8 | from dvclive import Live
 9 | from dvclive.fastai import DVCLiveCallback
10 | from fastai.data.all import Normalize, get_files
11 | from fastai.metrics import DiceMulti
12 | from fastai.vision.all import (
13 |     Resize,
14 |     SegmentationDataLoaders,
15 |     imagenet_stats,
16 |     models,
17 |     unet_learner,
18 | )
19 | from ruamel.yaml import YAML
20 | 
21 | yaml = YAML(typ="safe")
22 | 
23 | 
24 | def get_mask_path(x, train_data_dir):
25 |     return Path(train_data_dir) / f"{Path(x).stem}.png"
26 | 
27 | 
28 | def train():
29 |     params = ConfigBox(yaml.load(open("params.yaml", encoding="utf-8")))
30 | 
31 |     np.random.seed(params.base.random_seed)
32 |     torch.manual_seed(params.base.random_seed)
33 |     random.seed(params.base.random_seed)
34 |     train_data_dir = Path("data") / "train_data"
35 | 
36 |     data_loader = SegmentationDataLoaders.from_label_func(
37 |         path=train_data_dir,
38 |         fnames=get_files(train_data_dir, extensions=".jpg"),
39 |         label_func=partial(get_mask_path, train_data_dir=train_data_dir),
40 |         codes=["not-pool", "pool"],
41 |         bs=params.train.batch_size,
42 |         valid_pct=params.train.valid_pct,
43 |         item_tfms=Resize(params.train.img_size),
44 |         batch_tfms=[
45 |             Normalize.from_stats(*imagenet_stats),
46 |         ],
47 |     )
48 | 
49 |     model_names = [
50 |         name
51 |         for name in dir(models)
52 |         if not name.startswith("_")
53 |         and name.islower()
54 |         and name not in ("all", "tvm", "unet", "xresnet")
55 |     ]
56 |     if params.train.arch not in model_names:
57 |         raise ValueError(f"Unsupported model, must be one of:\n{model_names}")
58 | 
59 |     with Live("results/train") as live:
60 |         learn = unet_learner(
61 |             data_loader, arch=getattr(models, params.train.arch), metrics=DiceMulti
62 |         )
63 | 
64 |         learn.fine_tune(
65 |             **params.train.fine_tune_args,
66 |             cbs=[DVCLiveCallback(live=live)],
67 |         )
68 |         models_dir = Path("models")
69 |         models_dir.mkdir(exist_ok=True)
70 |         learn.export(fname=(models_dir / "model.pkl").absolute())
71 |         torch.save(learn.model, (models_dir / "model.pth").absolute())
72 |         live.log_artifact(
73 |             str(models_dir / "model.pkl"),
74 |             type="model",
75 |             name="pool-segmentation",
76 |             desc="This is a Computer Vision (CV) model that's segmenting out swimming pools from satellite images.",
77 |             labels=["cv", "segmentation", "satellite-images", params.train.arch],
78 |         )
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     train()
83 | 


--------------------------------------------------------------------------------
/example-get-started-experiments/generate.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Setup script env:
  4 | #   e   Exit immediately if a command exits with a non-zero exit status.
  5 | #   u   Treat unset variables as an error when substituting.
  6 | #   x   Print commands and their arguments as they are executed.
  7 | set -eux
  8 | HERE="$( cd "$(dirname "$0")" ; pwd -P )"
  9 | REPO_NAME="example-get-started-experiments"
 10 | REPO_PATH="$HERE/build/$REPO_NAME"
 11 | PROD=${1:-false}
 12 | 
 13 | if [ -d "$REPO_PATH" ]; then
 14 |   echo "Repo $REPO_PATH already exists, please remove it first."
 15 |   exit 1
 16 | fi
 17 | 
 18 | TOTAL_TAGS=8
 19 | STEP_TIME=100000
 20 | BEGIN_TIME=$(( $(date +%s) - ( ${TOTAL_TAGS} * ${STEP_TIME}) ))
 21 | export TAG_TIME=${BEGIN_TIME}
 22 | export GIT_AUTHOR_DATE="${TAG_TIME} +0000"
 23 | tick(){
 24 |   export TAG_TIME=$(( ${TAG_TIME} + ${STEP_TIME} ))
 25 |   export GIT_AUTHOR_DATE="${TAG_TIME} +0000"
 26 | }
 27 | 
 28 | export GIT_AUTHOR_NAME="Alex Kim"
 29 | export GIT_AUTHOR_EMAIL="alex000kim@gmail.com"
 30 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"
 31 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"
 32 | 
 33 | mkdir -p $REPO_PATH
 34 | pushd $REPO_PATH
 35 | 
 36 | virtualenv -p python3 .venv
 37 | export VIRTUAL_ENV_DISABLE_PROMPT=true
 38 | source .venv/bin/activate
 39 | echo '.venv/' > .gitignore
 40 | 
 41 | # Installing from main since we'd like to update repo before
 42 | # the release
 43 | pip install "git+https://github.com/iterative/dvc#egg=dvc[s3]" gto
 44 | 
 45 | git init
 46 | cp $HERE/code/README.md .
 47 | cp $HERE/code/.devcontainer.json .
 48 | cp $HERE/code/.gitattributes .
 49 | cp $HERE/code/.gitlab-ci.yml .
 50 | cp $HERE/code/requirements.txt .
 51 | cp -r $HERE/code/.github .
 52 | git add .
 53 | tick
 54 | git commit -m "Initialize Git repository"
 55 | git branch -M main
 56 | 
 57 | 
 58 | dvc init
 59 | # Remote active on this env only, for writing to HTTP redirect below.
 60 | dvc remote add -d --local storage s3://dvc-public/remote/get-started-pools
 61 | # Actual remote for generated project (read-only). Redirect of S3 bucket above.
 62 | dvc remote add -d storage https://remote.dvc.org/get-started-pools
 63 | git add .
 64 | tick
 65 | git commit -m "Initialize DVC project"
 66 | 
 67 | 
 68 | cp -r $HERE/code/data .
 69 | git add data/.gitignore data/pool_data.dvc
 70 | tick
 71 | git commit -m "Add data"
 72 | dvc pull
 73 | 
 74 | 
 75 | cp -r $HERE/code/notebooks .
 76 | git add .
 77 | git commit -m "Add notebook using DVCLive"
 78 | 
 79 | pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu118
 80 | pip install jupyter
 81 | jupyter nbconvert --execute 'notebooks/TrainSegModel.ipynb' --inplace
 82 | # Apply best experiment
 83 | BEST_EXP_ROW=$(dvc exp show --drop '.*' --keep 'Experiment|results/evaluate/metrics.json:dice_multi|base_lr' --csv --sort-by 'results/evaluate/metrics.json:dice_multi' | tail -n 1)
 84 | BEST_EXP_NAME=$(echo $BEST_EXP_ROW | cut -d, -f 1)
 85 | BEST_EXP_BASE_LR=$(echo $BEST_EXP_ROW | cut -d, -f 3)
 86 | dvc exp apply $BEST_EXP_NAME
 87 | git add .
 88 | tick
 89 | git commit -m "Run notebook and apply best experiment"
 90 | git tag -a "1-notebook-dvclive" -m "Experiment using Notebook"
 91 | 
 92 | 
 93 | cp -r $HERE/code/src .
 94 | cp -r $HERE/code/sagemaker .
 95 | cp $HERE/code/params.yaml .
 96 | sed -e "s/base_lr: 0.01/base_lr: $BEST_EXP_BASE_LR/" -i".bkp" params.yaml
 97 | rm params.yaml.bkp
 98 | 
 99 | git rm -r --cached 'results' 'models'
100 | git commit -m "stop tracking results"
101 | 
102 | dvc stage add -n data_split \
103 |   -p base,data_split \
104 |   -d src/data_split.py -d data/pool_data \
105 |   -o data/train_data -o  data/test_data \
106 |   python src/data_split.py
107 | 
108 | dvc remove models/model.pkl.dvc
109 | dvc stage add -n train \
110 |   -p base,train \
111 |   -d src/train.py -d data/train_data \
112 |   -o models/model.pkl -o models/model.pth \
113 |   -o results/train python src/train.py
114 | 
115 | dvc stage add -n evaluate \
116 |   -p base,evaluate \
117 |   -d src/evaluate.py -d models/model.pkl -d data/test_data \
118 |   -o results/evaluate python src/evaluate.py
119 | 
120 | dvc stage add -n sagemaker \
121 |   -d models/model.pth -o model.tar.gz \
122 |   'cp models/model.pth sagemaker/code/model.pth && cd sagemaker && tar -cpzf model.tar.gz code/ && cd .. && mv sagemaker/model.tar.gz .  && rm sagemaker/code/model.pth'
123 | 
124 | git add .
125 | tick
126 | git commit -m "Convert Notebook to dvc.yaml pipeline"
127 | 
128 | dvc exp run
129 | git add .
130 | tick
131 | git commit -m "Run dvc.yaml pipeline"
132 | git tag -a "2-dvc-pipeline" -m "Experiment run using dvc pipeline"
133 | tick
134 | gto register pool-segmentation --version v1.0.0
135 | gto assign pool-segmentation --version v1.0.0 --stage dev
136 | tick
137 | gto assign pool-segmentation --version v1.0.0 --stage prod
138 | gto deprecate pool-segmentation v1.0.0 dev
139 | 
140 | 
141 | export GIT_AUTHOR_NAME="David de la Iglesia"
142 | export GIT_AUTHOR_EMAIL="daviddelaiglesiacastro@gmail.com"
143 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"
144 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"
145 | 
146 | dvc exp run --queue --set-param 'train.arch=alexnet,resnet34,squeezenet1_1' --message 'Tune train.arch'
147 | dvc exp run --run-all
148 | 
149 | dvc push -A
150 | 
151 | popd
152 | 
153 | unset TAG_TIME
154 | unset GIT_AUTHOR_DATE
155 | unset GIT_COMMITTER_DATE
156 | unset GIT_AUTHOR_NAME
157 | unset GIT_AUTHOR_EMAIL
158 | unset GIT_COMMITTER_NAME
159 | unset GIT_COMMITTER_EMAIL
160 | 
161 | cat README.md
162 | 


--------------------------------------------------------------------------------
/example-get-started/.gitignore:
--------------------------------------------------------------------------------
1 | # Custom
2 | *.zip
3 | /tmp
4 | build/
5 | 


--------------------------------------------------------------------------------
/example-get-started/README.md:
--------------------------------------------------------------------------------
  1 | A set of scripts to generate an NLP DVC Studio project with multiple branches,
  2 | commit history, experiments, metrics, plots, etc. It used in the DVC docs and in
  3 | Studio as a demo project. 
  4 | 
  5 | This script can be also used in an advanced scenario to generate a nested
  6 | mono-repositories that are used as fixtures in Studio testing, or testing
  7 | different remote types. See the section below for the advanced settings.
  8 | 
  9 | ## Demo project
 10 | 
 11 | Note! In some cases, before rebuilding the project you might want to delete the
 12 | existing remote tags if you change the order, or names.
 13 | 
 14 | ```shell
 15 | git clone git@github.com:<slug>/example-get-started.git
 16 | cd example-get-started
 17 | git tag -l | xargs -n 1 git push --delete origin
 18 | ```
 19 | 
 20 | For the basic use case (docs and Studio demo), use the command below.
 21 | 
 22 | ```shell
 23 | ./generate.sh
 24 | ```
 25 | 
 26 | If change source code, to publish it on S3 (needed for the get started tutorial)
 27 | pass `prod` to the command. It's needed when you ready to publish it.
 28 | 
 29 | ```shell
 30 | ./generate.sh prod
 31 | ```
 32 | 
 33 | The repo generated in `build/example-get-started` is intended to be published on
 34 | to the https://github.com/iterative/example-get-started. Make sure the Github
 35 | repo exists first and that you have appropriate write permissions.
 36 | 
 37 | To create it with https://cli.github.com/, run:
 38 | 
 39 | ```shell
 40 | gh repo create iterative/example-get-started --public \
 41 |      -d "Get Started DVC project" -h "https://dvc.org/doc/get-started"
 42 | ```
 43 | 
 44 | Run these commands to force push it:
 45 | 
 46 | ```shell
 47 | cd build/example-get-started
 48 | git remote add origin git@github.com:<slug>/example-get-started.git
 49 | # close open PRs
 50 | gh pr close try-large-dataset
 51 | gh pr close tune-hyperparams
 52 | # remove existing tags, branches, experiments
 53 | git ls-remote origin | awk '{print $2}' | xargs -n 1 git push --delete origin || true
 54 | # force push branches
 55 | git push --force origin main
 56 | git push --force origin try-large-dataset
 57 | git push --force origin tune-hyperparams
 58 | # we push git tags one by one for Studio to receive webhooks:
 59 | git tag --sort=creatordate | xargs -n 1 git push --force origin
 60 | ```
 61 | 
 62 | Run these to drop and then rewrite the experiment references on the repo:
 63 | 
 64 | ```shell
 65 | source .venv/bin/activate
 66 | dvc exp remove -A -g origin
 67 | dvc exp push origin -A
 68 | ```
 69 | 
 70 | To create a PR from the `try-large-dataset` branch:
 71 | 
 72 | ```shell
 73 | gh pr create -t "Try 40K dataset (4x data)" \
 74 |    -b "We are trying here a large dataset, since the smaller one looks unstable" \
 75 |    -B main -H try-large-dataset
 76 | ```
 77 | 
 78 | To create a PR from the `tune-hyperparams` branch:
 79 | 
 80 | ```shell
 81 | gh pr create -t "Run experiments tuning random forest params" \
 82 |    -b "Better RF split and number of estimators based on small grid search." \
 83 |    -B main -H tune-hyperparams
 84 | ```
 85 | 
 86 | Finally, return to the directory where you started:
 87 | 
 88 | ```shell
 89 | cd ../..
 90 | ```
 91 | 
 92 | You may remove the generated repo with:
 93 | 
 94 | ```shell
 95 | rm -fR build/example-get-started
 96 | ```
 97 | 
 98 | To update the project in Studio, follow the instructions at:
 99 | 
100 | https://github.com/iterative/studio/wiki/Updating-and-synchronizing-demo-project
101 | 
102 | 
103 | ## Advanced usage
104 | 
105 | Inside the script there a few options that could help generating advanced nested
106 | repositories and/or use different remote types.
107 | 
108 | - `OPT_TESTING_REPO='false'` - (default `false`). Set to true to generate a
109 |   fixture repo or a testing repo. It generates a `README` in those repos that
110 |   has the dump of all the settings that were used to generate them. This way it
111 |   can be reproduced next time.
112 | - `OPT_SUBDIR=''` - (default `''`). No leading or trailing slashes. If specified
113 |   the new repo will be created inside the 
114 |   `build/example-get-started/$OPT_SUBDIR` path.
115 | - `OPT_INIT_GIT='true'` - (default `true`). Whether to run or not `git init`. If
116 |   there is already initialized Git repo in place we don't need to run it again.
117 |   Usually needed if you are generating a nested repo.
118 | - `OPT_INIT_DVC='true'` - (default `true`). Whether to run or not
119 |   `dvc init` in the generated directory. If it's nested directory `--subdir` is
120 |   added.
121 | - `OPT_NON_DVC='false'` - (default `false`). To generate a non DVC repo with
122 |   some sources, basic params, and metrics. To test non DVC root, or custom
123 |   metrics, etc.
124 | - `OPT_BRANCHES='true'` - (default `true`). Whether we need to generate
125 |   branches (bigger dataset, etc). It supports nested repos - branch names will
126 |   have prefixes or suffixes to distinguish them.
127 | - `OPT_REMOTE="public-s3"` - (default `private-s3`). Other options: `public-s3`,
128 |   `private-http`, `private-ssh`, `private-gdrive`, etc.
129 | - `OPT_DVC_TRACKED_METRICS='true'` - (default `true`). Either we should use
130 |   DVC to also track all metric and plot files (e.g. to test that Studio can get
131 |   plots from the remote storage).
132 | - `OPT_REGISTER_MODELS='false'` - (default `true`). Use the `gto` to register
133 |   models. It supports nested repos.
134 | - `OPT_TAGS='true'` - (default `true`). Generate Git tags for commits.
135 |   Independent of `OPT_REGISTER_MODELS` and `OPT_TAG_MODELS`.
136 | - `OPT_SQUASH_COMMITS='false'` - (default `false`). Squash commits into one
137 |   after generating a repo or a sub repo. It speedups parsing in tests. Be
138 |   careful with Git tags (disable them for example).
139 | - `OPT_TAG_MODELS='true'` - (default `true`). Creates Git tags using GTO.
140 |   Independent of `OPT_REGISTER_MODELS` and `OPT_TAGS`.
141 | - `OPT_MODEL_NAME='text-classification'` - (default `text-classification`).
142 |   Model name to register.
143 | 
144 | ## Remotes
145 | 
146 | A variety of remotes could be used to generated different repositories to test
147 | private storage credentials in Studio (manually or via CI).
148 | 
149 | `OPT_REMOTE` takes different values (see above or in the `generate.sh`).
150 | 
151 | For SSH and HTTP remotes we use a machine that is deployed in GCP with IP
152 | address http://35.194.53.251. Credentials for both could be found in this Slack
153 | [thread](https://iterativeai.slack.com/archives/CUSNDR35K/p1595393188054200).
154 | You might need to change a path to SSH key in the script. HTTP remote doesn't
155 | support PUT/POST so we use SSH to upload data there.
156 | 


--------------------------------------------------------------------------------
/example-get-started/code/.devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "example-get-started",
 3 |   "image": "mcr.microsoft.com/devcontainers/python:3.10",
 4 |   "extensions": ["Iterative.dvc", "ms-python.python", "redhat.vscode-yaml"],
 5 |   "features": {
 6 |       "ghcr.io/iterative/features/dvc:1": {}
 7 |   },
 8 |   "postCreateCommand": "pip3 install --user -r src/requirements.txt"
 9 | }
10 | 


--------------------------------------------------------------------------------
/example-get-started/code/.gitattributes:
--------------------------------------------------------------------------------
1 | *.dvc linguist-language=YAML
2 | dvc.lock linguist-language=YAML
3 | 


--------------------------------------------------------------------------------
/example-get-started/code/.github/workflows/cml.yaml:
--------------------------------------------------------------------------------
 1 | name: CML Report
 2 | on: pull_request
 3 | jobs:
 4 |   run:
 5 |     runs-on: [ubuntu-latest]
 6 |     steps:
 7 |       - uses: iterative/setup-cml@v2
 8 |       - uses: iterative/setup-dvc@v1
 9 |       - uses: actions/checkout@v3
10 |         with:
11 |           fetch-depth: 2
12 |       # Needed for https://github.com/iterative/example-repos-dev/issues/225
13 |       - name: Installs JSON5
14 |         run: npm install -g json5
15 |       - name: Generate metrics report
16 |         env:
17 |           REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 |         run: |
19 |           cml ci
20 |           if [ $GITHUB_REF = refs/heads/main ]; then
21 |             PREVIOUS_REF=HEAD~1
22 |           else
23 |             PREVIOUS_REF=main
24 |             git fetch origin main:main
25 |           fi
26 | 
27 |           dvc pull eval
28 |           dvc plots diff $PREVIOUS_REF workspace \
29 |             --show-vega --targets ROC | json5 > vega.json
30 |           vl2svg vega.json roc.svg
31 | 
32 |           dvc plots diff $PREVIOUS_REF workspace \
33 |             --show-vega --targets Precision-Recall | json5 > vega.json
34 |           vl2svg vega.json prc.svg
35 | 
36 |           dvc plots diff $PREVIOUS_REF workspace \
37 |             --show-vega --targets Confusion-Matrix | json5 > vega.json
38 |           vl2svg vega.json confusion.svg
39 | 
40 |           cp eval/plots/images/importance.png importance_workspace.png
41 | 
42 |           git checkout $PREVIOUS_REF -- dvc.lock
43 |           cp eval/plots/images/importance.png importance_previous.png
44 | 
45 |           dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
46 | 
47 |           cat <<EOF > report.md
48 |           # CML Report
49 |           ## Plots
50 |           ![ROC](./roc.svg)
51 |           ![Precision-Recall](./prc.svg)
52 |           ![Confusion Matrix](./confusion.svg)
53 |           #### Feature Importance: ${PREVIOUS_REF}
54 |           ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png)
55 |           #### Feature Importance: workspace
56 |           ![Feature Importance: workspace](./importance_workspace.png)
57 | 
58 |           ## Metrics and Params
59 |           ### ${PREVIOUS_REF} → workspace
60 |           ${dvc_report}
61 |           EOF
62 | 
63 |           cml comment create --publish --pr=false report.md
64 | 


--------------------------------------------------------------------------------
/example-get-started/code/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | report:
 2 |   rules:
 3 |     - if: $CI_PIPELINE_SOURCE == 'merge_request_event'
 4 |     - if: $CI_COMMIT_BRANCH == 'main'
 5 |   image: dvcorg/cml:0-dvc3-base1  
 6 |   before_script:
 7 |     - cml ci && cml --version
 8 |     - npm install -g json5
 9 |   script: |
10 |     if [ $CI_COMMIT_REF_NAME = main ]; then
11 |       PREVIOUS_REF=HEAD~1
12 |       COMMIT_HASH1=$CI_COMMIT_BEFORE_SHA
13 |       COMMIT_HASH2=$CI_COMMIT_SHA
14 |     else
15 |       PREVIOUS_REF=main
16 |       git fetch --depth=1 origin main:main
17 |       COMMIT_HASH1=$CI_MERGE_REQUEST_DIFF_BASE_SHA
18 |       COMMIT_HASH2=$CI_COMMIT_SHA
19 |     fi
20 | 
21 |     dvc pull eval
22 |     dvc plots diff $PREVIOUS_REF workspace \
23 |       --show-vega --targets ROC | json5 > vega.json
24 |     vl2svg vega.json roc.svg
25 | 
26 |     dvc plots diff $PREVIOUS_REF workspace \
27 |       --show-vega --targets Precision-Recall | json5 > vega.json
28 |     vl2svg vega.json prc.svg
29 | 
30 |     dvc plots diff $PREVIOUS_REF workspace \
31 |       --show-vega --targets Confusion-Matrix | json5 > vega.json
32 |     vl2svg vega.json confusion.svg
33 | 
34 |     cp eval/plots/images/importance.png importance_workspace.png
35 | 
36 |     git checkout $PREVIOUS_REF -- dvc.lock
37 |     cp eval/plots/images/importance.png importance_previous.png
38 | 
39 |     dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
40 | 
41 |     cat <<EOF > report.md
42 |     # CML Report
43 |     [![DVC](https://img.shields.io/badge/-Open_in_Studio-grey?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/views/example-get-started-2gpv7kdqx2?panels=plots%2C%3Bcompare%2C&commits=${COMMIT_HASH2}%3B${COMMIT_HASH1}&activeCommits=${COMMIT_HASH1}%3Aprimary%3B${COMMIT_HASH2}%3Apurple)
44 |     ## Plots
45 |     ![ROC](./roc.svg)
46 |     ![Precision-Recall](./prc.svg)
47 |     ![Confusion Matrix](./confusion.svg)
48 |     #### Feature Importance: ${PREVIOUS_REF}
49 |     ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png)
50 |     #### Feature Importance: workspace
51 |     ![Feature Importance: workspace](./importance_workspace.png)
52 | 
53 |     ## Metrics and Params
54 |     ### ${PREVIOUS_REF} → workspace
55 |     ${dvc_report}
56 |     EOF
57 | 
58 |     if [ $CI_COMMIT_REF_NAME = main ]; then
59 |         cml comment create --target=commit report.md
60 |     else
61 |         cml comment update --target=pr report.md
62 |     fi
63 | 


--------------------------------------------------------------------------------
/example-get-started/code/README.md:
--------------------------------------------------------------------------------
  1 | [![DVC](https://img.shields.io/badge/-Open_in_Studio-grey.svg?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/views/example-get-started-zde16i6c4g)
  2 | 
  3 | # DVC Get Started
  4 | 
  5 | This is an auto-generated repository for use in [DVC](https://dvc.org)
  6 | [Get Started](https://dvc.org/doc/get-started). It is a step-by-step quick
  7 | introduction into basic DVC concepts.
  8 | 
  9 | ![](https://static.iterative.ai/img/example-get-started/readme-head.png)
 10 | 
 11 | The project is a natural language processing (NLP) binary classifier problem of
 12 | predicting tags for a given StackOverflow question. For example, we want one
 13 | classifier which can predict a post that is about the R language by tagging it
 14 | `R`.
 15 | 
 16 | 🐛 Please report any issues found in this project here -
 17 | [example-repos-dev](https://github.com/iterative/example-repos-dev).
 18 | 
 19 | ## Installation
 20 | 
 21 | Python 3.9+ is required to run code from this repo.
 22 | 
 23 | ```console
 24 | $ git clone https://github.com/iterative/example-get-started
 25 | $ cd example-get-started
 26 | ```
 27 | 
 28 | Now let's install the requirements. But before we do that, we **strongly**
 29 | recommend creating a virtual environment with a tool such as
 30 | [virtualenv](https://virtualenv.pypa.io/en/stable/):
 31 | 
 32 | ```console
 33 | $ virtualenv -p python3 .venv
 34 | $ source .venv/bin/activate
 35 | $ pip install -r src/requirements.txt
 36 | ```
 37 | 
 38 | > This instruction assumes that DVC is already installed, as it is frequently
 39 | > used as a global tool like Git. If DVC is not installed, see the
 40 | > [DVC installation guide](https://dvc.org/doc/install) on how to install DVC.
 41 | 
 42 | This DVC project comes with a preconfigured DVC
 43 | [remote storage](https://dvc.org/doc/commands-reference/remote) that holds raw
 44 | data (input), intermediate, and final results that are produced. This is a
 45 | read-only HTTP remote.
 46 | 
 47 | ```console
 48 | $ dvc remote list
 49 | storage https://remote.dvc.org/get-started
 50 | ```
 51 | 
 52 | You can run [`dvc pull`](https://man.dvc.org/pull) to download the data:
 53 | 
 54 | ```console
 55 | $ dvc pull
 56 | ```
 57 | 
 58 | ## Running in your environment
 59 | 
 60 | Run [`dvc exp run`](https://man.dvc.org/exp/run) to reproduce the
 61 | [pipeline](https://dvc.org/doc/user-guide/pipelines) and create a new
 62 | [experiment](https://dvc.org/doc/user-guide/experiment-management).
 63 | 
 64 | ```console
 65 | $ dvc exp run
 66 | Ran experiment(s): rapid-cane
 67 | Experiment results have been applied to your workspace.
 68 | ```
 69 | 
 70 | If you'd like to test commands like [`dvc push`](https://man.dvc.org/push),
 71 | that require write access to the remote storage, the easiest way would be to set
 72 | up a "local remote" on your file system:
 73 | 
 74 | > This kind of remote is located in the local file system, but is external to
 75 | > the DVC project.
 76 | 
 77 | ```console
 78 | $ mkdir -p /tmp/dvc-storage
 79 | $ dvc remote add local /tmp/dvc-storage
 80 | ```
 81 | 
 82 | You should now be able to run:
 83 | 
 84 | ```console
 85 | $ dvc push -r local
 86 | ```
 87 | 
 88 | ## Existing stages
 89 | 
 90 | This project with the help of the Git tags reflects the sequence of actions that
 91 | are run in the DVC [get started](https://dvc.org/doc/get-started) guide. Feel
 92 | free to checkout one of them and play with the DVC commands having the
 93 | playground ready.
 94 | 
 95 | - `0-git-init`: Empty Git repository initialized.
 96 | - `1-dvc-init`: DVC has been initialized. `.dvc/` with the cache directory
 97 |   created.
 98 | - `2-track-data`: Raw data file `data.xml` downloaded and tracked with DVC using
 99 |   [`dvc add`](https://man.dvc.org/add). First `.dvc` file created.
100 | - `3-config-remote`: Remote HTTP storage initialized. It's a shared read only
101 |   storage that contains all data artifacts produced during next steps.
102 | - `4-import-data`: Use `dvc import` to get the same `data.xml` from the DVC data
103 |   registry.
104 | - `5-source-code`: Source code downloaded and put into Git.
105 | - `6-prepare-stage`: Create `dvc.yaml` and the first pipeline stage with
106 |   [`dvc run`](https://man.dvc.org/run). It transforms XML data into TSV.
107 | - `7-ml-pipeline`: Feature extraction and train stages created. It takes data in
108 |   TSV format and produces two `.pkl` files that contain serialized feature
109 |   matrices. Train runs random forest classifier and creates the `model.pkl` file.
110 | - `8-evaluation`: Evaluation stage. Runs the model on a test dataset to produce
111 |   its performance AUC value. The result is dumped into a DVC metric file so that
112 |   we can compare it with other experiments later.
113 | - `9-bigrams-model`: Bigrams experiment, code has been modified to extract more
114 |   features. We run [`dvc repro`](https://man.dvc.org/repro) for the first time
115 |   to illustrate how DVC can reuse cached files and detect changes along the
116 |   computational graph, regenerating the model with the updated data.
117 | - `10-bigrams-experiment`: Reproduce the evaluation stage with the bigrams based
118 |   model.
119 | - `11-random-forest-experiments`: Reproduce experiments to tune the random
120 |   forest classifier parameters and select the best experiment.
121 | 
122 | There are three additional tags:
123 | 
124 | - `baseline-experiment`: First end-to-end result that we have performance metric
125 |   for.
126 | - `bigrams-experiment`: Second experiment (model trained using bigrams
127 |   features).
128 | - `random-forest-experiments`: Best of additional experiments tuning random
129 |   forest parameters.
130 | 
131 | These tags can be used to illustrate `-a` or `-T` options across different
132 | [DVC commands](https://man.dvc.org/).
133 | 
134 | ## Project structure
135 | 
136 | The data files, DVC files, and results change as stages are created one by one.
137 | After cloning and using [`dvc pull`](https://man.dvc.org/pull) to download
138 | data, models, and plots tracked by DVC, the workspace should look like this:
139 | 
140 | ```console
141 | $ tree
142 | .
143 | ├── README.md
144 | ├── data                  # <-- Directory with raw and intermediate data
145 | │   ├── data.xml          # <-- Initial XML StackOverflow dataset (raw data)
146 | │   ├── data.xml.dvc      # <-- .dvc file - a placeholder/pointer to raw data
147 | │   ├── features          # <-- Extracted feature matrices
148 | │   │   ├── test.pkl
149 | │   │   └── train.pkl
150 | │   └── prepared          # <-- Processed dataset (split and TSV formatted)
151 | │       ├── test.tsv
152 | │       └── train.tsv
153 | ├── dvc.lock
154 | ├── dvc.yaml              # <-- DVC pipeline file
155 | ├── eval
156 | │   ├── metrics.json      # <-- Binary classifier final metrics (e.g. AUC)
157 | │   └── plots             
158 | │       ├── images
159 | │       │   └── importance.png    # <-- Feature importance plot
160 | │       └── sklearn       # <-- Data points for ROC, confusion matrix
161 | │           ├── cm
162 | │           │   ├── test.json
163 | │           │   └── train.json
164 | │           ├── prc
165 | │           │   ├── test.json
166 | │           │   └── train.json
167 | │           └── roc
168 | │               ├── test.json
169 | │               └── train.json
170 | ├── model.pkl             # <-- Trained model file
171 | ├── params.yaml           # <-- Parameters file
172 | └── src                   # <-- Source code to run the pipeline stages
173 |     ├── evaluate.py
174 |     ├── featurization.py
175 |     ├── prepare.py
176 |     ├── requirements.txt  # <-- Python dependencies needed in the project
177 |     └── train.py
178 | ```
179 | 


--------------------------------------------------------------------------------
/example-get-started/code/params.yaml:
--------------------------------------------------------------------------------
 1 | prepare:
 2 |   split: 0.20
 3 |   seed: 20170428
 4 | 
 5 | featurize:
 6 |   max_features: 100
 7 |   ngrams: 1
 8 | 
 9 | train:
10 |   seed: 20170428
11 |   n_est: 50
12 |   min_split: 0.01
13 | 
14 | 


--------------------------------------------------------------------------------
/example-get-started/code/src/evaluate.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import os
  4 | import pickle
  5 | import sys
  6 | 
  7 | import pandas as pd
  8 | from sklearn import metrics
  9 | from sklearn import tree
 10 | from dvclive import Live
 11 | from matplotlib import pyplot as plt
 12 | 
 13 | 
 14 | def evaluate(model, matrix, split, live, save_path):
 15 |     """
 16 |     Dump all evaluation metrics and plots for given datasets.
 17 | 
 18 |     Args:
 19 |         model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
 20 |         matrix (scipy.sparse.csr_matrix): Input matrix.
 21 |         split (str): Dataset name.
 22 |         live (dvclive.Live): Dvclive instance.
 23 |         save_path (str): Path to save the metrics.
 24 |     """
 25 |     labels = matrix[:, 1].toarray().astype(int)
 26 |     x = matrix[:, 2:]
 27 | 
 28 |     predictions_by_class = model.predict_proba(x)
 29 |     predictions = predictions_by_class[:, 1]
 30 | 
 31 |     # Use dvclive to log a few simple metrics...
 32 |     avg_prec = metrics.average_precision_score(labels, predictions)
 33 |     roc_auc = metrics.roc_auc_score(labels, predictions)
 34 |     if not live.summary:
 35 |         live.summary = {"avg_prec": {}, "roc_auc": {}}
 36 |     live.summary["avg_prec"][split] = avg_prec
 37 |     live.summary["roc_auc"][split] = roc_auc
 38 | 
 39 |     # ... and plots...
 40 |     # ... like an roc plot...
 41 |     live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}")
 42 |     # ... and precision recall plot...
 43 |     # ... which passes `drop_intermediate=True` to the sklearn method...
 44 |     live.log_sklearn_plot(
 45 |         "precision_recall",
 46 |         labels,
 47 |         predictions,
 48 |         name=f"prc/{split}",
 49 |         drop_intermediate=True,
 50 |     )
 51 |     # ... and confusion matrix plot
 52 |     live.log_sklearn_plot(
 53 |         "confusion_matrix",
 54 |         labels.squeeze(),
 55 |         predictions_by_class.argmax(-1),
 56 |         name=f"cm/{split}",
 57 |     )
 58 | 
 59 | 
 60 | def save_importance_plot(live, model, feature_names):
 61 |     """
 62 |     Save feature importance plot.
 63 | 
 64 |     Args:
 65 |         live (dvclive.Live): DVCLive instance.
 66 |         model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
 67 |         feature_names (list): List of feature names.
 68 |     """
 69 |     fig, axes = plt.subplots(dpi=100)
 70 |     fig.subplots_adjust(bottom=0.2, top=0.95)
 71 |     axes.set_ylabel("Mean decrease in impurity")
 72 | 
 73 |     importances = model.feature_importances_
 74 |     forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
 75 |     forest_importances.plot.bar(ax=axes)
 76 | 
 77 |     live.log_image("importance.png", fig)
 78 | 
 79 | 
 80 | def main():
 81 |     EVAL_PATH = "eval"
 82 | 
 83 |     if len(sys.argv) != 3:
 84 |         sys.stderr.write("Arguments error. Usage:\n")
 85 |         sys.stderr.write("\tpython evaluate.py model features\n")
 86 |         sys.exit(1)
 87 | 
 88 |     model_file = sys.argv[1]
 89 |     train_file = os.path.join(sys.argv[2], "train.pkl")
 90 |     test_file = os.path.join(sys.argv[2], "test.pkl")
 91 | 
 92 |     # Load model and data.
 93 |     with open(model_file, "rb") as fd:
 94 |         model = pickle.load(fd)
 95 | 
 96 |     with open(train_file, "rb") as fd:
 97 |         train, feature_names = pickle.load(fd)
 98 | 
 99 |     with open(test_file, "rb") as fd:
100 |         test, _ = pickle.load(fd)
101 | 
102 |     # Evaluate train and test datasets.
103 |     with Live(EVAL_PATH) as live:
104 |         evaluate(model, train, "train", live, save_path=EVAL_PATH)
105 |         evaluate(model, test, "test", live, save_path=EVAL_PATH)
106 | 
107 |         # Dump feature importance plot.
108 |         save_importance_plot(live, model, feature_names)
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     main()
113 | 


--------------------------------------------------------------------------------
/example-get-started/code/src/featurization.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import sys
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scipy.sparse as sparse
  8 | import yaml
  9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 10 | 
 11 | 
 12 | def get_df(data):
 13 |     """Read the input data file and return a data frame."""
 14 |     df = pd.read_csv(
 15 |         data,
 16 |         encoding="utf-8",
 17 |         header=None,
 18 |         delimiter="\t",
 19 |         names=["id", "label", "text"],
 20 |     )
 21 |     sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
 22 |     return df
 23 | 
 24 | 
 25 | def save_matrix(df, matrix, names, output):
 26 |     """
 27 |     Save the matrix to a pickle file.
 28 | 
 29 |     Args:
 30 |         df (pandas.DataFrame): Input data frame.
 31 |         matrix (scipy.sparse.csr_matrix): Input matrix.
 32 |         names (list): List of feature names.
 33 |         output (str): Output file name.
 34 |     """
 35 |     id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
 36 |     label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
 37 | 
 38 |     result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
 39 | 
 40 |     msg = "The output matrix {} size is {} and data type is {}\n"
 41 |     sys.stderr.write(msg.format(output, result.shape, result.dtype))
 42 | 
 43 |     with open(output, "wb") as fd:
 44 |         pickle.dump((result, names), fd)
 45 |     pass
 46 | 
 47 | 
 48 | def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf):
 49 |     """
 50 |     Generate train feature matrix.
 51 | 
 52 |     Args:
 53 |         train_input (str): Train input file name.
 54 |         train_output (str): Train output file name.
 55 |         bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
 56 |         tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
 57 |     """
 58 |     df_train = get_df(train_input)
 59 |     train_words = np.array(df_train.text.str.lower().values)
 60 | 
 61 |     bag_of_words.fit(train_words)
 62 | 
 63 |     train_words_binary_matrix = bag_of_words.transform(train_words)
 64 |     feature_names = bag_of_words.get_feature_names_out()
 65 | 
 66 |     tfidf.fit(train_words_binary_matrix)
 67 |     train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
 68 | 
 69 |     save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
 70 | 
 71 | 
 72 | def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf):
 73 |     """
 74 |     Generate test feature matrix.
 75 | 
 76 |     Args:
 77 |         test_input (str): Test input file name.
 78 |         test_output (str): Test output file name.
 79 |         bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
 80 |         tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
 81 |     """
 82 |     df_test = get_df(test_input)
 83 |     test_words = np.array(df_test.text.str.lower().values)
 84 | 
 85 |     test_words_binary_matrix = bag_of_words.transform(test_words)
 86 |     test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
 87 |     feature_names = bag_of_words.get_feature_names_out()
 88 | 
 89 |     save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
 90 | 
 91 | 
 92 | def main():
 93 |     params = yaml.safe_load(open("params.yaml"))["featurize"]
 94 | 
 95 |     np.set_printoptions(suppress=True)
 96 | 
 97 |     if len(sys.argv) != 3 and len(sys.argv) != 5:
 98 |         sys.stderr.write("Arguments error. Usage:\n")
 99 |         sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
100 |         sys.exit(1)
101 | 
102 |     in_path = sys.argv[1]
103 |     out_path = sys.argv[2]
104 | 
105 |     train_input = os.path.join(in_path, "train.tsv")
106 |     test_input = os.path.join(in_path, "test.tsv")
107 |     train_output = os.path.join(out_path, "train.pkl")
108 |     test_output = os.path.join(out_path, "test.pkl")
109 | 
110 |     max_features = params["max_features"]
111 |     ngrams = params["ngrams"]
112 | 
113 |     os.makedirs(out_path, exist_ok=True)
114 | 
115 |     bag_of_words = CountVectorizer(
116 |         stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
117 |     )
118 |     tfidf = TfidfTransformer(smooth_idf=False)
119 | 
120 |     generate_and_save_train_features(
121 |         train_input=train_input,
122 |         train_output=train_output,
123 |         bag_of_words=bag_of_words,
124 |         tfidf=tfidf,
125 |     )
126 | 
127 |     generate_and_save_test_features(
128 |         test_input=test_input,
129 |         test_output=test_output,
130 |         bag_of_words=bag_of_words,
131 |         tfidf=tfidf,
132 |     )
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     main()
137 | 


--------------------------------------------------------------------------------
/example-get-started/code/src/prepare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import re
 4 | import sys
 5 | import xml.etree.ElementTree
 6 | 
 7 | import yaml
 8 | 
 9 | 
10 | def process_posts(input_lines, fd_out_train, fd_out_test, target_tag, split):
11 |     """
12 |     Process the input lines and write the output to the output files.
13 | 
14 |     Args:
15 |         input_lines (list): List of input lines.
16 |         fd_out_train (file): Output file for the training data set.
17 |         fd_out_test (file): Output file for the test data set.
18 |         target_tag (str): Target tag.
19 |         split (float): Test data set split ratio.
20 |     """
21 |     num = 1
22 |     for line in input_lines:
23 |         try:
24 |             fd_out = fd_out_train if random.random() > split else fd_out_test
25 |             attr = xml.etree.ElementTree.fromstring(line).attrib
26 | 
27 |             pid = attr.get("Id", "")
28 |             label = 1 if target_tag in attr.get("Tags", "") else 0
29 |             title = re.sub(r"\s+", " ", attr.get("Title", "")).strip()
30 |             body = re.sub(r"\s+", " ", attr.get("Body", "")).strip()
31 |             text = title + " " + body
32 | 
33 |             fd_out.write("{}\t{}\t{}\n".format(pid, label, text))
34 | 
35 |             num += 1
36 |         except Exception as ex:
37 |             sys.stderr.write(f"Skipping the broken line {num}: {ex}\n")
38 | 
39 | 
40 | def main():
41 |     params = yaml.safe_load(open("params.yaml"))["prepare"]
42 | 
43 |     if len(sys.argv) != 2:
44 |         sys.stderr.write("Arguments error. Usage:\n")
45 |         sys.stderr.write("\tpython prepare.py data-file\n")
46 |         sys.exit(1)
47 | 
48 |     # Test data set split ratio
49 |     split = params["split"]
50 |     random.seed(params["seed"])
51 | 
52 |     input = sys.argv[1]
53 |     output_train = os.path.join("data", "prepared", "train.tsv")
54 |     output_test = os.path.join("data", "prepared", "test.tsv")
55 | 
56 |     os.makedirs(os.path.join("data", "prepared"), exist_ok=True)
57 | 
58 |     input_lines = []
59 |     with open(input) as fd_in:
60 |         input_lines = fd_in.readlines()
61 | 
62 |     fd_out_train = open(output_train, "w", encoding="utf-8")
63 |     fd_out_test = open(output_test, "w", encoding="utf-8")
64 | 
65 |     process_posts(
66 |         input_lines=input_lines,
67 |         fd_out_train=fd_out_train,
68 |         fd_out_test=fd_out_test,
69 |         target_tag="<r>",
70 |         split=split,
71 |     )
72 | 
73 |     fd_out_train.close()
74 |     fd_out_test.close()
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/example-get-started/code/src/requirements.txt:
--------------------------------------------------------------------------------
1 | dvclive>=3.0
2 | pandas
3 | pyaml
4 | scikit-learn>=1.3
5 | scipy
6 | matplotlib
7 | 


--------------------------------------------------------------------------------
/example-get-started/code/src/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import sys
 4 | 
 5 | import numpy as np
 6 | import yaml
 7 | from sklearn.ensemble import RandomForestClassifier
 8 | 
 9 | 
10 | def train(seed, n_est, min_split, matrix):
11 |     """
12 |     Train a random forest classifier.
13 | 
14 |     Args:
15 |         seed (int): Random seed.
16 |         n_est (int): Number of trees in the forest.
17 |         min_split (int): Minimum number of samples required to split an internal node.
18 |         matrix (scipy.sparse.csr_matrix): Input matrix.
19 | 
20 |     Returns:
21 |         sklearn.ensemble.RandomForestClassifier: Trained classifier.
22 |     """
23 |     labels = np.squeeze(matrix[:, 1].toarray())
24 |     x = matrix[:, 2:]
25 | 
26 |     sys.stderr.write("Input matrix size {}\n".format(matrix.shape))
27 |     sys.stderr.write("X matrix size {}\n".format(x.shape))
28 |     sys.stderr.write("Y matrix size {}\n".format(labels.shape))
29 | 
30 |     clf = RandomForestClassifier(
31 |         n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed
32 |     )
33 | 
34 |     clf.fit(x, labels)
35 | 
36 |     return clf
37 | 
38 | 
39 | def main():
40 |     params = yaml.safe_load(open("params.yaml"))["train"]
41 | 
42 |     if len(sys.argv) != 3:
43 |         sys.stderr.write("Arguments error. Usage:\n")
44 |         sys.stderr.write("\tpython train.py features model\n")
45 |         sys.exit(1)
46 | 
47 |     input = sys.argv[1]
48 |     output = sys.argv[2]
49 |     seed = params["seed"]
50 |     n_est = params["n_est"]
51 |     min_split = params["min_split"]
52 | 
53 |     # Load the data
54 |     with open(os.path.join(input, "train.pkl"), "rb") as fd:
55 |         matrix, _ = pickle.load(fd)
56 | 
57 |     clf = train(seed=seed, n_est=n_est, min_split=min_split, matrix=matrix)
58 | 
59 |     # Save the model
60 |     with open(output, "wb") as fd:
61 |         pickle.dump(clf, fd)
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/example-get-started/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -eux
 4 | 
 5 | PACKAGE_DIR=code
 6 | PACKAGE="code.zip"
 7 | TEST_DIR=tmp
 8 | TEST_PACKAGE=$TEST_DIR/$PACKAGE
 9 | PROD=${1:-false}
10 | 
11 | rm -f $PACKAGE
12 | rm -rf $TEST_DIR
13 | mkdir $TEST_DIR
14 | 
15 | pushd $PACKAGE_DIR
16 | zip -r $PACKAGE params.yaml src/* .github/*
17 | popd
18 | 
19 | # Requires AWS CLI and write access to `s3://dvc-public/code/get-started/`.
20 | mv $PACKAGE_DIR/$PACKAGE .
21 | if [ $PROD == 'prod' ]; then
22 | 
23 |     aws s3 cp $PACKAGE s3://dvc-public/code/get-started/$PACKAGE
24 | 
25 |     # Sanity check
26 |     wget https://code.dvc.org/get-started/$PACKAGE -O $TEST_PACKAGE
27 |     unzip $TEST_PACKAGE -d $TEST_DIR
28 | 
29 |     echo "\nNo output should be produced by the following cmp and diff commands:\n"
30 | 
31 |     cmp $PACKAGE $TEST_PACKAGE  # Expected output: nothing
32 |     rm -f $TEST_PACKAGE
33 |     cp -f $PACKAGE_DIR/README.md $TEST_DIR
34 |     cp -f $PACKAGE_DIR/.devcontainer.json $TEST_DIR
35 |     cp -f $PACKAGE_DIR/.gitlab-ci.yml $TEST_DIR
36 |     cp -f $PACKAGE_DIR/.gitattributes $TEST_DIR
37 |     diff -r $PACKAGE_DIR $TEST_DIR  # Expected output: nothing
38 |     rm -fR $TEST_DIR
39 | 
40 | fi
41 | 


--------------------------------------------------------------------------------
/example-get-started/generate.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # See https://dvc.org/get-started
  3 | 
  4 | set -eux
  5 | 
  6 | HERE=$( cd "$(dirname "$0")" ; pwd -P )
  7 | REPO_NAME="example-get-started"
  8 | REPO_PATH_BASE="$HERE/build/$REPO_NAME"
  9 | PROD=${1:-false}
 10 | 
 11 | # Some additional options to tune the exact repo structure that we generate.
 12 | # It useful to generate nested (monorepo), private storages, a mix of those
 13 | # cases to be used in Studio fixtures or QA.
 14 | OPT_TESTING_REPO='false' # Default false.
 15 | OPT_SUBDIR='' # No leading or trailing slashes. Default "".
 16 | OPT_INIT_GIT='true' # Default true.
 17 | OPT_INIT_DVC='true' # Default true.
 18 | OPT_NON_DVC='false' # Default false.
 19 | OPT_BRANCHES='true' # Default true.
 20 | OPT_TAGS='true' # Default true.
 21 | # Default "public-s3". Other options: "public-s3", "private-http", "private-ssh", etc.
 22 | # See the details below in the `init_remote_storage` and in the README.
 23 | OPT_REMOTE='public-s3'
 24 | OPT_DVC_TRACKED_METRICS='true' # Default true.
 25 | OPT_REGISTER_MODELS='true' # Default true.
 26 | OPT_MODEL_NAME='text-classification' # Default "text-classification".
 27 | OPT_TAG_MODELS='true' # Default true.
 28 | OPT_SQUASH_COMMITS='false' # Default false.
 29 | 
 30 | 
 31 | if [ -z $OPT_SUBDIR ]; then
 32 |   COMMIT_PREFIX=""
 33 |   GIT_TAG_SUFFIX=""
 34 |   GTO_PREFIX=""
 35 |   MAIN_REPO_README=""
 36 | else
 37 |   [ -d "$REPO_PATH_BASE" ] && cp -r "$REPO_PATH_BASE" "${REPO_PATH_BASE}-backup-$(date +%s)"
 38 |   MODIFIER=$(echo ${OPT_SUBDIR} | tr / -)
 39 |   COMMIT_PREFIX="[$MODIFIER] "
 40 |   GIT_TAG_SUFFIX="-$MODIFIER"
 41 |   # In GTO we use : as a separator to get the full model name
 42 |   GTO_PREFIX="${OPT_SUBDIR}:"
 43 |   MAIN_REPO_README="${REPO_PATH_BASE}/README.md"
 44 | fi
 45 | 
 46 | REPO_PATH="${REPO_PATH_BASE}/${OPT_SUBDIR}"
 47 | if [ -d "$REPO_PATH" ]; then
 48 |   echo "Repo $REPO_PATH already exists, please remove it first."
 49 |   exit 1
 50 | fi
 51 | 
 52 | create_tag() {
 53 |   if [ $OPT_TAGS == 'true' ]; then
 54 |     git tag -a "$1" -m "$2"
 55 |   fi
 56 | }
 57 | 
 58 | init_remote_storage() {
 59 |   if [ $OPT_REMOTE == 'public-s3' ]; then
 60 |     # Remote active on this env only, for writing.
 61 |     dvc remote add -f -d --local $OPT_REMOTE s3://dvc-public/remote/get-started
 62 |     # Actual remote for generated project (read-only). Redirect of S3 bucket above.
 63 |     dvc remote add -f -d $OPT_REMOTE https://remote.dvc.org/get-started
 64 |   fi
 65 | 
 66 |   if [ $OPT_REMOTE == 'private-gdrive' ]; then
 67 |     # This corresponds to the Iterative shared GDrive disk.
 68 |     dvc remote add -f -d $OPT_REMOTE gdrive://1x2tUWiCqcHxmUli7BP6qOrrlmp-12DYY
 69 |     # In Studio we have to use a custom app for GDrive. This one is created in
 70 |     # the pydrive-test project and in Studio we provide service account credentials.
 71 |     # When testing in Studio get a service credentials JSON file from the Google
 72 |     # Cloud Console.
 73 |     dvc remote modify $OPT_REMOTE gdrive_client_id "47794215776-cd9ssb6a4vv5otkq6n0iadpgc4efgjb1.apps.googleusercontent.com"
 74 |     dvc remote modify $OPT_REMOTE gdrive_client_secret 'i2gerGA7uBjZbR08HqSOSt9Z'
 75 |   fi
 76 | 
 77 |   if [ $OPT_REMOTE == 'private-s3' ]; then
 78 |     dvc remote add -f -d $OPT_REMOTE s3://dvc-private/remote/get-started
 79 |   fi
 80 | 
 81 |   if [ $OPT_REMOTE == 'private-http' ]; then
 82 |     dvc remote add -f -d --local storage ssh://dvc@35.194.53.251/home/dvc/storage
 83 |     dvc remote modify --local storage keyfile /Users/ivan/.ssh/dvc_gcp_remotes_rsa
 84 |     dvc remote add -f -d $OPT_REMOTE http://35.194.53.251
 85 |   fi
 86 | 
 87 |   if [ $OPT_REMOTE == 'private-ssh' ]; then
 88 |     dvc remote add -f -d $OPT_REMOTE ssh://dvc@35.194.53.251/home/dvc/storage
 89 |     dvc remote modify $OPT_REMOTE keyfile /Users/ivan/.ssh/dvc_gcp_remotes_rsa
 90 |   fi
 91 | 
 92 |   if [ $OPT_REMOTE == 'private-azure' ]; then
 93 |     # Make sure that you have connection string in your env or some other way
 94 |     # provide credentials for the `dvcprivate` storage account. Copy the connection
 95 |     # string from the Azure portal and export it with
 96 |     # `AZURE_STORAGE_CONNECTION_STRING`
 97 |     dvc remote add -f -d $OPT_REMOTE azure://nlp
 98 |   fi
 99 | }
100 | 
101 | mkdir -p $REPO_PATH
102 | pushd $REPO_PATH
103 | 
104 | TOTAL_TAGS=50
105 | STEP_TIME=500000
106 | 
107 | if [ $(git rev-parse --show-toplevel) == $REPO_PATH_BASE ]; then
108 |   BEGIN_TIME=$(git log -1 --format=%ct)
109 | else
110 |   BEGIN_TIME=$(( $(date +%s) - (${TOTAL_TAGS} * ${STEP_TIME}) ))
111 | fi
112 | 
113 | export TAG_TIME=${BEGIN_TIME}
114 | export GIT_AUTHOR_DATE="${TAG_TIME} +0000"
115 | export GIT_COMMITTER_DATE="${TAG_TIME} +0000"
116 | 
117 | tick(){
118 |   TICK_DELTA=$(python3 -c "print(int(${STEP_TIME} * ($RANDOM+1)/32767))")
119 |   export TAG_TIME=$(( ${TAG_TIME} + ${TICK_DELTA} ))
120 |   export GIT_AUTHOR_DATE="${TAG_TIME} +0000"
121 |   export GIT_COMMITTER_DATE="${TAG_TIME} +0000"
122 | }
123 | 
124 | if [ $OPT_TESTING_REPO == 'true' ]; then
125 |   export GIT_AUTHOR_NAME="R. Daneel Olivaw"
126 |   export GIT_AUTHOR_EMAIL="olivaw@iterative.ai"
127 | else
128 |   export GIT_AUTHOR_NAME="Ivan Shcheklein"
129 |   export GIT_AUTHOR_EMAIL="shcheklein@gmail.com"
130 | fi
131 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"
132 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"
133 | 
134 | virtualenv -p python3 .venv
135 | export VIRTUAL_ENV_DISABLE_PROMPT=true
136 | source .venv/bin/activate
137 | echo '.venv/' > .gitignore
138 | 
139 | # Installing from main since we'd like to update repo before
140 | # the release
141 | pip install "git+https://github.com/iterative/dvc#egg=dvc[all]" gto
142 | 
143 | 
144 | if [ $OPT_INIT_GIT == 'true' ]; then
145 |   git init
146 |   git checkout -b main
147 |   cp $HERE/code/README.md .
148 |   cp $HERE/code/.devcontainer.json .
149 |   cp $HERE/code/.gitlab-ci.yml .
150 |   cp $HERE/code/.gitattributes .
151 |   git add .
152 | else
153 |   git checkout main
154 | fi
155 | 
156 | # Dump the config for the repo into README if we are generating a testing repo.
157 | if [ $OPT_TESTING_REPO == 'true' ]; then
158 |   echo -e "This is a [DVC Studio](https://studio.iterative.ai) testing (fixture) repository." > README.md
159 |   echo -e "\n## \`<root>/${OPT_SUBDIR}\` config\n\n\`\`\`bash" | tee -a README.md $MAIN_REPO_README
160 |   while read var; do
161 |     echo "$var='$(eval "echo \"\$$var\"")'" | tee -a README.md $MAIN_REPO_README
162 |   done < <( declare -p | cut -d " " -f 2 | grep = | grep "^OPT_" | cut -f 1 -d '=')
163 |   echo '```' | tee -a README.md $MAIN_REPO_README
164 |   git add $REPO_PATH_BASE/.
165 | fi
166 | 
167 | if [ $OPT_INIT_GIT == 'true' ] || [ $OPT_TESTING_REPO == 'true' ]; then
168 |   if [ $OPT_INIT_GIT == 'true' ]; then
169 |     tick
170 |     git commit -m "${COMMIT_PREFIX}Initialize Git repository"
171 |     create_tag "0-git-init${GIT_TAG_SUFFIX}" "Git initialized."
172 |   else
173 |     tick
174 |     git commit -m "${COMMIT_PREFIX}Add testing repo"
175 |     create_tag "0-git-init${GIT_TAG_SUFFIX}" "Testing repo initialized."
176 |   fi
177 | fi
178 | 
179 | BASE_COMMT=$(git rev-parse HEAD)
180 | 
181 | if [ $OPT_INIT_DVC == 'true' ]; then
182 |   dvc init --subdir
183 |   tick
184 |   git commit -m "${COMMIT_PREFIX}Initialize DVC project"
185 |   create_tag "1-dvc-init${GIT_TAG_SUFFIX}" "DVC initialized."
186 | fi
187 | 
188 | 
189 | mkdir data
190 | dvc get https://github.com/iterative/dataset-registry \
191 |   get-started/data.xml -o data/data.xml
192 | 
193 | if [ $OPT_NON_DVC == 'false' ]; then
194 |   if [ $OPT_REGISTER_MODELS == "true" ]; then
195 |     echo "artifacts:
196 |   stackoverflow-dataset:
197 |     path: data/data.xml
198 |     type: dataset
199 |     desc: Initial XML StackOverflow dataset (raw data)" >> dvc.yaml
200 |   fi
201 |   dvc add data/data.xml
202 |   git add data/data.xml.dvc
203 | else
204 |   echo "data.xml" > data/.gitignore
205 | fi
206 | git add data/.gitignore
207 | tick
208 | git commit -m "${COMMIT_PREFIX}Add raw data"
209 | create_tag "2-track-data${GIT_TAG_SUFFIX}" "Data file added."
210 | 
211 | 
212 | if [ $OPT_NON_DVC == 'false' ]; then
213 |   init_remote_storage
214 | 
215 |   git add $REPO_PATH_BASE/.
216 |   tick
217 |   git commit -m "${COMMIT_PREFIX}Configure default remote"
218 |   create_tag "3-config-remote${GIT_TAG_SUFFIX}" "Remote storage configured."
219 |   dvc push
220 | fi
221 | 
222 | if [ $OPT_NON_DVC == 'false' ]; then
223 |   rm data/data.xml data/data.xml.dvc
224 |   dvc import https://github.com/iterative/dataset-registry \
225 |     get-started/data.xml -o data/data.xml
226 |   git add data/data.xml.dvc
227 |   tick
228 |   git commit -m "${COMMIT_PREFIX}Import raw data (overwrite)"
229 |   create_tag "4-import-data${GIT_TAG_SUFFIX}" "Data file overwritten with an import."
230 |   dvc push
231 | fi
232 | 
233 | # Deploy code
234 | pushd $HERE
235 | source deploy.sh $PROD
236 | popd
237 | 
238 | # Get deployed code
239 | if [ $PROD == 'prod' ]; then
240 |     wget https://code.dvc.org/get-started/code.zip
241 | else
242 |     mv $HERE/code.zip code.zip
243 | fi
244 | 
245 | unzip code.zip
246 | rm -f code.zip
247 | pip install -r src/requirements.txt
248 | git add .
249 | if [ $OPT_NON_DVC == 'true' ]; then
250 | cat <<EOF >> metrics.json
251 | {
252 |     "avg_prec": {
253 |         "train": 0.9743681430252835,
254 |         "test": 0.9249974999612706
255 |     },
256 |     "roc_auc": {
257 |         "train": 0.9866678562450621,
258 |         "test": 0.9460213440787918
259 |     }
260 | }
261 | EOF
262 | fi
263 | tick
264 | git commit -m "${COMMIT_PREFIX}Add source code files to repo"
265 | create_tag "5-source-code${GIT_TAG_SUFFIX}" "Source code added."
266 | 
267 | if [ $OPT_NON_DVC == 'false' ]; then
268 |   dvc stage add -n prepare \
269 |     -p prepare.seed,prepare.split \
270 |     -d src/prepare.py -d data/data.xml \
271 |     -o data/prepared \
272 |     python src/prepare.py data/data.xml
273 |   dvc repro
274 |   git add data/.gitignore dvc.yaml dvc.lock
275 |   tick
276 |   git commit -m "${COMMIT_PREFIX}Create data preparation stage"
277 |   create_tag "6-prepare-stage${GIT_TAG_SUFFIX}" "First pipeline stage (data preparation) created."
278 |   dvc push
279 | 
280 |   dvc stage add -n featurize \
281 |     -p featurize.max_features,featurize.ngrams \
282 |     -d src/featurization.py -d data/prepared \
283 |     -o data/features \
284 |     python src/featurization.py \
285 |     data/prepared data/features
286 |   dvc stage add -n train \
287 |     -p train.seed,train.n_est,train.min_split \
288 |     -d src/train.py -d data/features \
289 |     -o model.pkl \
290 |     python src/train.py data/features model.pkl
291 |   dvc repro
292 | 
293 |   if [ $OPT_REGISTER_MODELS == "true" ]; then
294 |   python <<EOF
295 | from dvc.repo import Repo
296 | from dvc.annotations import Artifact
297 | repo = Repo(".")
298 | artifact = Artifact(
299 |   path="model.pkl", 
300 |   type="model",
301 |   desc="Detect whether the given stackoverflow question should have R language tag",
302 |   labels=["nlp", "classification", "stackoverflow"]
303 | )
304 | repo.artifacts.add("$OPT_MODEL_NAME", artifact)
305 | EOF
306 |   fi
307 | 
308 |   git add .gitignore data/.gitignore dvc.yaml dvc.lock
309 |   tick
310 |   git commit -m "${COMMIT_PREFIX}Create ML pipeline stages"
311 |   create_tag "7-ml-pipeline${GIT_TAG_SUFFIX}" "ML pipeline created."
312 |   dvc push
313 | 
314 |   if [ $OPT_DVC_TRACKED_METRICS == "true" ]; then
315 |     dvc stage add -n evaluate \
316 |       -d src/evaluate.py -d model.pkl -d data/features -o eval \
317 |       python src/evaluate.py model.pkl data/features
318 |   else
319 |     dvc stage add -n evaluate \
320 |       -d src/evaluate.py -d model.pkl -d data/features -O eval \
321 |       python src/evaluate.py model.pkl data/features
322 |   fi
323 | 
324 |   sed -e "s/Live(EVAL_PATH)/Live(EVAL_PATH, dvcyaml=False)/" -i".bck" src/evaluate.py
325 |   rm -f src/evaluate.py.bck
326 | 
327 |   echo "metrics:
328 | - eval/metrics.json
329 | plots:
330 | - ROC:
331 |     template: simple
332 |     x: fpr
333 |     y:
334 |       eval/plots/sklearn/roc/train.json: tpr
335 |       eval/plots/sklearn/roc/test.json: tpr
336 | - Confusion-Matrix:
337 |     template: confusion
338 |     x: actual
339 |     y:
340 |       eval/plots/sklearn/cm/train.json: predicted
341 |       eval/plots/sklearn/cm/test.json: predicted
342 | - Precision-Recall:
343 |     template: simple
344 |     x: recall
345 |     y:
346 |       eval/plots/sklearn/prc/train.json: precision
347 |       eval/plots/sklearn/prc/test.json: precision
348 | - eval/plots/images/importance.png" >> dvc.yaml
349 | 
350 |   dvc repro
351 |   if [ $OPT_DVC_TRACKED_METRICS == "true" ]; then
352 |     git add .gitignore dvc.yaml dvc.lock
353 |   else
354 |     git add .gitignore dvc.yaml dvc.lock eval
355 |   fi
356 |   tick
357 |   git commit -am "${COMMIT_PREFIX}Create evaluation stage"
358 |   create_tag "8-dvclive-eval${GIT_TAG_SUFFIX}" "DVCLive evaluation stage created."
359 |   create_tag "baseline-experiment${GIT_TAG_SUFFIX}" "Baseline experiment evaluation"
360 |   if [ $OPT_TAG_MODELS == "true" ]; then
361 |     gto register "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.0.0
362 |     gto assign "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.0.0 --stage prod
363 |   fi
364 |   dvc push
365 | 
366 | 
367 |   sed -e "s/max_features: 100/max_features: 200/" -i".bck" params.yaml
368 |   sed -e "s/ngrams: 1/ngrams: 2/" -i".bck" params.yaml
369 |   rm -f params.yaml.bck
370 |   dvc repro train
371 |   tick
372 |   git commit -am "${COMMIT_PREFIX}Reproduce model using bigrams"
373 |   create_tag "9-bigrams-model${GIT_TAG_SUFFIX}" "Model retrained using bigrams."
374 |   if [ $OPT_TAG_MODELS == "true" ]; then
375 |     gto register "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.1.0
376 |     gto assign "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.1.0 --stage stage
377 |   fi
378 |   dvc push
379 | 
380 | 
381 |   dvc repro evaluate
382 |   tick
383 |   git commit -am "${COMMIT_PREFIX}Evaluate bigrams model"
384 |   create_tag "bigrams-experiment${GIT_TAG_SUFFIX}" "Bigrams experiment evaluation"
385 |   create_tag "10-bigrams-experiment${GIT_TAG_SUFFIX}" "Evaluated bigrams model."
386 |   if [ $OPT_TAG_MODELS == "true" ]; then
387 |     gto register "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.2.0
388 |     gto assign "${GTO_PREFIX}${OPT_MODEL_NAME}" --version v1.2.0 --stage dev
389 |   fi
390 |   dvc push
391 | fi
392 | 
393 | if [ $OPT_SQUASH_COMMITS == 'true' ]; then
394 |   git reset --soft $BASE_COMMT
395 |   git commit --amend --no-edit
396 | fi
397 | 
398 | if [ $OPT_NON_DVC == 'false' ] && [ $OPT_BRANCHES == 'true' ]; then
399 |   export GIT_AUTHOR_NAME="Dave Berenbaum"
400 |   export GIT_AUTHOR_EMAIL="dave.berenbaum@gmail.com"
401 |   export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"
402 |   export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"
403 | 
404 |   git checkout -b "tune-hyperparams${GIT_TAG_SUFFIX}"
405 | 
406 |   unset GIT_AUTHOR_DATE
407 |   unset GIT_COMMITTER_DATE
408 | 
409 |   dvc exp run --queue --set-param train.min_split=8
410 |   dvc exp run --queue --set-param train.min_split=64
411 |   dvc exp run --queue --set-param train.min_split=2 --set-param train.n_est=100
412 |   dvc exp run --queue --set-param train.min_split=8 --set-param train.n_est=100
413 |   dvc exp run --queue --set-param train.min_split=64 --set-param train.n_est=100
414 |   dvc exp run --run-all -j 2
415 |   # Apply best experiment
416 |   EXP=$(dvc exp show --csv --sort-by avg_prec.test | tail -n 1 | cut -d , -f 1)
417 |   dvc exp apply $EXP
418 |   tick
419 |   git commit -am "${COMMIT_PREFIX}Run experiments tuning random forest params"
420 |   create_tag "random-forest-experiments${GIT_TAG_SUFFIX}" "Run experiments to tune random forest params"
421 |   create_tag "11-random-forest-experiments${GIT_TAG_SUFFIX}" "Tuned random forest classifier."
422 |   dvc push
423 | 
424 |   git checkout main
425 | 
426 |   export GIT_AUTHOR_NAME="Dmitry Petrov"
427 |   export GIT_AUTHOR_EMAIL="dmitry.petrov@nevesomo.com"
428 |   export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"
429 |   export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"
430 | 
431 |   git checkout -b "try-large-dataset${GIT_TAG_SUFFIX}"
432 | 
433 |   dvc update data/data.xml.dvc --rev get-started-40K
434 |   sed -e "s/max_features: 200/max_features: 500/" -i".bck" params.yaml
435 |   rm -f params.yaml.bck
436 |   dvc repro
437 |   dvc push
438 |   git commit -am "${COMMIT_PREFIX}Try a 40K dataset (4x data)"
439 | fi
440 | 
441 | popd
442 | 
443 | unset TAG_TIME
444 | unset GIT_AUTHOR_DATE
445 | unset GIT_COMMITTER_DATE
446 | unset GIT_AUTHOR_NAME
447 | unset GIT_AUTHOR_EMAIL
448 | unset GIT_COMMITTER_NAME
449 | unset GIT_COMMITTER_EMAIL
450 | 
451 | set +eux
452 | echo
453 | echo "=========================================="
454 | echo "Done! Read README for the next steps."
455 | echo "=========================================="
456 | echo
457 | 


--------------------------------------------------------------------------------
/example-get-started/generate_data.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import random
 4 | import sys
 5 | import xml.etree.ElementTree
 6 | 
 7 | # This file is not part of the project but is used to generate a slice of
 8 | # data from the full SO dump https://archive.org/details/stackexchange
 9 | 
10 | 
11 | if len(sys.argv) != 3:
12 |     sys.stderr.write("Arguments error. Usage:\n")
13 |     sys.stderr.write("\tpython analyze.py data-file output-file\n")
14 |     sys.exit(1)
15 | 
16 | 
17 | target = 40000
18 | split = 0.3
19 | 
20 | 
21 | def lines_matched_test(fd, test):
22 |     for line in fd:
23 |         try:
24 |             attr = xml.etree.ElementTree.fromstring(line).attrib
25 |             if test(attr.get("Tags", "")):
26 |                 yield line
27 |         except Exception as ex:
28 |             sys.stderr.write(f"Skipping the broken line: {ex}\n")
29 | 
30 | 
31 | def process_posts(fd_in, fd_not, fd_out):
32 |     count = 0
33 |     in_lines = lines_matched_test(fd_in, lambda x: "<r>" in x)
34 |     not_lines = lines_matched_test(fd_not, lambda x: "<r>" not in x)
35 |     while count < target:
36 |         line = next(not_lines) if random.random() > split else next(in_lines)
37 |         fd_out.write(line)
38 |         count += 1
39 | 
40 | 
41 | with io.open(sys.argv[1], encoding="utf8") as fd_in:
42 |     with io.open(sys.argv[1], encoding="utf8") as fd_not:
43 |         with io.open(sys.argv[2], "w", encoding="utf8") as fd_out:
44 |             process_posts(fd_in, fd_not, fd_out)
45 | 


--------------------------------------------------------------------------------
/example-gto/code/.github/workflows/gto-act-on-tags.yml:
--------------------------------------------------------------------------------
 1 | name: Act on artifact registrations and promotions
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - "*"
 6 | 
 7 | jobs:
 8 |   act:
 9 |     name: Figure out what was registered/promoted and act on it
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v3
13 |     - name: "GTO: figure out what was registered/promoted and show the Registry state"
14 |       id: gto
15 |       uses: iterative/gto-action@v2
16 |     - uses: actions/setup-python@v2
17 |     - name: Install dependencies
18 |       run: |
19 |         pip install --upgrade pip setuptools wheel
20 |         pip install -r requirements.txt
21 |     - name: "Publish (act on registering a new version)"
22 |       if: steps.gto.outputs.event == 'registration'
23 |       run: |
24 |         echo "[$GITHUB_REF] You got version '${{ steps.gto.outputs.version }}' registered for model '${{ steps.gto.outputs.name }}' "
25 |         echo "[$GITHUB_REF] It is about time to publish it somewhere so others could use it!"
26 |     - name: "Deploy (act on assigning a new stage)"
27 |       if: steps.gto.outputs.event == 'assignment'
28 |       run: |
29 |         echo "[$GITHUB_REF] You got model '${{ steps.gto.outputs.name }}' of version '${{ steps.gto.outputs.version }}' promoted to stage '${{ steps.gto.outputs.stage }}'"
30 |         echo "[$GITHUB_REF] It is about time to deploy it somewhere!"
31 | 


--------------------------------------------------------------------------------
/example-gto/code/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/example-gto/code/README.md:
--------------------------------------------------------------------------------
 1 | # Example GTO Model Registry
 2 | 
 3 | A [model registry] is a tool to catalog ML models and their versions. Models from
 4 | your data science projects can be discovered, tested, shared, deployed, and
 5 | audited from there. [DVC] and [GTO] enable these capabilities on top of
 6 | Git, so you can stick to an existing software engineering stack.
 7 | 
 8 | This repo is an example of Model Registry built with these tools. The model
 9 | dashboard:
10 | 
11 | <big><pre>
12 | $ gto show
13 | ╒══════════╤══════════╤═════════╤═════════╤════════════╕
14 | │ name     │ latest   │ #dev    │ #prod   │ #staging   │
15 | ╞══════════╪══════════╪═════════╪═════════╪════════════╡
16 | │ churn    │ [v3.1.1](https://github.com/iterative/example-gto/releases/tag/churn@v3.1.1)   │ [v3.1.0](https://github.com/iterative/example-gto/releases/tag/churn%23dev%234)  │ [v3.0.0](https://github.com/iterative/example-gto/releases/tag/churn%23prod%233)  │ [v3.1.0](https://github.com/iterative/example-gto/releases/tag/churn%23staging%232)     │
17 | │ segment  │ [v0.4.1](https://github.com/iterative/example-gto/releases/tag/segment@v0.4.1)   │ [v0.4.1](https://github.com/iterative/example-gto/releases/tag/segment%23dev%231)  │ -       │ -          │
18 | │ cv-class │ [v0.1.13](https://github.com/iterative/example-gto/releases/tag/cv-class@v0.1.13)  │ -       │ -       │ -          │
19 | ╘══════════╧══════════╧═════════╧═════════╧════════════╛
20 | </pre></big>
21 | 
22 | - The `latest` column shows the latest model versions,
23 | - The `#dev` column represent model versions promoted to a Stage `dev` (same for
24 |   `#prod` and `#staging`),
25 | - Versions are registered and promoted to Stages by [Git tags] - you can click
26 |   the links to see the which specific Git tag did it,
27 | - Artifact metadata like `path` and `description` is stored in
28 |   [`artifacts.yaml`],
29 | - [Github Actions page] of this repo have examples of workflows where we act
30 |   upon these Git tags.
31 | 
32 | Check out [public Model Registry] in [Studio] built on top of DVC and GTO
33 | that provides more insight into your ML models development, including
34 | training params, metrics and plots.
35 | 
36 | 🧑‍💻 To continue learning, head to [Get Started with GTO].
37 | 
38 | [github actions page]: https://github.com/iterative/example-gto/actions
39 | [get started with gto]: https://dvc.org/doc/gto/get-started
40 | [model registry]: https://dvc.org/doc/use-cases/model-registry
41 | [dvc]: https://github.com/iterative/dvc
42 | [gto]: https://github.com/iterative/gto
43 | [git tags]: https://github.com/iterative/example-gto/tags
44 | [`artifacts.yaml`]:
45 |   https://github.com/iterative/example-gto/blob/main/artifacts.yaml
46 | [public model registry]: https://studio.iterative.ai/team/Iterative/models
47 | [studio]: https://studio.iterative.ai
48 | 


--------------------------------------------------------------------------------
/example-gto/code/mlem/.github/workflows/deploy-model-with-mlem.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy MLEM model after GTO Stage assignment
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - "*"
 6 | env:
 7 |   HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
 8 |   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
 9 |   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
10 | 
11 | jobs:
12 |   parse-git-tag:
13 |     name: Figure out what was registered/promoted
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |     - uses: actions/checkout@v3
17 |     - name: "GTO: figure out what was registered/promoted and show the Registry state"
18 |       id: gto
19 |       uses: iterative/gto-action@v2
20 |     # we define the Job outputs here to let the next Job use them
21 |     outputs:
22 |       name: ${{ steps.gto.outputs.name }}
23 |       stage: ${{ steps.gto.outputs.stage }}
24 |       event: ${{ steps.gto.outputs.event }}
25 |       path: ${{ steps.gto.outputs.path }}
26 |   deploy-model:
27 |     name: Deploy a MLEM model (act on assigning a new stage)
28 |     needs: parse-git-tag
29 |     if: needs.parse-git-tag.outputs.event == 'assignment'
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |     - uses: actions/checkout@v3
33 |     - uses: actions/setup-python@v2
34 |       with:
35 |         python-version: '3.10'
36 |     - name: Install dependencies
37 |       run: |
38 |         pip install --upgrade pip setuptools wheel
39 |         pip install -r requirements.txt
40 |     - name: Run `mlem deploy``
41 |       run: |
42 |         mlem deployment run --load deploy/${{ needs.parse-git-tag.outputs.stage }} --model ${{ needs.parse-git-tag.outputs.path }}
43 | 


--------------------------------------------------------------------------------
/example-gto/code/mlem/.mlem.yaml:
--------------------------------------------------------------------------------
1 | core:
2 |  state:
3 |    uri: s3://gto-mlem-example/mlem-deployment-state
4 | 


--------------------------------------------------------------------------------
/example-gto/code/mlem/deploy/dev.mlem:
--------------------------------------------------------------------------------
1 | object_type: deployment
2 | type: heroku
3 | app_name: mlem-dev
4 | 


--------------------------------------------------------------------------------
/example-gto/code/mlem/deploy/prod.mlem:
--------------------------------------------------------------------------------
1 | object_type: deployment
2 | type: heroku
3 | app_name: mlem-prod
4 | 


--------------------------------------------------------------------------------
/example-gto/code/mlem/deploy/staging.mlem:
--------------------------------------------------------------------------------
1 | object_type: deployment
2 | type: heroku
3 | app_name: mlem-staging
4 | 


--------------------------------------------------------------------------------
/example-gto/code/mlem/requirements.txt:
--------------------------------------------------------------------------------
1 | gto
2 | mlem[s3,fastapi,heroku]


--------------------------------------------------------------------------------
/example-gto/code/mlem/train.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import mlem
 4 | 
 5 | if __name__ == "__main__":
 6 |     value = sys.argv[1] if len(sys.argv) > 1 else "no value"
 7 | 
 8 |     def model(data):
 9 |         return value
10 | 
11 |     mlem.api.save(model, "models/churn.pkl", sample_data="string")
12 | 


--------------------------------------------------------------------------------
/example-gto/code/requirements.txt:
--------------------------------------------------------------------------------
1 | gto


--------------------------------------------------------------------------------
/example-gto/generate.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Setup script env:
  4 | #   e   Exit immediately if a command exits with a non-zero exit status.
  5 | #   u   Treat unset variables as an error when substituting.
  6 | #   x   Print commands and their arguments as they are executed.
  7 | set -eux
  8 | 
  9 | PUSH=false
 10 | echo $#
 11 | if [ "$#" -eq 1 ] && [ "$0" != "--push" ]; then
 12 |   PUSH=true
 13 |   echo "Will push things to GitHub :tada:"
 14 | fi
 15 | 
 16 | HERE="$(
 17 |   cd "$(dirname "$0")"
 18 |   pwd -P
 19 | )"
 20 | USER_NAME="iterative"
 21 | REPO_NAME="example-gto"
 22 | 
 23 | BUILD_PATH="$HERE/build"
 24 | REPO_PATH="$BUILD_PATH/$REPO_NAME"
 25 | 
 26 | if [ -d "$REPO_PATH" ]; then
 27 |   echo "Repo $REPO_PATH already exists, please remove it first."
 28 |   exit 1
 29 | fi
 30 | 
 31 | mkdir -p $BUILD_PATH
 32 | pushd $BUILD_PATH
 33 | if [ ! -d "$BUILD_PATH/.venv" ]; then
 34 |   virtualenv -p python3 .venv
 35 |   source .venv/bin/activate
 36 |   echo '.venv/' >.gitignore
 37 |   pip install -r ../code/requirements.txt
 38 |   git clone git@github.com:iterative/example-gto.git
 39 |   pip install -e ./gto
 40 | fi
 41 | popd
 42 | 
 43 | source $BUILD_PATH/.venv/bin/activate
 44 | 
 45 | TOTAL_TAGS=15
 46 | STEP_TIME=100000
 47 | SLEEP_TIME=90
 48 | BEGIN_TIME=$(($(date +%s) - (${TOTAL_TAGS} * ${STEP_TIME})))
 49 | export TAG_TIME=${BEGIN_TIME}
 50 | export GIT_AUTHOR_DATE="${TAG_TIME} +0000"
 51 | export GIT_COMMITTER_DATE="${TAG_TIME} +0000"
 52 | tick() {
 53 |   export TAG_TIME=$((${TAG_TIME} + ${STEP_TIME}))
 54 |   export GIT_AUTHOR_DATE="${TAG_TIME} +0000"
 55 |   export GIT_COMMITTER_DATE="${TAG_TIME} +0000"
 56 | }
 57 | 
 58 | export GIT_AUTHOR_NAME="Alexander Guschin"
 59 | export GIT_AUTHOR_EMAIL="1aguschin@gmail.com"
 60 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"
 61 | export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"
 62 | 
 63 | mkdir -p $REPO_PATH
 64 | pushd $REPO_PATH
 65 | 
 66 | git init -b main
 67 | cp $HERE/code/.gitignore .
 68 | git add .gitignore
 69 | cp $HERE/code/requirements.txt .
 70 | cp $HERE/code/README.md .
 71 | cp -R $HERE/code/.github .
 72 | git add .
 73 | tick
 74 | git commit -m "Initialize Git repository with CI workflow"
 75 | 
 76 | if $PUSH; then
 77 |   # remove GH Actions workflows
 78 |   gh api repos/$USER_NAME/$REPO_NAME/actions/runs \
 79 |     --paginate -q '.workflow_runs[] | "\(.id)"' |
 80 |     xargs -n1 -I % gh api --silent repos/$USER_NAME/$REPO_NAME/actions/runs/% -X DELETE
 81 |   # add remote
 82 |   git remote add origin git@github.com:$USER_NAME/$REPO_NAME.git
 83 |   # remove all tags from remote
 84 |   git ls-remote --tags origin | awk '/^(.*)(\s+)(.*[a-zA-Z0-9])$/ {print ":" $2}' | xargs git push origin
 85 | fi
 86 | 
 87 | echo "Initialize DVC"
 88 | dvc init
 89 | git commit -m "Initialize DVC"
 90 | echo "Create new models"
 91 | mkdir models
 92 | echo "1st version" > models/churn.pkl
 93 | git add models requirements.txt
 94 | tick
 95 | git commit -am "Create models"
 96 | 
 97 | cat >> dvc.yaml<< EOF
 98 | artifacts:
 99 |   churn:
100 |     type: model
101 |     path: models/churn.pkl
102 |   segment:
103 |     type: model
104 |     path: s3://mycorp/proj-ml/segm-model-2022-04-15.pt
105 |   cv-class:
106 |     type: model
107 |     path: s3://mycorp/proj-ml/classif-v2.pt
108 | EOF
109 | git add dvc.yaml
110 | 
111 | tick
112 | git commit -m "Annotate models with GTO"
113 | if $PUSH; then
114 |   git push --set-upstream origin main -f
115 | fi
116 | 
117 | echo "Register new model"
118 | tick
119 | gto register churn --version v3.0.0
120 | tick
121 | gto register segment --version v0.4.1
122 | tick
123 | gto register cv-class --version v0.1.13
124 | if $PUSH; then
125 |   git push --tags
126 |   sleep $SLEEP_TIME
127 | fi
128 | 
129 | echo "Update the model"
130 | echo "2nd version" >models/churn.pkl
131 | tick
132 | git commit -am "Update model"
133 | if $PUSH; then
134 |   git push
135 | fi
136 | 
137 | echo "Register models"
138 | tick
139 | gto register churn --bump-minor
140 | if $PUSH; then
141 |   git push --tags
142 |   sleep $SLEEP_TIME
143 | fi
144 | 
145 | echo "Promote models"
146 | tick
147 | gto assign churn --version v3.0.0 --stage dev
148 | if $PUSH; then
149 |   git push --tags
150 |   sleep $SLEEP_TIME
151 | fi
152 | 
153 | tick
154 | gto assign churn HEAD --stage staging
155 | if $PUSH; then
156 |   git push --tags
157 |   sleep $SLEEP_TIME
158 | fi
159 | 
160 | tick
161 | gto assign churn --version v3.0.0 --stage prod
162 | if $PUSH; then
163 |   git push --tags
164 |   sleep $SLEEP_TIME
165 | fi
166 | 
167 | tick
168 | gto assign churn --version v3.1.0 --stage dev
169 | gto assign segment --version v0.4.1 --stage dev
170 | if $PUSH; then
171 |   git push --tags
172 | fi
173 | 
174 | 
175 | gto show
176 | gto history
177 | 
178 | 
179 | if $PUSH; then
180 |   git push --set-upstream origin main -f
181 | fi
182 | 
183 | popd
184 | 
185 | unset TAG_TIME
186 | unset GIT_AUTHOR_DATE
187 | unset GIT_COMMITTER_DATE
188 | unset GIT_AUTHOR_NAME
189 | unset GIT_AUTHOR_EMAIL
190 | unset GIT_COMMITTER_NAME
191 | unset GIT_COMMITTER_EMAIL
192 | 
193 | cat <<EOF
194 | 
195 | The Git repo generated by this script is intended to be published on
196 | https://github.com/iterative/example-gto.
197 | Make sure the Github repo exists first and that you have
198 | appropriate write permissions.
199 | 
200 | To run the generator in the test mode, just use "bash generate.sh"
201 | 
202 | To push it to GitHub, use "bash generate.sh --push"
203 | This will do it step by step waiting for CI to have consistent results.
204 | 
205 | To cd to the generated repo, run "cd build/example-gto"
206 | 
207 | You may remove the generated repo with "rm -fR build/example-gto"
208 | EOF
209 | 


--------------------------------------------------------------------------------