├── .gitignore
├── LICENSE
├── README.md
├── week01_intro
    ├── README.md
    ├── lecture.pdf
    └── seminar.ipynb
├── week02_management_and_testing
    ├── README.md
    ├── example_project
    │   ├── compute_metrics.py
    │   ├── dvc.yaml
    │   ├── hparams.py
    │   ├── prepare_data.py
    │   ├── pyproject.toml
    │   ├── test_basic.py
    │   ├── train.py
    │   └── uv.lock
    ├── homework
    │   ├── README.md
    │   ├── main.py
    │   ├── modeling
    │   │   ├── __init__.py
    │   │   ├── diffusion.py
    │   │   ├── training.py
    │   │   └── unet.py
    │   ├── requirements.txt
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_model.py
    │   │   └── test_pipeline.py
    └── lecture.pdf
├── week03_fast_pipelines
    ├── README.md
    ├── homework
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── task1
    │   │   ├── dataset.py
    │   │   ├── download_data.sh
    │   │   ├── train.py
    │   │   └── unet.py
    │   ├── task2
    │   │   ├── dataset.py
    │   │   ├── run_epoch.py
    │   │   └── transformer.py
    │   └── task3
    │   │   ├── dataset.py
    │   │   ├── profiler.py
    │   │   ├── run_epoch.py
    │   │   ├── utils.py
    │   │   └── vit.py
    ├── lecture.pdf
    └── seminar
    │   ├── Mixed_precision.jpeg
    │   ├── image_loaders_benchmark.py
    │   ├── memory_snapshot.py
    │   ├── mnist_training.py
    │   ├── mnist_training_nsys.py
    │   ├── pics
    │       └── 1
    │       │   ├── 1.jpg
    │       │   ├── 2.jpg
    │       │   ├── 3.jpg
    │       │   ├── 4.jpg
    │       │   ├── 5.jpg
    │       │   ├── 6.jpg
    │       │   ├── 7.jpg
    │       │   └── 8.jpg
    │   ├── practice.ipynb
    │   ├── requirements.txt
    │   └── train.py
├── week04_data_parallel
    ├── README.md
    ├── homework
    │   ├── README.md
    │   ├── allreduce.py
    │   ├── ddp_cifar100.py
    │   ├── requirements.txt
    │   ├── sequential_print.py
    │   ├── syncbn.py
    │   └── test_syncbn.py
    ├── lecture.odp
    ├── lecture.pdf
    └── practice.ipynb
├── week05_large_models
    ├── README.md
    ├── lecture.odp
    ├── lecture.pdf
    ├── practice_part1.ipynb
    └── practice_part2.ipynb
├── week06_fsdp
    ├── README.md
    ├── homework
    │   ├── .devcontainer.json
    │   ├── .gitignore
    │   ├── .vscode
    │   │   └── launch.json
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── hw_fsdp.py
    │   ├── setup.sh
    │   └── train.py
    ├── practice.ipynb
    └── slides
    │   ├── .gitignore
    │   ├── assets
    │       ├── 1b_mem_snap.png
    │       ├── 1b_no_reduce_grads_no_reshard_after_backward_no_reshard_after_forward_mem_snap.png
    │       ├── 1b_no_reduce_grads_no_reshard_after_backward_no_reshard_after_forward_trace.png
    │       ├── 1b_no_reshard_after_backward_no_reshard_after_forward_mem_snap.png
    │       ├── 1b_no_reshard_after_backward_no_reshard_after_forward_trace.png
    │       ├── 1b_no_reshard_after_forward_mem_snap.png
    │       ├── 1b_reshard_after_forward_4_mem_snap.png
    │       ├── 1b_reshard_after_forward_4_trace.png
    │       ├── 8b_compile_backward_trace.png
    │       ├── 8b_compile_forward_trace.png
    │       ├── 8b_compile_trace.png
    │       ├── 8b_cpu_offload_mem_snap.png
    │       ├── 8b_no_compile_iter_mem_snap.png
    │       ├── 8b_on_cuda_model_mem_snap.png
    │       ├── 8b_reshard_after_forward_4_trace.png
    │       ├── dcp_1.png
    │       ├── dcp_2.png
    │       ├── dcp_3.png
    │       ├── dcp_saving_flow.png
    │       ├── device_mesh.png
    │       ├── dtensor_1.png
    │       ├── dtensor_2.png
    │       ├── dtensor_3.png
    │       ├── forward_hook.png
    │       ├── forward_pre_hook.png
    │       ├── fsdp_workflow.png
    │       ├── fsdp_wrap.png
    │       └── streams.png
    │   ├── package.json
    │   ├── slides.md
    │   └── slides.pdf
├── week07_application_deployment
    ├── 01_python_server
    │   ├── README.md
    │   ├── labels.json
    │   ├── requirements.txt
    │   └── server.py
    ├── 02_docker
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── docker-compose.yaml
    │   ├── labels.json
    │   ├── requirements.txt
    │   └── server.py
    ├── 03_orchestration
    │   ├── README.md
    │   └── docker-compose.swarm.yaml
    ├── 04_metrics
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── docker-compose.yaml
    │   ├── grafana
    │   │   ├── config.ini
    │   │   └── datasources
    │   │   │   └── all.yml
    │   ├── labels.json
    │   ├── monitor
    │   │   └── telegraf.conf
    │   ├── prom
    │   │   └── prometheus.yml
    │   ├── requirements.txt
    │   └── server.py
    ├── 05_microservices
    │   ├── Dockerfile.client_api
    │   ├── Dockerfile.inference_api
    │   ├── client-api.py
    │   ├── docker-compose.yaml
    │   ├── grpc-client.py
    │   ├── inference-api.py
    │   ├── labels.json
    │   ├── protos
    │   │   └── inference.proto
    │   ├── requirements.txt
    │   └── run_codegen.py
    ├── README.md
    ├── client-url.py
    ├── client.py
    ├── dataset
    │   ├── 10.jpeg
    │   ├── 14.jpeg
    │   ├── 16.jpeg
    │   ├── 17.jpeg
    │   ├── 5.jpeg
    │   ├── 6.jpeg
    │   └── 9.jpeg
    ├── homework
    │   ├── README.md
    │   ├── proto
    │   │   └── inference.proto
    │   └── tests.py
    ├── requirements.txt
    ├── supervisord
    │   ├── Dockerfile
    │   ├── docker-compose.yaml
    │   ├── labels.json
    │   ├── requirements.txt
    │   ├── server.py
    │   └── supervisord.conf
    └── train_model.py
├── week08_inference_software
    ├── README.md
    ├── lecture.pdf
    └── seminar
    │   ├── images
    │       ├── fused_kernels1.png
    │       ├── fused_kernels2.png
    │       ├── prefixLM.png
    │       └── rowcolumnarrays.webp
    │   └── seminar.ipynb
└── week09_inference_algorithms
    ├── README.md
    ├── homework
        ├── README.md
        └── hw-w8a8-specdec.ipynb
    ├── lecture.pdf
    └── seminar.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .idea
132 | .DS_Store
133 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 course authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Efficient Deep Learning Systems
 2 | This repository contains materials for the Efficient Deep Learning Systems course taught at the [Faculty of Computer Science](https://cs.hse.ru/en/) of [HSE University](https://www.hse.ru/en/) and [Yandex School of Data Analysis](https://academy.yandex.com/dataschool/).
 3 | 
 4 | __This branch corresponds to the ongoing 2025 course. If you want to see full materials of past years, see the ["Past versions"](#past-versions) section.__
 5 | 
 6 | # Syllabus
 7 | - [__Week 1:__](./week01_intro) __Introduction__
 8 |   - Lecture: Course overview and organizational details. Core concepts of the GPU architecture and CUDA API.
 9 |   - Seminar: CUDA operations in PyTorch. Introduction to benchmarking.
10 | - [__Week 2:__](./week02_management_and_testing) __Experiment tracking, model and data versioning, testing DL code in Python__
11 |   - Lecture: Experiment management basics and pipeline versioning. Configuring Python applications. Intro to regular and property-based testing.
12 |   - Seminar: Example DVC+Weights & Biases project walkthrough. Intro to testing with pytest.
13 | - [__Week 3:__ ](./week03_fast_pipelines) __Training optimizations, FP16/BF16/FP8 formats, profiling deep learning code__
14 |   - Lecture: Measuring performance of GPU-accelerated software. Mixed-precision training. Data storage and loading optimizations. Tools for profiling deep learning workloads. 
15 |   - Seminar: Automatic Mixed Precision in PyTorch. Dynamic padding for sequence data and JPEG decoding benchmarks. Basics of profiling with py-spy, PyTorch Profiler, Memory Snapshot and Nsight Systems.
16 | - [__Week 4:__](./week04_data_parallel) __Data-parallel training and All-Reduce__
17 |   - Lecture: Introduction to distributed training. Data-parallel training of neural networks. All-Reduce and its efficient implementations.
18 |   - Seminar: Introduction to PyTorch Distributed. Data-parallel training primitives.
19 | - [__Week 5:__](./week05_large_models) __Training large models__
20 |   - Lecture: Tensor, pipeline, sequence parallelism. Gradient checkpointing, offloading.
21 |   - Seminar: Gradient checkpointing and tensor parallelism in practice.
22 | - [__Week 6:__](./week06_fsdp) __Sharded data-parallel training, distributed training optimizations__
23 |   - Lecture: Fully-sharded data parallel training and its optimizations
24 |   - Seminar: In-depth overview of FSDP2
25 | - [__Week 7:__](./week07_application_deployment) __Python web application deployment__
26 |   - Lecture/Seminar: Building and deployment of production-ready web services. App & web servers, Docker, Prometheus, API via HTTP and gRPC.
27 | - [__Week 8:__](./week08_inference_software) __LLM inference optimizations and software__
28 |   - Lecture: Inference speed metrics. KV caching, batch inference, continuous batching. FlashAttention with its modifications and PagedAttention. Overview of popular LLM serving frameworks.
29 |   - Seminar: Implementation of KV caching. Basics of the Triton language. Layer fusion in PyTorch and Triton. Liger Kernels. FlashAttention and FlexAttention in practice.
30 | - [__Week 9:__](./week09_inference_algorithms) __Efficient model inference__
31 |   - Lecture: Speculative decoding, architecture optimizations, quantization, knowledge distillation
32 |   - Seminar: Introduction to speculative decoding. Matrix multiplication in Triton for different scenarios.
33 | - __Week 10:__ Guest lecture
34 | 
35 | ## Grading
36 | There will be several home assignments (spread over multiple weeks) on the following topics:
37 | - Training pipelines and code profiling
38 | - Distributed and memory-efficient training
39 | - Deploying and optimizing models for production
40 | 
41 | The final grade is a weighted sum of per-assignment grades.
42 | Please refer to the course page of your institution for details.
43 | 
44 | # Staff
45 | - [Max Ryabinin](https://github.com/mryab)
46 | - [Just Heuristic](https://github.com/justheuristic)
47 | - [Yaroslav Zolotarev](https://github.com/Q-c7)
48 | - [Maksim Abraham](https://github.com/fdrose)
49 | - [Gregory Leleytner](https://github.com/RunFMe)
50 | - [Antony Frolov](https://github.com/antony-frolov)
51 | - [Anton Chigin](https://github.com/achigin)
52 | - [Alexander Markovich](https://github.com/markovka17)
53 | - [Roman Gorb](https://github.com/rvg77)
54 | 
55 | # Past versions
56 | - [2024](https://github.com/mryab/efficient-dl-systems/tree/2024)
57 | - [2023](https://github.com/mryab/efficient-dl-systems/tree/2023)
58 | - [2022](https://github.com/mryab/efficient-dl-systems/tree/2022)
59 | - [2021](https://github.com/yandexdataschool/dlatscale_draft)
60 | 


--------------------------------------------------------------------------------
/week01_intro/README.md:
--------------------------------------------------------------------------------
 1 | # Week 1: Introduction
 2 | 
 3 | * Lecture: [link](./lecture.pdf)
 4 | * Seminar: [link](./seminar.ipynb)
 5 | 
 6 | ## Further reading
 7 | * [CUDA MODE reading group Resource Stream](https://github.com/cuda-mode/resource-stream)
 8 | * [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html) and [CUDA C++ Best Practices Guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html)
 9 | * [Modal GPU Glossary](https://modal.com/gpu-glossary)
10 | * [How to Optimize a CUDA Matmul Kernel for cuBLAS-like Performance: a Worklog](https://siboehm.com/articles/22/CUDA-MMM)
11 | * [GPU Puzzles](https://github.com/srush/GPU-Puzzles)
12 | * [PyTorch Performance Tuning Guide](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html)
13 | * [Earlier version of this guide from NVIDIA](https://tigress-web.princeton.edu/~jdh4/PyTorchPerformanceTuningGuide_GTC2021.pdf)
14 | * [Docs for caching memory allocation in PyTorch](https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management)
15 | * [Overview of `timeit` for microbenchmarking](https://docs.python.org/3/library/timeit.html)
16 | * [PyTorch Benchmark tutorial](https://pytorch.org/tutorials/recipes/recipes/benchmark.html)
17 | * Links on floating point precision in different libraries and environments: [1](https://discuss.pytorch.org/t/big-difference-between-torch-matmul-and-a-batch-of-torch-mm/101192) [2](https://github.com/pytorch/pytorch/issues/17678) 
18 | * [On threading in PyTorch](https://github.com/pytorch/pytorch/issues/19001)
19 | * [Getting started with CUDA Graphs](https://developer.nvidia.com/blog/cuda-graphs/)
20 | * [Accelerating PyTorch with CUDA Graphs](https://github.com/pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/)


--------------------------------------------------------------------------------
/week01_intro/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week01_intro/lecture.pdf


--------------------------------------------------------------------------------
/week02_management_and_testing/README.md:
--------------------------------------------------------------------------------
 1 | # Week 2: Experiment tracking and testing
 2 | 
 3 | * Lecture: [link](./lecture.pdf)
 4 | * Seminar: see the [example_project](./example_project) directory
 5 | * Homework: see [homework/README.md](homework/README.md)
 6 | 
 7 | ## Further reading
 8 | * Tools for experiment tracking: [Aim](https://github.com/aimhubio/aim), [Comet](https://www.comet.ml/site/), [Neptune](https://neptune.ai/), [Sacred](https://github.com/IDSIA/sacred), [Weights and Biases](https://wandb.ai/), [ClearML](https://clear.ml/)
 9 | * [DVC](https://dvc.org/) and [Pachyderm](https://www.pachyderm.com/) for artifact versioning
10 | * [Hydra documentation](https://hydra.cc/docs/intro/)
11 | * [Omegaconf](https://github.com/omry/omegaconf) for simpler configuration management
12 | * [Unittest](https://docs.python.org/3/library/unittest.html) built-in module
13 | * [Doctest](https://docs.python.org/3/library/doctest.html) built-in module (useful for testing docstrings!)
14 | * [Pytest](https://github.com/pytest-dev/pytest/) repository
15 | * Pytest plugins: [pytest-xdist](https://pypi.org/project/pytest-xdist/) for parallel execution, [pytest-cov](https://pytest-cov.readthedocs.io/en/latest/readme.html) for coverage reports.
16 | * [Hypothesis quick start guide](https://hypothesis.readthedocs.io/en/latest/quickstart.html) and [integration with pytest](https://hypothesis.readthedocs.io/en/latest/details.html#the-hypothesis-pytest-plugin)
17 | * [Full Stack Deep Learning "Troubleshooting & Testing" lecture](https://fullstackdeeplearning.com/course/2022/lecture-3-troubleshooting-and-testing/#4-resources)
18 | * [Made With ML MLOps Course, "Testing Machine Learning Systems: Code, Data and Models"](https://madewithml.com/courses/mlops/testing/)


--------------------------------------------------------------------------------
/week02_management_and_testing/example_project/compute_metrics.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from argparse import ArgumentParser
 3 | 
 4 | import torch
 5 | import torchvision.transforms as transforms
 6 | from torchvision.datasets import CIFAR10
 7 | from torchvision.models import resnet18
 8 | 
 9 | from hparams import config
10 | 
11 | 
12 | def main(args):
13 |     transform = transforms.Compose([
14 |         transforms.ToTensor(),
15 |         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
16 |     ])
17 | 
18 |     test_dataset = CIFAR10(root='CIFAR10/test',
19 |                            train=False,
20 |                            transform=transform,
21 |                            download=False,
22 |                            )
23 | 
24 |     test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
25 |                                               batch_size=config["batch_size"])
26 | 
27 |     device = torch.device("cuda")
28 | 
29 |     model = resnet18(pretrained=False, num_classes=10)
30 |     model.load_state_dict(torch.load("model.pt"))
31 |     model.to(device)
32 | 
33 |     correct = 0.0
34 | 
35 |     for test_images, test_labels in test_loader:
36 |         test_images = test_images.to(device)
37 |         test_labels = test_labels.to(device)
38 | 
39 |         with torch.inference_mode():
40 |             outputs = model(test_images)
41 |             preds = torch.argmax(outputs, 1)
42 |             correct += (preds == test_labels).sum()
43 | 
44 |     accuracy = correct / len(test_dataset)
45 | 
46 |     with open("final_metrics.json", "w+") as f:
47 |         json.dump({"accuracy": accuracy.item()}, f)
48 |         print("\n", file=f)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     parser = ArgumentParser()
53 |     args = parser.parse_args()
54 |     main(args)
55 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/example_project/dvc.yaml:
--------------------------------------------------------------------------------
 1 | stages:
 2 |   prepare_data:
 3 |     cmd: python prepare_data.py
 4 |     deps:
 5 |     - prepare_data.py
 6 |     outs:
 7 |     - CIFAR10
 8 |   train:
 9 |     cmd: python train.py
10 |     deps:
11 |     - CIFAR10
12 |     - hparams.py
13 |     - train.py
14 |     outs:
15 |     - model.pt
16 |   compute_metrics:
17 |     cmd: python compute_metrics.py
18 |     deps:
19 |     - CIFAR10
20 |     - compute_metrics.py
21 |     - model.pt
22 |     metrics:
23 |     - final_metrics.json:
24 |         cache: false
25 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/example_project/hparams.py:
--------------------------------------------------------------------------------
1 | config = dict(
2 |     batch_size=64,
3 |     learning_rate=1e-5,
4 |     weight_decay=0.01,
5 |     epochs=2,
6 |     zero_init_residual=False,
7 | )
8 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/example_project/prepare_data.py:
--------------------------------------------------------------------------------
1 | from torchvision.datasets import CIFAR10
2 | 
3 | if __name__ == "__main__":
4 |     train_dataset = CIFAR10("CIFAR10/train", download=True)
5 |     test_dataset = CIFAR10("CIFAR10/test", download=True)
6 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/example_project/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "homework"
 3 | version = "0.1.0"
 4 | description = "Sample Text"
 5 | authors = [ "YZ <spam@shit.com>" ]
 6 | requires-python = ">=3.10"
 7 | readme = "README.md"
 8 | 
 9 | dependencies = [
10 |     "torch==2.1.2",
11 |     "torchvision==0.16.2",
12 |     "wandb>=0.13.10",
13 |     "tqdm==4.66.1",
14 |     "numpy==1.26.4",
15 |     "dvc==2.44.0",
16 |     "hydra-core==1.3.1",
17 |     "omegaconf==2.3.0",
18 | ]
19 | 
20 | [tool.uv]
21 | dev-dependencies = [
22 |     "pytest==7.4.4",
23 |     "pytest-cov==4.1.0",
24 | ]
25 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/example_project/test_basic.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytest
 3 | 
 4 | from train import compute_accuracy
 5 | 
 6 | def test_arange_elems():
 7 |     arr = torch.arange(0, 10, dtype=torch.float)
 8 |     assert torch.allclose(arr[-1], torch.tensor([9]).float())  #
 9 | 
10 | def test_div_zero():
11 |     a = torch.zeros(1,dtype=torch.long)
12 |     b = torch.ones(1,dtype=torch.long)
13 | 
14 |     assert not torch.isfinite(b/a)
15 | 
16 | 
17 | def test_div_zero_python():
18 |     with pytest.raises(ZeroDivisionError):
19 |         1/0  #
20 | 
21 | def test_accuracy():
22 |     preds = torch.randint(0,2,size=(100,))
23 |     targets = preds.clone()
24 | 
25 |     assert compute_accuracy(preds, targets) == 1.0
26 | 
27 |     preds = torch.tensor([1,2,3,0,0,0])
28 |     targets = torch.tensor([1,2,3,4,5,6])
29 | 
30 |     assert compute_accuracy(preds, targets) == 0.5  # This is bad - why?
31 | 
32 | @pytest.mark.parametrize("preds,targets,result",[
33 |     (torch.tensor([1,2,3]),torch.tensor([1,2,3]), 1.0),
34 |     (torch.tensor([1,2,3]),torch.tensor([0,0,0]), 0.0),
35 |     (torch.tensor([1,2,3]),torch.tensor([1,2,0]), 2/3),
36 |     ])
37 | def test_accuracy_parametrized(preds, targets, result):
38 |     assert torch.allclose(compute_accuracy(preds, targets), torch.tensor([result]), rtol=0, atol=1e-5)
39 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/example_project/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torchvision.transforms as transforms
 4 | import wandb
 5 | from torchvision.datasets import CIFAR10
 6 | from torchvision.models import resnet18
 7 | from tqdm import tqdm, trange
 8 | 
 9 | from hparams import config
10 | 
11 | def compute_accuracy(preds, targets):
12 |     result = (targets == preds).float().sum()
13 |     return result
14 | 
15 | 
16 | def main():
17 |     wandb.init(config=config, project="effdl_example", name="baseline")
18 | 
19 |     transform = transforms.Compose([
20 |         transforms.ToTensor(),
21 |         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
22 |         transforms.Resize((224, 224)),
23 |     ])
24 | 
25 |     train_dataset = CIFAR10(root='CIFAR10/train',
26 |                             train=True,
27 |                             transform=transform,
28 |                             download=False,
29 |                             )
30 | 
31 |     test_dataset = CIFAR10(root='CIFAR10/test',
32 |                            train=False,
33 |                            transform=transform,
34 |                            download=False,
35 |                            )
36 | 
37 |     train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
38 |                                                batch_size=config["batch_size"],
39 |                                                shuffle=True)
40 | 
41 |     test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
42 |                                               batch_size=config["batch_size"])
43 | 
44 |     device = torch.device("cuda")
45 | 
46 |     model = resnet18(pretrained=False, num_classes=10, zero_init_residual=config["zero_init_residual"])
47 |     model.to(device)
48 |     wandb.watch(model)
49 | 
50 |     criterion = nn.CrossEntropyLoss()
51 |     optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
52 | 
53 |     for epoch in trange(config["epochs"]):
54 |         for i, (images, labels) in enumerate(tqdm(train_loader)):
55 |             images = images.to(device)
56 |             labels = labels.to(device)
57 | 
58 |             outputs = model(images)
59 |             loss = criterion(outputs, labels)
60 | 
61 |             loss.backward()
62 |             optimizer.step()
63 |             optimizer.zero_grad()
64 | 
65 |             if i % 100 == 0:
66 |                 all_preds = []
67 |                 all_labels = []
68 | 
69 |                 for test_images, test_labels in test_loader:
70 |                     test_images = test_images.to(device)
71 |                     test_labels = test_labels.to(device)
72 | 
73 |                     with torch.inference_mode():
74 |                         outputs = model(test_images)
75 |                         preds = torch.argmax(outputs, 1)
76 | 
77 |                         all_preds.append(preds)
78 |                         all_labels.append(test_labels)
79 | 
80 |                 accuracy = compute_accuracy(torch.cat(all_preds), torch.cat(all_labels))
81 | 
82 |                 metrics = {'test_acc': accuracy, 'train_loss': loss}
83 |                 wandb.log(metrics, step=epoch * len(train_dataset) + (i + 1) * config["batch_size"])
84 |     torch.save(model.state_dict(), "model.pt")
85 | 
86 |     with open("run_id.txt", "w+") as f:
87 |         print(wandb.run.id, file=f)
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/README.md:
--------------------------------------------------------------------------------
 1 | # Week 2 home assignment
 2 | 
 3 | This assignment consists of 4 parts: you can earn the full amount of points by completing the first two and either of 
 4 | tasks 3 and 4 (or both of them for bonus points).
 5 | However, completing tasks 3 or 4 without the first two will not give you any points.
 6 | 
 7 | # Problem statement
 8 | You are given a small codebase that should train an **unconditional** [Denoising Diffusion Probabilistic Model](https://arxiv.org/abs/2006.11239)
 9 | on the CIFAR-10 dataset.
10 | However, this project contains several bugs of different severity, and even some of the tests are written incorrectly.
11 | A correct implementation will achieve *somewhat* decent results after training for 100 epochs (~2 hours on an average GPU),
12 | but you should not expect much in terms of quality.
13 | In this homework, we are going to have a deeper look at the training pipeline, try to fix any errors we find and make 
14 | the code more reliable and reproducible.
15 | 
16 | # Task 1 (6.5 points)
17 | Implement *correct* tests for the training pipeline.
18 | Specifically, have a look at the current [tests](./tests) folder: it contains several files with tests, 
19 | some of which fail, fail sometimes or are plainly incorrect.
20 | Your task is to identify the bugs and make the test suite pass deterministically: this will involve changes 
21 | both to `modeling` and to `tests`, as some parts of the testing code need to be modified as well.
22 | 
23 | In your report, please tell us how you found the bugs in all parts of the code.
24 | You can find the original implementation of DDPM that we use in this assignment, but giving it as an explanation for 
25 | your fixes will give you no points.
26 | Obviously, "solving" the assignment by removing all tests or having unreasonably high thresholds will not earn
27 | you a good grade as well.
28 | 
29 | After that, implement the `test_training` function in `test_pipeline.py` that runs an integration test for the
30 | entire training procedure with different hyperparameters and expects different outcomes.
31 | This test should increase the coverage of the `modeling.training` file (measured by [pytest-cov](https://github.com/pytest-dev/pytest-cov)) to **>80%**.
32 | 
33 | Importantly, you should ensure that your test code running the actual model can run both on CPU and GPU.
34 | Since training on CPU even for 1 epoch might take too long, you need to implement training on a subset of data.
35 | 
36 | 
37 | # Task 2 (1.5 points)
38 | Implement logging of the metrics and artifacts during training with [Weights and Biases](https://wandb.ai/site).
39 | You should log the following values:
40 | * Training loss and the learning rate
41 | * All training hyperparameters (including batch size, number of epochs etc., as well as all model and diffusion hyperparameters)
42 | * Inputs to the model (1 batch is enough) and samples from it after each epoch
43 | 
44 | However, you should **NOT** log the training code for the model.
45 | 
46 | Logging the hyperparameters and metrics will likely involve some refactoring of the original codebase.
47 | You can either place the necessary hyperparameters in a config file or simply have them as constants/argparse defaults 
48 | defined somewhere reasonable in the training code.
49 | 
50 | After finishing this task, train the model for at least 100 epochs with default hyperparameters and attach the link to
51 | your W&B project containing this run to the final report.
52 | 
53 | # Task 3 (2 points)
54 | Improve the configuration process of this pipeline using the [Hydra](https://hydra.cc/) library.
55 | You should create a config that allows adjusting at least the following attributes:
56 | * Peak learning rate and optimizer momentum
57 | * Optimizer (Adam by default, at least SGD should be supported)
58 | * Training batch size and the number of epochs
59 | * Number of workers in the dataloader
60 | * Existence of random flip augmentations
61 | 
62 | Demonstrate that your integration works by running at least three *complete* runs (less than 100 epochs is OK) 
63 | with hyperparameters changed via the config file.
64 | From these runs, it should be evident that changing hyperparameters affects the training procedure.
65 | Here, you should log the config using [run.log_artifact](https://docs.wandb.ai/ref/python/run#log_artifact)
66 | and show that this changes the hyperparameters of the run in W&B.
67 | 
68 | # Task 4 (2 points)
69 | Make the pipeline reproducible using [Data Version Control](https://dvc.org/). 
70 | You should end up with a `dvc.yaml` that represents two stages of your experiment with corresponding inputs and outputs: 
71 | getting the data (yes, you need to refactor that part of the code) and training the model itself.
72 | Also, you should specify the relevant code and configuration as dependencies of the corresponding pipeline stages.
73 | Lastly, after running your code, you should have a `dvc.lock` that stores hashes of all artifacts in your pipeline.
74 | Submit both `dvc.yaml` and `dvc.lock` as parts of your solution.
75 | 
76 | Importantly, modifying any of the relevant modules or hyperparameters should trigger an invalidation of the
77 | corresponding pipeline stages: that is, `dvc repro` should do nothing if and only if `dvc.lock` is consistent with
78 | hashes of all dependencies in the pipeline.
79 | 
80 | If you have also done the Hydra configuration assignment, make sure to check out [this guide](https://dvc.org/doc/user-guide/experiment-management/hydra-composition)
81 | on integrating Hydra with DVC experiment management.
82 | 
83 | # Submission format
84 | When submitting this assignment, you should attach a .zip archive that contains:
85 | - The source code with all your fixes and improvements
86 | - A Markdown/PDF report in the root of the project folder that:
87 |   1. Details the changes you made to the original code (we will run `diff` and see if everything is explained)
88 |   2. Tells how to run the modified code (i.e., which command line arguments you have added and how to use them)
89 |   3. Describes your process of fixing and adding new tests for Task 1 and reports the test coverage
90 |   4. Gives a link to the Weights and Biases project with all necessary logs for tasks 2 and 3
91 | - If you solved Tasks 3 or 4, please ensure that the archived project contains the corresponding configuration/lock files as well.
92 | - An updated `requirements.txt` file, if your solution requires new dependencies such as `wandb`, `hydra-core` or `dvc`.


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import DataLoader
 3 | from torchvision import transforms
 4 | from torchvision.datasets import CIFAR10
 5 | 
 6 | from modeling.diffusion import DiffusionModel
 7 | from modeling.training import generate_samples, train_epoch
 8 | from modeling.unet import UnetModel
 9 | 
10 | 
11 | def main(device: str, num_epochs: int = 100):
12 |     ddpm = DiffusionModel(
13 |         eps_model=UnetModel(3, 3, hidden_size=128),
14 |         betas=(1e-4, 0.02),
15 |         num_timesteps=1000,
16 |     )
17 |     ddpm.to(device)
18 | 
19 |     train_transforms = transforms.Compose(
20 |         [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
21 |     )
22 | 
23 |     dataset = CIFAR10(
24 |         "cifar10",
25 |         train=True,
26 |         download=True,
27 |         transform=train_transforms,
28 |     )
29 | 
30 |     dataloader = DataLoader(dataset, batch_size=128, num_workers=4, shuffle=True)
31 |     optim = torch.optim.Adam(ddpm.parameters(), lr=1e-5)
32 | 
33 |     for i in range(num_epochs):
34 |         train_epoch(ddpm, dataloader, optim, device)
35 |         generate_samples(ddpm, device, f"samples/{i:02d}.png")
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     device = "cuda" if torch.cuda.is_available() else "cpu"
40 |     main(device=device)
41 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week02_management_and_testing/homework/modeling/__init__.py


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/modeling/diffusion.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Tuple
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class DiffusionModel(nn.Module):
 8 |     def __init__(
 9 |         self,
10 |         eps_model: nn.Module,
11 |         betas: Tuple[float, float],
12 |         num_timesteps: int,
13 |     ):
14 |         super().__init__()
15 |         self.eps_model = eps_model
16 | 
17 |         for name, schedule in get_schedules(betas[0], betas[1], num_timesteps).items():
18 |             self.register_buffer(name, schedule)
19 | 
20 |         self.num_timesteps = num_timesteps
21 |         self.criterion = nn.MSELoss()
22 | 
23 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
24 |         timestep = torch.randint(1, self.num_timesteps + 1, (x.shape[0],))
25 |         eps = torch.rand_like(x)
26 | 
27 |         x_t = (
28 |             self.sqrt_alphas_cumprod[timestep, None, None, None] * x
29 |             + self.one_minus_alpha_over_prod[timestep, None, None, None] * eps
30 |         )
31 | 
32 |         return self.criterion(eps, self.eps_model(x_t, timestep / self.num_timesteps))
33 | 
34 |     def sample(self, num_samples: int, size, device) -> torch.Tensor:
35 | 
36 |         x_i = torch.randn(num_samples, *size)
37 | 
38 |         for i in range(self.num_timesteps, 0, -1):
39 |             z = torch.randn(num_samples, *size) if i > 1 else 0
40 |             eps = self.eps_model(x_i, torch.tensor(i / self.num_timesteps).repeat(num_samples, 1).to(device))
41 |             x_i = self.inv_sqrt_alphas[i] * (x_i - eps * self.one_minus_alpha_over_prod[i]) + self.sqrt_betas[i] * z
42 | 
43 |         return x_i
44 | 
45 | 
46 | def get_schedules(beta1: float, beta2: float, num_timesteps: int) -> Dict[str, torch.Tensor]:
47 |     assert beta1 < beta2 < 1.0, "beta1 and beta2 must be in (0, 1)"
48 | 
49 |     betas = (beta2 - beta1) * torch.arange(0, num_timesteps + 1, dtype=torch.float32) / num_timesteps + beta1
50 |     sqrt_betas = torch.sqrt(betas)
51 |     alphas = 1 - betas
52 | 
53 |     alphas_cumprod = torch.cumprod(alphas, dim=0)
54 | 
55 |     sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
56 |     inv_sqrt_alphas = 1 / torch.sqrt(alphas)
57 | 
58 |     sqrt_one_minus_alpha_prod = torch.sqrt(1 - alphas_cumprod)
59 |     one_minus_alpha_over_prod = (1 - alphas) / sqrt_one_minus_alpha_prod
60 | 
61 |     return {
62 |         "alphas": alphas,
63 |         "inv_sqrt_alphas": inv_sqrt_alphas,
64 |         "sqrt_betas": sqrt_betas,
65 |         "alphas_cumprod": alphas_cumprod,
66 |         "sqrt_alphas_cumprod": sqrt_alphas_cumprod,
67 |         "sqrt_one_minus_alpha_prod": sqrt_one_minus_alpha_prod,
68 |         "one_minus_alpha_over_prod": one_minus_alpha_over_prod,
69 |     }
70 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/modeling/training.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim.optimizer import Optimizer
 3 | from torch.utils.data import DataLoader
 4 | from torchvision.utils import make_grid, save_image
 5 | from tqdm import tqdm
 6 | 
 7 | from modeling.diffusion import DiffusionModel
 8 | 
 9 | 
10 | def train_step(model: DiffusionModel, inputs: torch.Tensor, optimizer: Optimizer, device: str):
11 |     optimizer.zero_grad()
12 |     inputs = inputs.to(device)
13 |     loss = model(inputs)
14 |     loss.backward()
15 |     optimizer.step()
16 |     return loss
17 | 
18 | 
19 | def train_epoch(model: DiffusionModel, dataloader: DataLoader, optimizer: Optimizer, device: str):
20 |     model.train()
21 |     pbar = tqdm(dataloader)
22 |     loss_ema = None
23 |     for x, _ in pbar:
24 |         train_loss = train_step(model, x, optimizer, device)
25 |         loss_ema = train_loss if loss_ema is None else 0.9 * loss_ema + 0.1 * train_loss
26 |         pbar.set_description(f"loss: {loss_ema:.4f}")
27 | 
28 | 
29 | def generate_samples(model: DiffusionModel, device: str, path: str):
30 |     model.eval()
31 |     with torch.no_grad():
32 |         samples = model.sample(8, (3, 32, 32), device=device)
33 |         grid = make_grid(samples, nrow=4)
34 |         save_image(grid, path)
35 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/modeling/unet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | class ConvBlock(nn.Module):
  6 |     def __init__(self, in_channels: int, out_channels: int, residual: bool = False):
  7 |         super().__init__()
  8 |         self.main = nn.Sequential(
  9 |             nn.Conv2d(in_channels, out_channels, 3, 1, 1),
 10 |             nn.GroupNorm(8, out_channels),
 11 |             nn.ReLU(),
 12 |         )
 13 |         self.conv = nn.Sequential(
 14 |             nn.Conv2d(out_channels, out_channels, 3, 1, 1),
 15 |             nn.GroupNorm(8, out_channels),
 16 |             nn.ReLU(),
 17 |             nn.Conv2d(out_channels, out_channels, 3, 1, 1),
 18 |             nn.GroupNorm(8, out_channels),
 19 |             nn.ReLU(),
 20 |         )
 21 | 
 22 |         self.is_res = residual
 23 | 
 24 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 25 |         x = self.main(x)
 26 |         if self.is_res:
 27 |             x = x + self.conv(x)
 28 |             return x / 1.414
 29 |         else:
 30 |             return self.conv(x)
 31 | 
 32 | 
 33 | class DownBlock(nn.Module):
 34 |     def __init__(self, in_channels: int, out_channels: int):
 35 |         super().__init__()
 36 |         self.layers = nn.Sequential(ConvBlock(in_channels, out_channels), nn.MaxPool2d(2))
 37 | 
 38 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 39 |         return self.layers(x)
 40 | 
 41 | 
 42 | class UpBlock(nn.Module):
 43 |     def __init__(self, in_channels: int, out_channels: int):
 44 |         super().__init__()
 45 |         self.layers = nn.Sequential(
 46 |             nn.ConvTranspose2d(in_channels, out_channels, 2, 2),
 47 |             ConvBlock(out_channels, out_channels),
 48 |             ConvBlock(out_channels, out_channels),
 49 |         )
 50 | 
 51 |     def forward(self, x: torch.Tensor, skip: torch.Tensor) -> torch.Tensor:
 52 |         x = torch.cat((x, skip), 1)
 53 |         x = self.layers(x)
 54 | 
 55 |         return x
 56 | 
 57 | 
 58 | class TimestepEmbedding(nn.Module):
 59 |     def __init__(self, emb_dim: int):
 60 |         super().__init__()
 61 | 
 62 |         self.lin1 = nn.Linear(1, emb_dim, bias=False)
 63 |         self.lin2 = nn.Linear(emb_dim, emb_dim)
 64 | 
 65 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 66 |         x = x.view(-1, 1)
 67 |         x = torch.sin(self.lin1(x))
 68 |         x = self.lin2(x)
 69 |         return x
 70 | 
 71 | 
 72 | class UnetModel(nn.Module):
 73 |     def __init__(self, in_channels: int, out_channels: int, hidden_size: int = 256):
 74 |         super().__init__()
 75 |         self.in_channels = in_channels
 76 |         self.out_channels = out_channels
 77 | 
 78 |         self.hidden_size = hidden_size
 79 | 
 80 |         self.init_conv = ConvBlock(in_channels, hidden_size, residual=True)
 81 | 
 82 |         self.down1 = DownBlock(hidden_size, hidden_size)
 83 |         self.down2 = DownBlock(hidden_size, 2 * hidden_size)
 84 |         self.down3 = DownBlock(2 * hidden_size, 2 * hidden_size)
 85 | 
 86 |         self.to_vec = nn.Sequential(nn.AvgPool2d(4), nn.ReLU())
 87 | 
 88 |         self.timestep_embedding = TimestepEmbedding(2 * hidden_size)
 89 | 
 90 |         self.up0 = nn.Sequential(
 91 |             nn.ConvTranspose2d(2 * hidden_size, 2 * hidden_size, 4, 4),
 92 |             nn.GroupNorm(8, 2 * hidden_size),
 93 |             nn.ReLU(),
 94 |         )
 95 | 
 96 |         self.up1 = UpBlock(4 * hidden_size, 2 * hidden_size)
 97 |         self.up2 = UpBlock(4 * hidden_size, hidden_size)
 98 |         self.up3 = UpBlock(2 * hidden_size, hidden_size)
 99 |         self.out = nn.Conv2d(2 * hidden_size, self.out_channels, 3, 1, 1)
100 | 
101 |     def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
102 |         x = self.init_conv(x)
103 | 
104 |         down1 = self.down1(x)
105 |         down2 = self.down2(down1)
106 |         down3 = self.down3(down2)
107 | 
108 |         thro = self.to_vec(down3)
109 |         temb = self.timestep_embedding(t)
110 | 
111 |         thro = self.up0(thro + temb)
112 | 
113 |         up1 = self.up1(thro, down3) + temb
114 |         up2 = self.up2(up1, down2)
115 |         up3 = self.up3(up2, down1)
116 | 
117 |         out = self.out(torch.cat((up3, x), 1))
118 | 
119 |         return out
120 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.1.2
2 | torchvision==0.16.2
3 | tqdm==4.66.1
4 | pytest==7.4.4
5 | pytest-cov==4.1.0
6 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week02_management_and_testing/homework/tests/__init__.py


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/tests/test_model.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from modeling.diffusion import DiffusionModel
 5 | from modeling.unet import UnetModel
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     [
10 |         "input_tensor",
11 |         "num_timesteps",
12 |     ],
13 |     [
14 |         (
15 |             torch.randn(2, 3, 32, 32),
16 |             10,
17 |         ),
18 |         (
19 |             torch.randn(2, 3, 64, 64),
20 |             20,
21 |         ),
22 |         (
23 |             torch.randn(2, 3, 128, 128),
24 |             30,
25 |         ),
26 |         (
27 |             torch.randn(2, 3, 256, 256),
28 |             40,
29 |         ),
30 |     ],
31 | )
32 | def test_unet(input_tensor, num_timesteps):
33 |     B, C, H, W = input_tensor.shape
34 |     net = UnetModel(C, C, hidden_size=128)
35 |     timestep = torch.randint(1, num_timesteps + 1, (B,)) / num_timesteps
36 |     out = net(input_tensor, timestep)
37 |     assert out.shape == input_tensor.shape
38 | 
39 | 
40 | def test_diffusion(num_channels=3, batch_size=4):
41 |     # note: you should not need to change the thresholds or the hyperparameters
42 |     net = UnetModel(num_channels, num_channels, hidden_size=128)
43 |     model = DiffusionModel(eps_model=net, betas=(1e-4, 0.02), num_timesteps=20)
44 | 
45 |     input_data = torch.randn((batch_size, num_channels, 32, 32))
46 | 
47 |     output = model(input_data)
48 |     assert output.ndim == 0
49 |     assert 1.0 <= output <= 1.2
50 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/homework/tests/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from torch.utils.data import DataLoader
 4 | from torchvision.transforms import Compose, ToTensor, Normalize
 5 | from torchvision.datasets import CIFAR10
 6 | 
 7 | from modeling.diffusion import DiffusionModel
 8 | from modeling.training import train_step
 9 | from modeling.unet import UnetModel
10 | 
11 | 
12 | @pytest.fixture
13 | def train_dataset():
14 |     transforms = Compose([ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
15 |     dataset = CIFAR10(
16 |         "./data",
17 |         train=True,
18 |         download=True,
19 |         transform=transforms,
20 |     )
21 |     return dataset
22 | 
23 | 
24 | @pytest.mark.parametrize(["device"], [["cpu"], ["cuda"]])
25 | def test_train_on_one_batch(device, train_dataset):
26 |     # note: you should not need to increase the threshold or change the hyperparameters
27 |     ddpm = DiffusionModel(
28 |         eps_model=UnetModel(3, 3, hidden_size=32),
29 |         betas=(1e-4, 0.02),
30 |         num_timesteps=1000,
31 |     )
32 |     ddpm.to(device)
33 | 
34 |     optim = torch.optim.Adam(ddpm.parameters(), lr=5e-4)
35 |     dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
36 | 
37 |     x, _ = next(iter(dataloader))
38 |     loss = None
39 |     for i in range(50):
40 |         loss = train_step(ddpm, x, optim, device)
41 |     assert loss < 0.5
42 | 
43 | 
44 | def test_training():
45 |     # note: implement and test a complete training procedure (including sampling)
46 |     pass
47 | 


--------------------------------------------------------------------------------
/week02_management_and_testing/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week02_management_and_testing/lecture.pdf


--------------------------------------------------------------------------------
/week03_fast_pipelines/README.md:
--------------------------------------------------------------------------------
 1 | # Week 3: Training optimizations, profiling DL code
 2 | 
 3 | * Lecture: [slides](./lecture.pdf)
 4 | * Seminar: [folder](./seminar)
 5 | * Homework: see [homework/README.md](homework/README.md)
 6 | 
 7 | ## Further reading
 8 | * [Blog post about reduced precision FP formats](https://moocaholic.medium.com/fp64-fp32-fp16-bfloat16-tf32-and-other-members-of-the-zoo-a1ca7897d407)
 9 | * NVIDIA blog posts about [mixed precision training with Tensor Cores](https://developer.nvidia.com/blog/video-mixed-precision-techniques-tensor-cores-deep-learning/), [Tensor Core performance tips](https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/), [TF32 Tensor Cores](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/)
10 | * Presentations about Tensor Cores: [one](https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9926-tensor-core-performance-the-ultimate-guide.pdf), [two](https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21929-tensor-core-performance-on-nvidia-gpus-the-ultimate-guide.pdf), [three](https://nvlabs.github.io/eccv2020-mixed-precision-tutorial/files/dusan_stosic-training-neural-networks-with-tensor-cores.pdf)
11 | * [Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) and [Mixed Precision Training](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#mptrain) sections of the [NVIDIA DL performance guide](https://docs.nvidia.com/deeplearning/performance/index.html)
12 | * [Automatic Mixed Precision in PyTorch](https://pytorch.org/docs/stable/amp.html)
13 | * [TF32 section of PyTorch CUDA docs](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
14 | * [FP8 Formats for Deep Learning paper](https://arxiv.org/abs/2209.05433)
15 | * [PyTorch Architecture Optimization](https://github.com/pytorch/ao) for FP8 training and other optimizations
16 | * [Float8 in PyTorch discussion](https://dev-discuss.pytorch.org/t/float8-in-pytorch-1-x/1815)
17 | * [AMP](https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options), [FP16](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) and [BF16](https://www.deepspeed.ai/docs/config-json/#bfloat16-training-options) in DeepSpeed
18 | * [PyTorch Performance Tuning Guide](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#)
19 | * [Latency Numbers Every Programmer Should Know](https://colin-scott.github.io/personal_website/research/interactive_latency.html)
20 | * [Pillow Performance benchmarks](https://python-pillow.org/pillow-perf/)
21 | * [Faster Image Processing](https://fastai1.fast.ai/performance.html#faster-image-processing) tips from fastai docs
22 | * [Rapid Data Pre-Processing with NVIDIA DALI](https://developer.nvidia.com/blog/rapid-data-pre-processing-with-nvidia-dali/)
23 | * General-purpose Python profilers: [builtins (cProfile and profile)](https://docs.python.org/3/library/profile.html), [pyinstrument](https://github.com/joerick/pyinstrument), [memory_profiler](https://github.com/pythonprofilers/memory_profiler), [py-spy](https://github.com/benfred/py-spy), [Scalene](https://github.com/plasma-umass/scalene)
24 | * [DLProf user guide](https://docs.nvidia.com/deeplearning/frameworks/dlprof-user-guide/index.html)
25 | * [How to profile with DLProf](https://tigress-web.princeton.edu/~jdh4/how_to_profile_with_dlprof_may_2021.pdf)
26 | * [Profiling and Optimizing Deep Neural Networks with DLProf and PyProf](https://developer.nvidia.com/blog/profiling-and-optimizing-deep-neural-networks-with-dlprof-and-pyprof/)
27 | * NVIDIA presentations on [profiling DL networks](https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9339-profiling-deep-learning-networks.pdf), [profiling for DL and mixed precision](https://on-demand.gputechconf.com/gtc-cn/2019/pdf/CN9620/presentation.pdf)
28 | * [Profiling Deep Learning Workloads](https://extremecomputingtraining.anl.gov/files/2020/08/ATPESC-2020-Track-8-Talk-7-Emani-ProfilingDLWorkloads.pdf)
29 | * [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) and [PyTorch Profiler with TensorBoard](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html) tutorial
30 | * [torch.utils.bottleneck quick guide](https://pytorch.org/docs/stable/bottleneck.html)
31 | * [PyTorch Autograd profiler tutorial](https://pytorch.org/tutorials/beginner/profiler.html)
32 | * [Nsight Systems](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) and [Nsight Compute](https://docs.nvidia.com/nsight-compute/2022.1/index.html) user guides
33 | * [Video tutorial about speeding up and profiling neural networks](https://www.youtube.com/watch?v=ySGIaOb_RDY)
34 | * [Solving Machine Learning Performance Anti-Patterns: a Systematic Approach](https://paulbridger.com/posts/nsight-systems-systematic-optimization/)


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/README.md:
--------------------------------------------------------------------------------
  1 | # Week 3 home assignment
  2 | 
  3 | The assignment for this week consists of three parts: all parts are obligatory, no bonus tasks are given, but you can earn more than 10 points in total.
  4 | Implement your solutions in the folders for the corresponding tasks. 
  5 | Create a report for your homework: briefly describe
  6 | the structure of your solution for each section, include benchmark results in the tables, and provide explanations of the observed results.
  7 | Poorly written reports will give you a reduced grade for the assignment!
  8 | 
  9 | Make sure to install the necessary packages from `requirements.txt` in the week's folder.
 10 | 
 11 | ## Submission format
 12 | - For the report, you need to create an `.ipynb` or a `.pdf` file.
 13 | - Create a `.zip` archive that contains:
 14 |   - Folders with your solutions for each task
 15 |   - The report file with instructions on how to run each part, results of running the code and (when necessary) your analysis 
 16 | - Upload this archive when submitting the assignment
 17 | 
 18 | ## Task 1: DIY loss scaling (2 points)
 19 | Implement [loss scaling](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#lossscaling) for the AMP training mode.
 20 | Use the provided semantic segmentation pipeline in [`task1`](./task1).
 21 | Your task is to train the model in the AMP mode with loss scaler implemented by you.
 22 | You **can use** `torch.cuda.amp.autocast`, and you **cannot use** `torch.cuda.amp.GradScaler()` (you may only for checking your solution).
 23 | 
 24 | Let us recall what loss scaling is. 
 25 | Loss scaling is used to avoid the gradient underflow problem when computing gradients in FP16 precision. 
 26 | The issue here is that while training in full precision, we might acquire rather small values in the gradients, which will vanish when we cast a tensor to a half precision. 
 27 | To fix the problem, we use the following solution:
 28 | 
 29 | - Make a forward pass for the model and compute the loss
 30 | - Multiply the loss value by some factor
 31 | - Call `.backward()`
 32 | - Update the model's master weights with **unscaled** FP32 gradients
 33 | 
 34 | Loss scaling might be done in two different ways: static and dynamic one.
 35 | In the static mode, you choose a factor for scaling only once and use it for the whole training procedure.
 36 | In the dynamic mode, you recompute the factor each time you scale the loss.
 37 | 
 38 | ### Task
 39 | - Implement static loss scaling (**1 point**)
 40 | - Implement dynamic loss scaling (**1 point**)
 41 | 
 42 | The task is done if you manage to stably achieve high accuracy values (0.985+) within 5 training epochs.
 43 | Note that you need to implement and successfully train with **both** scaling modes if you want to get a full grade for this task.
 44 | As a starting point, you can run the training in the full precision mode, then try to run in the AMP mode with and without the PyTorch loss scaler.
 45 | You will observe that adding a scaler gives you additional accuracy points.
 46 | 
 47 | **Hint:** To make sure that you're doing everything right, you might want to examine the values of gradients: (almost) no zeros should be present there.
 48 | 
 49 | ### Report instructions
 50 | When you are done with the code, you can either:
 51 | - Run the training function with implemented scaling modes in an `.ipynb` report
 52 | - Include training logs AND instructions on how to run your code in a `.pdf` report
 53 | 
 54 | ## Task 2: efficient batching for language modeling (4 points)
 55 | In this part, you need to examine the efficiency of the four batching approaches we discussed during the seminar. 
 56 | Let us remind you of them shortly:
 57 | 
 58 | **BRAIN**: pad everything to a fixed `max_length`
 59 | 
 60 | **BIG BRAIN**: pad only in the `collate_fn`
 61 | 
 62 | **ULTRA BIG BRAIN**: group examples of similar length into buckets, and sample examples for every batch from a single bucket
 63 | 
 64 | **ULTRA DUPER BIG BRAIN**: pack all sequences into one long sequence and generate metadata that indicates where each original sequence starts and ends
 65 | 
 66 | ### Task
 67 | More formally, you need to download [WikiText-103 dataset (dropbox)](https://www.dropbox.com/scl/fi/e6oqpx6iuos7kn9m139z7/wikitext-103-raw-v1.zip?rlkey=81evwbaqfkxtckj8zhks7yied&st=6ept2pdm&dl=0), [WikiText-103 dataset (yandex disk)](https://disk.yandex.ru/d/xwMXnteHKDqehw) and implement all the mentioned approaches.
 68 | Use only the training subset for all the task's subproblems.
 69 | 
 70 | - For naive batching, implement a Pytorch `Dataset` class that will parse training data from the source files of the dataset and pad every sample to a fixed `max_length=640`. **(0.5 points)**
 71 | - For the second approach, reimplement the `collate_fn` demo from the seminar for this dataset. **(0.5 points)**
 72 | More specifically, you need to pad sequences only up to a maximum sample length in the current batch.
 73 | - For the third approach, implement the `UltraBigBrainDataset` and the `UltraBigBrainBatchSampler` classes. **(1.5 points)**
 74 | Objects of the `BatchSampler` class are iterables and yield a list of indices that correspond to dataset objects, which are put into a batch. 
 75 | You can pass this batch sampler to a `DataLoader`. 
 76 | For more information, refer to PyTorch [docs](https://pytorch.org/docs/stable/data.html#automatic-batching-default). 
 77 | Objects in each batch should have the same or similar length. 
 78 | Sample batches randomly, but ensure that the length difference between the longest and shortest samples is less than or equal to k (try different values of k: 1, 5, 10, 20, 50). 
 79 | Note that some batches may be shorter than the specified batch size.
 80 | The `__init__` method must work in O(n) time, where n is the length of the dataset. 
 81 | The `__iter__` call must work in O(1) time with respect to the size of the dataset (and obviously, in O(batch_size)).
 82 | While processing the dataset, put all possible lengths of the samples into a hash table, where keys are lengths and values are containers with the indices of samples of this length.
 83 | - For the fourth approach, we recommend to use `IterableDataset`, which is a good choice when we don't know how many samples we need to create a batch. **(1.5 points)** 
 84 | If the last sample is too long, you can either truncate it or drop it from the dataset. 
 85 | Don't forget that you also need to build a correct attention mask to prevent cross-contamination of training examples and pass it to the model!
 86 | 
 87 | For each of the implemented methods (and all variations of the third method), mock one training epoch and measure minimum, maximum, mean and median batch processing times.
 88 | To mock a training epoch, you need to construct a small GPT-2-like model: use `nn.Embedding` layer, `PositionalEncoding` class from `transformer.py` file and a single `nn.TransformerDecoder` layer with a hidden size of 1024 and 8 heads.
 89 | For tokenization, use `.tokenize()` method of `AutoTokenizer.from_pretrained("bert-base-uncased")`.
 90 | Run one epoch **without a backward pass**.
 91 | Make sure you've [warmed up](https://forums.developer.nvidia.com/t/why-warm-up/48565) the GPU before computing the statistics and do not forget about asynchronous CUDA kernel execution.
 92 | 
 93 | Keep in mind that all padding in this task must be **implemented by you**: unlike the seminar, PyTorch’s default collate padding is not allowed.
 94 | In every subproblem, for sequences longer than 640 tokens, just truncate the overflowing part.
 95 | Feel free to modify the keyword arguments of functions.
 96 | 
 97 | **Hint:** In the third subtask, you might want to use a hash table multiple times.
 98 | **Hint 2:** In the third subtask, when `k=640`, you should receive the same results as in Subtask 2.
 99 | 
100 | ### Report instructions
101 | When you are done with the code, you can either:
102 | - Display the benchmark results in a `pandas.DataFrame` in your `.ipynb` report
103 | - Display the benchmark results in a table in your `.pdf` report
104 | 
105 | ## Task 3 (5 points)
106 | You are given a training script for a [Vision Transformer model](https://huggingface.co/docs/transformers/model_doc/vit) on the [Clothing dataset](https://www.kaggle.com/datasets/agrigorev/clothing-dataset-full).
107 | In this task, you need to implement a custom profiler to measure the performance of PyTorch models at the layer level. 
108 | The profiler should track the execution time of each layer during the forward and backward passes and output results in a trace event format.
109 | You also need to examine the bottlenecks of the training pipeline, including the model and the training loop (you can use any profilers you want here).
110 | The implementation of the model is based on the [`lucidrains/vit-pytorch`](https://github.com/lucidrains/vit-pytorch) repository.
111 | 
112 | ### Task
113 | - Implement a basic profiler: (**2.5 points**)
114 |    - Implement a [context manager](https://book.pythontips.com/en/latest/context_managers.html) to collect execution times for each layer. You have a skeleton of the `Profile` class, feel free to modify or extend it. We are only doing **layer-level** profiling here (not kernel-level).
115 |    - Support **profiling schedule phases** (e.g., wait, warmup, active), similar to the [PyTorch profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#using-profiler-to-analyze-long-running-jobs).  
116 |    - Implement a `to_perfetto` method that exports data in the [trace event format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw) which is compatible with [Perfetto](https://ui.perfetto.dev/).
117 |    - Profile a ViT model for several training iterations using your custom profiler. Visualize the results in the Perfetto UI. Compare your profiler's layer timings with those from the native PyTorch profiler (Don’t forget a warm-up phase!). **Report** any differences you observe in the measured times.
118 | 
119 | - Profile CUDA kernels now: (**1 point**)
120 |    - Update your profiler: insert **NVTX markers** via `torch.cuda.nvtx`. This will let you see **individual CUDA kernels** in the timeline when using Nsight Systems. **Remove any explicit synchronization**, because Nsight Systems can capture kernel timings directly from the GPU.  
121 |    - Run your script with **Nsight Systems**:
122 |      ```bash
123 |      nsys profile --env-var CUDA_VISIBLE_DEVICES="YOUR_GPU_ID" -o trace python3 main.py
124 |      ```
125 |    - Open the resulting **`.nsys-rep`** file in Nsight Systems. Examine kernel-level details in the GPU timeline. **Report** whether you see any timing differences compared to your earlier runs. If you see any difference, can you explain the reasons?
126 |   
127 | - Profile model performance during training, find deliberate inefficiencies we've left in the code, and fix them: (**1.5 points**)
128 |    - There is a total of 6 inefficiencies, you will get 0.25 points for each one you find
129 |    - We expect that in your analysis, you will not only examine the time and memory consumption, but also provide explanations of
130 | whether the obtained results are reasonable.
131 | 
132 | **Hints:**
133 | - Use PyTorch's forward and backward hooks to collect execution times for each module in the model.
134 | - Use `torch.cuda.synchronize()` and `torch.cuda.Event()` correctly to ensure GPU kernels complete before recording events, since all GPU operations are asynchronous ([Asynchronous Execution](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution)).
135 | - Inefficiencies could be anywhere in the code: they may be in data processing, model performance, the training loop — you name it.
136 | - You might want to look at the trace of operations instead of just per-operation profiling, as there is a lot of useful information.
137 | 
138 | ### Report instructions
139 | When you are done with investigations and fixes, you can either:
140 | - Report the profiler output AND its meaningful analysis in your `.ipynb` report file.
141 | List the fixes you made to the code. Be sure to describe how you found them, why the code was inefficient (with profiler screenshots/outputs), and why suggested fixes help.
142 | - The same applies to the `.pdf` file, if you decide to submit your report in that format.
143 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/requirements.txt:
--------------------------------------------------------------------------------
 1 | datasets==2.9.0
 2 | imageio==2.25.0
 3 | jpeg4py==0.1.4
 4 | nvprof==0.2
 5 | opencv-python==4.7.0.68
 6 | scikit-image==0.19.3
 7 | pandas==1.5.3
 8 | py-spy==0.3.14
 9 | einops==0.7.0
10 | torch==2.4.0
11 | torchtext
12 | torchvision==0.19.0
13 | tqdm==4.64.1
14 | transformers==4.48.2
15 | vit_pytorch==0.40.2
16 | gdown==4.7.3
17 | matplotlib==3.8.2
18 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task1/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import isfile, join
 3 | from typing import List, Tuple
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | import torch
 8 | from PIL import Image
 9 | from torch.utils.data.dataset import Dataset
10 | from torchvision import transforms
11 | 
12 | 
13 | class Carvana(Dataset):
14 |     def __init__(self, root: str, transform: transforms.Compose = None) -> None:
15 |         """
16 |         :param root: path to the data folder
17 |         :param transform: transforms of the images and labels
18 |         """
19 |         self.root = os.path.expanduser(root)
20 |         self.transform = transform
21 |         (self.data_path, self.labels_path) = ([], [])
22 | 
23 |         def load_images(path: str) -> List[str]:
24 |             """
25 |             Return a list with paths to all images
26 | 
27 |             :param path: path to the data folder
28 |             :return: list with paths to all images
29 |             """
30 |             images_dir = [join(path, f) for f in os.listdir(path) if isfile(join(path, f))]
31 |             images_dir.sort()
32 | 
33 |             return images_dir
34 | 
35 |         self.data_path = load_images(self.root + "/train")
36 |         self.labels_path = load_images(self.root + "/train_masks")
37 | 
38 |     def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
39 |         """
40 |         :param index: sample index
41 |         :return: tuple (img, target) with the input data and its label
42 |         """
43 |         img = Image.open(self.data_path[index])
44 |         target = Image.open(self.labels_path[index])
45 | 
46 |         if self.transform is not None:
47 |             img = self.transform(img)
48 |             target = self.transform(target)
49 |             target = (target > 0).float()
50 | 
51 |         return img, target
52 | 
53 |     def __len__(self):
54 |         return len(self.data_path)
55 | 
56 | 
57 | def get_train_data() -> torch.utils.data.DataLoader:
58 |     train_dataset = Carvana(
59 |         root=".", transform=transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])
60 |     )
61 | 
62 |     train_loader = torch.utils.data.DataLoader(
63 |         dataset=train_dataset, batch_size=128, shuffle=True, pin_memory=True, num_workers=4
64 |     )
65 | 
66 |     return train_loader
67 | 
68 | 
69 | def im_show(img_list: List[Tuple[torch.Tensor, torch.Tensor]]) -> None:
70 |     """
71 |     Plots images with corresponding segmentation masks
72 | 
73 |     :param img_list: list of pairs image-mask
74 |     """
75 |     to_PIL = transforms.ToPILImage()
76 |     if len(img_list) > 9:
77 |         raise Exception("len(img_list) must be smaller than 10")
78 |     fig, axes = plt.subplots(len(img_list), 2, figsize=(16, 16))
79 |     fig.tight_layout()
80 | 
81 |     for (idx, sample) in enumerate(img_list):
82 |         axes[idx][0].imshow(np.array(to_PIL(sample[0])))
83 |         axes[idx][1].imshow(np.array(to_PIL(sample[1])))
84 |         for ax in axes[idx]:
85 |             ax.get_xaxis().set_visible(False)
86 |             ax.get_yaxis().set_visible(False)
87 | 
88 |     plt.show()
89 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task1/download_data.sh:
--------------------------------------------------------------------------------
1 | # Download and unpack data
2 | wget https://www.dropbox.com/s/tc1qo73rrm3gt3m/CARVANA.zip
3 | unzip -q CARVANA.zip
4 | rm -rf ./train/.DS_Store ./train_masks/.DS_Store CARVANA.zip
5 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task1/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from tqdm.auto import tqdm
 4 | 
 5 | from unet import Unet
 6 | 
 7 | from dataset import get_train_data
 8 | 
 9 | 
10 | def train_epoch(
11 |     train_loader: torch.utils.data.DataLoader,
12 |     model: torch.nn.Module,
13 |     criterion: torch.nn.modules.loss._Loss,
14 |     optimizer: torch.optim.Optimizer,
15 |     device: torch.device,
16 | ) -> None:
17 |     model.train()
18 | 
19 |     pbar = tqdm(enumerate(train_loader), total=len(train_loader))
20 |     for i, (images, labels) in pbar:
21 |         images = images.to(device)
22 |         labels = labels.to(device)
23 | 
24 |         with torch.amp.autocast(device.type, dtype=torch.float16):
25 |             outputs = model(images)
26 |             loss = criterion(outputs, labels)
27 |         # TODO: your code for loss scaling here
28 | 
29 |         accuracy = ((outputs > 0.5) == labels).float().mean()
30 | 
31 |         pbar.set_description(f"Loss: {round(loss.item(), 4)} " f"Accuracy: {round(accuracy.item() * 100, 4)}")
32 | 
33 | 
34 | def train():
35 |     device = torch.device("cuda:0")
36 |     model = Unet().to(device)
37 |     criterion = nn.BCEWithLogitsLoss()
38 |     optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
39 | 
40 |     train_loader = get_train_data()
41 | 
42 |     num_epochs = 5
43 |     for epoch in range(0, num_epochs):
44 |         train_epoch(train_loader, model, criterion, optimizer, device=device)
45 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task1/unet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | 
 5 | 
 6 | class ConvBlock(nn.Module):
 7 |     def __init__(self, in_size, out_size, kernel_size=3, padding=1, stride=1):
 8 |         super().__init__()
 9 |         self.conv = nn.Conv2d(in_size, out_size, kernel_size, padding=padding, stride=stride)
10 |         self.bn = nn.BatchNorm2d(out_size)
11 |         self.relu = nn.ReLU(inplace=True)
12 | 
13 |     def forward(self, x):
14 |         return self.relu(self.bn(self.conv(x)))
15 | 
16 | 
17 | class Unet(nn.Module):
18 |     def __init__(self):
19 |         super().__init__()
20 | 
21 |         self.down_1 = nn.Sequential(ConvBlock(3, 16), ConvBlock(16, 32, stride=2, padding=1))
22 |         self.down_2 = nn.Sequential(ConvBlock(32, 64), ConvBlock(64, 128))
23 |         self.middle = ConvBlock(128, 128, kernel_size=1, padding=0)
24 |         self.up_2 = nn.Sequential(ConvBlock(256, 128), ConvBlock(128, 32))
25 |         self.up_1 = nn.Sequential(ConvBlock(64, 64), ConvBlock(64, 32))
26 |         self.output = nn.Sequential(ConvBlock(32, 16), ConvBlock(16, 1, kernel_size=1, padding=0))
27 | 
28 |     def forward(self, x):
29 |         down1 = self.down_1(x)
30 |         out = F.max_pool2d(down1, kernel_size=2, stride=2)
31 | 
32 |         down2 = self.down_2(out)
33 |         out = F.max_pool2d(down2, kernel_size=2, stride=2)
34 | 
35 |         out = self.middle(out)
36 | 
37 |         out = nn.functional.interpolate(out, scale_factor=2)
38 |         out = torch.cat([down2, out], 1)
39 |         out = self.up_2(out)
40 | 
41 |         out = nn.functional.interpolate(out, scale_factor=2)
42 |         out = torch.cat([down1, out], 1)
43 |         out = self.up_1(out)
44 | 
45 |         out = nn.functional.interpolate(out, scale_factor=2)
46 | 
47 |         return self.output(out)
48 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task2/dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import torch
 4 | from torch.utils.data.dataset import Dataset
 5 | from torch.utils.data import Sampler, IterableDataset
 6 | from transformers import AutoTokenizer
 7 | 
 8 | 
 9 | MAX_LENGTH = 640
10 | 
11 | 
12 | class BrainDataset(Dataset):
13 |     def __init__(self, data_path: str, max_length: int = MAX_LENGTH):
14 |         pass
15 | 
16 |     def __getitem__(self, idx: int):
17 |         pass
18 | 
19 | 
20 | class BigBrainDataset(Dataset):
21 |     def __init__(self, data_path: str, max_length: int = MAX_LENGTH):
22 |         pass
23 | 
24 |     def __getitem__(self, idx: int):
25 |         pass
26 | 
27 | 
28 | class UltraBigBrainDataset(Dataset):
29 |     def __init__(self, data_path: str, max_length: int = MAX_LENGTH, n_bins: int = 1):
30 |         pass
31 | 
32 |     def __getitem__(self, idx: int):
33 |         pass
34 | 
35 | 
36 | class UltraDuperBigBrainDataset(IterableDataset):
37 |     def __init__(self, data_path: str, max_length: int = MAX_LENGTH):
38 |         pass
39 | 
40 |     def __iter__(self):
41 |         pass
42 | 
43 | 
44 | def collate_fn(
45 |     batch: list[tuple[str, torch.Tensor]], max_length: Optional[int] = MAX_LENGTH
46 | ) -> tuple[torch.Tensor, torch.Tensor]:
47 |     """
48 |     Pad each sequence of the incoming sequences list
49 |     :param batch: a list of the objects received from the dataset by __getitem__
50 |     :param max_length: maximum sequence length to pad to (for "Brain" approach only)
51 |     :return: tuple of padded sequences and corresponding training targets
52 |     """
53 |     pass
54 | 
55 | 
56 | class UltraBigBrainBatchSampler(Sampler):
57 | 
58 |     def __init__(self, batch_size: int, max_length: Optional[int] = MAX_LENGTH):
59 |         pass
60 | 
61 |     def __len__(self):
62 |         pass
63 | 
64 |     def __iter__(self):
65 |         pass
66 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task2/run_epoch.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class DataMode(Enum):
 7 |     BRAIN = 1
 8 |     BIG_BRAIN = 2
 9 |     ULTRA_BIG_BRAIN = 3
10 |     ULTRA_DUPER_BIG_BRAIN = 4
11 | 
12 | 
13 | def get_gpt2_model() -> torch.nn.Module:
14 |     pass
15 | 
16 | 
17 | def run_epoch(data_mode: DataMode) -> None:
18 |     pass
19 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task3/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import typing as tp
 3 | import zipfile
 4 | import gdown
 5 | 
 6 | from PIL import Image
 7 | from torch.utils.data import Dataset
 8 | from torchvision import transforms
 9 | 
10 | from utils import Clothes, get_labels_dict
11 | 
12 | 
13 | class ClothesDataset(Dataset):
14 |     def __init__(self, folder_path, frame, transform=None):
15 |         self.folder_path = folder_path
16 |         self.transform = transform
17 |         self.frame = frame.set_index("image")
18 |         self.img_list = list(self.frame.index.values)
19 | 
20 |         self.label2ix = get_labels_dict()
21 | 
22 |     def __len__(self):
23 |         return len(self.img_list)
24 | 
25 |     def __getitem__(self, idx):
26 |         img_name = self.img_list[idx]
27 |         img = Image.open(f"{self.folder_path}/{img_name}.jpg").convert("RGB")
28 |         img_transformed = self.transform(img)
29 |         label = self.label2ix[self.frame.loc[img_name]["label"]]
30 | 
31 |         return img_transformed, label
32 | 
33 | 
34 | def download_extract_dataset():
35 |     if os.path.exists(f"{Clothes.directory}/{Clothes.train_val_img_dir}"):
36 |         print("Dataset already extracted")
37 |         return
38 |     os.makedirs(Clothes.directory, exist_ok=True)
39 |     gdown.download(
40 |         "https://drive.google.com/uc?id=19QYn7wX9kbBOUT3ofztgRURNR_8WLPj6",
41 |         output=f"{Clothes.directory}/{Clothes.archive_name}.zip",
42 |     )
43 |     gdown.download(
44 |         "https://drive.google.com/uc?id=1rk8CFX-0MdezDue_dSl6pGHzAtFrJefm",
45 |         output=f"{Clothes.directory}/{Clothes.csv_name}",
46 |     )
47 |     with zipfile.ZipFile(f"{Clothes.directory}/{Clothes.archive_name}.zip") as train_zip:
48 |         train_zip.extractall(f"{Clothes.directory}/{Clothes.train_val_img_dir}")
49 | 
50 | 
51 | def get_train_transforms() -> tp.Any:
52 |     return transforms.Compose(
53 |         [
54 |             transforms.Resize((320, 320)),
55 |             transforms.CenterCrop(224),
56 |             transforms.RandomResizedCrop(224),
57 |             transforms.RandomHorizontalFlip(),
58 |             transforms.AugMix(),
59 |             transforms.ToTensor(),
60 |         ]
61 |     )
62 | 
63 | 
64 | def get_val_transforms() -> tp.Any:
65 |     return transforms.Compose(
66 |         [
67 |             transforms.Resize((320, 320)),
68 |             transforms.CenterCrop(224),
69 |             transforms.ToTensor(),
70 |         ]
71 |     )
72 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task3/profiler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | import torch
 4 | import os
 5 | from collections import defaultdict
 6 | 
 7 | 
 8 | class Profile:
 9 |     def __init__(self, model, name="model", schedule=None):
10 |         self.name_map = self._build_name_map(model, name)
11 |         self.events = []
12 |         
13 |         ### TODO
14 |     
15 |     def _build_name_map(self, model, name="model"):
16 |         name_map = {}
17 |         for full_name, module in model.named_modules():
18 |             if full_name == "":
19 |                 full_name = name
20 | 
21 |             if self._is_leaf(module):
22 |                 name_map[module] = module.__class__.__name__
23 |             else:
24 |                 name_map[module] = f"{full_name}: {module.__class__.__name__}"
25 | 
26 |         return name_map
27 | 
28 |     def _is_leaf(self, module):
29 |         return len(list(module.children())) == 0
30 | 
31 |     def _forward_pre_hook(self, module, inputs):
32 |         ### TODO
33 |         raise NotImplementedError
34 | 
35 |     def _forward_post_hook(self, module, inputs, outputs):
36 |         ### TODO
37 |         raise NotImplementedError
38 | 
39 |     def _backward_pre_hook(self, module, grad_output):
40 |         ### TODO
41 |         raise NotImplementedError
42 | 
43 |     def _backward_post_hook(self, module, grad_input, grad_output):
44 |         ### TODO
45 |         raise NotImplementedError
46 | 
47 |     def __enter__(self):
48 |         ### TODO
49 |         raise NotImplementedError
50 |  
51 |     def __exit__(self, type, value, traceback):
52 |         ### TODO
53 |         raise NotImplementedError
54 | 
55 |     def step(self):
56 |         ### TODO
57 |         raise NotImplementedError
58 | 
59 |     def summary(self):
60 |         print("Summary:")
61 |         for event in self.events:
62 |             print(event)
63 | 
64 |     def to_perfetto(self, path="trace.json"):
65 |         ### TODO
66 |         raise NotImplementedError
67 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task3/run_epoch.py:
--------------------------------------------------------------------------------
 1 | import typing as tp
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.optim as optim
 6 | import dataset
 7 | import pandas as pd
 8 | 
 9 | from torch.utils.data import DataLoader
10 | from tqdm import tqdm
11 | 
12 | from utils import Settings, Clothes, seed_everything
13 | from vit import ViT
14 | 
15 | 
16 | def get_vit_model() -> torch.nn.Module:
17 |     model = ViT(
18 |         depth=12,
19 |         heads=4,
20 |         image_size=224,
21 |         patch_size=32,
22 |         num_classes=20,
23 |         channels=3,
24 |     ).to(Settings.device)
25 |     return model
26 | 
27 | 
28 | def get_loaders() -> torch.utils.data.DataLoader:
29 |     dataset.download_extract_dataset()
30 |     train_transforms = dataset.get_train_transforms()
31 |     val_transforms = dataset.get_val_transforms()
32 | 
33 |     frame = pd.read_csv(f"{Clothes.directory}/{Clothes.csv_name}")
34 |     train_frame = frame.sample(frac=Settings.train_frac)
35 |     val_frame = frame.drop(train_frame.index)
36 | 
37 |     train_data = dataset.ClothesDataset(
38 |         f"{Clothes.directory}/{Clothes.train_val_img_dir}", train_frame, transform=train_transforms
39 |     )
40 |     val_data = dataset.ClothesDataset(
41 |         f"{Clothes.directory}/{Clothes.train_val_img_dir}", val_frame, transform=val_transforms
42 |     )
43 | 
44 |     print(f"Train Data: {len(train_data)}")
45 |     print(f"Val Data: {len(val_data)}")
46 | 
47 |     train_loader = DataLoader(dataset=train_data, batch_size=Settings.batch_size, shuffle=True)
48 |     val_loader = DataLoader(dataset=val_data, batch_size=Settings.batch_size, shuffle=False)
49 | 
50 |     return train_loader, val_loader
51 | 
52 | 
53 | def run_epoch(model, train_loader, val_loader, criterion, optimizer) -> tp.Tuple[float, float]:
54 |     epoch_loss, epoch_accuracy = 0, 0
55 |     val_loss, val_accuracy = 0, 0
56 |     model.train()
57 |     for data, label in tqdm(train_loader, desc="Train"):
58 |         data = data.to(Settings.device)
59 |         label = label.to(Settings.device)
60 |         output = model(data)
61 |         loss = criterion(output, label)
62 |         acc = (output.argmax(dim=1) == label).float().mean()
63 |         epoch_accuracy += acc.item() / len(train_loader)
64 |         epoch_loss += loss.item() / len(train_loader)
65 |         optimizer.zero_grad()
66 |         loss.backward()
67 |         optimizer.step()
68 | 
69 |     model.eval()
70 |     for data, label in tqdm(val_loader, desc="Val"):
71 |         data = data.to(Settings.device)
72 |         label = label.to(Settings.device)
73 |         output = model(data)
74 |         loss = criterion(output, label)
75 |         acc = (output.argmax(dim=1) == label).float().mean()
76 |         val_accuracy += acc.item() / len(train_loader)
77 |         val_loss += loss.item() / len(train_loader)
78 | 
79 |     return epoch_loss, epoch_accuracy, val_loss, val_accuracy
80 | 
81 | 
82 | def main():
83 |     seed_everything()
84 |     model = get_vit_model()
85 |     train_loader, val_loader = get_loaders()
86 |     criterion = nn.CrossEntropyLoss()
87 |     optimizer = optim.Adam(model.parameters(), lr=Settings.lr)
88 | 
89 |     run_epoch(model, train_loader, val_loader, criterion, optimizer)
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     main()
94 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task3/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def seed_everything(seed: int = 595959) -> None:
10 |     random.seed(seed)
11 |     os.environ["PYTHONHASHSEED"] = str(seed)
12 |     np.random.seed(seed)
13 |     torch.manual_seed(seed)
14 |     torch.cuda.manual_seed(seed)
15 |     torch.cuda.manual_seed_all(seed)
16 |     torch.backends.cudnn.deterministic = True
17 | 
18 | 
19 | def get_device() -> torch.device:
20 |     return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
21 | 
22 | 
23 | class Settings:
24 |     batch_size: int = 256
25 |     epochs: int = 2
26 |     lr: float = 3e-5
27 |     gamma: float = 0.7
28 |     seed: int = 42
29 |     device: str = get_device()
30 |     train_frac: float = 0.8
31 | 
32 | 
33 | class Clothes:
34 |     directory = "data"
35 |     train_val_img_dir = "train"
36 |     csv_name = "images.csv"
37 |     archive_name = "images_original"
38 | 
39 | 
40 | def get_labels_dict():
41 |     frame = pd.read_csv(f"{Clothes.directory}/{Clothes.csv_name}")
42 |     labels = frame["label"].unique()
43 |     return {label: i for i, label in enumerate(labels)}
44 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/homework/task3/vit.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The source is ViT from lucidrains implementation
  3 | https://github.com/lucidrains/vit-pytorch
  4 | """
  5 | 
  6 | import torch
  7 | from einops import rearrange, repeat
  8 | from einops.layers.torch import Rearrange
  9 | from torch import nn
 10 | 
 11 | 
 12 | def pair(t):
 13 |     return t if isinstance(t, tuple) else (t, t)
 14 | 
 15 | 
 16 | class FeedForward(nn.Module):
 17 |     def __init__(self, dim, hidden_dim=255, dropout=0.0):
 18 |         super().__init__()
 19 |         self.net = nn.Sequential(
 20 |             nn.LayerNorm(dim),
 21 |             nn.Linear(dim, hidden_dim, bias=True),
 22 |             nn.GELU(),
 23 |             nn.Dropout(dropout),
 24 |             nn.Linear(hidden_dim, dim, bias=True),
 25 |             nn.Dropout(dropout),
 26 |         )
 27 | 
 28 |     def forward(self, x):
 29 |         return self.net(x)
 30 | 
 31 | 
 32 | class Attention(nn.Module):
 33 |     def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
 34 |         super().__init__()
 35 |         inner_dim = dim_head * heads
 36 |         project_out = not (heads == 1 and dim_head == dim)
 37 | 
 38 |         self.heads = heads
 39 |         self.scale = dim_head ** (-0.5)
 40 | 
 41 |         self.attend = nn.Softmax(dim=-1)
 42 |         self.dropout = nn.Dropout(dropout)
 43 |         self.norm = nn.LayerNorm(dim)
 44 |         self.queries = nn.Linear(dim, inner_dim, bias=False)
 45 |         self.keys = nn.Linear(dim, inner_dim, bias=False)
 46 |         self.values = nn.Linear(dim, inner_dim, bias=False)
 47 | 
 48 |         self.to_out = nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout)) if project_out else nn.Identity()
 49 | 
 50 |     def forward(self, x):
 51 |         q = self.queries(x)
 52 |         k = self.keys(x)
 53 |         v = self.values(x)
 54 | 
 55 |         dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
 56 | 
 57 |         attn = self.attend(dots)
 58 |         attn = self.dropout(attn)
 59 | 
 60 |         out = torch.matmul(attn, v)
 61 | 
 62 |         return self.to_out(out)
 63 | 
 64 | 
 65 | class Transformer(nn.Module):
 66 |     def __init__(self, dim, depth, heads, dim_head, dropout=0.0):
 67 |         super().__init__()
 68 |         self.layers = nn.ModuleList([])
 69 |         for _ in range(depth):
 70 |             self.layers.append(
 71 |                 nn.ModuleList(
 72 |                     [
 73 |                         Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout),
 74 |                         FeedForward(dim, dropout=dropout),
 75 |                     ]
 76 |                 )
 77 |             )
 78 | 
 79 |     def forward(self, x):
 80 |         for attn, ff in self.layers:
 81 |             x = attn(x) + x
 82 |             x = ff(x) + x
 83 |         return x
 84 | 
 85 | 
 86 | class ViT(nn.Module):
 87 |     def __init__(
 88 |         self,
 89 |         *,
 90 |         image_size,
 91 |         patch_size,
 92 |         num_classes,
 93 |         depth,
 94 |         heads,
 95 |         dim=255,
 96 |         pool="cls",
 97 |         channels=3,
 98 |         dim_head=64,
 99 |         dropout=0.0,
100 |         emb_dropout=0.0,
101 |     ):
102 |         super().__init__()
103 |         image_height, image_width = pair(image_size)
104 |         patch_height, patch_width = pair(patch_size)
105 | 
106 |         assert (
107 |             image_height % patch_height == 0 and image_width % patch_width == 0
108 |         ), "Image dimensions must be divisible by the patch size."
109 | 
110 |         num_patches = (image_height // patch_height) * (image_width // patch_width)
111 |         patch_dim = channels * patch_height * patch_width
112 |         assert pool in {"cls", "mean"}, "pool type must be either cls (cls token) or mean (mean pooling)"
113 | 
114 |         self.to_patch_embedding = nn.Sequential(
115 |             Rearrange("b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1=patch_height, p2=patch_width),
116 |             nn.Linear(patch_dim, dim),
117 |         )
118 | 
119 |         self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
120 |         self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
121 |         self.dropout = nn.Dropout(emb_dropout)
122 | 
123 |         self.transformer = Transformer(dim, depth, heads, dim_head, dropout=dropout)
124 | 
125 |         self.pool = pool
126 |         self.to_latent = nn.Identity()
127 | 
128 |         self.mlp_head = nn.Sequential(nn.BatchNorm1d(dim), nn.Linear(dim, num_classes))
129 | 
130 |     def forward(self, img):
131 |         x = self.to_patch_embedding(img)
132 |         b, n, _ = x.shape
133 | 
134 |         cls_tokens = repeat(self.cls_token, "1 1 d -> b 1 d", b=b)
135 |         x = torch.cat((cls_tokens, x), dim=1)
136 |         x += self.pos_embedding[:, : (n + 1)]
137 | 
138 |         x = self.dropout(x)
139 | 
140 |         x = self.transformer(x)
141 | 
142 |         x = x.mean(dim=1) if self.pool == "mean" else x[:, 0]
143 | 
144 |         x = self.to_latent(x)
145 | 
146 |         output = self.mlp_head(x)
147 |         return output
148 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/lecture.pdf


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/Mixed_precision.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/seminar/Mixed_precision.jpeg


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/image_loaders_benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Source: https://github.com/ternaus/imread_benchmark/blob/master/imread_benchmark/benchmark.py
  3 | Benchmark libraries: cv2, skimage, PIL, jpeg4py, imageio
  4 | 
  5 | for the case jpeg images => numpy array for RGB image
  6 | """
  7 | 
  8 | import argparse
  9 | import math
 10 | import random
 11 | import sys
 12 | from abc import ABC
 13 | from collections import defaultdict
 14 | from pathlib import Path
 15 | from timeit import Timer
 16 | from typing import Union
 17 | 
 18 | import cv2
 19 | import imageio
 20 | import jpeg4py
 21 | import numpy as np
 22 | import pandas as pd
 23 | import pkg_resources
 24 | import skimage
 25 | from PIL import Image
 26 | from tqdm import tqdm
 27 | 
 28 | from torchvision.io import read_image
 29 | 
 30 | def print_package_versions():
 31 |     packages = ["opencv-python", "pillow-simd", "jpeg4py", "scikit-image", "imageio"]
 32 |     package_versions = {"python": sys.version}
 33 |     for package in packages:
 34 |         try:
 35 |             package_versions[package] = pkg_resources.get_distribution(package).version
 36 |         except pkg_resources.DistributionNotFound:
 37 |             pass
 38 |     print(package_versions)
 39 | 
 40 | 
 41 | def format_results(images_per_second_for_read, show_std=False):
 42 |     if images_per_second_for_read is None:
 43 |         return "-"
 44 |     result = str(math.floor(np.mean(images_per_second_for_read)))
 45 |     if show_std:
 46 |         result += " ± {}".format(math.ceil(np.std(images_per_second_for_read)))
 47 |     return result
 48 | 
 49 | 
 50 | class BenchmarkTest(ABC):
 51 |     def __str__(self):
 52 |         return self.__class__.__name__
 53 | 
 54 |     def run(self, library, image_paths: list):
 55 |         operation = getattr(self, library)
 56 |         for image in image_paths:
 57 |             operation(image)
 58 | 
 59 | 
 60 | class GetArray(BenchmarkTest):
 61 |     def PIL(self, image_path: str) -> np.array:
 62 |         img = Image.open(image_path)
 63 |         img = img.convert("RGB")
 64 |         return np.asarray(img)
 65 | 
 66 |     def opencv(self, image_path: str) -> np.array:
 67 |         img = cv2.imread(image_path)
 68 |         return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 69 | 
 70 |     def jpeg4py(self, image_path: str) -> np.array:
 71 |         return jpeg4py.JPEG(image_path).decode()
 72 | 
 73 |     def skimage(self, image_path: str) -> np.array:
 74 |         return skimage.io.imread(image_path, plugin="matplotlib")
 75 | 
 76 |     def imageio(self, image_path: str) -> np.array:
 77 |         return imageio.imread(image_path)
 78 | 
 79 |     def torch(self, image_path: str) -> np.array:
 80 |         return read_image(image_path)
 81 | 
 82 | 
 83 | def benchmark(libraries: list, benchmarks: list, image_paths: list, num_runs: int, shuffle: bool) -> defaultdict:
 84 |     images_per_second = defaultdict(dict)
 85 |     num_images = len(image_paths)
 86 | 
 87 |     for library in libraries:
 88 |         pbar = tqdm(total=len(benchmarks))
 89 |         for benchmark in benchmarks:
 90 |             pbar.set_description("Current benchmark: {} | {}".format(library, benchmark))
 91 |             if shuffle:
 92 |                 random.shuffle(image_paths)
 93 |             timer = Timer(lambda: benchmark.run(library, image_paths))
 94 |             run_times = timer.repeat(number=1, repeat=num_runs)
 95 |             benchmark_images_per_second = [1 / (run_time / num_images) for run_time in run_times]
 96 |             images_per_second[library][str(benchmark)] = benchmark_images_per_second
 97 |             pbar.update(1)
 98 | 
 99 |         pbar.close()
100 | 
101 |     return images_per_second
102 | 
103 | 
104 | def parse_args():
105 |     parser = argparse.ArgumentParser(description="Image reading libraries performance benchmark")
106 |     parser.add_argument("-d", "--data-dir", metavar="DIR", help="path to a directory with images")
107 |     parser.add_argument(
108 |         "-i",
109 |         "--num_images",
110 |         default=2000,
111 |         type=int,
112 |         metavar="N",
113 |         help="number of images for benchmarking (default: 2000)",
114 |     )
115 |     parser.add_argument(
116 |         "-r", "--num_runs", default=5, type=int, metavar="N", help="number of runs for each benchmark (default: 5)"
117 |     )
118 |     parser.add_argument(
119 |         "--show-std", dest="show_std", action="store_true", help="show standard deviation for benchmark runs"
120 |     )
121 |     parser.add_argument("-p", "--print-package-versions", action="store_true", help="print versions of packages")
122 |     parser.add_argument("-s", "--shuffle", action="store_true", help="Shuffle the list of images.")
123 |     return parser.parse_args()
124 | 
125 | 
126 | def get_image_paths(data_dir: Union[str, Path], num_images: int) -> list:
127 |     image_paths = sorted(Path(data_dir).glob("*.*"))
128 |     return [str(x) for x in image_paths[:num_images]]
129 | 
130 | 
131 | def main():
132 |     args = parse_args()
133 |     if args.print_package_versions:
134 |         print_package_versions()
135 | 
136 |     benchmarks = [GetArray()]
137 | 
138 |     libraries = ["opencv", "PIL", "jpeg4py", "skimage", "imageio", "torch"]
139 | 
140 |     image_paths = get_image_paths(args.data_dir, args.num_images)
141 | 
142 |     images_per_second = benchmark(libraries, benchmarks, image_paths, args.num_runs, args.shuffle)
143 | 
144 |     pd.set_option("display.width", 1000)
145 |     df = pd.DataFrame.from_dict(images_per_second)
146 |     df = df.applymap(lambda r: format_results(r, args.show_std))
147 |     df = df[libraries]
148 | 
149 |     print(df)
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     main()
154 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/memory_snapshot.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | from tqdm.auto import trange
 4 | 
 5 | device = torch.device("cuda:0")
 6 | 
 7 | in_size = 8192
 8 | out_size = 8192
 9 | num_layers = 20
10 | num_batches = 10
11 | epochs = 1
12 | 
13 | def make_model(in_size: int, out_size: int, num_layers: int) -> torch.nn.Module:
14 |     layers = []
15 |     for _ in range(num_layers - 1):
16 |         layers.append(torch.nn.Linear(in_size, in_size))
17 |         layers.append(torch.nn.ReLU())
18 |     layers.append(torch.nn.Linear(in_size, out_size))
19 |     return torch.nn.Sequential(*tuple(layers))
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--amp", action="store_true", default=False)
25 |     args = parser.parse_args()
26 | 
27 |     torch.cuda.memory._record_memory_history()
28 | 
29 |     data = [torch.randn(1024, in_size, device=device) for _ in range(num_batches)]
30 |     targets = [torch.randn(1024, out_size, device=device) for _ in range(num_batches)]
31 | 
32 |     net = make_model(in_size, out_size, num_layers).to(device)
33 |     opt = torch.optim.Adam(net.parameters(), lr=0.001)
34 |     loss_fn = torch.nn.MSELoss().to(device)
35 |     scaler = torch.cuda.amp.GradScaler(enabled=True)
36 | 
37 |     for epoch in trange(epochs):
38 |         for inputs, target in zip(data, targets):
39 |             if args.amp:
40 |                 with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
41 |                     output = net(inputs)
42 |                     loss = loss_fn(output, target)
43 | 
44 |                 scaler.scale(loss).backward()
45 |                 scaler.step(opt)
46 |                 scaler.update()
47 |                 opt.zero_grad()
48 |             else:
49 |                 output = net(inputs)
50 |                 loss = loss_fn(output, target)
51 | 
52 |                 loss.backward()
53 |                 opt.step()
54 |                 opt.zero_grad()
55 | 
56 |     torch.cuda.memory._dump_snapshot(f"snapshot_amp={args.amp}.pickle")
57 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/mnist_training.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torchvision
 6 | 
 7 | from train import model_provider, train, train_amp
 8 | 
 9 | 
10 | def get_loaders(
11 |     transforms_level: int = 1,
12 | ) -> tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]:
13 |     if transforms_level == 1:
14 |         # no transforms
15 |         transform = torchvision.transforms.Compose(
16 |             [
17 |                 torchvision.transforms.ToTensor(),
18 |                 torchvision.transforms.Normalize((0.1307,), (0.3081,)),
19 |             ]
20 |         )
21 |     elif transforms_level == 2:
22 |         # modest transforms
23 |         transform = torchvision.transforms.Compose(
24 |             [
25 |                 torchvision.transforms.RandomHorizontalFlip(),
26 |                 torchvision.transforms.RandomPerspective(),
27 |                 torchvision.transforms.RandomVerticalFlip(),
28 |                 torchvision.transforms.ToTensor(),
29 |                 torchvision.transforms.Normalize((0.1307,), (0.3081,)),
30 |             ]
31 |         )
32 |     else:
33 |         # heavy transforms
34 |         transform = torchvision.transforms.Compose(
35 |             [
36 |                 torchvision.transforms.RandomHorizontalFlip(),
37 |                 torchvision.transforms.RandomPerspective(),
38 |                 torchvision.transforms.RandomVerticalFlip(),
39 |                 torchvision.transforms.GaussianBlur(5),
40 |                 torchvision.transforms.RandomAdjustSharpness(2),
41 |                 torchvision.transforms.RandomAutocontrast(),
42 |                 torchvision.transforms.RandomAdjustSharpness(1),
43 |                 torchvision.transforms.RandomAutocontrast(),
44 |                 torchvision.transforms.RandomAdjustSharpness(0.5),
45 |                 torchvision.transforms.RandomAutocontrast(),
46 |                 torchvision.transforms.RandomEqualize(),
47 |                 torchvision.transforms.ToTensor(),
48 |                 torchvision.transforms.RandomSolarize(0.5),
49 |                 torchvision.transforms.RandomSolarize(0.5),
50 |                 torchvision.transforms.RandomSolarize(0.5),
51 |                 torchvision.transforms.Normalize((0.1307,), (0.3081,)),
52 |             ]
53 |         )
54 | 
55 |     mnist_train = torchvision.datasets.MNIST(
56 |         "./mnist/", train=True, download=True, transform=transform
57 |     )
58 |     mnist_val = torchvision.datasets.MNIST(
59 |         "./mnist/", train=False, download=True, transform=transform
60 |     )
61 | 
62 |     train_dataloader = torch.utils.data.DataLoader(
63 |         mnist_train, batch_size=1024, shuffle=True
64 |     )
65 |     val_dataloader = torch.utils.data.DataLoader(
66 |         mnist_val, batch_size=1024, shuffle=False
67 |     )
68 | 
69 |     return train_dataloader, val_dataloader
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     parser = argparse.ArgumentParser()
74 |     parser.add_argument("-t", "--transforms-level", type=int, default=1)
75 |     parser.add_argument("--amp", action="store_true", default=False)
76 |     parser.add_argument("--n-epochs", type=int, default=100)
77 |     args = parser.parse_args()
78 | 
79 |     train_dataloader_, val_dataloader_ = get_loaders(args.transforms_level)
80 |     model_ = model_provider()
81 |     optimizer = torch.optim.Adam(model_.parameters(), lr=0.01)
82 |     loss_fn = nn.CrossEntropyLoss()
83 |     if args.amp:
84 |         train_amp(model_, loss_fn, optimizer, train_dataloader_, val_dataloader_, n_epochs=args.n_epochs)
85 |     else:
86 |         train(model_, loss_fn, optimizer, train_dataloader_, val_dataloader_, n_epochs=args.n_epochs)
87 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/mnist_training_nsys.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torchvision
  6 | 
  7 | from tqdm.auto import tqdm
  8 | 
  9 | from train import model_provider
 10 | 
 11 | 
 12 | def get_loaders(
 13 |     transforms_level: int = 1,
 14 | ) -> tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]:
 15 |     if transforms_level == 1:
 16 |         # no transforms
 17 |         transform = torchvision.transforms.Compose(
 18 |             [
 19 |                 torchvision.transforms.ToTensor(),
 20 |                 torchvision.transforms.Normalize((0.1307,), (0.3081,)),
 21 |             ]
 22 |         )
 23 |     elif transforms_level == 2:
 24 |         # modest transforms
 25 |         transform = torchvision.transforms.Compose(
 26 |             [
 27 |                 torchvision.transforms.RandomHorizontalFlip(),
 28 |                 torchvision.transforms.RandomPerspective(),
 29 |                 torchvision.transforms.RandomVerticalFlip(),
 30 |                 torchvision.transforms.ToTensor(),
 31 |                 torchvision.transforms.Normalize((0.1307,), (0.3081,)),
 32 |             ]
 33 |         )
 34 |     else:
 35 |         # heavy transforms
 36 |         transform = torchvision.transforms.Compose(
 37 |             [
 38 |                 torchvision.transforms.RandomHorizontalFlip(),
 39 |                 torchvision.transforms.RandomPerspective(),
 40 |                 torchvision.transforms.RandomVerticalFlip(),
 41 |                 torchvision.transforms.GaussianBlur(5),
 42 |                 torchvision.transforms.RandomAdjustSharpness(2),
 43 |                 torchvision.transforms.RandomAutocontrast(),
 44 |                 torchvision.transforms.RandomAdjustSharpness(1),
 45 |                 torchvision.transforms.RandomAutocontrast(),
 46 |                 torchvision.transforms.RandomAdjustSharpness(0.5),
 47 |                 torchvision.transforms.RandomAutocontrast(),
 48 |                 torchvision.transforms.RandomEqualize(),
 49 |                 torchvision.transforms.ToTensor(),
 50 |                 torchvision.transforms.RandomSolarize(0.5),
 51 |                 torchvision.transforms.RandomSolarize(0.5),
 52 |                 torchvision.transforms.RandomSolarize(0.5),
 53 |                 torchvision.transforms.Normalize((0.1307,), (0.3081,)),
 54 |             ]
 55 |         )
 56 | 
 57 |     mnist_train = torchvision.datasets.MNIST(
 58 |         "./mnist/", train=True, download=True, transform=transform
 59 |     )
 60 |     mnist_val = torchvision.datasets.MNIST(
 61 |         "./mnist/", train=False, download=True, transform=transform
 62 |     )
 63 | 
 64 |     train_dataloader = torch.utils.data.DataLoader(
 65 |         mnist_train, batch_size=1024, shuffle=True
 66 |     )
 67 |     val_dataloader = torch.utils.data.DataLoader(
 68 |         mnist_val, batch_size=1024, shuffle=False
 69 |     )
 70 | 
 71 |     return train_dataloader, val_dataloader
 72 | 
 73 | 
 74 | if __name__ == "__main__":
 75 |     parser = argparse.ArgumentParser()
 76 |     parser.add_argument("--emit-nvtx", type=int, default=1)
 77 |     parser.add_argument("-t", "--transforms-level", type=int, default=1)
 78 |     args = parser.parse_args()
 79 | 
 80 |     device = torch.device("cuda:0")
 81 |     train_dataloader_, val_dataloader_ = get_loaders(args.transforms_level)
 82 |     model_ = model_provider()
 83 |     optimizer = torch.optim.Adam(model_.parameters(), lr=0.01)
 84 |     scaler = torch.cuda.amp.GradScaler()
 85 |     loss_fn = nn.CrossEntropyLoss()
 86 | 
 87 |     epoch = 0
 88 |     i = 0
 89 |     model_.to(device)
 90 |     model_.train()
 91 | 
 92 |     if args.emit_nvtx:
 93 |         with torch.autograd.profiler.emit_nvtx():
 94 |             for x_train, y_train in tqdm(train_dataloader_, desc=f"Epoch {epoch}: "):
 95 |                 with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
 96 |                     x_train, y_train = x_train.to(device), y_train.to(device)
 97 |                     y_pred = model_(x_train)
 98 |                     loss = loss_fn(y_pred, y_train)
 99 | 
100 |                 scaler.scale(loss).backward()
101 |                 scaler.step(optimizer)
102 |                 scaler.update()
103 |                 optimizer.zero_grad()
104 |                 i += 1
105 |                 if i == 3:
106 |                     break
107 |     else:
108 |         torch.cuda.nvtx.range_push("Train Loop")
109 |         for x_train, y_train in tqdm(train_dataloader_, desc=f"Epoch {epoch}: "):
110 |             with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
111 |                 x_train, y_train = x_train.to(device), y_train.to(device)
112 |                 torch.cuda.nvtx.range_push("Forward")
113 |                 y_pred = model_(x_train)
114 |                 loss = loss_fn(y_pred, y_train)
115 |                 torch.cuda.nvtx.range_pop()
116 | 
117 |             torch.cuda.nvtx.range_push("Backward")
118 |             scaler.scale(loss).backward()
119 |             torch.cuda.nvtx.range_pop()
120 |             torch.cuda.nvtx.range_push("Optimizer Step")
121 |             scaler.step(optimizer)
122 |             torch.cuda.nvtx.range_pop()
123 |             scaler.update()
124 |             optimizer.zero_grad()
125 |             i += 1
126 |             if i == 3:
127 |                 break
128 |         torch.cuda.nvtx.range_pop()
129 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/pics/1/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/seminar/pics/1/1.jpg


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/pics/1/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/seminar/pics/1/2.jpg


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/pics/1/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/seminar/pics/1/3.jpg


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/pics/1/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/seminar/pics/1/4.jpg


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/pics/1/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/seminar/pics/1/5.jpg


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/pics/1/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/seminar/pics/1/6.jpg


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/pics/1/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/seminar/pics/1/7.jpg


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/pics/1/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week03_fast_pipelines/seminar/pics/1/8.jpg


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/requirements.txt:
--------------------------------------------------------------------------------
 1 | datasets==2.9.0
 2 | imageio==2.25.0
 3 | jpeg4py==0.1.4
 4 | nvprof==0.2
 5 | opencv-python==4.7.0.68
 6 | scikit-image==0.19.3
 7 | pandas==1.5.3
 8 | py-spy==0.3.14
 9 | torch==2.3.0
10 | torchtext
11 | torchvision==0.18.0
12 | tqdm==4.64.1
13 | vit_pytorch==0.40.2
14 | matplotlib==3.8.2
15 | 


--------------------------------------------------------------------------------
/week03_fast_pipelines/seminar/train.py:
--------------------------------------------------------------------------------
  1 | from typing import Literal
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from tqdm.auto import tqdm
  6 | 
  7 | 
  8 | def model_provider():
  9 |     model = nn.Sequential(
 10 |         nn.Conv2d(in_channels=1, out_channels=20, kernel_size=5),
 11 |         nn.ReLU(),
 12 |         nn.MaxPool2d(kernel_size=4),
 13 |         nn.Conv2d(in_channels=20, out_channels=10, kernel_size=3),
 14 |         nn.ReLU(),
 15 |         nn.MaxPool2d(kernel_size=2),
 16 |         nn.Flatten(),
 17 |         nn.Linear(2 * 2 * 10, 128),
 18 |         nn.ReLU(),
 19 |         nn.Linear(128, 1024),
 20 |         nn.ReLU(),
 21 |         nn.Linear(1024, 10),
 22 |     )
 23 |     return model
 24 | 
 25 | 
 26 | def train(
 27 |     model: nn.Module,
 28 |     loss_fn: nn.modules.loss,
 29 |     optimizer: torch.optim.Optimizer,
 30 |     train_dataloader: torch.utils.data.DataLoader,
 31 |     val_dataloader: torch.utils.data.DataLoader,
 32 |     n_epochs: int = 3,
 33 |     device: torch.device = torch.device("cuda:0"),
 34 |     precision: Literal["full", "half"] = "full",
 35 | ) -> None:
 36 |     if precision == "half":
 37 |         model.half()
 38 |     model.to(device)
 39 | 
 40 |     for epoch in range(n_epochs):
 41 |         model.train()
 42 |         for x_train, y_train in tqdm(train_dataloader, desc=f"Epoch {epoch}: "):
 43 |             if precision == "half":
 44 |                 x_train = x_train.half()
 45 |             x_train, y_train = x_train.to(device), y_train.to(device)
 46 |             y_pred = model(x_train)
 47 | 
 48 |             loss = loss_fn(y_pred, y_train)
 49 | 
 50 |             loss.backward()
 51 |             optimizer.step()
 52 |             optimizer.zero_grad()
 53 | 
 54 |         if epoch % 2 == 0 or epoch == n_epochs - 1:
 55 |             print("Starting validation...")
 56 |             model.eval()
 57 |             val_loss = torch.empty(len(val_dataloader))
 58 |             val_accuracy = torch.empty(len(val_dataloader))
 59 | 
 60 |             with torch.no_grad():
 61 |                 for i, (x_val, y_val) in enumerate(val_dataloader):
 62 |                     if precision == "half":
 63 |                         x_val = x_val.half()
 64 |                     x_val, y_val = x_val.to(device), y_val.to(device)
 65 |                     y_pred = model(x_val)
 66 |                     loss = loss_fn(y_pred.float(), y_val)
 67 |                     val_loss[i] = loss
 68 |                     val_accuracy[i] = (
 69 |                         (torch.argmax(y_pred, dim=-1) == y_val).float().mean()
 70 |                     )
 71 | 
 72 |             print(
 73 |                 f"Epoch: {epoch}, loss: {val_loss.mean().detach().cpu()}, "
 74 |                 f"accuracy: {val_accuracy.mean().detach().cpu()}"
 75 |             )
 76 |     model.eval()
 77 | 
 78 | 
 79 | def train_amp(
 80 |     model: nn.Module,
 81 |     loss_fn: nn.modules.loss,
 82 |     optimizer: torch.optim.Optimizer,
 83 |     train_dataloader: torch.utils.data.DataLoader,
 84 |     val_dataloader: torch.utils.data.DataLoader,
 85 |     n_epochs: int = 3,
 86 |     device: torch.device = torch.device("cuda:0"),
 87 |     precision: Literal["fp16", "bf16"] = "bf16",
 88 |     loss_scaling: bool = False,
 89 | ) -> None:
 90 |     scaler = torch.cuda.amp.GradScaler()
 91 |     model.to(device)
 92 | 
 93 |     if precision == "fp16":
 94 |         dtype = torch.float16
 95 |     elif precision == "bf16":
 96 |         dtype = torch.bfloat16
 97 |     else:
 98 |         ValueError("Unsupported precision for amp.")
 99 | 
100 |     for epoch in range(n_epochs):
101 |         model.train()
102 | 
103 |         for x_train, y_train in tqdm(train_dataloader, desc=f"Epoch {epoch}: "):
104 |             with torch.amp.autocast(device_type="cuda", dtype=dtype):
105 |                 x_train, y_train = x_train.to(device), y_train.to(device)
106 |                 y_pred = model(x_train)
107 |                 loss = loss_fn(y_pred, y_train)
108 | 
109 |             if loss_scaling:
110 |                 scaler.scale(loss).backward()
111 |                 scaler.step(optimizer)
112 |                 scaler.update()
113 |             else:
114 |                 loss.backward()
115 |                 optimizer.step()
116 | 
117 |             optimizer.zero_grad()
118 | 
119 |         if epoch % 2 == 0 or epoch == n_epochs - 1:
120 |             print("Starting validation...")
121 |             model.eval()
122 |             val_loss = torch.empty(len(val_dataloader))
123 |             val_accuracy = torch.empty(len(val_dataloader))
124 | 
125 |             with torch.no_grad():
126 |                 for i, (x_val, y_val) in enumerate(val_dataloader):
127 |                     x_val, y_val = x_val.to(device), y_val.to(device)
128 |                     y_pred = model(x_val)
129 |                     loss = loss_fn(y_pred, y_val)
130 |                     val_loss[i] = loss
131 |                     val_accuracy[i] = (
132 |                         (torch.argmax(y_pred, dim=-1) == y_val).float().mean()
133 |                     )
134 | 
135 |             print(
136 |                 f"Epoch: {epoch}, loss: {val_loss.mean().detach().cpu()}, "
137 |                 f"accuracy: {val_accuracy.mean().detach().cpu()}"
138 |             )
139 |     model.eval()
140 | 


--------------------------------------------------------------------------------
/week04_data_parallel/README.md:
--------------------------------------------------------------------------------
 1 | # Week 4: Data-parallel training and All-Reduce
 2 | 
 3 | * Lecture: [link](./lecture.pdf)
 4 | * Seminar: [link](./practice.ipynb)
 5 | * Homework: see the [homework](./homework) folder
 6 | * Video: [lecture](https://disk.yandex.ru/i/44cBFo8zKSUYjg), [seminar](https://disk.yandex.ru/i/uxzBjkKYtGFLXA)
 7 | 
 8 | ## Further reading
 9 | * [Numba parallel](https://numba.pydata.org/numba-doc/dev/user/parallel.html) - a way to develop threaded parallel code in python without GIL
10 | * [joblib](https://joblib.readthedocs.io/) - a library of multiprocessing primitives similar to mp.Pool, but with some extra conveniences
11 | * BytePS paper - https://www.usenix.org/system/files/osdi20-jiang.pdf
12 | * Alternative lecture: Parameter servers from CMU 10-605 - [here](https://www.youtube.com/watch?v=N241lmq5mqk)
13 | * Alternative seminar: python multiprocessing - [playlist](https://www.youtube.com/watch?v=RR4SoktDQAw&list=PL5tcWHG-UPH3SX16DI6EP1FlEibgxkg_6)
14 | * [Python multiprocessing docs](https://docs.python.org/3/library/multiprocessing.html) (pay attention to `fork` vs `spawn`!)
15 | * [PyTorch Distributed tutorial](https://pytorch.org/tutorials/intermediate/dist_tuto.html)
16 | * [Collective communication protocols in NCCL](https://images.nvidia.com/events/sc15/pdfs/NCCL-Woolley.pdf)
17 | * There's a ton of links on the slides, please check the PDF.
18 | 


--------------------------------------------------------------------------------
/week04_data_parallel/homework/README.md:
--------------------------------------------------------------------------------
 1 | # Week 4 home assignment
 2 | 
 3 | The assignment for this week consists of four parts: the first three are obligatory, and the fourth is a bonus one.
 4 | Include all the files with implemented functions/classes and the report for Tasks 2 and 4 in your submission.
 5 | 
 6 | ## Task 1 (1 point)
 7 | 
 8 | Implement the function for deterministic sequential printing of N numbers for N processes,
 9 | using [sequential_print.py](./sequential_print.py) as a template. 
10 | You should be able to test it with `torchrun --nproc_per_node N sequential_print.py`
11 | Pay attention to the output format!
12 | 
13 | ## Task 2 (7 points)
14 | 
15 | The pipeline you saw in the seminar shows only the basic building blocks of distributed training. Now, let's train
16 | something actually interesting!
17 | 
18 | ### SyncBatchNorm implementation
19 | For this task, let's take the [CIFAR-100](https://pytorch.org/vision/0.8/datasets.html#torchvision.datasets.CIFAR100)
20 | dataset and train a model with **synchronized** Batch Normalization: this version of the layer aggregates 
21 | the statistics **across all workers** during each forward pass.
22 | 
23 | Importantly, you have to call a communication primitive **only once** during each forward or backward pass; 
24 | if you use it more than once, you will only earn up to 4 points for this task.
25 | Additionally, you are **not allowed** to use internal PyTorch functions that compute the backward pass
26 | of batch normalization: please implement it manually.
27 | 
28 | ### Reducing gradient synchronization
29 | Also, implement a version of distributed training which is aware of **gradient accumulation**:
30 | for every batch that doesn't run `optimizer.step`, you do not need to run All-Reduce for gradients at all.
31 | 
32 | ### Benchmarking the training pipeline
33 | Compare the performance (in terms of speed, memory footprint, and final quality) of your distributed training 
34 | pipeline with the one that uses primitives from PyTorch (i.e., [torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) **and** [torch.nn.SyncBatchNorm](https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html)). 
35 | You need to compare the implementations by training with **at least two** processes, and your pipeline needs to have 
36 | at least 2 gradient accumulation steps.
37 | 
38 | ### Tests for SyncBatchNorm
39 | In addition, **test the SyncBN layer itself** by comparing the results with standard **BatchNorm1d** and changing 
40 | the number of workers (1 and 4), the size of activations (128, 256, 512, 1024), and the batch size (32, 64). 
41 | 
42 | Compare the results of forward/backward passes in the following setup: 
43 | * FP32 inputs come from the standard Gaussian distribution;
44 | * The loss function takes the outputs of batch normalization and computes the sum over all dimensions 
45 | for first B/2 samples (B is the total batch size).
46 | 
47 | A working implementation of SyncBN should have reasonably low `atol` (at least 1e-3) and `rtol` equal to 0.
48 | 
49 | This test needs to be implemented via `pytest` in [test_syncbn.py](./test_syncbn.py): in particular, all the above 
50 | parameters (including the number of workers) need to be the inputs of that test.
51 | Therefore, you will need to **start worker processes** within the test as well: `test_batchnorm` contains helpful 
52 | comments to get you started.
53 | The test can be implemented to work only on the CPU for simplicity.
54 | 
55 | ### Performance benchmarks
56 | Finally, measure the GPU time (2+ workers) and the memory footprint of standard **SyncBatchNorm** 
57 | and your implementation in the above setup: in total, you should have 8 speed/memory benchmarks for each implementation.
58 | 
59 | ### Submission format
60 | Provide the results of your experiments in a `.ipynb`/`.pdf` report and attach it to your code 
61 | when submitting the homework.
62 | Your report should include a brief experimental setup (if changed), results of all experiments **with the commands/code 
63 | to reproduce them**, and the infrastructure description (version of PyTorch, number of processes, type of GPUs, etc.).
64 | 
65 | Use [syncbn.py](./syncbn.py) and [ddp_cifar100.py](./ddp_cifar100.py) as a template. 
66 | 
67 | ## Task 3 (2 points)
68 | 
69 | Until now, we only aggregated the gradients across different workers during training. But what if we want to run
70 | distributed validation on a large dataset as well? In this assignment, you have to implement distributed metric
71 | aggregation: shard the dataset across different workers (with [scatter](https://pytorch.org/docs/stable/distributed.html#torch.distributed.scatter)), compute accuracy for each subset on 
72 | its respective worker and then average the metric values on the master process.
73 | 
74 | Also, make one more quality-of-life improvement of the pipeline by logging the loss (and accuracy!) 
75 | only from the rank-0 process to avoid flooding the standard output of your training command. 
76 | Submit the training code that includes all enhancements from Tasks 2 and 3.
77 | 
78 | ## Task 4 (bonus, 3 points)
79 | 
80 | Using [allreduce.py](./allreduce.py) as a template, implement the Ring All-Reduce algorithm
81 | using only point-to-point communication primitives from `torch.distributed`. 
82 | Compare it with the provided implementation of Butterfly All-Reduce
83 | and with `torch.distributed.all_reduce` in terms of CPU speed, memory usage and the accuracy of averaging. 
84 | Specifically, compare custom implementations of All-Reduce with 1–32 workers and compare your implementation of 
85 | Ring All-Reduce with `torch.distributed.all_reduce` on 1–16 processes and vectors of 1,000–100,000 elements.
86 | 


--------------------------------------------------------------------------------
/week04_data_parallel/homework/allreduce.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import torch
 5 | import torch.distributed as dist
 6 | from torch.multiprocessing import Process
 7 | 
 8 | 
 9 | def init_process(rank, size, fn, master_port, backend="gloo"):
10 |     """Initialize the distributed environment."""
11 |     os.environ["MASTER_ADDR"] = "127.0.0.1"
12 |     os.environ["MASTER_PORT"] = str(master_port)
13 |     dist.init_process_group(backend, rank=rank, world_size=size)
14 |     fn(rank, size)
15 | 
16 | 
17 | def butterfly_allreduce(send, rank, size):
18 |     """
19 |     Performs Butterfly All-Reduce over the process group. Modifies the input tensor in place.
20 |     Args:
21 |         send: torch.Tensor to be averaged with other processes.
22 |         rank: Current process rank (in a range from 0 to size)
23 |         size: Number of workers
24 |     """
25 | 
26 |     buffer_for_chunk = torch.empty((size,), dtype=torch.float)
27 | 
28 |     send_futures = []
29 | 
30 |     for i, elem in enumerate(send):
31 |         if i != rank:
32 |             send_futures.append(dist.isend(elem, i))
33 | 
34 |     recv_futures = []
35 | 
36 |     for i, elem in enumerate(buffer_for_chunk):
37 |         if i != rank:
38 |             recv_futures.append(dist.irecv(elem, i))
39 |         else:
40 |             elem.copy_(send[i])
41 | 
42 |     for future in recv_futures:
43 |         future.wait()
44 | 
45 |     # compute the average
46 |     torch.mean(buffer_for_chunk, dim=0, out=send[rank])
47 | 
48 |     for i in range(size):
49 |         if i != rank:
50 |             send_futures.append(dist.isend(send[rank], i))
51 | 
52 |     recv_futures = []
53 | 
54 |     for i, elem in enumerate(send):
55 |         if i != rank:
56 |             recv_futures.append(dist.irecv(elem, i))
57 | 
58 |     for future in recv_futures:
59 |         future.wait()
60 |     for future in send_futures:
61 |         future.wait()
62 | 
63 | 
64 | def ring_allreduce(send, rank, size):
65 |     """
66 |     Performs Ring All-Reduce over the process group. Modifies the input tensor in place.
67 |     Args:
68 |         send: torch.Tensor to be averaged with other processes.
69 |         rank: Current process rank (in a range from 0 to size)
70 |         size: Number of workers
71 |     """
72 |     pass
73 | 
74 | 
75 | def run_butterfly_allreduce(rank, size):
76 |     """Simple point-to-point communication."""
77 |     torch.manual_seed(rank)
78 |     tensor = torch.randn((size,), dtype=torch.float)
79 |     print("Rank ", rank, " has data ", tensor)
80 |     butterfly_allreduce(tensor, rank, size)
81 |     print("Rank ", rank, " has data ", tensor)
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     size = 5
86 |     processes = []
87 |     port = random.randint(25000, 30000)
88 |     for rank in range(size):
89 |         p = Process(target=init_process, args=(rank, size, run_butterfly_allreduce, port))
90 |         p.start()
91 |         processes.append(p)
92 | 
93 |     for p in processes:
94 |         p.join()
95 | 


--------------------------------------------------------------------------------
/week04_data_parallel/homework/ddp_cifar100.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | import torch.distributed as dist
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torchvision.transforms as transforms
  8 | from torch.utils.data import DataLoader
  9 | from torch.utils.data.distributed import DistributedSampler
 10 | from torchvision.datasets import CIFAR100
 11 | 
 12 | torch.set_num_threads(1)
 13 | 
 14 | 
 15 | def init_process(local_rank, fn, backend="nccl"):
 16 |     """Initialize the distributed environment."""
 17 |     dist.init_process_group(backend, rank=local_rank)
 18 |     size = dist.get_world_size()
 19 |     fn(local_rank, size)
 20 | 
 21 | 
 22 | class Net(nn.Module):
 23 |     """
 24 |     A very simple model with minimal changes from the tutorial, used for the sake of simplicity.
 25 |     Feel free to replace it with EffNetV2-XL once you get comfortable injecting SyncBN into models programmatically.
 26 |     """
 27 | 
 28 |     def __init__(self):
 29 |         super().__init__()
 30 |         self.conv1 = nn.Conv2d(3, 32, 3, 1)
 31 |         self.conv2 = nn.Conv2d(32, 32, 3, 1)
 32 |         self.dropout1 = nn.Dropout(0.25)
 33 |         self.dropout2 = nn.Dropout(0.5)
 34 |         self.fc1 = nn.Linear(6272, 128)
 35 |         self.fc2 = nn.Linear(128, 100)
 36 |         self.bn1 = nn.BatchNorm1d(128, affine=False)  # to be replaced with SyncBatchNorm
 37 | 
 38 |     def forward(self, x):
 39 |         x = self.conv1(x)
 40 |         x = F.relu(x)
 41 | 
 42 |         x = self.conv2(x)
 43 |         x = F.relu(x)
 44 | 
 45 |         x = F.max_pool2d(x, 2)
 46 |         x = self.dropout1(x)
 47 | 
 48 |         x = torch.flatten(x, 1)
 49 |         x = self.fc1(x)
 50 |         x = self.bn1(x)
 51 |         x = F.relu(x)
 52 |         x = self.dropout2(x)
 53 |         output = self.fc2(x)
 54 |         return output
 55 | 
 56 | 
 57 | def average_gradients(model):
 58 |     size = float(dist.get_world_size())
 59 |     for param in model.parameters():
 60 |         dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
 61 |         param.grad.data /= size
 62 | 
 63 | 
 64 | def run_training(rank, size):
 65 |     torch.manual_seed(0)
 66 | 
 67 |     dataset = CIFAR100(
 68 |         "./cifar",
 69 |         transform=transforms.Compose(
 70 |             [
 71 |                 transforms.ToTensor(),
 72 |                 transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
 73 |             ]
 74 |         ),
 75 |         download=True,
 76 |     )
 77 |     # where's the validation dataset?
 78 |     loader = DataLoader(dataset, sampler=DistributedSampler(dataset, size, rank), batch_size=64)
 79 | 
 80 |     model = Net()
 81 |     device = torch.device("cpu")  # replace with "cuda" afterwards
 82 |     model.to(device)
 83 |     optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
 84 | 
 85 |     num_batches = len(loader)
 86 | 
 87 |     for _ in range(10):
 88 |         epoch_loss = torch.zeros((1,), device=device)
 89 | 
 90 |         for data, target in loader:
 91 |             data = data.to(device)
 92 |             target = target.to(device)
 93 | 
 94 |             optimizer.zero_grad()
 95 |             output = model(data)
 96 |             loss = torch.nn.functional.cross_entropy(output, target)
 97 |             epoch_loss += loss.detach()
 98 |             loss.backward()
 99 |             average_gradients(model)
100 |             optimizer.step()
101 | 
102 |             acc = (output.argmax(dim=1) == target).float().mean()
103 | 
104 |             print(f"Rank {dist.get_rank()}, loss: {epoch_loss / num_batches}, acc: {acc}")
105 |             epoch_loss = 0
106 |         # where's the validation loop?
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     local_rank = int(os.environ["LOCAL_RANK"])
111 |     init_process(local_rank, fn=run_training, backend="gloo")  # replace with "nccl" when testing on several GPUs
112 | 


--------------------------------------------------------------------------------
/week04_data_parallel/homework/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest==8.3.4
2 | torch==2.4.0
3 | torchvision==0.19.0


--------------------------------------------------------------------------------
/week04_data_parallel/homework/sequential_print.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch.distributed as dist
 4 | 
 5 | 
 6 | def run_sequential(rank, size, num_iter=10):
 7 |     """
 8 |     Prints the process rank sequentially according to its number over `num_iter` iterations,
 9 |     separating the output for each iteration by `---`
10 |     Example (3 processes, num_iter=2):
11 |     ```
12 |     Process 0
13 |     Process 1
14 |     Process 2
15 |     ---
16 |     Process 0
17 |     Process 1
18 |     Process 2
19 |     ```
20 |     """
21 | 
22 |     pass
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     local_rank = int(os.environ["LOCAL_RANK"])
27 |     dist.init_process_group(rank=local_rank, backend="gloo")
28 | 
29 |     run_sequential(local_rank, dist.get_world_size())
30 | 


--------------------------------------------------------------------------------
/week04_data_parallel/homework/syncbn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | from torch.autograd import Function
 4 | from torch.nn.modules.batchnorm import _BatchNorm
 5 | 
 6 | 
 7 | class sync_batch_norm(Function):
 8 |     """
 9 |     A version of batch normalization that aggregates the activation statistics across all processes.
10 | 
11 |     This needs to be a custom autograd.Function, because you also need to communicate between processes
12 |     on the backward pass (each activation affects all examples, so loss gradients from all examples affect
13 |     the gradient for each activation).
14 | 
15 |     For a quick tutorial on torch.autograd.function, see
16 |     https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
17 |     """
18 | 
19 |     @staticmethod
20 |     def forward(ctx, input, running_mean, running_std, eps: float, momentum: float):
21 |         # Compute statistics, sync statistics, apply them to the input
22 |         # Also, store relevant quantities to be used on the backward pass with `ctx.save_for_backward`
23 |         pass
24 | 
25 |     @staticmethod
26 |     def backward(ctx, grad_output):
27 |         # don't forget to return a tuple of gradients wrt all arguments of `forward`!
28 |         pass
29 | 
30 | 
31 | class SyncBatchNorm(_BatchNorm):
32 |     """
33 |     Applies Batch Normalization to the input (over the 0 axis), aggregating the activation statistics
34 |     across all processes. You can assume that there are no affine operations in this layer.
35 |     """
36 | 
37 |     def __init__(self, num_features: int, eps: float = 1e-5, momentum: float = 0.1):
38 |         super().__init__(
39 |             num_features,
40 |             eps,
41 |             momentum,
42 |             affine=False,
43 |             track_running_stats=True,
44 |             device=None,
45 |             dtype=None,
46 |         )
47 |         # your code here
48 |         self.running_mean = torch.zeros((num_features,))
49 |         self.running_std = torch.ones((num_features,))
50 | 
51 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
52 |         # You will probably need to use `sync_batch_norm` from above
53 |         pass
54 | 


--------------------------------------------------------------------------------
/week04_data_parallel/homework/test_syncbn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from syncbn import SyncBatchNorm
 3 | 
 4 | 
 5 | def test_batchnorm(num_workers, hid_dim, batch_size):
 6 |     # Verify that the implementation of SyncBatchNorm gives the same results (both for outputs
 7 |     # and gradients with respect to input) as torch.nn.BatchNorm1d on a variety of inputs.
 8 | 
 9 |     # This can help you set up the worker processes. Child processes launched with `spawn` can still run
10 |     # torch.distributed primitives, but you can also communicate their outputs back to the main process to compare them
11 |     # with the outputs of a non-synchronous BatchNorm.
12 |     ctx = torch.multiprocessing.get_context("spawn")
13 | 
14 |     pass
15 | 


--------------------------------------------------------------------------------
/week04_data_parallel/lecture.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week04_data_parallel/lecture.odp


--------------------------------------------------------------------------------
/week04_data_parallel/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week04_data_parallel/lecture.pdf


--------------------------------------------------------------------------------
/week05_large_models/README.md:
--------------------------------------------------------------------------------
 1 | # Week 5: Large Models
 2 | * Lecture: [slides](lecture.pdf), [source](lecture.odp), [video](https://disk.yandex.ru/i/zpUT2zZorGilMw)
 3 | * Practice: [video](https://disk.yandex.ru/i/Bxp_jXdGa011Xw)
 4 | * Homework: see below
 5 | 
 6 | 
 7 | 
 8 | ### Practice / homework
 9 | This homework consists of two parts:
10 | - Part 1: [`./practice_part1.ipynb`](./practice_part1.ipynb) [![open in colab](https://camo.githubusercontent.com/96889048f8a9014fdeba2a891f97150c6aac6e723f5190236b10215a97ed41f3/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/mryab/efficient-dl-systems/blob/main/week05_large_models/practice_part1.ipynb) - memory-efficient training and inference
11 | - Part 2: [`./practice_part2.ipynb`](./practice_part2.ipynb) [![open in colab](https://camo.githubusercontent.com/96889048f8a9014fdeba2a891f97150c6aac6e723f5190236b10215a97ed41f3/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/github/mryab/efficient-dl-systems/blob/main/week05_large_models/practice_part2.ipynb) - implementing model and sequence parallelism
12 | 
13 | Part 1 will require you to implement memory-saving techniques such as offloading and gradient checkpointing / accumulation. To implement offloading, you may either write your own low-level code, or use the recommended trick: write your own [autograd.Function](https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function) (similar to gradient checkpoint function) that moves the requisite modules on device just in time for computation. Our practice video ('25) contains some tips on extending autograd functions, but those are optional.
14 | 
15 | Part 2 is much more convenient with multiple GPUs - though, it can *potentially* be solved by emulating GPUs with CPU-only code. 
16 | For YSDA and HSE students, you can use either DataSphere or one of the GPU servers available for this course (recommended). If you are an online student, you can try to register for kaggle kernels ([they ley you run on 2x T4](https://www.kaggle.com/discussions/product-feedback/361104)) in jupyter-like interface. That said, implementing assignments B and C in Kaggle is more difficult than intended. For non-enrolled online students, we recommend option A unless you have access to some other multi-GPU-hardware or are intentionally masochistic.
17 | 
18 | 
19 | ### References
20 | 
21 | * PyTorch gradient checkpointing - [API reference](https://pytorch.org/docs/stable/checkpoint.html)
22 | * PyTorch native ZeRO - [FullyShardedDataParallel](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/)
23 | * GPipe (one good implementation of pipelining) - [arxiv](https://arxiv.org/abs/1811.06965)
24 | * Megatron-LM - one honking great implementation of large-scale training for transformers - [repo](https://github.com/NVIDIA/Megatron-LM)
25 | * DeepSpeed (a library of many tricks) - [repo](https://github.com/microsoft/DeepSpeed)
26 |     * Parameter/Optimizer State Sharding in ZeRO - [arxiv](https://arxiv.org/pdf/1910.02054v3.pdf) [blog](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
27 |     * ZeRO-offload - moving gradients and statistics from GPU into RAM - [arxiv](https://arxiv.org/abs/2101.06840) [blog](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
28 | * Alpa (automated parallelism in Jax - https://github.com/alpa-projects/alpa
29 |     * ICML'22 tutorial: https://sites.google.com/view/icml-2022-big-model
30 | * FairScale - sharded DDP and pipeline from Meta - [repo](https://github.com/facebookresearch/fairscale)
31 | * [`tensor_parallel`](https://github.com/BlackSamorez/tensor_parallel) - automated tensor parallelism in PyTorch
32 | 
33 | 
34 | During the in-class practice, we also had several PyTorch code examples that could come in handy when training large models:
35 | 
36 | __Gradient checkpointing:__
37 | ```python
38 | import torch
39 | import torch.nn as nn
40 | from torch.utils.checkpoint import checkpoint, checkpoint_sequential
41 | 
42 | class Checkpoint(nn.Sequential):
43 |   def forward(self, *inputs):
44 |     return checkpoint(super().forward, *inputs)
45 | 
46 | class Echo(nn.Module):
47 |   def __init__(self, msg: str):
48 |     super().__init__()
49 |     self.msg = msg  # print this message during forward (for debugging)
50 |   def forward(self, x):
51 |     print("forward", self.msg)
52 |     return x
53 | 
54 | model = nn.Sequential(
55 |     Checkpoint(nn.Linear(1000, 1000), nn.ReLU(), Echo("layer1 done"),
56 |                nn.Linear(1000, 1000), nn.ReLU(), Echo("layer2 done")),
57 |     Checkpoint(nn.Linear(1000, 1000), nn.ReLU(), Echo("layer3 done"),
58 |                nn.Linear(1000, 1000), nn.ReLU(), Echo("layer4 done")),
59 |     nn.Linear(1000, 1000), nn.ReLU(), Echo("layer5 done"),
60 | )
61 | 
62 | inputs = torch.randn(16, 1000, requires_grad=True)
63 | # note: we must set inptus requires_grad=True because checkpoints require at least one input with grad for backprop
64 | outputs = model(inputs)
65 | outputs.norm().backward()  # Echo layers will print in the following order: 1 2 3 4 5 3 4 1 2
66 | ```
67 | 


--------------------------------------------------------------------------------
/week05_large_models/lecture.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week05_large_models/lecture.odp


--------------------------------------------------------------------------------
/week05_large_models/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week05_large_models/lecture.pdf


--------------------------------------------------------------------------------
/week06_fsdp/README.md:
--------------------------------------------------------------------------------
1 | # Week 6: Sharded data-parallel training, distributed training optimizations
2 | 
3 | * Lecture: TBA
4 | * Seminar: [link](./practice.ipynb)
5 | * Homework: see the [homework](./homework) folder


--------------------------------------------------------------------------------
/week06_fsdp/homework/.devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "build": {
 3 |         "dockerfile": "Dockerfile"
 4 |     },
 5 |     "name": "hw_fsdp",
 6 |     "containerUser": "vscode",
 7 |     "runArgs": [
 8 |         "--name", "hw_fsdp_${localEnv:USER}",
 9 |         "--restart", "unless-stopped",
10 |         "--gpus", "all",
11 |         "--ipc", "host",
12 |         "--ulimit", "memlock=-1",
13 |         "--ulimit", "stack=67108864"
14 |     ],
15 |     "postCreateCommand": {
16 |         "packages-editable-mode-compat": "./setup.sh"
17 |     },
18 |     "customizations": {
19 |         "vscode": {
20 |             "extensions": [
21 |                 "ms-python.python"
22 |             ]
23 |         }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/week06_fsdp/homework/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | train
3 | 1b.patch
4 | torchtitan
5 | 


--------------------------------------------------------------------------------
/week06_fsdp/homework/.vscode/launch.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "version": "0.2.0",
  3 |     "configurations": [
  4 |         {
  5 |             "name": "train.py — FSDP2",
  6 |             "type": "debugpy",
  7 |             "request": "launch",
  8 |             "program": "/usr/local/bin/torchrun",
  9 |             "console": "integratedTerminal",
 10 |             "justMyCode": false,
 11 |             "env": {
 12 |                 "OMP_NUM_THREADS": "1",
 13 |                 "CUDA_VISIBLE_DEVICES": "0,1"
 14 |             },
 15 |             "args": [
 16 |                 "--master-port", "29600",
 17 |                 "--nproc-per-node", "2",
 18 |                 "--local-ranks-filter", "0",
 19 | 
 20 |                 "./train.py",
 21 |                 "--flavor", "debugmodel",
 22 |                 "--batch-size", "2",
 23 |                 "--seq-len", "1024",
 24 |                 "--training-steps", "20",
 25 |                 "--warmup-steps", "5",
 26 |                 "--gc-freq", "5",
 27 |                 "--profile-freq", "10",
 28 | 
 29 |                 "--param-dtype", "float16",
 30 |                 "--reduce-dtype", "float32",
 31 | 
 32 |                 "--dump-folder", "./train/fsdp-2"
 33 |             ]
 34 |         },
 35 |         {
 36 |             "name": "train.py — hwFSDP",
 37 |             "type": "debugpy",
 38 |             "request": "launch",
 39 |             "program": "/usr/local/bin/torchrun",
 40 |             "console": "integratedTerminal",
 41 |             "justMyCode": false,
 42 |             "env": {
 43 |                 "OMP_NUM_THREADS": "1",
 44 |                 "CUDA_VISIBLE_DEVICES": "0,1"
 45 |             },
 46 |             "args": [
 47 |                 "--master-port", "29601",
 48 |                 "--nproc-per-node", "2",
 49 |                 "--local-ranks-filter", "0",
 50 | 
 51 |                 "./train.py",
 52 |                 "--flavor", "debugmodel",
 53 |                 "--batch-size", "2",
 54 |                 "--seq-len", "1024",
 55 |                 "--training-steps", "20",
 56 |                 "--warmup-steps", "5",
 57 |                 "--gc-freq", "5",
 58 |                 "--profile-freq", "10",
 59 | 
 60 |                 "--param-dtype", "float16",
 61 |                 "--reduce-dtype", "float32",
 62 | 
 63 |                 "--hw-fsdp",
 64 |                 "--dump-folder", "./train/hw-fsdp"
 65 |             ]
 66 |         },
 67 |         {
 68 |             "name": "test.py – FSDP2",
 69 |             "type": "debugpy",
 70 |             "request": "launch",
 71 |             "program": "/usr/local/bin/torchrun",
 72 |             "console": "integratedTerminal",
 73 |             "justMyCode": false,
 74 |             "env": {
 75 |                     "OMP_NUM_THREADS": "1",
 76 |                     "CUDA_VISIBLE_DEVICES": "6,7",
 77 |                     // "CUDA_LAUNCH_BLOCKING": "1",
 78 |             },
 79 |             "args": [
 80 |                 "--master-port", "29600",
 81 |                 "--nproc-per-node", "2",
 82 |                 "--local-ranks-filter", "0",
 83 | 
 84 |                 "./test.py",
 85 |                 "--framework", "fsdp-2",
 86 |                 "--dump-dir", "./test/fsdp-2"
 87 |             ]
 88 |         },
 89 |         {
 90 |             "name": "test.py – hwFSDP",
 91 |             "type": "debugpy",
 92 |             "request": "launch",
 93 |             "program": "/usr/local/bin/torchrun",
 94 |             "console": "integratedTerminal",
 95 |             "justMyCode": false,
 96 |             "env": {
 97 |                     "OMP_NUM_THREADS": "1",
 98 |                     "CUDA_VISIBLE_DEVICES": "6,7",
 99 |                     // "CUDA_LAUNCH_BLOCKING": "1",
100 |             },
101 |             "args": [
102 |                 "--master-port", "29601",
103 |                 "--nproc-per-node", "2",
104 |                 "--local-ranks-filter", "0",
105 | 
106 |                 "./test.py",
107 |                 "--framework", "hw-fsdp",
108 |                 "--dump-dir", "./test/hw-fsdp",
109 |             ]
110 |         },
111 |     ]
112 | }
113 | 


--------------------------------------------------------------------------------
/week06_fsdp/homework/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:25.01-py3
 2 | 
 3 | RUN apt-get update \
 4 |     && apt-get install --no-install-recommends -y \
 5 |         sudo \
 6 |     && apt-get clean \
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | ARG USERNAME=vscode
10 | RUN useradd -U -m "$USERNAME" \
11 |     && echo "$USERNAME" ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/"$USERNAME" \
12 |     && chmod 0440 /etc/sudoers.d/"$USERNAME"
13 | 


--------------------------------------------------------------------------------
/week06_fsdp/homework/README.md:
--------------------------------------------------------------------------------
 1 | # Week 6 assignment
 2 | 
 3 | ## Task 1 — `FULL_SHARD` (no overlap) (6 points)
 4 | 
 5 | `hw_fsdp.py` contains a draft of a simple FSDP implementation (we'll call it
 6 | hwFSDP).
 7 | 
 8 | - Fill in the TODO's in `hw_fsdp.py` to achieve a functioning FSDP
 9 |   implementation.
10 | - Validate your hwFSDP against FSDP2 using `train.py` (more info in
11 |   [Notes](#notes) section).
12 | - Make sure your losses and grad norms match the ones of a FSDP2 run. Attach
13 |   both runs' logs for validation.
14 | - Make sure to free unsharded params after each `FSDPModule`'s forward and
15 |   backward.
16 | - Make sure to free unsharded grads after each `FSDPModule`'s reduce-scatter.
17 | - Attach a memory snapshot of a hwFSDP run for validation.
18 | - Make sure the memory usage is similar to the one of FSDP2 (attach a memory
19 | snapshot of a FSDP2 run as well).
20 | - Sharded params must me instances of `DTensor` with correct mesh and
21 |   placements.
22 | - Functioning forward pass gets you `3 points` and a full functioning step gets
23 |   you another `3 points`.
24 | 
25 | No computation / communication overlap is required in this part of the
26 | assignment.
27 | 
28 | ## Task 2 — `FULL_SHARD` (implicit forward prefetch) (2 points)
29 | 
30 | - Make changes to hwFSDP to overlap forward communications (parameter gathering)
31 |   with forward computations. Make use of multiple CUDA streams and use CUDA
32 |   event to sync them.
33 | - Make sure losses and grad norms still match the FSDP2 ones (or are close).
34 | - Make sure memory usage is still fine.
35 | - Attach a profile trace which depicts the overlap (more on traces in
36 |   [Notes](#notes) section).
37 | 
38 | ## Task 3 — `FULL_SHARD` (explicit backward prefetch) (2 points)
39 | 
40 | - Overlap backward communications (gradient reduction and parameter gathering)
41 |   with backward computations.
42 | - Just as before, validate losses and grad norms, make sure memory usage is
43 |   okay.
44 | - Attach a profile trace which depicts the overlap.
45 | 
46 | ## Bonus
47 | 
48 | ### Activation checkpointing support (1 point)
49 | 
50 | - Make changes to hwFSDP to support using activation checkpointing with hwFSDP.
51 | - Validate losses, grad norms and memory, if you've achieved overlap make sure
52 |   it's still there.
53 | 
54 | ### `reshard_after_forward=False` support (1 point)
55 | 
56 | - Make changes to hwFSDP to support no resharding after forward (aka ZeRO-2).
57 | - Validate losses, grad norms and memory, if you've achieved overlap make sure
58 |   it's still there.
59 | - Attach a trace which depicts shows there are to parameter all-gathers during
60 |   backward pass
61 | 
62 | ## Notes
63 | 
64 | - It is recommended to debug your code using a
65 |   [dev-container](https://code.visualstudio.com/docs/devcontainers/containers)
66 |   with configuration provided in `.devcontainer.json` and debug configs from
67 |   `.vscode/launch.json`.
68 | - Debug configs launch hwFSDP and FSDP2 runs of the `train.py` script.
69 | - `train.py` runs a debug Llama pre-train, logs metrics, saves profiling traces
70 |   and memory snapshots.
71 | - Overlap can be checked using profiling traces. To visualize them use
72 |   [perfetto.dev](perfetto.dev). `train.py` saves profiling traces to
73 |   `train/(hw-fsdp|fsdp-2)/profile_trace`.
74 | - Memory snapshots can be visualized using
75 |   [pytorch.org/memory_viz](pytorch.org/memory_viz). `train.py` saves memory
76 |   snapshots to `train/(hw-fsdp|fsdp-2)/memory_snapshot`.
77 | - Tip: to get a clear picture of the overlap GPUs must be pretty well utilized,
78 |   to achieve that change the model flavour from `debugmodel` to `1B` and
79 |   increase seq-len until the utilization is high enough (by default `train.py`
80 |   runs a small debug model).
81 | 


--------------------------------------------------------------------------------
/week06_fsdp/homework/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euxo pipefail
 3 | 
 4 | sudo rm -rf torchtitan
 5 | git clone -q https://github.com/pytorch/torchtitan
 6 | git -C torchtitan checkout -q 49c6d6fc15ef644e5c3b1003ad4e0d9ea5fcb9a9
 7 | curl -s https://gist.githubusercontent.com/antony-frolov/c2e69bbda2b4418b1ab1c99839c55877/raw/c873709f6fe34dbf8ba678302e4fa92d6ed8c7f1/1b.patch -o 1b.patch
 8 | patch -s -p1 -i ../1b.patch -d torchtitan
 9 | sudo pip install -q fire triton -r ./torchtitan/requirements.txt ./torchtitan
10 | sudo apt-get update -qq && sudo apt-get install -qq pciutils
11 | 


--------------------------------------------------------------------------------
/week06_fsdp/slides/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | pnpm-lock.yaml
3 | 


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/1b_mem_snap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/1b_mem_snap.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/1b_no_reduce_grads_no_reshard_after_backward_no_reshard_after_forward_mem_snap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/1b_no_reduce_grads_no_reshard_after_backward_no_reshard_after_forward_mem_snap.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/1b_no_reduce_grads_no_reshard_after_backward_no_reshard_after_forward_trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/1b_no_reduce_grads_no_reshard_after_backward_no_reshard_after_forward_trace.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/1b_no_reshard_after_backward_no_reshard_after_forward_mem_snap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/1b_no_reshard_after_backward_no_reshard_after_forward_mem_snap.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/1b_no_reshard_after_backward_no_reshard_after_forward_trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/1b_no_reshard_after_backward_no_reshard_after_forward_trace.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/1b_no_reshard_after_forward_mem_snap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/1b_no_reshard_after_forward_mem_snap.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/1b_reshard_after_forward_4_mem_snap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/1b_reshard_after_forward_4_mem_snap.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/1b_reshard_after_forward_4_trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/1b_reshard_after_forward_4_trace.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/8b_compile_backward_trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/8b_compile_backward_trace.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/8b_compile_forward_trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/8b_compile_forward_trace.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/8b_compile_trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/8b_compile_trace.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/8b_cpu_offload_mem_snap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/8b_cpu_offload_mem_snap.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/8b_no_compile_iter_mem_snap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/8b_no_compile_iter_mem_snap.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/8b_on_cuda_model_mem_snap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/8b_on_cuda_model_mem_snap.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/8b_reshard_after_forward_4_trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/8b_reshard_after_forward_4_trace.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/dcp_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/dcp_1.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/dcp_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/dcp_2.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/dcp_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/dcp_3.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/dcp_saving_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/dcp_saving_flow.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/device_mesh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/device_mesh.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/dtensor_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/dtensor_1.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/dtensor_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/dtensor_2.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/dtensor_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/dtensor_3.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/forward_hook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/forward_hook.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/forward_pre_hook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/forward_pre_hook.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/fsdp_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/fsdp_workflow.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/fsdp_wrap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/fsdp_wrap.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/assets/streams.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/assets/streams.png


--------------------------------------------------------------------------------
/week06_fsdp/slides/package.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"dependencies": {
3 | 		"@slidev/cli": "^51.3.0",
4 | 		"@slidev/theme-default": "^0.25.0",
5 | 		"playwright-chromium": "^1.50.1"
6 | 	}
7 | }
8 | 


--------------------------------------------------------------------------------
/week06_fsdp/slides/slides.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: FSDP семинар
  3 | transition: slide-left
  4 | ---
  5 | 
  6 | # FSDP семинар
  7 | 
  8 | ---
  9 | 
 10 | # Plan
 11 | 
 12 | <br>
 13 | <br>
 14 | <br>
 15 | 
 16 | - Prerequisites: CUDA streams / events, DeviceMesh, DTensor
 17 | - FSDP2: interface, options, internals
 18 | - PyTorch DCP, efficient garbage collection
 19 | 
 20 | ---
 21 | 
 22 | # CUDA streams and events
 23 | 
 24 | ```python
 25 | all_gather_stream = torch.cuda.Stream()
 26 | 
 27 | ...
 28 | 
 29 | # layer 3 unshard
 30 | with torch.cuda.stream(all_gather_stream):
 31 |     model.layers[3].all_gather()
 32 |     all_gather_event_3 = torch.cuda.Event()
 33 |     # or all_gather_stream.record_event()
 34 | 
 35 | # layer 2 forward
 36 | activations = model.layers[2](activations)
 37 | 
 38 | # layer 4 unshard
 39 | with torch.cuda.stream(all_gather_stream):
 40 |     model.layers[4].all_gather()
 41 |     all_gather_event_4 = torch.cuda.Event()
 42 | 
 43 | # layer 3 forward
 44 | torch.cuda.default_stream().wait_event(all_gather_event_3)
 45 | activations = model.block[3](activations)
 46 | 
 47 | ...
 48 | 
 49 | ```
 50 | 
 51 | ---
 52 | 
 53 | # CUDA streams and events
 54 | 
 55 | <br>
 56 | <br>
 57 | <br>
 58 | 
 59 | <img border="rounded" src="./assets/streams.png" alt="">
 60 | 
 61 | <!--
 62 | сказать откуда трейсы
 63 | -->
 64 | 
 65 | ---
 66 | 
 67 | # DeviceMesh
 68 | 
 69 | <img border="rounded" src="./assets/device_mesh.png" alt="">
 70 | 
 71 | ---
 72 | 
 73 | # DeviceMesh
 74 | 
 75 | ```python
 76 | from torch.distributed.device_mesh import init_device_mesh
 77 | 
 78 | mesh_1d = init_device_mesh("cuda", mesh_shape=(8,), mesh_dim_names=("dp",))
 79 | mesh_2d = init_device_mesh("cpu", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp"))
 80 | mesh_3d = init_device_mesh(
 81 |     "cuda",
 82 |     mesh_shape=(2, 2, 8),
 83 |     mesh_dim_names=("pp", "dp", "tp"),
 84 | )
 85 | 
 86 | dp_group = mesh_2d.get_group("dp")
 87 | dist.all_gather(..., group=dp_group)
 88 | 
 89 | mesh_2d.get_local_rank("tp")
 90 | 
 91 | mesh_3d["dp", "tp"]._flatten("dp_tp")
 92 | 
 93 | ```
 94 | 
 95 | ---
 96 | 
 97 | # DTensor
 98 | 
 99 | ```python
100 | from torch.distributed.tensor import DTensor, distribute_tensor
101 | 
102 | mesh = init_device_mesh("cuda", mesh_shape=(8,), mesh_dim_names=("dp",))
103 | big_tensor = torch.randn(1024, 4096)
104 | placements = (Shard(dim=0),)
105 | 
106 | dtensor = distribute_tensor(
107 |     big_tensor,
108 |     device_mesh=mesh,
109 |     placements=placements,
110 | )
111 | dtensor._local_tensor
112 | dtensor.to_local() # .shape = (512, 4096)
113 | 
114 | 
115 | shard = ... # .shape = (512, 4096)
116 | DTensor.from_local(
117 |     shard,
118 |     device_mesh=mesh,
119 |     placements=placements,
120 | ) # .shape = (1024, 4096)
121 | 
122 | dtensor.redistribute(placements=(Replicate(),))
123 | dtensor.full_tensor()
124 | ```
125 | 
126 | ---
127 | 
128 | # DTensor
129 | 
130 | <img border="rounded" src="./assets/dtensor_1.png" alt="" width="800">
131 | <br>
132 | <img border="rounded" src="./assets/dtensor_2.png" alt="" width="600">
133 | 
134 | ---
135 | 
136 | # DTensor
137 | 
138 | <img border="rounded" src="./assets/dtensor_3.png" alt="" width="700">
139 | 
140 | ---
141 | 
142 | # FSDP2
143 | 
144 | <img border="rounded" src="./assets/fsdp_workflow.png" alt="">
145 | 
146 | ---
147 | layout: two-cols-header
148 | layoutClass: gap-5
149 | ---
150 | 
151 | # FSDP2
152 | 
153 | ::left::
154 | 
155 | ```python
156 | from torch.distributed.fsdp import fully_shard
157 | 
158 | mesh_2d = init_device_mesh(
159 |     "cuda",
160 |     mesh_shape=(2, 8),
161 |     mesh_dim_names=("dp", "tp"),
162 | )
163 | model = Model()
164 | 
165 | for layer in model.layers:
166 |     fully_shard(
167 |         module, # (module1, module2)
168 |         mesh=dp_mesh,
169 |         reshard_after_forward=True, # ZeRO-3
170 |         mp_policy=MixedPrecisionPolicy(
171 |             param_dtype=torch.float16,
172 |             reduce_dtype=torch.float32,
173 |         ),
174 |         offload_policy=CPUOffloadPolicy(),
175 |     )
176 | 
177 | fully_shard(model, ...)
178 | ```
179 | 
180 | ::right::
181 | 
182 | ```python
183 | for step in ...:
184 |     for gas_step in ...:
185 |         is_last_backward = gas_step == num_gas_steps - 1
186 |         # ZeRO-3
187 |         model.set_reshard_after_backward(is_last_backward)
188 |         # ZeRO-2
189 |         model.requires_gradient_sync(is_last_backward)
190 | 
191 |     loss = loss_fn(model(inputs), targets)
192 |     ...
193 | ```
194 | 
195 | ---
196 | 
197 | # FSDP2
198 | 
199 | <img border="rounded" src="./assets/fsdp_wrap.png" alt="" width="800">
200 | 
201 | ---
202 | 
203 | # FSDP2 — hooks
204 | 
205 | <img border="rounded" src="./assets/forward_pre_hook.png" alt="" width="700">
206 | <br>
207 | <img border="rounded" src="./assets/forward_hook.png" alt="" width="800">
208 | 
209 | ---
210 | 
211 | # FSDP2 — pre-forward
212 | 
213 | ```python
214 | def pre_forward(module, args):
215 |     module.unshard() # in all-gather stream
216 |     module.wait_for_unshard() # sync compute (default) stream with all-gather stream
217 |     module._register_post_backward_hook(args)
218 |     return args
219 | 
220 | def unshard(module):
221 |     with torch.cuda.stream(all_gather_stream):
222 |         module.all_gather()
223 |     module.all_gather_event = all_gather_stream.record_event()
224 |     module.set_unsharded_params()
225 | 
226 | def wait_for_unshard(module):
227 |     torch.cuda.default_stream().wait_event(module.all_gather_event)
228 | 
229 | def fully_shard(module, ...):
230 |     ...
231 |     module.register_forward_pre_hook(pre_forward)
232 | ```
233 | 
234 | <!--
235 | тут нужно начинать рисовать
236 | -->
237 | 
238 | ---
239 | 
240 | # FSDP2 — post-forward
241 | 
242 | ```python
243 | def post_forward(module, args, output):
244 |     module.reshard()
245 |     module._record_post_forward()
246 |     module._register_pre_backward_hook(output)
247 |     return output
248 | 
249 | def reshard(module):
250 |     module.set_sharded_params() # and free unsharded params
251 | 
252 | def _record_post_forward(module):
253 |     post_forward_index = len(module.comm_ctx.post_forward_order)
254 |     module.comm_ctx.post_forward_order.append(module)
255 |     module._post_forward_indices.append(post_forward_index)
256 | 
257 | def fully_shard(module, ...):
258 |     ...
259 |     module.register_forward_hook(post_forward)
260 | ```
261 | 
262 | ---
263 | 
264 | # FSDP2 — pre-backward
265 | 
266 | ```python
267 | def pre_backward(module, *unused):
268 |     module.unshard()  # no-op if prefetched
269 |     module.wait_for_unshard()
270 |     module._backward_prefetch()
271 | 
272 | def _backward_prefetch(module):
273 |     curr_index = module._post_forward_indices.pop()
274 |     target_index = curr_index - 1
275 |     target_module = self.comm_ctx.post_forward_order[target_index]
276 |     target_module.unshard()
277 | 
278 | def _register_pre_backward_hook(self, output):
279 |     for t in output:
280 |         if torch.is_tensor(t) and t.requires_grad:
281 |             t.register_hook(self._pre_backward)
282 |     return output
283 | ```
284 | 
285 | ---
286 | 
287 | # FSDP2 — post-backward
288 | 
289 | ```python
290 | def post_backward(module, *unused: Any):
291 |     if module.reshard_after_backward:
292 |         module.reshard()
293 |     if module.reduce_grads:
294 |         reduce_scatter_stream.wait_stream(torch.cuda.default_stream())
295 |         with torch.cuda.stream(reduce_scatter_stream):
296 |             module.reduce_scatter_grads()
297 |         reduce_event = reduce_scatter_stream.record_event()
298 | 
299 | def _register_post_backward_hook(module, args):
300 |     RegisterPostBackwardFunction.apply(self, *args)
301 | 
302 | class RegisterPostBackwardFunction(torch.autograd.Function):
303 |     @staticmethod
304 |     def forward(ctx, module, *inputs):
305 |         ctx.module = module
306 |         return inputs
307 | 
308 |     @staticmethod
309 |     def backward(ctx, *grads):
310 |         module.post_backward()
311 |         return (None,) + grads
312 | ```
313 | 
314 | ---
315 | 
316 | # FSDP2 — memory
317 | 
318 | <img border="rounded" src="./assets/8b_on_cuda_model_mem_snap.png" alt="">
319 | 
320 | ---
321 | 
322 | # FSDP2 — memory
323 | 
324 | <img border="rounded" src="./assets/1b_mem_snap.png" alt="">
325 | 
326 | ---
327 | 
328 | # Computation / communication overlap
329 | 
330 | - Implicit prefetching
331 |   - в `pre_forward`
332 | - Explicit prefetching
333 |   - в `pre_backward`
334 |   - можно задать руками
335 | 
336 |     ```python
337 |     module.set_modules_to_forward_prefetch(modules)
338 |     module.set_modules_to_backward_prefetch(modules)
339 |     ```
340 | 
341 | ---
342 | 
343 | # Подробнее про работу со стримами
344 | 
345 | <br>
346 | <br>
347 | <br>
348 | <br>
349 | 
350 | <img border="rounded" src="./assets/8b_compile_trace.png" alt="">
351 | 
352 | <br>
353 | 
354 | ---
355 | 
356 | # Подробнее про работу со стримами — forward
357 | 
358 | <br>
359 | <br>
360 | <br>
361 | <br>
362 | 
363 | <img border="rounded" src="./assets/8b_compile_forward_trace.png" alt="">
364 | 
365 | <br>
366 | ---
367 | 
368 | # Подробнее про работу со стримами — backward
369 | 
370 | <br>
371 | <br>
372 | <br>
373 | <br>
374 | 
375 | <img border="rounded" src="./assets/8b_compile_backward_trace.png" alt="">
376 | 
377 | <br>
378 | 
379 | ---
380 | 
381 | # ZeRO-2
382 | 
383 | <br>
384 | <br>
385 | <br>
386 | <br>
387 | 
388 | <img border="rounded" src="./assets/1b_no_reshard_after_backward_no_reshard_after_forward_trace.png" alt="">
389 | 
390 | <br>
391 | 
392 | ---
393 | 
394 | # ZeRO-2
395 | 
396 | <img border="rounded" src="./assets/1b_no_reshard_after_backward_no_reshard_after_forward_mem_snap.png" alt="">
397 | 
398 | ---
399 | 
400 | # ZeRO-1
401 | 
402 | <br>
403 | <br>
404 | <br>
405 | <br>
406 | 
407 | <img border="rounded" src="./assets/1b_no_reduce_grads_no_reshard_after_backward_no_reshard_after_forward_trace.png" alt="">
408 | 
409 | <br>
410 | 
411 | ---
412 | 
413 | # ZeRO-1
414 | 
415 | <img border="rounded" src="./assets/1b_no_reduce_grads_no_reshard_after_backward_no_reshard_after_forward_mem_snap.png" alt="">
416 | 
417 | ---
418 | 
419 | # HSDP
420 | 
421 | ```python
422 | mesh_2d = init_device_mesh(
423 |     "cpu",
424 |     mesh_shape=(2, 8),
425 |     mesh_dim_names=("dp_replicate", "dp_shard"),
426 | )
427 | 
428 | fully_shard(
429 |     module,
430 |     mesh=mesh_2d,
431 |     ...
432 | )
433 | ```
434 | 
435 | <br>
436 | 
437 | - логика становится заметно сложнее, показывать не буду(
438 | 
439 | ---
440 | 
441 | # CPU offloading
442 | 
443 | - [ZeRO-Offload](https://arxiv.org/pdf/2101.06840)
444 | 
445 | ```python
446 | with torch.device("cpu"):
447 |     model = Model()
448 | 
449 | fully_shard(
450 |     module,
451 |     ...
452 |     offload_policy=CPUOffloadPolicy(),
453 | )
454 | 
455 | def unshard(module):
456 |     sharded_param = sharded_param.to(
457 |         device,
458 |         non_blocking=True,
459 |     )
460 |     ...
461 |     module.all_gather()
462 | 
463 | def post_backward(module):
464 |     new_sharded_grad = new_sharded_grad.to(
465 |         torch.device("cpu"),
466 |         non_blocking=True
467 |     )
468 | ```
469 | 
470 | ---
471 | 
472 | # CPU offloading
473 | 
474 | <img border="rounded" src="./assets/8b_cpu_offload_mem_snap.png" alt="">
475 | 
476 | ---
477 | 
478 | # hpZ
479 | 
480 | - [ZeRO++](https://arxiv.org/pdf/2306.10209)
481 | 
482 | ```python
483 | mesh = init_device_mesh(
484 |     "cuda",
485 |     mesh_shape=(16,),
486 |     mesh_dim_names=("dp",),
487 | )
488 | fully_shard(
489 |     module,
490 |     mesh,
491 |     ...
492 |     reshard_after_forward=8,
493 | )
494 | ```
495 | 
496 | ---
497 | 
498 | # hpZ
499 | 
500 | <img border="rounded" src="./assets/1b_reshard_after_forward_4_trace.png" alt="" width="600">
501 | 
502 | ---
503 | 
504 | # hpZ
505 | 
506 | <img border="rounded" src="./assets/1b_reshard_after_forward_4_mem_snap.png" alt="">
507 | 
508 | ---
509 | 
510 | # PyTorch DCP
511 | 
512 | - два вида `state_dict`
513 |   - `SHARDED_STATE_DICT`
514 |   - `FULL_STATE_DICT`
515 | - в FSDP2 всегда sharded, но состоит из DTensor-ов
516 |   - с помощью `.redistribute()` можно менять шардирование чекпоинта
517 | - DCP умеет эффективно отгружать чекпоинты с минимальным оверхедом
518 | 
519 | ---
520 | 
521 | # PyTorch DCP
522 | 
523 | ```python
524 | import torch.distributed.checkpoint as dcp
525 | model = Model()
526 | fully_shard(model)
527 | optimizer = Optimizer(model.parameters())
528 | 
529 | state_dict = {
530 |     "model": model.state_dict(),
531 |     "optimizer": optimizer.state_dict()
532 | }
533 | dcp.state_dict_saver.save(state_dict)
534 | dcp.state_dict_loader.load(state_dict)
535 | ```
536 | 
537 | <br>
538 | 
539 | - [truthfully i's a bit more complicated](https://github.com/pytorch/torchtitan/blob/main/torchtitan/checkpoint.py)
540 | 
541 | ---
542 | 
543 | # PyTorch DCP
544 | 
545 | <img border="rounded" src="./assets/dcp_saving_flow.png" alt="" width="800">
546 | 
547 | ---
548 | 
549 | # PyTorch DCP
550 | 
551 | <br>
552 | <br>
553 | 
554 | <img border="rounded" src="./assets/dcp_1.png" alt="">
555 | 
556 | ---
557 | 
558 | # PyTorch DCP
559 | 
560 | <br>
561 | <br>
562 | 
563 | <img border="rounded" src="./assets/dcp_2.png" alt="">
564 | 
565 | ---
566 | 
567 | # PyTorch DCP
568 | 
569 | <br>
570 | <br>
571 | 
572 | <img border="rounded" src="./assets/dcp_3.png" alt="">
573 | 
574 | ---
575 | 
576 | # Garbage collection tuning
577 | 
578 | ```python
579 | gc.disable()
580 | gc.collect(1)
581 | 
582 | ... init
583 | 
584 | for step in ...:
585 |     if step > 1 and step % _gc_freq == 0:
586 |         gc.collect(1)
587 | 
588 |     ... step
589 | ```
590 | 
591 | ---
592 | 
593 | # Extras
594 | 
595 | - [SimpleFSDP](https://arxiv.org/pdf/2411.00284)
596 | - `unshard_in_backward`
597 | - meta device init
598 | - compile
599 | 
600 | ---
601 | 
602 | # Code
603 | 
604 | - можно поиграться со всем этим в [ноутбуке](https://www.kaggle.com/code/antonyfrolov/practice-ipynb)
605 | - пайплайн отладки
606 | 


--------------------------------------------------------------------------------
/week06_fsdp/slides/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week06_fsdp/slides/slides.pdf


--------------------------------------------------------------------------------
/week07_application_deployment/01_python_server/README.md:
--------------------------------------------------------------------------------
 1 | # Python server
 2 | 
 3 | * Put `vgg16.pt` inside this directory
 4 | 
 5 | * Run 
 6 |   ```bash
 7 |   pip3 install -r requirements.txt
 8 |   python3 server.py
 9 |   ```
10 | 
11 | ## Further reading
12 | 
13 | * Flask lib - https://flask.palletsprojects.com/en/3.0.x/
14 | * Http - https://developer.mozilla.org/en-US/docs/Web/HTTP/Overview


--------------------------------------------------------------------------------
/week07_application_deployment/01_python_server/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==3.1.0
2 | torch==2.1.2
3 | torchvision==0.16.2
4 | numpy==1.26.4
5 | 


--------------------------------------------------------------------------------
/week07_application_deployment/01_python_server/server.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import torch
 4 | 
 5 | from flask import Flask, request, jsonify
 6 | 
 7 | app = Flask(__name__, static_url_path="")
 8 | 
 9 | model = torch.jit.load(f'{os.path.dirname(os.path.abspath(__file__))}/vgg16.pt')
10 | with open(f'{os.path.dirname(os.path.abspath(__file__))}/labels.json', 'r') as f:
11 |     labels_raw = json.loads(f.read())
12 |     labels = {int(index): value for index, value in enumerate(labels_raw)}
13 | 
14 | 
15 | @app.route("/predict", methods=['POST'])
16 | def predict():
17 |     data = request.get_json(force=True)
18 |     features = torch.tensor(data['data'])
19 | 
20 |     result = model(features).data.numpy().argmax()
21 |     label = labels[result]
22 | 
23 |     return jsonify({
24 |         "label": label
25 |     })
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     app.run(host='0.0.0.0', port=8080)
30 | 


--------------------------------------------------------------------------------
/week07_application_deployment/02_docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10
 2 | 
 3 | COPY requirements.txt .
 4 | RUN pip3 install -r requirements.txt
 5 | 
 6 | RUN mkdir /app
 7 | WORKDIR /app
 8 | 
 9 | COPY labels.json .
10 | COPY vgg16.pt .
11 | COPY server.py .
12 | 
13 | ENTRYPOINT ["python3", "server.py"]
14 | 


--------------------------------------------------------------------------------
/week07_application_deployment/02_docker/README.md:
--------------------------------------------------------------------------------
 1 | # Containerized server
 2 | 
 3 | * Put `vgg16.pt` inside this directory
 4 | 
 5 | * Install Docker and Docker-compose
 6 | 
 7 |   * On Ubuntu
 8 |     ```bash
 9 |     curl https://get.docker.com -L | bash
10 |     DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker}
11 |     mkdir -p $DOCKER_CONFIG/cli-plugins
12 |     curl -SL https://github.com/docker/compose/releases/download/v2.33.0/docker-compose-linux-x86_64 -o $DOCKER_CONFIG/cli-plugins/docker-compose
13 |     chmod +x $DOCKER_CONFIG/cli-plugins/docker-compose
14 |     ```
15 | 
16 |   * On other systems
17 |     * [Docker](https://docs.docker.com/engine/install)
18 |     * [Docker-compose](https://docs.docker.com/compose/install/)
19 | 
20 | * Build and run a simple server:
21 | 
22 |     If you are using MacOS
23 |     ```bash
24 |     export DOCKER_BUILDKIT=0
25 |     export COMPOSE_DOCKER_CLI_BUILD=0
26 |     ```
27 | 
28 | * Build and run a simple Python server:
29 |     
30 |     ```bash
31 |     docker compose up --build
32 |     ```
33 | 
34 | * Build and run a production-ready server:
35 | 
36 |     ```bash
37 |     docker compose -f docker-compose.production.yaml up --build
38 |     ```
39 | 
40 | ## Further reading
41 | 
42 | * Docker - https://docs.docker.com/
43 | * Gunicorn - https://docs.gunicorn.org/en/stable/index.html
44 | * Nginx - https://nginx.org/ru/ 
45 | * Why gunicorn & nginx - https://docs.gunicorn.org/en/stable/deploy.html
46 | * Supervisord - http://supervisord.org/
47 | * GIL - https://habr.com/ru/post/84629/
48 | 
49 | 


--------------------------------------------------------------------------------
/week07_application_deployment/02_docker/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   app:
 5 |     build:
 6 |       dockerfile: Dockerfile
 7 |       context: .
 8 |     ports:
 9 |       - "8080:8080"
10 | 


--------------------------------------------------------------------------------
/week07_application_deployment/02_docker/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==3.1.0
2 | torch==2.1.2
3 | torchvision==0.16.2
4 | numpy==1.26.4
5 | gunicorn==23.0.0


--------------------------------------------------------------------------------
/week07_application_deployment/02_docker/server.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import torch
 5 | from flask import Flask, request, jsonify
 6 | 
 7 | app = Flask(__name__, static_url_path="")
 8 | 
 9 | model = torch.jit.load(f'{os.path.dirname(os.path.abspath(__file__))}/vgg16.pt')
10 | with open(f'{os.path.dirname(os.path.abspath(__file__))}/labels.json', 'r') as f:
11 |     labels_raw = json.loads(f.read())
12 |     labels = {int(index): value for index, value in enumerate(labels_raw)}
13 | 
14 | 
15 | @app.route("/predict", methods=['POST'])
16 | def predict():
17 |     data = request.get_json(force=True)
18 |     features = torch.tensor(data['data'])
19 | 
20 |     result = model(features).data.numpy().argmax()
21 |     label = labels[result]
22 | 
23 |     return jsonify({
24 |         "label": label
25 |     })
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     app.run(host='0.0.0.0', port=8080)
30 | 


--------------------------------------------------------------------------------
/week07_application_deployment/03_orchestration/README.md:
--------------------------------------------------------------------------------
 1 | You can use any cloud provider to get some machines to add to the swarm. 
 2 | The previous years had a guide on how to use docker swarm with machines created locally via VirtualBox.
 3 | It used to operate using `docker-machine`, but it is now deprecated. You can check previous years for details.
 4 | 
 5 | YZ: In this guide for previous years, local machines were created using VirtualBox. 
 6 | Since I am a weirdo who uses WSL2, setupping everything properly was a pain in the ass.
 7 | If someone wants to replicate the setup, I **definitely** recommend the cloud way.
 8 | Using pure linux (non-WSL) may be easier but still not recommended.
 9 | What we'd need here is just some VM with Docker installed where we can ssh to, replicated 3 times.
10 | Ensure the Docker version is the same, this is VERY important!
11 | Also check firewalls (use ChatGPT for `sudo ufw allow icmp`-like commands and troubleshooting)
12 | 
13 | On machine 1, run:
14 | ```bash
15 | docker swarm init --advertise-addr MACHINE_1_IP
16 | ```
17 | 
18 | On machines 2 and 3, run
19 | ```bash
20 | docker swarm join --token SWMTKN-1-TOKEN MACHINE_1_IP:2377
21 | ```
22 | 
23 | If docker swarm commands hang, don't forget to check if the machines can ping each other and something is listening on port 2377
24 | ```shell
25 | nc -zv MACHINE_1_IP 2377
26 | ```
27 | 
28 | Check that nodes are indeed available (on manager node, aka node1)
29 | ```shell
30 | docker node ls
31 | ```
32 | 
33 | You should see all nodes.
34 | 
35 | Then, on machine 1,
36 | 
37 | ```bash
38 | docker service create \
39 |   --name=viz \
40 |   --publish=8080:8080/tcp \
41 |   --constraint=node.role==manager \
42 |   --mount=type=bind,src=/var/run/docker.sock,dst=/var/run/docker.sock \
43 |   dockersamples/visualizer
44 | 
45 | docker ps
46 | docker service ls
47 | ```
48 | 
49 | For local setup, you might require to pass additional arguments and setup ingress so it listens to local addresses.
50 | `docker network inspect ingress | grep Subnet` should yield something. Again, going through all this is *NOT* recommended.
51 | 
52 | Open `MACHINE1_IP:8080` (and because of ingress, `MACHINE2_IP:8080` should give same result too)
53 | 
54 | Ssh into machine1 (you should move the compose file there first) and run
55 | 
56 | ```bash
57 | docker stack deploy --compose-file docker-compose.swarm.yaml fileservice
58 | ```
59 | 
60 | Open `MACHINE1_IP:9090`
61 | 


--------------------------------------------------------------------------------
/week07_application_deployment/03_orchestration/docker-compose.swarm.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | services:
 4 |   server:
 5 |     image: halverneus/static-file-server:latest
 6 |     deploy:
 7 |       mode: replicated
 8 |       replicas: 3  # Добавляем 2 реплики этого сервиса
 9 |     volumes:
10 |       - /etc:/target-folder
11 |     environment:
12 |       FOLDER: /target-folder
13 |     ports:
14 |       - "9090:8080"
15 | 


--------------------------------------------------------------------------------
/week07_application_deployment/04_metrics/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10
 2 | 
 3 | COPY requirements.txt .
 4 | RUN pip3 install -r requirements.txt
 5 | 
 6 | RUN mkdir /app
 7 | WORKDIR /app
 8 | 
 9 | COPY labels.json .
10 | COPY vgg16.pt .
11 | COPY server.py .
12 | 
13 | ENTRYPOINT ["python3", "server.py"]
14 | 


--------------------------------------------------------------------------------
/week07_application_deployment/04_metrics/README.md:
--------------------------------------------------------------------------------
 1 | # Metrics
 2 | 
 3 | Put `vgg16.pt` inside this directory
 4 | 
 5 | Run 
 6 | 
 7 | ```bash
 8 | docker-compose up --build
 9 | ```
10 | 
11 | Visit
12 | * `http://localhost:8080/metrics` - raw metrics from app
13 | * `http://localhost:3000/` or `http://localhost:9090` - login with `admin`/`admin` - grafana to draw metrics
14 | 
15 | ## Further reading
16 | 
17 | * Prometheus - https://prometheus.io/
18 | * Prometheus & Flask - https://pypi.org/project/prometheus-flask-exporter/
19 | * Grafana - https://grafana.com/
20 | * Telegraf - https://www.influxdata.com/time-series-platform/telegraf/


--------------------------------------------------------------------------------
/week07_application_deployment/04_metrics/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   app:
 3 |     build:
 4 |       dockerfile: Dockerfile
 5 |       context: .
 6 |     ports:
 7 |       - 8080:8080
 8 | 
 9 |   prom:
10 |     image: prom/prometheus:v2.47.0
11 |     ports:
12 |       - 9090:9090
13 |     volumes:
14 |       - ./prom/prometheus.yml:/etc/prometheus/prometheus.yml
15 |       - prometheus_data:/prometheus
16 | 
17 |   grafana:
18 |     image: grafana/grafana:10.0.3
19 |     ports:
20 |       - 3000:3000
21 |     volumes:
22 |       - ./grafana/datasources:/etc/grafana/provisioning/datasources
23 |       - ./grafana/config.ini:/etc/grafana/config.ini
24 |       - grafana_data:/var/lib/grafana
25 | 
26 |   telegraf:
27 |     image: telegraf:1.29
28 |     volumes:
29 |       - ./monitor/telegraf.conf:/etc/telegraf/telegraf.conf:ro
30 |     ports:
31 |       - 9100:9100
32 | 
33 | volumes:
34 |   # store prom metrics between runs
35 |   prometheus_data: {}
36 |   grafana_data: {}
37 |   


--------------------------------------------------------------------------------
/week07_application_deployment/04_metrics/grafana/config.ini:
--------------------------------------------------------------------------------
1 | # place to find startup config
2 | [paths]
3 | provisioning = /etc/grafana/provisioning
4 | 
5 | [server]
6 | enable_gzip = true
7 | 


--------------------------------------------------------------------------------
/week07_application_deployment/04_metrics/grafana/datasources/all.yml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 | 
3 | # tells grafana where to find the prom connection
4 | datasources:
5 |   - name: 'prometheus'
6 |     type: 'prometheus'
7 |     access: 'proxy'
8 |     url: 'http://prom:9090'
9 | 


--------------------------------------------------------------------------------
/week07_application_deployment/04_metrics/monitor/telegraf.conf:
--------------------------------------------------------------------------------
 1 | # Read metrics about cpu usage
 2 | [[inputs.cpu]]
 3 |   ## Whether to report per-cpu stats or not
 4 |   percpu = true
 5 |   ## Whether to report total system cpu stats or not
 6 |   totalcpu = true
 7 |   ## If true, collect raw CPU time metrics
 8 |   collect_cpu_time = false
 9 |   ## If true, compute and report the sum of all non-idle CPU states
10 |   report_active = false
11 | 
12 | [[inputs.mem]]
13 | 
14 | [[outputs.prometheus_client]]
15 |   listen = "telegraf:9100"
16 | 


--------------------------------------------------------------------------------
/week07_application_deployment/04_metrics/prom/prometheus.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: 15s
 3 |   evaluation_interval: 15s
 4 |   external_labels:
 5 |     monitor: 'app-monitor'
 6 | 
 7 | scrape_configs:
 8 |   - job_name: 'prometheus'
 9 |     scrape_interval: 5s
10 |     static_configs:
11 |       - targets: [ 'prom:9090' ]
12 | 
13 |   - job_name: 'telegraf'
14 |     scrape_interval: 5s
15 |     static_configs:
16 |       - targets: [ 'telegraf:9100' ]
17 | 
18 |   - job_name: 'app-server'
19 |     scrape_interval: 5s
20 |     static_configs:
21 |       - targets: [ 'app:8080' ]
22 | 
23 |   - job_name: 'nginx'
24 |     scrape_interval: 5s
25 |     static_configs:
26 |       - targets: [ 'nginx_metrics:9113' ]
27 | 


--------------------------------------------------------------------------------
/week07_application_deployment/04_metrics/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==3.1.0
2 | torch==2.1.2
3 | torchvision==0.16.2
4 | numpy==1.26.4
5 | gunicorn==23.0.0
6 | prometheus-flask-exporter==0.23.1
7 | 


--------------------------------------------------------------------------------
/week07_application_deployment/04_metrics/server.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import torch
 4 | from flask import Flask, request, jsonify
 5 | from prometheus_flask_exporter import PrometheusMetrics
 6 | from prometheus_client import Counter
 7 | 
 8 | 
 9 | app = Flask(__name__, static_url_path="")
10 | metrics = PrometheusMetrics(app)
11 | PREDICTION_COUNT = Counter("predictions_total", "Number of predictions", ["label"])
12 | 
13 | model = torch.jit.load('vgg16.pt')
14 | with open('labels.json', 'r') as f:
15 |     labels_raw = json.loads(f.read())
16 |     labels = {int(index): value for index, value in enumerate(labels_raw)}
17 | 
18 | 
19 | @app.route("/predict", methods=['POST'])
20 | @metrics.gauge("api_in_progress", "requests in progress")
21 | @metrics.counter("api_invocations_total", "number of invocations")
22 | def predict():
23 |     data = request.get_json(force=True)
24 |     features = torch.tensor(data['data'])
25 | 
26 |     result = model(features).data.numpy().argmax()
27 |     label = labels[result]
28 | 
29 |     PREDICTION_COUNT.labels(label=label).inc()
30 | 
31 |     return jsonify({
32 |         "label": label
33 |     })
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     app.run(host='0.0.0.0', port=8080)
38 | 


--------------------------------------------------------------------------------
/week07_application_deployment/05_microservices/Dockerfile.client_api:
--------------------------------------------------------------------------------
 1 | FROM python:3.10
 2 | 
 3 | COPY requirements.txt .
 4 | RUN pip3 install -r requirements.txt
 5 | 
 6 | RUN mkdir /app
 7 | WORKDIR /app
 8 | 
 9 | COPY protos/* /app/protos
10 | COPY *.py /app/
11 | RUN python3 run_codegen.py
12 | 
13 | ENTRYPOINT ["python3", "client-api.py"]
14 | 


--------------------------------------------------------------------------------
/week07_application_deployment/05_microservices/Dockerfile.inference_api:
--------------------------------------------------------------------------------
 1 | FROM python:3.10
 2 | 
 3 | COPY requirements.txt .
 4 | RUN pip3 install -r requirements.txt
 5 | 
 6 | RUN mkdir /app
 7 | WORKDIR /app
 8 | 
 9 | COPY protos/* /app/protos
10 | COPY labels.json vgg16.pt /app/
11 | COPY *.py /app/
12 | RUN python3 run_codegen.py
13 | 
14 | ENTRYPOINT ["python3", "inference-api.py"]
15 | 


--------------------------------------------------------------------------------
/week07_application_deployment/05_microservices/client-api.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import concurrent.futures
 3 | 
 4 | import aiohttp
 5 | import grpc
 6 | import uvicorn
 7 | from fastapi import FastAPI
 8 | from pydantic import BaseModel
 9 | import torchvision.transforms as transforms
10 | from PIL import Image
11 | import io
12 | from functools import reduce
13 | import operator
14 | import inference_pb2_grpc
15 | import inference_pb2
16 | 
17 | executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
18 | 
19 | transform_pipeline = transforms.Compose([
20 |     transforms.Resize(224),
21 |     transforms.ToTensor(),
22 |     transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
23 | ])
24 | 
25 | 
26 | def prepare(img_data):
27 |     image = Image.open(io.BytesIO(img_data))
28 |     img_data = transform_pipeline(image).unsqueeze(0)
29 |     img_shape = list(img_data.shape)
30 |     flat_shape = reduce(operator.mul, img_shape, 1)
31 | 
32 |     data = img_data.numpy().reshape(flat_shape).tolist()
33 |     return data, img_shape
34 | 
35 | 
36 | class ImageRequest(BaseModel):
37 |     image_url: str
38 | 
39 | 
40 | class LabelResponse(BaseModel):
41 |     label: str
42 | 
43 | 
44 | app = FastAPI()
45 | 
46 | """"
47 | {
48 |     "image_url": ""...""
49 | }
50 | 
51 | """
52 | @app.post("/predict")
53 | async def hello_world(req: ImageRequest):
54 |     async with aiohttp.ClientSession() as session:
55 |         async with session.get(req.image_url) as resp:
56 |             data = await resp.read()
57 |             loop = asyncio.get_event_loop()
58 |             image_data, shape = await loop.run_in_executor(executor, prepare, data)
59 | 
60 |     async with grpc.aio.insecure_channel('inference-api:50051') as channel:
61 |         service = inference_pb2_grpc.ImageClassifierStub(channel)
62 |         r = await service.Predict(inference_pb2.ImageClassifierInput(
63 |             data=image_data,
64 |             shape=shape
65 |         ))
66 | 
67 |     label = r.label
68 |     return LabelResponse(label=label)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     uvicorn.run("client-api:app", port=80, host='0.0.0.0')
73 | 


--------------------------------------------------------------------------------
/week07_application_deployment/05_microservices/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   inference-api:
 3 |     build:
 4 |       dockerfile: Dockerfile.inference_api
 5 |       context: .
 6 |     ports:
 7 |       - 50051:50051
 8 | 
 9 |   client-api:
10 |     build:
11 |       dockerfile: Dockerfile.client_api
12 |       context: .
13 |     ports:
14 |       - 8081:80
15 | 
16 |   image-server:
17 |     image: python:3.10
18 |     command: python3 -m http.server -d /data --bind 0.0.0.0 9090
19 |     ports:
20 |       - 9091:9090
21 |     volumes:
22 |       - ./imgs:/data
23 | 


--------------------------------------------------------------------------------
/week07_application_deployment/05_microservices/grpc-client.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | import io
 4 | import operator
 5 | from functools import reduce
 6 | 
 7 | import grpc
 8 | import torchvision.transforms as transforms
 9 | from PIL import Image
10 | import inference_pb2_grpc
11 | import inference_pb2
12 | 
13 | transform_pipeline = transforms.Compose([
14 |     transforms.Resize(224),
15 |     transforms.ToTensor(),
16 |     transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
17 | ])
18 | 
19 | 
20 | def prepare(img_data):
21 |     image = Image.open(img_data)
22 |     img_data = transform_pipeline(image).unsqueeze(0)
23 |     img_shape = list(img_data.shape)
24 |     flat_shape = reduce(operator.mul, img_shape, 1)
25 | 
26 |     data = img_data.numpy().reshape(flat_shape).tolist()
27 |     return data, img_shape
28 | 
29 | 
30 | def main_single(img_path, server_url):
31 |     image_data, shape = prepare(img_path)
32 | 
33 |     with grpc.insecure_channel(server_url) as channel:
34 |         service = inference_pb2_grpc.ImageClassifierStub(channel)
35 |         r = service.Predict(inference_pb2.ImageClassifierInput(
36 |             data=image_data,
37 |             shape=shape
38 |         ))
39 |         print("It is {}".format(r.label))
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument("mode", help="mode to run")
45 |     parser.add_argument("img", help="path to img")
46 |     parser.add_argument("--url", help="url to server", default="localhost:50051")
47 |     args = parser.parse_args()
48 | 
49 |     if args.mode == "single":
50 |         main_single(args.img, args.url)
51 |     else:
52 |         print("Unexpected mode {}".format(args.mode))
53 | 


--------------------------------------------------------------------------------
/week07_application_deployment/05_microservices/inference-api.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | 
 4 | from concurrent import futures
 5 | 
 6 | import grpc
 7 | import numpy as np
 8 | import torch
 9 | import inference_pb2
10 | import inference_pb2_grpc
11 | 
12 | 
13 | class InferenceClassifier(inference_pb2_grpc.ImageClassifierServicer):
14 |     def __init__(self):
15 |         self.model = torch.jit.load('vgg16.pt')
16 |         with open('labels.json', 'r') as f:
17 |             labels_raw = json.loads(f.read())
18 |             self.labels = {int(index): value for index, value in enumerate(labels_raw)}
19 | 
20 |     def Predict(self, request, context):
21 |         shape = request.shape
22 |         data = np.array(request.data).reshape(*shape)
23 |         features = torch.from_numpy(data).float()
24 |         result = self.model(features).data.numpy().argmax()
25 |         label = self.labels[result]
26 |         return inference_pb2.ImageClassifierOutput(label=label)
27 | 
28 | 
29 | def serve():
30 |     # to use processes - https://github.com/grpc/grpc/blob/master/examples/python/multiprocessing/server.py
31 |     server = grpc.server(futures.ThreadPoolExecutor(max_workers=4))
32 |     inference_pb2_grpc.add_ImageClassifierServicer_to_server(InferenceClassifier(), server)
33 |     server.add_insecure_port('[::]:50051')
34 |     server.start()
35 |     server.wait_for_termination()
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     logging.basicConfig()
40 |     print("start serving...")
41 |     serve()
42 | 


--------------------------------------------------------------------------------
/week07_application_deployment/05_microservices/protos/inference.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | package inference;
 3 | 
 4 | service ImageClassifier {
 5 |   rpc Predict(ImageClassifierInput) returns (ImageClassifierOutput);
 6 | }
 7 | 
 8 | message ImageClassifierInput {
 9 |   repeated int32 shape = 2;
10 |   repeated double data = 1;
11 | }
12 | 
13 | message ImageClassifierOutput {
14 |   string label = 1;
15 | }
16 | 


--------------------------------------------------------------------------------
/week07_application_deployment/05_microservices/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==2.1.2
 2 | torchvision==0.16.2
 3 | numpy==1.26.4
 4 | grpcio-tools
 5 | grpcio
 6 | fastapi
 7 | uvicorn[standard]
 8 | grequests
 9 | aiohttp
10 | furl


--------------------------------------------------------------------------------
/week07_application_deployment/05_microservices/run_codegen.py:
--------------------------------------------------------------------------------
 1 | from grpc_tools import protoc
 2 | 
 3 | protoc.main((
 4 |     '',
 5 |     '-Iprotos',
 6 |     '--python_out=.',
 7 |     '--grpc_python_out=.',
 8 |     '--pyi_out=.',
 9 |     'protos/inference.proto',
10 | ))
11 | 


--------------------------------------------------------------------------------
/week07_application_deployment/README.md:
--------------------------------------------------------------------------------
 1 | # Week 7: Application deployment 
 2 | 
 3 | Designed by Alex Kosmos ❤️❤️❤️️
 4 | 
 5 | * Lecture/practice: TBA. See README files inside numbered directories for more info about how to run stands.
 6 | * Homework: see the [homework](./homework) folder
 7 | 
 8 | Run `train_model.py` to generate `vgg16.pt`. 
 9 | This will not run any training, just trace a pretrained model from torchvision.
10 | 
11 | Use `client.py` in order to send requests for services 1-4.
12 | Use `client-url.py` in order to send requests for service 5.
13 | 
14 | ## Further reading
15 | See links in README files inside the practice session folders.
16 | 


--------------------------------------------------------------------------------
/week07_application_deployment/client-url.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import requests
 4 | from furl import furl
 5 | 
 6 | 
 7 | def main_single(img_path, server_url):
 8 |     if not img_path.startswith('https://'):
 9 |         img_path = str(furl("http://image-server:9091") / img_path)
10 | 
11 |     predict_url = str(furl(server_url) / "predict")
12 |     r = requests.post(predict_url, json={'image_url': img_path})
13 |     print("It is {}".format(r.json()['label']))
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("img", help="path to img")
19 |     parser.add_argument("--url", help="url to server", default="http://localhost:8081")
20 |     args = parser.parse_args()
21 | 
22 |     main_single(args.img, args.url)
23 | 


--------------------------------------------------------------------------------
/week07_application_deployment/client.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import concurrent.futures
 5 | import random
 6 | from collections import Counter
 7 | 
 8 | import requests
 9 | from furl import furl
10 | import torchvision.transforms as transforms
11 | from PIL import Image
12 | 
13 | transform_pipeline = transforms.Compose([
14 |     transforms.Resize(224),
15 |     transforms.ToTensor(),
16 |     transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
17 | ])
18 | 
19 | 
20 | def prepare(img_path):
21 |     image = Image.open(img_path)
22 |     img_data = transform_pipeline(image).unsqueeze(0)
23 |     data = img_data.tolist()
24 |     return data
25 | 
26 | 
27 | def main_single(img_path, server_url):
28 |     predict_url = str(furl(server_url) / "predict")
29 |     print("Sending POST {}".format(predict_url))
30 |     data = prepare(img_path)
31 |     r = requests.post(predict_url, json={'data': data})
32 |     print("It is {}".format(r.json()['label']))
33 | 
34 | 
35 | def main_stress(folder, server_url, threads):
36 |     predict_url = str(furl(server_url) / "predict")
37 | 
38 |     do_request = lambda x: requests.post(predict_url, json={'data': x}).json()
39 |     imgs = os.listdir(folder)
40 | 
41 |     batch = 100
42 |     with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
43 |         while True:
44 |             print("preparing")
45 |             imgsd = [prepare(str(furl(folder)/random.choice(imgs))) for _ in range(batch)]
46 |             print("start batch")
47 |             futures = [executor.submit(do_request, img_data) for img_data in imgsd]
48 | 
49 |             labels = Counter()
50 |             for future in concurrent.futures.as_completed(futures):
51 |                 labels[future.result()['label']] += 1
52 |             print("Current batch: {}".format(labels))
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     parser = argparse.ArgumentParser()
57 |     parser.add_argument("mode", help="mode to run")
58 |     parser.add_argument("img", help="path to img")
59 |     parser.add_argument("--url", help="url to server", default="http://localhost:8080")
60 |     parser.add_argument("--threads", help="number of threads", default="2")
61 |     args = parser.parse_args()
62 | 
63 |     if args.mode == "single":
64 |         main_single(args.img, args.url)
65 |     elif args.mode == "stress":
66 |         main_stress(args.img, args.url, int(args.threads))
67 |     else:
68 |         print("Unexpected mode {}".format(args.mode))
69 | 


--------------------------------------------------------------------------------
/week07_application_deployment/dataset/10.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week07_application_deployment/dataset/10.jpeg


--------------------------------------------------------------------------------
/week07_application_deployment/dataset/14.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week07_application_deployment/dataset/14.jpeg


--------------------------------------------------------------------------------
/week07_application_deployment/dataset/16.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week07_application_deployment/dataset/16.jpeg


--------------------------------------------------------------------------------
/week07_application_deployment/dataset/17.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week07_application_deployment/dataset/17.jpeg


--------------------------------------------------------------------------------
/week07_application_deployment/dataset/5.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week07_application_deployment/dataset/5.jpeg


--------------------------------------------------------------------------------
/week07_application_deployment/dataset/6.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week07_application_deployment/dataset/6.jpeg


--------------------------------------------------------------------------------
/week07_application_deployment/dataset/9.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week07_application_deployment/dataset/9.jpeg


--------------------------------------------------------------------------------
/week07_application_deployment/homework/README.md:
--------------------------------------------------------------------------------
 1 | # Service deployment
 2 | 
 3 | Your task is to create an Instance Segmentation (more specifically, Instance Detection) service.
 4 | You can use **any pretrained model** (such as [these](https://pytorch.org/vision/stable/models.html#instance-segmentation), for example). 
 5 | Instance segmentation metrics will not be counted in your final grade, but MAP should be above 0.5 in order to pass the tests.
 6 | 
 7 | Note that object names are the same as in the [COCO 2017 dataset](https://cocodataset.org/#download). 
 8 | You can retrieve them from the [FasterRCNN_ResNet50_FPN_V2](https://pytorch.org/vision/stable/models.html#instance-segmentation) model:
 9 | ```python
10 | from torchvision.models.detection import FasterRCNN_ResNet50_FPN_V2_Weights
11 | 
12 | print(FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1.meta['categories'])
13 | ```
14 | 
15 | **[4 points] HTTP endpoint:**
16 | Implement a service which can handle the `POST /predict` query on port `8080`.
17 | Request data is a JSON with the following structure:
18 | ```json
19 | {
20 |   "url": "<url to image>"
21 | }
22 | ```
23 | 
24 | Response data is also a JSON with the following structure:
25 | ```json
26 | {
27 |   "objects": [
28 |     "<object name>",
29 |     "<object name>",
30 |     ...
31 |   ]
32 | }
33 | ```
34 | 
35 | Example:
36 | ```bash
37 | curl -XPOST http://localhost:8080/predict -H "Content-Type: application/json" -d '{"url": "http://images.cocodataset.org/val2017/000000001268.jpg"}' 
38 | ...
39 | {
40 |   "objects": [
41 |         "bird",
42 |         "boat",
43 |         "boat",
44 |         "person",
45 |         "person",
46 |         "person",
47 |         "person",
48 |         "cell phone",
49 |         "backpack",
50 |         "handbag",
51 |         "boat"
52 |     ]
53 | }
54 | ```
55 | 
56 | **[3 points] Metric endpoint:**
57 | Implement a service which is able to serve its metrics in the [Prometheus format](https://prometheus.io/docs/concepts/data_model/) via `/metrics` on the `8080` port. 
58 | The most important metric for us is `app_http_inference_count`, the number of HTTP endpoint invocations.
59 | 
60 | Example:
61 | ```bash
62 | curl http://localhost:8080/metrics
63 | ...
64 | # HELP app_http_inference_count_total Multiprocess metric
65 | # TYPE app_http_inference_count_total counter
66 | app_http_inference_count_total 12.0
67 | ```
68 | 
69 | **[3 points] GRPC endpoint:**
70 | Implement a separate [gRPC](https://grpc.io/) service on the `9090` port. 
71 | See the `inference.proto` file in the `proto` directory. 
72 | The contract is the same as for the HTTP endpoint.
73 | 
74 | ### How to submit?
75 | 
76 | * Create a **private** GitHub repository
77 | * Install [greater-solution-extractor-59](https://github.com/apps/greater-solution-extractor-59) to your account and authorize it to your newly created repo.
78 | * Put `Dockerfile` in the root of the repository. This Dockerfile should assemble all code in your repo **and the model checkpoint** into the working service.
79 | * Go to [Jenkins site](https://jenkins.edl07.mom) , use `student`/`Student228!` for login.
80 | * Build `week07-pipeline` with your repo as a parameter.
81 | * After successful build, click `Keep this build forever`
82 | * This link to your successful (or maybe only partially successful) is the solution for this homework - send it via anytask/LMS.
83 | * Also, add the link to your GitHub repo and the commit hash which corresponds to your submission.
84 | 
85 | ### Notes
86 | 
87 | * Pretrained PyTorch model `maskrcnn_resnet50_fpn` with `score_threshold = 0.75` works fine for this task
88 | * You are not limited to running only one process inside docker - use `supervisord` to run as many processes as you want
89 | * You can find actual tests in file `tests.py`
90 | * In order to sanity-check your model, tests check that MAP is greater than 0.5 - see tests for details


--------------------------------------------------------------------------------
/week07_application_deployment/homework/proto/inference.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | package inference;
 3 | 
 4 | service InstanceDetector {
 5 |   rpc Predict(InstanceDetectorInput) returns (InstanceDetectorOutput);
 6 | }
 7 | 
 8 | message InstanceDetectorInput {
 9 |   string url = 1;
10 | }
11 | 
12 | message InstanceDetectorOutput {
13 |   repeated string objects = 1;
14 | }
15 | 


--------------------------------------------------------------------------------
/week07_application_deployment/homework/tests.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import statistics
  4 | 
  5 | import grpc
  6 | import requests
  7 | from furl import furl
  8 | import pytest
  9 | 
 10 | import inference_pb2_grpc
 11 | import inference_pb2
 12 | from prometheus_client.parser import text_string_to_metric_families
 13 | 
 14 | 
 15 | @pytest.fixture(scope='session')
 16 | def eval_data():
 17 |     with open('eval.json', 'r') as f:
 18 |         return json.loads(f.read())
 19 | 
 20 | 
 21 | @pytest.fixture(scope='session')
 22 | def server_ip():
 23 |     server_ip_value = os.environ['DOCKER_IP']
 24 |     if server_ip_value is None:
 25 |         pytest.fail("No cluster ip were provided")
 26 |     return server_ip_value
 27 | 
 28 | 
 29 | @pytest.fixture(scope='session')
 30 | def http_host(server_ip):
 31 |     return "http://{}:8080/".format(server_ip)
 32 | 
 33 | 
 34 | @pytest.fixture(scope="session")
 35 | def grpc_host(server_ip):
 36 |     return "{}:9090".format(server_ip)
 37 | 
 38 | 
 39 | def get_metric_value(samples):
 40 |     if len(samples) == 0:
 41 |         return 0
 42 |     return samples[0].value
 43 | 
 44 | 
 45 | def parse_prom(metrics_data):
 46 |     return {
 47 |         m.name: get_metric_value(m.samples)
 48 |         for m in text_string_to_metric_families(metrics_data)
 49 |     }
 50 | 
 51 | 
 52 | def get_image_link(image_name):
 53 |     return "http://images.cocodataset.org/val2017/{}".format(image_name)
 54 | 
 55 | 
 56 | def calc_score(actual, predicted):
 57 |     actual_copy = [x for x in actual]
 58 |     score = 0
 59 |     for label in predicted:
 60 |         if label in actual_copy:
 61 |             score += 1
 62 |             actual_copy.remove(label)
 63 |     return 2 * score / (len(actual) + len(predicted))
 64 | 
 65 | 
 66 | @pytest.mark.run(order=1)
 67 | def test_http_endpoint(http_host, eval_data, capsys):
 68 |     with capsys.disabled():
 69 |         predict_url = str(furl(http_host) / "predict")
 70 |         scores = []
 71 |         for img_name, labels in eval_data.items():
 72 |             print("Processing {}".format(img_name))
 73 |             img_url = get_image_link(img_name)
 74 |             r = requests.post(predict_url, json={"url": img_url})
 75 |             predicted_labels = r.json()['objects']
 76 |             scores.append(calc_score(labels, predicted_labels))
 77 | 
 78 |         mean_score = statistics.mean(scores)
 79 |         assert mean_score > 0.5
 80 | 
 81 | 
 82 | @pytest.mark.run(order=2)
 83 | def test_grpc_endpoint(grpc_host, eval_data, capsys):
 84 |     with capsys.disabled():
 85 |         with grpc.insecure_channel(grpc_host) as channel:
 86 |             scores = []
 87 |             for img_name, labels in eval_data.items():
 88 |                 print("Processing {}".format(img_name))
 89 |                 img_url = get_image_link(img_name)
 90 |                 service = inference_pb2_grpc.InstanceDetectorStub(channel)
 91 |                 r = service.Predict(inference_pb2.InstanceDetectorInput(
 92 |                     url=img_url
 93 |                 ))
 94 |                 predicted_labels = r.objects
 95 |                 scores.append(calc_score(labels, predicted_labels))
 96 | 
 97 |             mean_score = statistics.mean(scores)
 98 |             assert mean_score > 0.5
 99 | 
100 | 
101 | @pytest.mark.run(order=3)
102 | def test_http_metrics(http_host, eval_data):
103 |     predict_url = str(furl(http_host) / "predict")
104 |     metrics_url = str(furl(http_host) / "metrics")
105 |     img_name, _ = next(iter(eval_data.items()))
106 |     img_url = get_image_link(img_name)
107 | 
108 |     init = parse_prom(requests.get(metrics_url).text)
109 |     print(init)
110 |     init_value = int(init['app_http_inference_count'])
111 | 
112 |     r = requests.post(predict_url, json={'url': img_url})
113 |     assert r.status_code == 200
114 | 
115 |     next_ = parse_prom(requests.get(metrics_url).text)
116 |     next_value = int(next_['app_http_inference_count'])
117 | 
118 |     assert next_value == init_value + 1
119 | 


--------------------------------------------------------------------------------
/week07_application_deployment/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==3.1.0
2 | torch==2.1.2
3 | torchvision==0.16.2
4 | numpy==1.26.4
5 | furl
6 | 


--------------------------------------------------------------------------------
/week07_application_deployment/supervisord/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7
 2 | 
 3 | COPY requirements.txt .
 4 | RUN pip3 install -r requirements.txt
 5 | 
 6 | RUN mkdir /app
 7 | WORKDIR /app
 8 | 
 9 | COPY labels.json .
10 | COPY vgg16.pt .
11 | COPY server.py .
12 | 
13 | COPY supervisord.conf /etc/supervisord.conf
14 | 
15 | ENTRYPOINT ["supervisord", "-c", "/etc/supervisord.conf"]
16 | 


--------------------------------------------------------------------------------
/week07_application_deployment/supervisord/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   app:
 5 |     build:
 6 |       dockerfile: Dockerfile
 7 |       context: .
 8 |     ports:
 9 |       - 8080:8080
10 | 
11 |   prom:
12 |     image: prom/prometheus:v2.27.1
13 |     ports:
14 |       - 9090:9090
15 |     volumes:
16 |       - ./prom/prometheus.yml:/etc/prometheus/prometheus.yml
17 |       - prometheus_data:/prometheus
18 | 
19 |   grafana:
20 |     image: grafana/grafana:7.5.7
21 |     ports:
22 |       - 3000:3000
23 |     volumes:
24 |       - ./grafana/datasources:/etc/grafana/provisioning/datasources
25 |       - ./grafana/config.ini:/etc/grafana/config.ini
26 |       - grafana_data:/var/lib/grafana
27 | 
28 |   telegraf:
29 |     image: telegraf:1.8
30 |     volumes:
31 |       - ./monitor/telegraf.conf:/etc/telegraf/telegraf.conf:ro
32 |     ports:
33 |       - 9100:9100
34 | 
35 | volumes:
36 |   # store prom metrics between runs
37 |   prometheus_data: {}
38 |   grafana_data: {}


--------------------------------------------------------------------------------
/week07_application_deployment/supervisord/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==2.0.2
2 | torch==1.11.0
3 | torchvision==0.12.0
4 | gunicorn==20.1.0
5 | prometheus-flask-exporter==0.18.7
6 | supervisor
7 | 


--------------------------------------------------------------------------------
/week07_application_deployment/supervisord/server.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | import torch
 5 | from flask import Flask, request, jsonify
 6 | from prometheus_flask_exporter import PrometheusMetrics
 7 | from prometheus_client import Counter
 8 | 
 9 | 
10 | app = Flask(__name__, static_url_path="")
11 | metrics = PrometheusMetrics(app)
12 | PREDICTION_COUNT = Counter("predictions_total", "Number of predictions", ["label"])
13 | 
14 | model = torch.jit.load('vgg16.pt')
15 | with open('labels.json', 'r') as f:
16 |     labels_raw = json.loads(f.read())
17 |     labels = {int(index): value for index, value in enumerate(labels_raw)}
18 | 
19 | 
20 | @app.route("/predict", methods=['POST'])
21 | @metrics.gauge("api_in_progress", "requests in progress")
22 | @metrics.counter("api_invocations_total", "number of invocations")
23 | def predict():
24 |     data = request.get_json(force=True)
25 |     features = torch.tensor(data['data'])
26 | 
27 |     result = model(features).data.numpy().argmax()
28 |     label = labels[result]
29 | 
30 |     PREDICTION_COUNT.labels(label=label).inc()
31 | 
32 |     return jsonify({
33 |         "label": label
34 |     })
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     port = int(sys.argv[1])
39 |     app.run(host='0.0.0.0', port=port)
40 | 


--------------------------------------------------------------------------------
/week07_application_deployment/supervisord/supervisord.conf:
--------------------------------------------------------------------------------
 1 | [supervisord]
 2 | nodaemon=true
 3 | 
 4 | [program:app1]
 5 | directory=/app
 6 | command=python3 /app/server.py 9090
 7 | stdout_logfile=/dev/stdout
 8 | stdout_logfile_maxbytes=0
 9 | redirect_stderr=true
10 | priority=1
11 | 
12 | [program:app2]
13 | directory=/app
14 | command=python3 /app/server.py 8080
15 | stdout_logfile=/dev/stdout
16 | stdout_logfile_maxbytes=0
17 | redirect_stderr=true
18 | priority=2
19 | 


--------------------------------------------------------------------------------
/week07_application_deployment/train_model.py:
--------------------------------------------------------------------------------
 1 | import torchvision.models as models
 2 | import torch
 3 | 
 4 | 
 5 | def main():
 6 |     vgg16 = models.vgg16(pretrained=True)
 7 | 
 8 |     example = torch.rand(1, 3, 224, 224)
 9 |     traced_script_module = torch.jit.trace(vgg16, example)
10 |     traced_script_module.save("vgg16.pt")
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     main()
15 | 


--------------------------------------------------------------------------------
/week08_inference_software/README.md:
--------------------------------------------------------------------------------
 1 | # Week 8: LLM inference optimizations and software
 2 | 
 3 | * Lecture: [link](./lecture.pdf)
 4 | * Seminar: [link](./seminar/seminar.ipynb)
 5 | 
 6 | ## Further reading
 7 | * [What is the KV cache?](https://mett29.github.io/posts/kv-cache/)
 8 | * [Overview of torch.compiler](https://pytorch.org/docs/stable/torch.compiler.html#torch-compiler-overview)
 9 | * [Torch Dynamo Overview](https://pytorch.org/docs/stable/torch.compiler_dynamo_overview.html)
10 | * [Torch Dynamo Deep-Dive](https://pytorch.org/docs/stable/torch.compiler_dynamo_deepdive.html)
11 | * [Torch Compiler Troubleshouting](https://github.com/pytorch/pytorch/blob/main/docs/source/torch.compiler_troubleshooting.rst)
12 | * [Deep Dive into Triton Internals (3 Parts)](https://www.kapilsharma.dev/posts/deep-dive-into-triton-internals/)
13 | * [HF Ultra-Scale Playbook (fused kernels section)](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=fused_kernels)
14 | * [Liger Kernels repo](https://github.com/linkedin/Liger-Kernel/tree/main/src/liger_kernel/ops)
15 | * [Liger Kernels paper](https://arxiv.org/pdf/2410.10989)
16 | * [FlashAttention](https://arxiv.org/pdf/2205.14135)
17 | * [FlassAttention2](https://arxiv.org/pdf/2307.08691)
18 | * [FlassAttention3](https://arxiv.org/pdf/2407.08608)
19 | * [Flex Attention Tutorial](https://pytorch.org/blog/flexattention/)
20 | 


--------------------------------------------------------------------------------
/week08_inference_software/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week08_inference_software/lecture.pdf


--------------------------------------------------------------------------------
/week08_inference_software/seminar/images/fused_kernels1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week08_inference_software/seminar/images/fused_kernels1.png


--------------------------------------------------------------------------------
/week08_inference_software/seminar/images/fused_kernels2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week08_inference_software/seminar/images/fused_kernels2.png


--------------------------------------------------------------------------------
/week08_inference_software/seminar/images/prefixLM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week08_inference_software/seminar/images/prefixLM.png


--------------------------------------------------------------------------------
/week08_inference_software/seminar/images/rowcolumnarrays.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week08_inference_software/seminar/images/rowcolumnarrays.webp


--------------------------------------------------------------------------------
/week09_inference_algorithms/README.md:
--------------------------------------------------------------------------------
 1 | # Week 9: Efficient model inference
 2 | 
 3 | * Lecture: [link](./lecture.pdf)
 4 | * Seminar: [link](./practice.ipynb)
 5 | * Homework: see the [homework](./homework) folder
 6 | 
 7 | ## Further reading
 8 | * [GPU MODE Lecture 14: Practitioners Guide to Triton](https://christianjmills.com/posts/cuda-mode-notes/lecture-014/#auto-tuning)
 9 | * [Flash-Decoding for long-context inference](https://pytorch.org/blog/flash-decoding/)
10 | * [Deep Dive on the Hopper TMA Unit for FP8 GEMMs](https://pytorch.org/blog/hopper-tma-unit/)
11 | * [Persistent Matmul](https://triton-lang.org/main/getting-started/tutorials/09-persistent-matmul.html)
12 | * [Matrix Multiplication Background User's Guide](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html)
13 | * [Deep Dive on CUTLASS Ping-Pong GEMM Kernel](https://pytorch.org/blog/cutlass-ping-pong-gemm-kernel/)
14 | * [Accelerating 2D Dynamic Block Quantized Float8 GEMMs in Triton](https://pytorch.org/blog/accelerating-gemms-triton/)
15 | * [SmoothQuant paper](https://arxiv.org/abs/2211.10438)
16 | * [SmoothQuant repo](https://github.com/mit-han-lab/smoothquant)


--------------------------------------------------------------------------------
/week09_inference_algorithms/homework/README.md:
--------------------------------------------------------------------------------
 1 | # Homework
 2 | 
 3 | This week homework is devoted to weight and activation quantization and speculative decoding. 
 4 | The details are described in the [notebook](./hw-w8a8-specdec.ipynb).
 5 | 
 6 | ## W8A8 quantization (6 points + 4 bonus points)
 7 | 
 8 | Here you will implement and benchmark kernels for quantization and matrix multiplication in `int8`. 
 9 | As a bonus, you can try to implement SmoothQuant algorithm and get Roman Gorb's personal respect.
10 | 
11 | ## Speculative decoding (4 points)
12 | 
13 | You will implement the simplest version of speculative decoding, measure the speedup and also try `huggingface` implementation.
14 | 


--------------------------------------------------------------------------------
/week09_inference_algorithms/lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mryab/efficient-dl-systems/990f0ef24c049e0782f28944888dfe23a0e198fd/week09_inference_algorithms/lecture.pdf


--------------------------------------------------------------------------------