The response has been limited to 50k tokens of the smallest files in the repo. You can remove this limitation by removing the max tokens filter.
├── .gitignore
├── LICENSE-CC-BY-SA
├── Makefile
├── README.md
├── build
    ├── README.md
    ├── linkcheckerrc
    ├── mdbook
    │   ├── md-to-html.py
    │   ├── mv-links.py
    │   └── utils
    │   │   ├── build_utils.py
    │   │   └── github_md_utils.py
    ├── prince_style.css
    └── requirements.txt
├── chapters-md.txt
├── compute
    ├── README.md
    ├── accelerator
    │   ├── README.md
    │   ├── amd
    │   │   ├── debug.md
    │   │   └── performance.md
    │   ├── benchmarks
    │   │   ├── README.md
    │   │   └── mamf-finder.py
    │   ├── images
    │   │   ├── 8x-H100-node-Dell-PowerEdge-XE9680.png
    │   │   ├── maf-nvidia-amd-efficiency.png
    │   │   ├── nvidia-a100-matmul-tflops.png
    │   │   └── nvidia-a100-spec.png
    │   └── nvidia
    │   │   ├── debug.md
    │   │   └── images
    │   │       └── dcgm-metrics.png
    ├── cpu-memory
    │   └── README.md
    └── cpu
    │   └── README.md
├── contributors.md
├── debug
    ├── NicerTrace.py
    ├── README.md
    ├── images
    │   └── math-fp-discrepancy-outcome-lizard.png
    ├── make-tiny-models-tokenizers-datasets.md
    ├── nccl-performance-debug.md
    ├── pytorch.md
    ├── tiny-scripts
    │   ├── README.md
    │   ├── c4-en-10k.py
    │   ├── cm4-synthetic-testing.py
    │   ├── fsmt-make-super-tiny-model.py
    │   ├── general-pmd-ds-unpack.py
    │   ├── general-pmd-synthetic-testing.py
    │   ├── idefics-make-tiny-model.py
    │   ├── m4-ds-unpack.py
    │   ├── mt5-make-tiny-model.py
    │   ├── openwebtext-10k.py
    │   └── oscar-en-10k.py
    ├── tools.md
    ├── torch-distributed-gpu-test.py
    ├── torch-distributed-hanging-solutions.md
    ├── underflow_overflow.md
    └── underflow_overflow.py
├── images
    └── Machine-Learning-Engineering-book-cover.png
├── inference
    ├── README.md
    └── images
    │   ├── github-vllm-stats-2024-08-24.png
    │   ├── infer-kv-cache.png
    │   ├── mha-gqa-mqa-mla.png
    │   └── softmax-temperature.png
├── insights
    ├── ai-battlefield.md
    ├── how-to-choose-cloud-provider.md
    └── images
    │   └── 640px-Baureihe52Heizer.jpg
├── model-parallelism
    └── README.md
├── network
    ├── README.md
    ├── benchmarks
    │   ├── README.md
    │   ├── all_gather_object_vs_all_gather.py
    │   ├── all_gather_object_vs_all_reduce.py
    │   ├── all_reduce_bench.py
    │   ├── all_reduce_bench_pyxis.sbatch
    │   ├── all_reduce_latency_comp.py
    │   ├── images
    │   │   ├── all-reduce-bench-plot-4n.png
    │   │   └── all-reduce-multi-node-bandwidth.png
    │   └── results
    │   │   ├── README.md
    │   │   └── disable-nvlink.md
    ├── comms.md
    ├── debug
    │   └── README.md
    └── images
    │   ├── all-reduce-bw-2025.png
    │   ├── all-reduce-collective.png
    │   ├── all-reduce-ring-chunk1.png
    │   ├── all-reduce-ring-chunk2.png
    │   ├── all-to-all-bw-2025.png
    │   ├── amd-infinity-arch-MI300X.png
    │   ├── broadcast-ring.png
    │   ├── ccgrid11-low-level-latency.png
    │   ├── ccgrid11-uni-direction-bandwidth.png
    │   ├── collective-all-gather-1.png
    │   ├── collective-all-gather-2.png
    │   ├── collective-all-reduce-1.png
    │   ├── collective-all-reduce-2.png
    │   ├── collective-all-to-all-1.png
    │   ├── collective-all-to-all.png
    │   ├── collective-broadcast-1.png
    │   ├── collective-broadcast-2.png
    │   ├── collective-gather-1.png
    │   ├── collective-gather-2.png
    │   ├── collective-reduce-1.png
    │   ├── collective-reduce-2.png
    │   ├── collective-reduce-scatter.png
    │   ├── collective-scatter-1.png
    │   ├── collective-scatter-2.png
    │   ├── nccl-all-reduce-scan-nvlstree.png
    │   └── nccl-all-reduce-scan.png
├── orchestration
    ├── README.md
    └── slurm
    │   ├── README.md
    │   ├── admin.md
    │   ├── cron-daily.slurm
    │   ├── cron-hourly.slurm
    │   ├── example.slurm
    │   ├── launchers
    │       ├── README.md
    │       ├── accelerate-launcher.slurm
    │       ├── lightning-launcher.slurm
    │       ├── srun-launcher.slurm
    │       └── torchrun-launcher.slurm
    │   ├── performance.md
    │   ├── undrain-good-nodes.sh
    │   └── users.md
├── resources
    └── README.md
├── stabs
    ├── README.md
    └── incoming.md
├── storage
    ├── README.md
    ├── benchmarks
    │   └── results
    │   │   └── hope-2023-12-20-14-37-02-331702-summary.md
    ├── fio-json-extract.py
    └── fio-scan
├── testing
    ├── README.md
    └── testing_utils.py
├── todo.md
└── training
    ├── README.md
    ├── checkpoints
        ├── README.md
        ├── torch-checkpoint-convert-to-bf16
        └── torch-checkpoint-shrink.py
    ├── datasets.md
    ├── dtype.md
    ├── emulate-multi-node.md
    ├── fault-tolerance
        ├── README.md
        ├── fs-watchdog.py
        ├── fs-watchdog.slurm
        ├── slurm-status.py
        └── slurm-status.slurm
    ├── hparams.md
    ├── images
        ├── fp16-bf16-fp8.png
        ├── fp32-tf32-fp16-bf16.png
        └── mixed-precision-fp16.png
    ├── instabilities
        ├── README.md
        ├── images
        │   ├── bloom-176B-success.png
        │   ├── idefics-80b-tr-190-01-image2text.png
        │   ├── idefics-80b-tr-190-01-losses-2023-06-04.png
        │   ├── idefics-80b-tr-190-01-spike-2023-05-27.png
        │   ├── idefics-80b-tr-190-01-spike-recover-2023-05-30.png
        │   ├── llama-7b-grokking-no-zoom.png
        │   ├── llama-7b-grokking.png
        │   ├── pre-bloom-104B-en-fail.png
        │   ├── pre-bloom-tr1-13B-glitch-1-2.png
        │   ├── pre-bloom-tr8-104B-glitch-1.png
        │   ├── pre-bloom-tr8-104B-glitch-5.png
        │   ├── pre-bloom-tr8-104B-glitch-7-10.png
        │   ├── ptl-repeat-data-p1.png
        │   ├── ptl-repeat-data-p2.png
        │   └── ptl-repeat-data-p3.png
        └── training-loss-patterns.md
    ├── model-parallelism
        ├── README.md
        └── images
        │   ├── all-reduce-reduce-scatter-all-gather.png
        │   ├── deepspeed-ulysses-math.png
        │   ├── deepspeed-ulysses.png
        │   ├── dist-flash-attn.png
        │   ├── parallelism-deepspeed-3d.png
        │   ├── parallelism-flexflow.jpeg
        │   ├── parallelism-gpipe-bubble.png
        │   ├── parallelism-pp-dualpipe.png
        │   ├── parallelism-sagemaker-interleaved-pipeline.png
        │   ├── parallelism-tp-independent-gelu.png
        │   ├── parallelism-tp-parallel_gemm.png
        │   ├── parallelism-tp-parallel_self_attention.png
        │   ├── parallelism-tp-parallel_shard_processing.png
        │   ├── parallelism-zero-dp-pp.png
        │   └── parallelism-zero.png
    ├── performance
        ├── README.md
        ├── benchmarks
        │   ├── activation-memory-per-layer.py
        │   ├── dataloader
        │   │   ├── num-workers-bench.py
        │   │   └── pin-memory-non-block-bench.py
        │   ├── matrix-shape
        │   │   └── swiglu-maf-bench.py
        │   └── numa
        │   │   ├── numa-set-pynvml.py
        │   │   └── numa-set.sh
        ├── distributed
        │   └── torch-dist-mem-usage.py
        └── images
        │   ├── a100-server-hwloc.png
        │   ├── attention-less-heads.png
        │   ├── flash-attention.png
        │   ├── tiling.png
        │   └── wave-quant.png
    ├── re-train-hub-models.md
    ├── reproducibility
        └── README.md
    └── tools
        ├── main_process_first.py
        ├── multi-gpu-non-interleaved-print.py
        └── printflock.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # HTML build
  2 | *.html
  3 | chapters-html.txt
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # usage: make help
 2 | 
 3 | .PHONY: help spell html pdf checklinks clean
 4 | .DEFAULT_GOAL := help
 5 | 
 6 | help: ## this help
 7 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-22s\033[0m %s\n", $1, $2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($0, 5) } ' $(MAKEFILE_LIST)
 8 | 
 9 | # pip install codespell
10 | spell: ## spellcheck
11 | 	@codespell --write-changes --skip "*.pdf" --skip "*.json"
12 | 
13 | prep-html-files: ## prepare html-files
14 | 	echo book-front.html > chapters-html.txt
15 | 	perl -ne 's|\.md|.html|; print' chapters-md.txt >> chapters-html.txt
16 | 
17 | html: prep-html-files ## make html version w/ scripts linking to their url at my github repo
18 | 	python build/mdbook/md-to-html.py
19 | 
20 | html-local: prep-html-files ## make html version w/ scripts remaining local
21 | 	python build/mdbook/md-to-html.py --local
22 | 
23 | pdf: html ## make pdf version (from html files)
24 | 	prince --no-author-style -s build/prince_style.css --pdf-title="Stas Bekman - Machine Learning Engineering ($(date))" -o "Stas Bekman - Machine Learning Engineering.pdf" $(cat chapters-html.txt | tr "\n" " ")
25 | 
26 | pdf-upload: pdf ## upload pdf to the hub
27 | 	cp "Stas Bekman - Machine Learning Engineering.pdf" ml-engineering-book/
28 | 	cd ml-engineering-book/ && git commit -m "new version" "Stas Bekman - Machine Learning Engineering.pdf" && git push
29 | 
30 | check-links-local: html-local ## check local links
31 | 	linkchecker --config build/linkcheckerrc $(cat chapters-html.txt | tr "\n" " ") | tee linkchecker-local.txt
32 | 
33 | check-links-all: html ## check all links including external ones
34 | 	linkchecker --config build/linkcheckerrc $(cat chapters-html.txt | tr "\n" " ") --check-extern --user-agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0" | tee linkchecker-all.txt
35 | 
36 | clean: ## remove build files
37 | 	find . -name "*html" -exec rm {} \;
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Machine Learning Engineering Open Book
  2 | 
  3 | This is an open collection of methodologies, tools and step by step instructions to help with successful training and fine-tuning of large language models and multi-modal models and their inference.
  4 | 
  5 | This is a technical material suitable for LLM/VLM training engineers and operators. That is the content here contains lots of scripts and copy-n-paste commands to enable you to quickly address your needs.
  6 | 
  7 | This repo is an ongoing brain dump of my experiences training Large Language Models (LLM) (and VLMs); a lot of the know-how I acquired while training the open-source [BLOOM-176B](https://huggingface.co/bigscience/bloom) model in 2022 and [IDEFICS-80B](https://huggingface.co/HuggingFaceM4/idefics-80b-instruct) multi-modal model in 2023, and RAG models at [Contextual.AI](https://contextual.ai/) in 2024.
  8 | 
  9 | I've been compiling this information mostly for myself so that I could quickly find solutions I have already researched in the past and which have worked, but as usual I'm happy to share these notes with the wider ML community.
 10 | 
 11 | 
 12 | ## Table of Contents
 13 | 
 14 | 
 15 | **Part 1. Insights**
 16 | 
 17 | 1. **[The AI Battlefield Engineering](./insights/ai-battlefield.md)** - what you need to know in order to succeed.
 18 | 
 19 | 1. **[How to Choose a Cloud Provider](./insights/how-to-choose-cloud-provider.md)** - these questions will empower you to have a successful compute cloud experience.
 20 | 
 21 | **Part 2. Hardware**
 22 | 
 23 | 1. **[Compute](compute)** - accelerators, CPUs, CPU memory.
 24 | 
 25 | 1. **[Storage](storage)** - local, distributed and shared file systems.
 26 | 
 27 | 1. **[Network](network)** - intra- and inter-node networking.
 28 | 
 29 | 
 30 | **Part 3. Orchestration**
 31 | 
 32 | 1. **[Orchestration Systems](orchestration)** - managing containers and resources
 33 | 1. **[SLURM](orchestration/slurm)** - Simple Linux Utility for Resource Management
 34 | 
 35 | 
 36 | **Part 4. Training**
 37 | 
 38 | 1. **[Training](training)** - model training-related guides
 39 | 
 40 | 
 41 | **Part 5. Inference**
 42 | 
 43 | 1. **[Inference](inference)** - model inference insights
 44 | 
 45 | 
 46 | **Part 6. Development**
 47 | 
 48 | 1. **[Debugging and Troubleshooting](debug)** - how to debug easy and difficult issues
 49 | 
 50 | 1. **[And more debugging](https://github.com/stas00/the-art-of-debugging)**
 51 | 
 52 | 1. **[Testing](testing)** - numerous tips and tools to make test writing enjoyable
 53 | 
 54 | 
 55 | **Part 7. Miscellaneous**
 56 | 
 57 | 1. **[Resources](resources)** - LLM/VLM chronicles
 58 | 
 59 | 
 60 | ## Updates
 61 | 
 62 | I announce any significant updates on my twitter channel [https://twitter.com/StasBekman](https://twitter.com/StasBekman).
 63 | 
 64 | ## PDF version
 65 | 
 66 | Download the [PDF](https://huggingface.co/stas/ml-engineering-book/resolve/main/Stas%20Bekman%20-%20Machine%20Learning%20Engineering.pdf?download=true) version of the book.
 67 | 
 68 | I will try to rebuild it once in a few weeks or so, but if you want the latest pdf, the instructions for building are [here](build).
 69 | 
 70 | Thanks to HuggingFace for giving me permission to host my book's PDF at the [HF hub](https://huggingface.co/stas/ml-engineering-book).
 71 | 
 72 | ## Discussions
 73 | 
 74 | If you want to discuss something related to ML engineering this repo has the [community discussions](https://github.com/stas00/ml-engineering/discussions) available - so please don't hesitate to share your experience or start a new discussion about something you're passionate about.
 75 | 
 76 | ## Key comparison tables
 77 | 
 78 | High end accelerators:
 79 | 
 80 | - [Theoretical accelerator TFLOPS](compute/accelerator#tflops-comparison-table)
 81 | - [Accelerator memory size and speed](compute/accelerator#accelerator-memory-size-and-speed)
 82 | 
 83 | Networks:
 84 | 
 85 | - [Theoretical inter-node speed](network#inter-node-networking)
 86 | - [Theoretical intra-node speed](network#intra-node-networking)
 87 | 
 88 | ## Shortcuts
 89 | 
 90 | Things that you are likely to need to find quickly and often.
 91 | 
 92 | Tools:
 93 | 
 94 | - [all_reduce_bench.py](network/benchmarks/all_reduce_bench.py) - a much easier way to benchmark network throughput than nccl-tests.
 95 | - [torch-distributed-gpu-test.py](debug/torch-distributed-gpu-test.py) - a tool to quickly test your inter-node connectivity
 96 | - [mamf-finder.py](compute/accelerator/benchmarks/mamf-finder.py) - what is the actual TFLOPS measurement you can get from your accelerator.
 97 | 
 98 | Guides:
 99 | 
100 | - [debugging pytorch applications](debug/pytorch.md) - quick copy-n-paste solutions to resolve hanging or breaking pytorch applications
101 | - [slurm for users](orchestration/slurm/users.md) - a slurm cheatsheet and tricks
102 | - [make tiny models/datasets/tokenizers](debug/make-tiny-models-tokenizers-datasets.md)
103 | - [LLM/VLM chronicles collection](resources#publicly-available-training-llmvlm-logbooks)
104 | 
105 | 
106 | ## Gratitude
107 | 
108 | None of this would have been possible without me being entrusted with doing the specific LLM/VLM trainings I have learned the initial know-how from. This is a privilege that only a few enjoy due to the prohibitively expensive cost of renting huge ML compute clusters. So hopefully the rest of the ML community will vicariously learn from these notes.
109 | 
110 | Special thanks go to [Thom Wolf](https://github.com/thomwolf) who proposed that I lead the BLOOM-176B training back when I didn't know anything about large scale training. This was the project that catapulted me into the intense learning process. And, of course, HuggingFace for giving me the opportunity to work full time on BLOOM-176B and later on IDEFICS-80B trainings.
111 | 
112 | Recently, I continued expanding my knowledge and experience while training models and building scalable training/inference systems at [Contextual.AI](https://contextual.ai/) and I'm grateful for that opportunity to Aman and Douwe.
113 | 
114 | I'd also like to thank the numerous [contributors](contributors.md) who have been making this text awesome and error-free.
115 | 
116 | ## Contributing
117 | 
118 | If you found a bug, typo or would like to propose an improvement please don't hesitate to open an [Issue](https://github.com/stas00/ml-engineering/issues) or contribute a PR.
119 | 
120 | 
121 | ## License
122 | 
123 | The content of this site is distributed under [Attribution-ShareAlike 4.0 International](LICENSE-CC-BY-SA).
124 | 
125 | 
126 | ## Citation
127 | 
128 | ```bibtex
129 | @misc{bekman2024mlengineering,
130 |   author = {Bekman, Stas},
131 |   title = {Machine Learning Engineering Open Book},
132 |   year = {2023-2024},
133 |   publisher = {Stasosphere Online Inc.},
134 |   journal = {GitHub repository},
135 |   url = {https://github.com/stas00/ml-engineering}
136 | }
137 | ```
138 | 
139 | ## My repositories map
140 | 
141 | ✔ **Machine Learning:**
142 |  [ML Engineering Open Book](https://github.com/stas00/ml-engineering) |
143 |  [ML ways](https://github.com/stas00/ml-ways) |
144 |  [Porting](https://github.com/stas00/porting)
145 | 
146 | ✔ **Guides:**
147 |  [The Art of Debugging](https://github.com/stas00/the-art-of-debugging)
148 | 
149 | ✔ **Applications:**
150 |  [ipyexperiments](https://github.com/stas00/ipyexperiments)
151 | 
152 | ✔ **Tools and Cheatsheets:**
153 |  [bash](https://github.com/stas00/bash-tools) |
154 |  [conda](https://github.com/stas00/conda-tools) |
155 |  [git](https://github.com/stas00/git-tools) |
156 |  [jupyter-notebook](https://github.com/stas00/jupyter-notebook-tools) |
157 |  [make](https://github.com/stas00/make-tools) |
158 |  [python](https://github.com/stas00/python-tools) |
159 |  [tensorboard](https://github.com/stas00/tensorboard-tools) |
160 |  [unix](https://github.com/stas00/unix-tools)
161 | 


--------------------------------------------------------------------------------
/build/README.md:
--------------------------------------------------------------------------------
 1 | # Book Building
 2 | 
 3 | Important: this is still a WIP - it mostly works, but stylesheets need some work to make the pdf really nice. Should be complete in a few weeks.
 4 | 
 5 | This document assumes you're working from the root of the repo.
 6 | 
 7 | ## Installation requirements
 8 | 
 9 | 1. Install python packages used during book build
10 | ```
11 | pip install -r build/requirements.txt
12 | ```
13 | 
14 | 2. Download the free version of [Prince XML](https://www.princexml.com/download/). It's used to build the pdf version of this book.
15 | 
16 | 
17 | ## Build html
18 | 
19 | ```
20 | make html
21 | ```
22 | 
23 | ## Build pdf
24 | 
25 | ```
26 | make pdf
27 | ```
28 | 
29 | It will first build the html target and then will use it to build the pdf version.
30 | 
31 | 
32 | ## Check links and anchors
33 | 
34 | To validate that all local links and anchored links are valid run:
35 | ```
36 | make check-links-local
37 | ```
38 | 
39 | To additionally also check external links
40 | ```
41 | make check-links-all
42 | ```
43 | use the latter sparingly to avoid being banned for hammering servers.
44 | 
45 | 
46 | ## Move md files/dirs and adjust relative links
47 | 
48 | 
49 | e.g. `slurm` => `orchestration/slurm`
50 | ```
51 | src=slurm
52 | dst=orchestration/slurm
53 | 
54 | mkdir -p orchestration
55 | git mv $src $dst
56 | perl -pi -e "s|$src|$dst|" chapters-md.txt
57 | python build/mdbook/mv-links.py $src $dst
58 | git checkout $dst
59 | make check-links-local
60 | 
61 | ```
62 | 
63 | ## Resize images
64 | 
65 | When included images are too large, make them smaller a bit:
66 | 
67 | ```
68 | mogrify -format png -resize 1024x1024\> *png
69 | ```
70 | 


--------------------------------------------------------------------------------
/build/linkcheckerrc:
--------------------------------------------------------------------------------
 1 | # rtfm https://linkchecker.github.io/linkchecker/man/linkcheckerrc.html
 2 | 
 3 | [output]
 4 | 
 5 | [text]
 6 | colorwarning=blue
 7 | 
 8 | [AnchorCheck]
 9 | 
10 | [filtering]
11 | ignorewarnings=http-redirected,http-moved-permanent
12 | 
13 | [checking]
14 | threads=20
15 | 


--------------------------------------------------------------------------------
/build/mdbook/md-to-html.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | import re
 4 | 
 5 | from functools import partial
 6 | from markdown_it import MarkdownIt
 7 | from mdit_py_plugins.anchors import anchors_plugin
 8 | from pathlib import Path
 9 | 
10 | from utils.github_md_utils import md_header_to_anchor, md_process_local_links, md_expand_links, md_convert_md_target_to_html
11 | from utils.build_utils import get_markdown_files
12 | 
13 | mdit = (
14 |     MarkdownIt('commonmark', {'breaks':True, 'html':True})
15 |     .use(anchors_plugin, max_level=7, permalink=False, slug_func=md_header_to_anchor)
16 |     .enable('table')
17 | )
18 | 
19 | my_repo_url = "https://github.com/stas00/ml-engineering/blob/master"
20 | 
21 | def convert_markdown_to_html(markdown_path, args):
22 |     md_content = markdown_path.read_text()
23 | 
24 |     cwd_rel_path = markdown_path.parent
25 | 
26 |     repo_url = my_repo_url if not args.local else ""
27 |     md_content = md_process_local_links(md_content, md_expand_links, cwd_rel_path=cwd_rel_path, repo_url=repo_url)
28 |     md_content = md_process_local_links(md_content, md_convert_md_target_to_html)
29 | 
30 |     #tokens = mdit.parse(md_content)
31 |     html_content = mdit.render(md_content)
32 |     # we don't want <br />, since github doesn't use it in its md presentation
33 |     html_content = re.sub('<br />', '', html_content)
34 | 
35 |     html_file = markdown_path.with_suffix(".html")
36 |     html_file.write_text(html_content)
37 | 
38 | def make_cover_page_file(cover_md_file, date):
39 |     with open(cover_md_file, "w") as f:
40 |         f.write(f"""
41 | ![](images/Machine-Learning-Engineering-book-cover.png)
42 | 
43 | ## Machine Learning Engineering Open Book
44 | 
45 | This is a PDF version of [Machine Learning Engineering Open Book by Stas Bekman](https://github.com/stas00/ml-engineering/) generated on {date}.
46 | 
47 | As this book is constantly being updated, if you downloaded it as a pdf file and the date isn't recent, chances are that it's already outdated - make sure to check the latest version at [https://github.com/stas00/ml-engineering](https://github.com/stas00/ml-engineering/).
48 | """)
49 |     return Path(cover_md_file)
50 | 
51 | def write_html_index(html_chapters_file, markdown_files):
52 |     html_chapters = [str(l.with_suffix(".html")) for l in markdown_files]
53 |     html_chapters_file.write_text("\n".join(html_chapters))
54 | 
55 | 
56 | if __name__ == "__main__":
57 | 
58 |     parser = argparse.ArgumentParser()
59 |     parser.add_argument('--local',  action="store_true", help="all local files remain local")
60 |     args = parser.parse_args()
61 | 
62 |     date = datetime.datetime.now().strftime("%Y-%m-%d")
63 | 
64 |     cover_md_file = "book-front.md"
65 | 
66 |     md_chapters_file = Path("chapters-md.txt")
67 |     html_chapters_file = Path("chapters-html.txt")
68 | 
69 |     pdf_file = f"Stas Bekman - Machine Learning Engineering ({date}).pdf"
70 | 
71 |     markdown_files = [make_cover_page_file(cover_md_file, date)] + get_markdown_files(md_chapters_file)
72 | 
73 |     pdf_files = []
74 |     for markdown_file in markdown_files:
75 |         convert_markdown_to_html(markdown_file, args)
76 | 
77 |     write_html_index(html_chapters_file, markdown_files)
78 | 


--------------------------------------------------------------------------------
/build/mdbook/mv-links.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | when chapters are moved around this script rewrites local relative links
 4 | 
 5 | python build/mdbook/mv-links.py slurm orchestration/slurm
 6 | 
 7 | """
 8 | 
 9 | import datetime
10 | import re
11 | import sys
12 | from pathlib import Path
13 | 
14 | from utils.build_utils import get_markdown_files
15 | from utils.github_md_utils import md_rename_relative_links, md_process_local_links
16 | 
17 | 
18 | def rewrite_links(markdown_path, src, dst):
19 |     md_content = markdown_path.read_text()
20 | 
21 |     cwd_rel_path = markdown_path.parent
22 |     md_content = md_process_local_links(md_content, md_rename_relative_links, cwd_rel_path=cwd_rel_path, src=src, dst=dst)
23 | 
24 |     markdown_path.write_text(md_content)
25 | 
26 | 
27 | if __name__ == "__main__":
28 | 
29 |     src, dst = sys.argv[1:3]
30 | 
31 |     print(f"Renaming {src} => {dst}")
32 | 
33 |     md_chapters_file = Path("chapters-md.txt")
34 |     markdown_files = get_markdown_files(md_chapters_file)
35 | 
36 |     for markdown_file in markdown_files:
37 |         rewrite_links(markdown_file, src=src, dst=dst)
38 | 


--------------------------------------------------------------------------------
/build/mdbook/utils/build_utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | def get_markdown_files(md_chapters_file):
4 |     return [Path(l) for l in md_chapters_file.read_text().splitlines() if len(l)>0]
5 | 


--------------------------------------------------------------------------------
/build/prince_style.css:
--------------------------------------------------------------------------------
  1 | /*
  2 |   CSS style sheet for prince html2pdf system (http://www.princexml.com/)
  3 | 
  4 |   Here's an example of how to use the style sheet:
  5 | 
  6 |   prince --no-author-style -s prince_style.css http://en.wikipedia.org/wiki/Winter_war -o foo.pdf
  7 | */
  8 | 
  9 | @import url(http://www.princexml.com/fonts/gentium/index.css);
 10 | 
 11 | /* set headers and footers */
 12 | 
 13 | @page {
 14 |   size: letter;
 15 |   margin: 2cm 2cm;
 16 |   font: 11pt/1.3 "Gentium", serif;
 17 | 
 18 | /*
 19 |   @top-right {
 20 |     content: string(title);
 21 |     font-style: italic;
 22 |   }
 23 |   @top-left {
 24 |     content: string(source);
 25 |     font-style: italic;
 26 |   }
 27 | */
 28 |   @bottom-center {
 29 |     content: counter(page);
 30 |     vertical-align: top;
 31 |     padding-top: 1em;
 32 |   }
 33 | 
 34 |   /* prince-shrink-to-fit: auto; */
 35 | }
 36 | 
 37 | /* #siteSub { string-set: source content() } */
 38 | 
 39 | /* basic style settings*/
 40 | 
 41 | body {
 42 |   font: 10pt/1.3 "Gentium", serif;
 43 |   prince-linebreak-magic: auto;
 44 |   hyphens: none;
 45 |   text-align: justify;
 46 | }
 47 | 
 48 | ul, ol, dl { text-align: left; hyphens: manual; }
 49 | 
 50 | chapter {
 51 |   page-break-before: always;
 52 |   prince-bookmark-level: 1;
 53 |   prince-bookmark-label: attr(title);
 54 | }
 55 | 
 56 | h1 { page-break-before: always; }
 57 | 
 58 | h1, h2, h3, h4, h5, h6 {
 59 |   line-height: 1.2;
 60 |   padding: 0;
 61 |   margin: 0.7em 0 0.2em;
 62 |   font-weight: normal;
 63 |   text-align: left;
 64 |   page-break-after: avoid;
 65 |   clear: both;
 66 | }
 67 | 
 68 | title { prince-bookmark-level: 1 }
 69 | h1 { prince-bookmark-level: 1 }
 70 | h2 { prince-bookmark-level: 2 }
 71 | h3 { prince-bookmark-level: 3 }
 72 | h4 { prince-bookmark-level: 4 }
 73 | h5 { prince-bookmark-level: 5 }
 74 | h6 { prince-bookmark-level: 6 }
 75 | 
 76 | /* a { text-decoration: none; color: inherit; } */
 77 | 
 78 | p {
 79 |   padding: 4px 0;   /* top & bottom, right & left */
 80 |   margin: 0;
 81 | }
 82 | 
 83 | /* blockquote p { */
 84 | /*   font-size: 1em; */
 85 | /*   font-style: italic; */
 86 | /* } */
 87 | 
 88 | blockquote {
 89 |   background: #f9f9f9;
 90 |   border-left: 10px solid #ccc;
 91 |   margin: 1.5em 10px;
 92 |   padding: 0.5em 10px;
 93 | }
 94 | blockquote p {
 95 |   display: inline;
 96 | }
 97 | 
 98 | code {
 99 |   font-family: Consolas, Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono, Courier New, monospace, serif;
100 |   font-size: 0.8em; /* seems to be similar in size to the non-monospace font */
101 |   background: #f9f9f9;
102 | }
103 | 
104 | pre {
105 |   background: #f9f9f9;
106 |   margin: 1.5em 10px;
107 |   padding: 0.5em 10px;
108 |   white-space: pre-wrap; /* wrap long code sections to fit the page */
109 |   hyphens: none; /* do not hyphenate code sections */
110 | }
111 | 
112 | ol, ul {
113 |   margin-top: 4px;
114 |   margin-bottom: 4px;
115 |   margin-left: 2em;
116 | }
117 | ul {  list-style-type: disc }
118 | 
119 | 
120 | /* put article heading on top of the page, spanning all columns */
121 | 
122 | h1 {
123 |   string-set: title content();
124 |   padding-bottom: 0.2em;
125 |   border-bottom: thin solid black;
126 |   margin-bottom: 1em;
127 | }
128 | 
129 | 
130 | div {
131 |   max-width: 100%
132 | }
133 | 
134 | /* images */
135 | 
136 | /* this is important to fit huge images */
137 | img {
138 |   max-width: 650px;
139 | }
140 | 
141 | tr, td, th {
142 |   margin: 0;
143 | /*  padding: 0.1em 0.2em; */
144 |   text-align: left;
145 |   vertical-align: top
146 | }
147 | 
148 | div.center, th[align="center"] { text-align: center }
149 | 
150 | /* tables */
151 | 
152 | table {
153 |   width: auto;
154 |   border-collapse: collapse;
155 |   border-bottom: thin solid black;
156 |   margin: 1em 1em 2em 1em;
157 | }
158 | table, table td, table th {
159 |   border: solid black .1px;
160 |   padding: 0.4em;
161 |   text-align: left;
162 | }
163 | 
164 | table th { background: #eee; font-weight: bold}
165 | 
166 | /* hr { display: none } */
167 | 
168 | sup { vertical-align: baseline }
169 | sup { vertical-align: top }
170 | 
171 | /* fix ' characters */
172 | body { prince-text-replace: "'" "\2019" }
173 | 


--------------------------------------------------------------------------------
/build/requirements.txt:
--------------------------------------------------------------------------------
1 | codespell
2 | linkchecker
3 | markdown-it-py
4 | mdit-py-plugins
5 | 


--------------------------------------------------------------------------------
/chapters-md.txt:
--------------------------------------------------------------------------------
 1 | README.md
 2 | 
 3 | insights/ai-battlefield.md
 4 | insights/how-to-choose-cloud-provider.md
 5 | 
 6 | compute/README.md
 7 | compute/accelerator/README.md
 8 | compute/accelerator/benchmarks/README.md
 9 | compute/accelerator/nvidia/debug.md
10 | compute/accelerator/amd/debug.md
11 | compute/accelerator/amd/performance.md
12 | compute/cpu/README.md
13 | compute/cpu-memory/README.md
14 | 
15 | storage/README.md
16 | storage/benchmarks/results/hope-2023-12-20-14-37-02-331702-summary.md
17 | 
18 | network/README.md
19 | network/comms.md
20 | network/debug/README.md
21 | network/benchmarks/README.md
22 | network/benchmarks/results/README.md
23 | network/benchmarks/results/disable-nvlink.md
24 | 
25 | orchestration/README.md
26 | orchestration/slurm/README.md
27 | orchestration/slurm/admin.md
28 | orchestration/slurm/users.md
29 | orchestration/slurm/performance.md
30 | orchestration/slurm/launchers/README.md
31 | 
32 | training/README.md
33 | training/model-parallelism/README.md
34 | training/performance/README.md
35 | training/fault-tolerance/README.md
36 | training/reproducibility/README.md
37 | training/instabilities/README.md
38 | training/instabilities/training-loss-patterns.md
39 | training/checkpoints/README.md
40 | training/hparams.md
41 | training/dtype.md
42 | training/emulate-multi-node.md
43 | training/re-train-hub-models.md
44 | training/datasets.md
45 | 
46 | inference/README.md
47 | 
48 | debug/README.md
49 | debug/pytorch.md
50 | debug/tools.md
51 | debug/torch-distributed-hanging-solutions.md
52 | debug/underflow_overflow.md
53 | debug/make-tiny-models-tokenizers-datasets.md
54 | debug/tiny-scripts/README.md
55 | 
56 | testing/README.md
57 | 
58 | resources/README.md
59 | 
60 | contributors.md
61 | 
62 | build/README.md
63 | 


--------------------------------------------------------------------------------
/compute/README.md:
--------------------------------------------------------------------------------
1 | # Compute
2 | 
3 | 1. **[Accelerator](accelerator)** - the work horses of ML - GPUs, TPUs, IPUs, FPGAs, HPUs, QPUs, RDUs (WIP)
4 | 
5 | 1. **[CPU](cpu)** - cpus, affinities (WIP)
6 | 
7 | 1. **[CPU Memory](cpu-memory)** - how much CPU memory is enough - the shortest chapter ever.
8 | 


--------------------------------------------------------------------------------
/compute/accelerator/amd/performance.md:
--------------------------------------------------------------------------------
 1 | # AMD GPUs Performance
 2 | 
 3 | As I haven't had a chance to do any serious work with AMD GPUs, just sharing links for now.
 4 | 
 5 | - [AMD Instinct MI300X system optimization](https://rocm.docs.amd.com/en/latest/how-to/system-optimization/mi300x.html)
 6 | - [AMD Instinct MI300X workload optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html)
 7 | 
 8 | ## Profilers
 9 | 
10 | [omniperf](https://github.com/ROCm/omniperf) - Advanced Profiling and Analytics for AMD Hardware - e.g. can plot a roofline performance of your AMD accelerator and many other things.
11 | 


--------------------------------------------------------------------------------
/compute/accelerator/images/8x-H100-node-Dell-PowerEdge-XE9680.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/compute/accelerator/images/8x-H100-node-Dell-PowerEdge-XE9680.png


--------------------------------------------------------------------------------
/compute/accelerator/images/maf-nvidia-amd-efficiency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/compute/accelerator/images/maf-nvidia-amd-efficiency.png


--------------------------------------------------------------------------------
/compute/accelerator/images/nvidia-a100-matmul-tflops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/compute/accelerator/images/nvidia-a100-matmul-tflops.png


--------------------------------------------------------------------------------
/compute/accelerator/images/nvidia-a100-spec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/compute/accelerator/images/nvidia-a100-spec.png


--------------------------------------------------------------------------------
/compute/accelerator/nvidia/images/dcgm-metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/compute/accelerator/nvidia/images/dcgm-metrics.png


--------------------------------------------------------------------------------
/compute/cpu-memory/README.md:
--------------------------------------------------------------------------------
 1 | # CPU memory
 2 | 
 3 | This is a tiny chapter, since usually there are very few nuances one needs to know about CPU memory - which is a good thing!
 4 | 
 5 | Most of the ML workload compute happens on GPUs, but typically there should be at least as much CPU memory on each node as there is on the GPUs. So, for example, if you're on a H100 node with 8x 80GB GPUs, you have 640GB of GPU memory. Thus you want at least as much of CPU memory. Most recent high end cloud packages usually come with 1-2TBs of CPU memory.
 6 | 
 7 | ## What CPU memory is needed for in ML workloads
 8 | 
 9 | - Loading the model weights, unless they are loaded directly onto the GPUs - this is usually a transitory memory usage that goes back to zero once the model has been moved to GPUs.
10 | - Saving the model weights. In some situations each GPU writes its own checkpoint directly to the disk, in other cases the model is recomposed on the CPU before it's written to disk - this too is a transitory memory usage.
11 | - Possible parameter and optimizer state offloading when using frameworks like  [Deepspeed](https://www.deepspeed.ai/tutorials/zero-offload/). In which case quite a lot of CPU memory might be needed.
12 | - Activations calculated in the `forward` pass, and which need to be available for the `backward` path can also be offloaded to CPU, rather than discarded and then recomputed during the backward pass to save the unnecessary overhead
13 | - `DataLoader` is usually one of the main users of CPU memory and at times it may consume very large amounts of memory. Typically there are at least 2x 8 DL workers running on each node, so you need enough memory to support at least 16 processes each holding some data. For example, in the case of streaming data from the cloud, if the data shards are large, these processes could easily eat up hundreds of GBs of CPU memory.
14 | - The software itself and its dependent libraries uses a bit of CPU memory, but this amount is usually negligible.
15 | 
16 | ## Things to know
17 | 
18 | - If the `DataLoader` uses HF `datasets` in `mmap` mode the Resident memory usage may appear to be using a huge amount of CPU memory as it'll try to map out the whole datasets to the memory. Except this is misleading, since if the memory is needed elsewhere the OS will page out any unneeded mmap'ed pages back to the system. You can read more about it [here](https://stasosphere.com/entrepreneur-being/301-mmap-memory-leak-investigation/). This awareness, of course, applies to any dataset using `mmap`, I was using HF `datasets` as an example since it's very widely used.
19 | 


--------------------------------------------------------------------------------
/compute/cpu/README.md:
--------------------------------------------------------------------------------
 1 | # CPU
 2 | 
 3 | As of this writing Machine learning workloads don't use much CPU so there aren't too many things to tell in this chapter. As CPUs evolve to become more like GPUs this is like to change, so I'm expecting this chapter to evolve along the evolution of the CPUs.
 4 | 
 5 | ## How many cpu cores do you need
 6 | 
 7 | Per 1 accelerator you need:
 8 | 
 9 | 1. 1 cpu core per process that is tied to the accelerator
10 | 2. 1 cpu core for each `DataLoader` worker process - and typically you need 2-4 workers.
11 | 
12 | 2 workers is usually plenty for LMs, especially if the data is already preprocessed.
13 | 
14 | If you need to do dynamic transforms, which is often the case with computer vision models or VLMs, you may need 3-4 and sometimes more workers.
15 | 
16 | The goal is to be able to pull from the `DataLoader` instantly, and not block the accelerator's compute, which means that you need to pre-process a bunch of samples for the next iteration, while the current iteration is running. In other words your next batch needs to take no longer than a single iteration accelerator compute of the batch of the same size.
17 | 
18 | Besides preprocessing if you're pulling dynamically from the cloud instead of local storage you also need to make sure that the data is pre-fetched fast enough to feed the workers that feed the accelerator furnace.
19 | 
20 | Multiply that by the number of accelerators, add a few cores for the Operation system (let's say 4).
21 | 
22 | If the node has 8 accelerators, and you have `num_workers`, then you need `8*(num_workers+1)+4`. If you're doing NLP, it'd be usually about 2 workers per accelerator, so `8*(2+1)+4` => 28 cpu cores. If you do CV training, and, say, you need 4 workers per accelerator, then it'd be `8(4+1)+4` => 44 cpu cores.
23 | 
24 | What happens if you have more very active processes than the total number of cpu cores? Some processes will get preempted (put in the queue for when cpu cores become available) and you absolutely want to avoid any context switching.
25 | 
26 | But modern cloud offerings typically have 50-100+ cpu-cores so usually there is no problem to have enough cores to go around.
27 | 
28 | See also [Asynchronous DataLoader](../../training/performance#asynchronous-dataloader).
29 | 
30 | 
31 | 
32 | ### CPU offload
33 | 
34 | Some frameworks, like [Deepspeed](https://www.deepspeed.ai/tutorials/zero-offload/) can offload some compute work to CPU without creating a bottleneck. In which case you'd want additional cpu-cores.
35 | 
36 | 
37 | 
38 | ## NUMA affinity
39 | 
40 | See [NUMA affinity](../../training/performance#numa-affinity).
41 | 
42 | 
43 | 
44 | ## Hyperthreads
45 | 
46 | [Hyper-Threads](https://en.wikipedia.org/wiki/Hyper-threading) double the cpu cores number, by virtualizing each physical core into 2 virtual ones, allowing 2 threads to use the same cpu core at the same time. Depending on the type of workload this feature may or may not increase the overall performance. Intel, the inventor of this technology, suggests a possible 30% performance increase in some situations.
47 | 
48 | See also [To enable Hyper-Threads or not](../../orchestration/slurm/performance.md#to-enable-hyper-threads-or-not).
49 | 


--------------------------------------------------------------------------------
/contributors.md:
--------------------------------------------------------------------------------
 1 | # Contributors
 2 | 
 3 | Multiple contributors kindly helped to improve these ever improving and expanding notes.
 4 | 
 5 | 1. Some of them did it via PRs, and are thus listed automatically [here](https://github.com/stas00/ml-engineering/graphs/contributors)
 6 | 2. Others did it via various other ways so I'm listing them explicitly here:
 7 | 
 8 | - [Adam Moody](https://github.com/adammoody)
 9 | - [Alex Rogozhnikov](https://github.com/arogozhnikov)
10 | - [Bowei Liu](https://github.com/boweiliu)
11 | - [Darrick Horton](https://www.linkedin.com/in/darrick-horton/)
12 | - [Elio VP](https://www.linkedin.com/in/eliovp/)
13 | - [Garrett Goon](https://github.com/garrett361)
14 | - [Horace He](https://github.com/Chillee)
15 | - [Ivan Yashchuk](https://github.com/IvanYashchuk)
16 | - [Jack Dent](https://github.com/jackdent)
17 | - [Jon Stevens](https://github.com/jon-hotaisle)
18 | - [Jordan Nanos](https://github.com/JordanNanos)
19 | - [Mark Saroufim](https://github.com/msaroufim)
20 | - [Olatunji Ruwase](https://github.com/tjruwase)
21 | - Oren Leung
22 | - [Quentin Anthony](https://github.com/Quentin-Anthony)
23 | - [Ross Wightman](https://github.com/rwightman)
24 | - [Samyam Rajbhandari](https://github.com/samyam)
25 | - [Shikib Mehri](https://github.com/Shikib)
26 | - [Siddharth Singh](https://github.com/siddharth9820)
27 | - [Stéphane Requena](https://twitter.com/s_requena)
28 | - [Zhiqi Tao](https://www.linkedin.com/in/zhiqitao/)
29 | 
30 | 
31 | If you contributed to this text and for some reason you're not on one of these 2 lists - let's fix it by adding your name with a github or similar link here.
32 | 


--------------------------------------------------------------------------------
/debug/README.md:
--------------------------------------------------------------------------------
 1 | # Debugging and Troubleshooting
 2 | 
 3 | 
 4 | ## Guides
 5 | 
 6 | - [Debugging PyTorch programs](./pytorch.md)
 7 | 
 8 | - [Diagnosing Hangings and Deadlocks in Multi-Node Multi-GPU Python Programs](./torch-distributed-hanging-solutions.md)
 9 | 
10 | - [Network Debug](../network/debug/)
11 | 
12 | - [Troubleshooting NVIDIA GPUs](../compute/accelerator/nvidia/debug.md)
13 | 
14 | - [Underflow and Overflow Detection](./underflow_overflow.md)
15 | 
16 | 
17 | 
18 | ## Tools
19 | 
20 | - [Debug Tools](./tools.md)
21 | 
22 | - [torch-distributed-gpu-test.py](./torch-distributed-gpu-test.py) - this a `torch.distributed` diagnostics
23 |   script that checks that all GPUs in the cluster (one or many nodes) can talk to each other and allocate gpu memory.
24 | 
25 | - [NicerTrace](./NicerTrace.py) - this is an improved `trace` python module with multiple additional flags added to the constructor and more useful output.
26 | 


--------------------------------------------------------------------------------
/debug/images/math-fp-discrepancy-outcome-lizard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/debug/images/math-fp-discrepancy-outcome-lizard.png


--------------------------------------------------------------------------------
/debug/nccl-performance-debug.md:
--------------------------------------------------------------------------------
1 | # NCCL: Debug and Performance
2 | 
3 | moved to [Network Debug](../network/debug).
4 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/README.md:
--------------------------------------------------------------------------------
 1 | # A Back up of scripts
 2 | 
 3 | This is a backup of scripts discussed in [Faster debug and development with tiny models, tokenizers and datasets](../make-tiny-models-tokenizers-datasets.md).
 4 | 
 5 | * [c4-en-10k.py](./c4-en-10k.py)
 6 | * [cm4-synthetic-testing.py](./cm4-synthetic-testing.py)
 7 | * [fsmt-make-super-tiny-model.py](./fsmt-make-super-tiny-model.py)
 8 | * [general-pmd-ds-unpack.py](./general-pmd-ds-unpack.py)
 9 | * [general-pmd-synthetic-testing.py](./general-pmd-synthetic-testing.py)
10 | * [m4-ds-unpack.py](./m4-ds-unpack.py)
11 | * [mt5-make-tiny-model.py](./mt5-make-tiny-model.py)
12 | * [openwebtext-10k.py](./openwebtext-10k.py)
13 | * [oscar-en-10k.py](./oscar-en-10k.py)
14 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/c4-en-10k.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """The Open WebText Corpus"""
16 | 
17 | 
18 | import os
19 | import json
20 | 
21 | import datasets
22 | 
23 | 
24 | _CITATION = """\
25 | @article{2019t5,
26 |     author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
27 |     title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
28 |     journal = {arXiv e-prints},
29 |     year = {2019},
30 |     archivePrefix = {arXiv},
31 |     eprint = {1910.10683},
32 | }
33 | """
34 | 
35 | _DESCRIPTION = """\
36 | This is a small subset representing the first 10K records of the original C4 dataset, "en" subset - created for testing. The records were extracted after having been shuffled.
37 | 
38 | The full 1TB+ dataset is at https://huggingface.co/datasets/c4.
39 | """
40 | 
41 | _URL = "https://cdn-datasets.huggingface.co/nlp/datasets/c4/c4-en-10k.tar.xz"
42 | 
43 | class C4En10k(datasets.GeneratorBasedBuilder):
44 |     """The C4 dataset."""
45 | 
46 |     BUILDER_CONFIGS = [
47 |         datasets.BuilderConfig(
48 |             name="plain_text",
49 |             description="Plain text",
50 |             version=datasets.Version("1.0.0"),
51 |         )
52 |     ]
53 | 
54 |     def _info(self):
55 |         return datasets.DatasetInfo(
56 |             description=_DESCRIPTION,
57 |             features=datasets.Features({"text": datasets.Value("string")}),
58 |             homepage="https://huggingface.co/datasets/allenai/c4/",
59 |             citation=_CITATION,
60 |         )
61 | 
62 |     def _split_generators(self, dl_manager):
63 |         dl_dir = dl_manager.download_and_extract(_URL)
64 |         jsonl_file = os.path.join(dl_dir, "c4-en-10k", "c4-en-10k.jsonl")
65 |         return [
66 |             datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"jsonl_file": jsonl_file}),
67 |         ]
68 | 
69 |     def _generate_examples(self, jsonl_file):
70 |         """Yields examples."""
71 |         with open(jsonl_file, encoding="utf-8") as f:
72 |             idx = 0
73 |             for line in f:
74 |                 rec = json.loads(line)
75 |                 yield idx,  {"text": rec["text"]}
76 |                 idx += 1
77 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/fsmt-make-super-tiny-model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # This script creates a super tiny model that is useful inside tests, when we just want to test that
18 | # the machinery works, without needing to the check the quality of the outcomes.
19 | #
20 | # This version creates a tiny vocab first, and then a tiny model - so the outcome is truly tiny -
21 | # all files ~60KB. As compared to taking a full-size model, reducing to the minimum its layers and
22 | # emb dimensions, but keeping the full vocab + merges files, leading to ~3MB in total for all files.
23 | # The latter is done by `fsmt-make-super-tiny-model.py`.
24 | #
25 | # It will be used then as "stas/tiny-wmt19-en-ru"
26 | 
27 | from pathlib import Path
28 | import json
29 | import tempfile
30 | 
31 | from transformers import FSMTTokenizer, FSMTConfig, FSMTForConditionalGeneration
32 | from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
33 | 
34 | mname_tiny = "tiny-wmt19-en-ru"
35 | 
36 | # Build
37 | 
38 | # borrowed from a test 
39 | vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "lo", "low", "er</w>", "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>", ]
40 | vocab_tokens = dict(zip(vocab, range(len(vocab))))
41 | merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
42 | 
43 | with tempfile.TemporaryDirectory() as tmpdirname:
44 |     build_dir = Path(tmpdirname)
45 |     src_vocab_file = build_dir / VOCAB_FILES_NAMES["src_vocab_file"]
46 |     tgt_vocab_file = build_dir / VOCAB_FILES_NAMES["tgt_vocab_file"]
47 |     merges_file = build_dir / VOCAB_FILES_NAMES["merges_file"]
48 |     with open(src_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
49 |     with open(tgt_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
50 |     with open(merges_file, "w") as fp   : fp.write("\n".join(merges))
51 | 
52 |     tokenizer = FSMTTokenizer(
53 |         langs=["en", "ru"],
54 |         src_vocab_size = len(vocab),
55 |         tgt_vocab_size = len(vocab),
56 |         src_vocab_file=src_vocab_file,
57 |         tgt_vocab_file=tgt_vocab_file,
58 |         merges_file=merges_file,
59 |     )
60 |     
61 | config = FSMTConfig(
62 |     langs=['ru', 'en'],
63 |     src_vocab_size=1000, tgt_vocab_size=1000,
64 |     d_model=4,
65 |     encoder_layers=1, decoder_layers=1,
66 |     encoder_ffn_dim=4, decoder_ffn_dim=4,
67 |     encoder_attention_heads=1, decoder_attention_heads=1,
68 | )
69 | 
70 | tiny_model = FSMTForConditionalGeneration(config)
71 | print(f"num of params {tiny_model.num_parameters()}")
72 | 
73 | # Test
74 | batch = tokenizer(["Making tiny model"], return_tensors="pt")
75 | outputs = tiny_model(**batch)
76 | 
77 | print("test output:", len(outputs.logits[0]))
78 | 
79 | # Save
80 | tiny_model.half() # makes it smaller
81 | tiny_model.save_pretrained(mname_tiny)
82 | tokenizer.save_pretrained(mname_tiny)
83 | 
84 | print(f"Generated {mname_tiny}")
85 | 
86 | # Upload
87 | # transformers-cli upload tiny-wmt19-en-ru
88 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/general-pmd-ds-unpack.py:
--------------------------------------------------------------------------------
  1 | # unpack the desired datasets records into a filesystem-based subdir structure which can then be
  2 | # used to create a synthetic dataset of desired records. Each record can now be easily modified on
  3 | # the filesystem before being packed back into a dataset
  4 | #
  5 | # each records is a subdir
  6 | # each part of the record is:
  7 | # image.jpg
  8 | # text.txt
  9 | # meta.txt
 10 | # source.txt
 11 | #
 12 | # .null extension is when the slot is empty
 13 | #
 14 | # Example:
 15 | # python general-pmd-ds-unpack.py \
 16 | # --dataset_name_or_path /hf/m4-master/data/general_pmd/image/localized_narratives__ADE20k/train/00000-00002 \
 17 | # --ids 1,4-10 --target_path data
 18 | 
 19 | 
 20 | from argparse import ArgumentParser
 21 | from collections import defaultdict
 22 | from datasets import load_from_disk, Dataset
 23 | from pathlib import Path
 24 | from pprint import pprint
 25 | import gc
 26 | import numpy as np
 27 | import os
 28 | import psutil
 29 | import sys
 30 | import torchvision.transforms as transforms
 31 | from PIL import Image, ImageFile
 32 | 
 33 | Image.MAX_IMAGE_PIXELS = None
 34 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 35 | 
 36 | # LOCALIZE ME!
 37 | DATA_ROOT = "/hf/m4-master/data/cm4"
 38 | 
 39 | DATASET_PATH = f"{DATA_ROOT}/cm4-10000-v0.1"
 40 | 
 41 | parser = ArgumentParser()
 42 | 
 43 | parser.add_argument("--dataset_name_or_path", required=True, type=str, help="source dataset_name_or_path")
 44 | parser.add_argument("--target_path", required=False, default="output", type=str, help="path to where to unpack")
 45 | parser.add_argument("--ids", required=False, default="0", type=str, help="which ids to extract. example: 1,2,5-7,10")
 46 | args = parser.parse_args()
 47 | 
 48 | def list2range(s):
 49 |     """
 50 |     list2range('1,2,5-7,10')
 51 |     [1, 2, 5, 6, 7, 10]
 52 |     # from https://stackoverflow.com/a/6405711/9201239
 53 |     """
 54 |     return sum(((list(range(*[int(j) + k for k,j in enumerate(i.split('-'))]))
 55 |          if '-' in i else [int(i)]) for i in s.split(',')), [])
 56 | 
 57 | def unpack(args, idx, row):
 58 |     #pprint(row)
 59 | 
 60 |     path = f"{args.target_path}/{idx}"
 61 |     Path(path).mkdir(parents=True, exist_ok=True)
 62 | 
 63 |     # all items are text, except 'image'
 64 | 
 65 |     img = row["image"]
 66 |     basename = f"{path}/image"
 67 |     ext = "null" if img is None else "jpg"
 68 |     file = f"{basename}.{ext}"
 69 |     with open(file, "w") as fh:
 70 |         if img is not None:
 71 |             img.save(fh, 'jpeg')
 72 | 
 73 |     for col in ['meta', 'source', 'text']:
 74 |         item = row[col]
 75 |         basename = f"{path}/{col}"
 76 |         ext = "null" if item is None else "txt"
 77 |         file = f"{basename}.{ext}"
 78 |         with open(file, "w") as fh:
 79 |             if item is not None:
 80 |                 fh.write(item)
 81 | 
 82 | def dump_example_shapes(idx, row):
 83 |     """ dump the row stats """
 84 |     shapes = {}
 85 | 
 86 |     img = row["image"]
 87 |     shapes["image"] = 0 if img is None else "x".join(map(str, img.size))
 88 | 
 89 |     for col in ['meta', 'source', 'text']:
 90 |         item = row[col]
 91 |         shapes[col] = 0 if item is None else len(item)
 92 | 
 93 |     summary = ", ".join([f"{k}: {v:>9}" for k,v in shapes.items()])
 94 |     print(f"rec{idx:>6}: {summary}")
 95 | 
 96 | 
 97 | ids_range = list2range(args.ids)
 98 | 
 99 | ds = load_from_disk(args.dataset_name_or_path)
100 | #rows = ds[ids_range]
101 | 
102 | #pprint(rows[1])
103 | 
104 | for idx, id in enumerate(ids_range):
105 |     unpack(args, id, ds[id])
106 |     dump_example_shapes(id, ds[id])
107 |     #sys.exit()
108 | 
109 | ds.info.write_to_directory(args.target_path)
110 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/general-pmd-synthetic-testing.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | 
  4 | This dataset was generated by:
  5 | 
  6 | # prep dataset repo
  7 | https://huggingface.co/new-dataset => HuggingFaceM4/general-pmd-synthetic-testing
  8 | git clone https://huggingface.co/datasets/HuggingFaceM4/general-pmd-synthetic-testing
  9 | cd general-pmd-synthetic-testing
 10 | 
 11 | # select a few seed records so there is some longer and shorter text, records with images and without, a few variations of each type
 12 | rm -rf data
 13 | python general-pmd-ds-unpack.py --dataset_name_or_path /hf/m4-master/data/general_pmd/image/localized_narratives__ADE20k/train/00000-00002 --ids 1-10 --target_path data
 14 | 
 15 | cd data
 16 | 
 17 | # shrink to 32x32 max, keeping ratio
 18 | mogrify -format jpg -resize 32x32\> */*jpg
 19 | 
 20 | # adjust one record to have no image and no text
 21 | cd 1
 22 | rm image.jpg text.txt
 23 | touch image.null text.null
 24 | cd -
 25 | 
 26 | cd ..
 27 | 
 28 | # create tarball
 29 | tar -cvzf data.tar.gz data
 30 | 
 31 | # complete the dataset repo
 32 | echo "This dataset is designed to be used in testing. It's derived from general-pmd/localized_narratives__ADE20k dataset" >> README.md
 33 | 
 34 | # test dataset
 35 | cd ..
 36 | datasets-cli test general-pmd-synthetic-testing/general-pmd-synthetic-testing.py --all_configs
 37 | 
 38 | 
 39 | # push the data
 40 | cd general-pmd-synthetic-testing
 41 | rm -rf data
 42 | git add *
 43 | git commit -am "new dataset"
 44 | git push
 45 | 
 46 | # test
 47 | python -c 'from datasets import load_dataset; load_dataset("HuggingFaceM4/general-pmd-synthetic-testing")["100.unique"]'
 48 | 
 49 | """
 50 | 
 51 | 
 52 | from PIL import Image, ImageFile
 53 | from collections import defaultdict
 54 | from datasets import DatasetInfo
 55 | from pathlib import Path
 56 | from pprint import pprint
 57 | import datasets
 58 | import itertools
 59 | import json
 60 | import os
 61 | 
 62 | _CITATION = """\
 63 | @InProceedings{huggingface:dataset,
 64 | title = {Multimodal synthetic dataset for testing / general PMD},
 65 | author={HuggingFace, Inc.},
 66 | year={2022}
 67 | }
 68 | """
 69 | 
 70 | _DESCRIPTION = """This dataset is designed to be used in testing. It's derived from general-pmd-10k dataset"""
 71 | _HOMEPAGE = "https://huggingface.co/datasets/HuggingFaceM4/general-pmd-synthetic-testing"
 72 | _LICENSE = "bigscience-openrail-m"
 73 | _URL = "https://huggingface.co/datasets/HuggingFaceM4/general-pmd-synthetic-testing/resolve/main/data.tar.gz"
 74 | #_URL = "./data.tar.gz"
 75 | 
 76 | sizes = ["100", "300", "1k", "10k"]
 77 | types = ["unique", "repeat"]
 78 | 
 79 | class GeneralPMDSynthetic(datasets.GeneratorBasedBuilder):
 80 | 
 81 |     VERSION = datasets.Version("1.1.1")
 82 | 
 83 |     # splits = [f"{s}.{t}" for s in sizes for t in types]
 84 |     # BUILDER_CONFIGS = [] # can't use list comprehension and access VERSION due to python scoping design
 85 |     # for split in splits:
 86 |     #     BUILDER_CONFIGS.append(datasets.BuilderConfig(name=split, version=VERSION, description=f"{split} items split"))
 87 |     DEFAULT_CONFIG_NAME = "100.unique"
 88 | 
 89 |     def _info(self):
 90 |         # script_dir = os.path.abspath(os.path.dirname(__file__))
 91 |         # path = os.path.join(script_dir, "dataset_info.json")
 92 |         # ds_info = DatasetInfo.from_directory(path)
 93 |         # pprint(ds_info)
 94 |         # return ds_info
 95 | 
 96 |         # XXX: automate
 97 |         return datasets.DatasetInfo(
 98 |             description=_DESCRIPTION,
 99 |             citation=_CITATION,
100 |             homepage=_HOMEPAGE,
101 |             license=_LICENSE,
102 |             features={
103 |                 "image":  {"decode": True,    "id": None, "_type": "Image"},
104 |                 "text":   {"dtype": "string", "id": None, "_type": "Value"},
105 |                 "source": {"dtype": "string", "id": None, "_type": "Value"},
106 |                 "meta":   {"dtype": "string", "id": None, "_type": "Value"},
107 |             },
108 |         )
109 | 
110 |     def _split_generators(self, dl_manager):
111 |         url = _URL
112 |         data_dir = dl_manager.download_and_extract(url)
113 | 
114 |         return [
115 |             datasets.SplitGenerator(
116 |                 name=self.config.name,
117 |                 # These kwargs will be passed to _generate_examples
118 |                 gen_kwargs={
119 |                     "data_path": os.path.join(data_dir, "data"),
120 |                 },
121 |             )
122 |         ]
123 | 
124 |     def _generate_examples(self, data_path):
125 |         # the split name acts as the designator of how many rows to generate
126 | 
127 |         size, type = self.config.name.split(".")
128 | 
129 |         print(f"Generating {size}-long {type} records split")
130 | 
131 |         # for now handling 100, 10k - can add m
132 |         total_examples = int(size.replace("k", "000"))
133 | 
134 |         def pack_example(path):
135 |             """ put the directory with and image and text cols into a single datasets record """
136 | 
137 |             row = {}
138 | 
139 |             for file in path.glob("*"):
140 |                 if file.suffix == ".null":
141 |                     row[file.stem] = None
142 |                 elif file.stem == "image":
143 |                     row[file.stem] = Image.open(file)
144 |                 elif file.stem in ['meta', 'source', 'text']:
145 |                     row[file.stem] = "".join([l for l in open(file)])
146 |                 else:
147 |                     pass # ignore any other files
148 | 
149 |             return row
150 | 
151 |         def dump_example_shapes(idx, row):
152 |             """ dump the row stats """
153 |             shapes = {}
154 | 
155 |             img = row["image"]
156 |             shapes["image"] = 0 if img is None else "x".join(map(str, img.size))
157 | 
158 |             for col in ['meta', 'source', 'text']:
159 |                 item = row[col]
160 |                 shapes[col] = 0 if item is None else len(item)
161 | 
162 |             summary = ", ".join([f"{k}: {v:>9}" for k,v in shapes.items()])
163 |             print(f"rec{idx:>6}: {summary}")
164 | 
165 |         print()
166 |         rows = [pack_example(subdir) for subdir in sorted(Path(data_path).glob("[0-9]*"))]
167 |         num_rows = len(rows)
168 |         if num_rows == 0:
169 |             raise ValueError(f"can't find any data - check {data_path}")
170 | 
171 |         print(f"\nStats for {len(rows)} unique records used:")
172 |         for i, row in enumerate(rows): dump_example_shapes(i, row)
173 | 
174 |         one_none_texts = 0
175 |         def gen_unique_rec(idx, row):
176 |             nonlocal one_none_texts
177 |             """ insert idx as a string at the end of the first non-None entry, or create a new one if all are
178 |             None. The replacement will overwrite the last few characters of the previous string. This ensures
179 |             that each record will be unique """
180 | 
181 |             uniq_text = str(idx)
182 |             if row["text"] is None:
183 |                 # keep one record that has text=None (which is still unique)
184 |                 if one_none_texts == 0:
185 |                     one_none_texts = 1
186 |                 else:
187 |                     row["text"] = uniq_text
188 |             else:
189 |                 row["text"] = row["text"][:-len(uniq_text)] + uniq_text
190 | 
191 |             return row
192 | 
193 |         # this being a synthetic dataset we rotate the 1 or more available rows until we generate enough records.
194 |         # in the case of unique type we tweak one text record to be unique
195 |         for i in range(total_examples):
196 |             idx = i % num_rows
197 |             if type == "repeat":
198 |                 yield i, rows[idx]
199 |             elif type == "unique":
200 |                 yield i, gen_unique_rec(i, rows[idx])
201 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/idefics-make-tiny-model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This script creates a smallish random model, with a few layers to test things quickly
 4 | #
 5 | # It also demonstrates how to change the config in child objects of the model config
 6 | #
 7 | # It will be used then as "stas/idefics-tiny-random"
 8 | 
 9 | from transformers import AutoTokenizer, IdeficsConfig, IdeficsForVisionText2Text
10 | 
11 | mname_from = "HuggingFaceM4/idefics-9b"
12 | mname_very_small = "idefics-tiny-random"
13 | 
14 | tokenizer = AutoTokenizer.from_pretrained(mname_from)
15 | config = IdeficsConfig.from_pretrained(mname_from)
16 | 
17 | config.update(dict(
18 |     hidden_size=64,
19 |     intermediate_size=37,
20 |     num_hidden_layers=5,
21 |     num_attention_heads=4,
22 |     max_position_embeddings=64,
23 |     max_sequence_length=64,
24 | 
25 | ))
26 | 
27 | # This model contains several child config objects
28 | #
29 | # If you need to update the child config objects you can't do it from the top-level dict, but need
30 | # to update these directly via those objects, like so:
31 | config.perceiver_config.update(dict(qk_layer_norms_perceiver=False))
32 | config.vision_config.update(dict(embed_dim=64))
33 | 
34 | print("new config", config)
35 | 
36 | very_small_model = IdeficsForVisionText2Text(config)
37 | print(f"num of params {very_small_model.num_parameters()}")
38 | very_small_model.resize_token_embeddings(len(tokenizer))
39 | 
40 | # Save
41 | very_small_model.bfloat16() # makes it smaller
42 | very_small_model.save_pretrained(mname_very_small)
43 | config.save_pretrained(mname_very_small)
44 | tokenizer.save_pretrained(mname_very_small)
45 | 
46 | print(f"Generated {mname_very_small}")
47 | 
48 | # Upload
49 | # transformers-cli repo create idefics-tiny-random
50 | # clone and add files
51 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/m4-ds-unpack.py:
--------------------------------------------------------------------------------
  1 | # unpack the desired datasets records into a filesystem-based subdir structure which can then be
  2 | # used to create a synthetic dataset of desired records. each record can now be easily modified on
  3 | # the filesystem before being packed back into a dataset
  4 | #
  5 | # each records is a subdir
  6 | # each part of the record is:
  7 | # images_0.png
  8 | # images_1.jpg
  9 | # images_2.null # means no image in this slot
 10 | # ....
 11 | # images_n.png
 12 | # texts_0.txt
 13 | # texts_1.txt
 14 | # texts_2.null
 15 | # ....
 16 | # images_n.png
 17 | #
 18 | # .null extension is when the slot is empty
 19 | #
 20 | 
 21 | 
 22 | from argparse import ArgumentParser
 23 | from collections import defaultdict
 24 | from datasets import load_from_disk, Dataset
 25 | from pathlib import Path
 26 | from pprint import pprint
 27 | import gc
 28 | import numpy as np
 29 | import os
 30 | import psutil
 31 | import sys
 32 | import torchvision.transforms as transforms
 33 | from PIL import Image, ImageFile
 34 | 
 35 | Image.MAX_IMAGE_PIXELS = None
 36 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 37 | 
 38 | # LOCALIZE ME!
 39 | DATA_ROOT = "/hf/m4-master/data/cm4"
 40 | 
 41 | DATASET_PATH = f"{DATA_ROOT}/cm4-10000-v0.1"
 42 | 
 43 | parser = ArgumentParser()
 44 | 
 45 | parser.add_argument("--dataset_name_or_path", required=True, type=str, help="source dataset_name_or_path")
 46 | parser.add_argument("--target_path", required=False, default="output", type=str, help="path to where to unpack")
 47 | parser.add_argument("--ids", required=False, default="0", type=str, help="which ids to extract. example: 1,2,5-7,10")
 48 | args = parser.parse_args()
 49 | 
 50 | def list2range(s):
 51 |     """
 52 |     list2range('1,2,5-7,10')
 53 |     [1, 2, 5, 6, 7, 10]
 54 |     # from https://stackoverflow.com/a/6405711/9201239
 55 |     """
 56 |     return sum(((list(range(*[int(j) + k for k,j in enumerate(i.split('-'))]))
 57 |          if '-' in i else [int(i)]) for i in s.split(',')), [])
 58 | 
 59 | def unpack(args, idx, row):
 60 |     #pprint(row)
 61 |     path = f"{args.target_path}/{idx}"
 62 |     Path(path).mkdir(parents=True, exist_ok=True)
 63 |     for i, img in enumerate(row["images"]):
 64 |         basename = f"{path}/images_{i:02d}"
 65 |         ext = "null" if img is None else "jpg"
 66 |         file = f"{basename}.{ext}"
 67 |         with open(file, "w") as fh:
 68 |             if img is not None:
 69 |                 img.save(fh, 'jpeg')
 70 |     for i, txt in enumerate(row["texts"]):
 71 |         basename = f"{path}/texts_{i:02d}"
 72 |         ext = "null" if txt is None else "txt"
 73 |         file = f"{basename}.{ext}"
 74 |         with open(file, "w") as fh:
 75 |             if txt is not None:
 76 |                 fh.write(txt)
 77 | 
 78 | def dump_example_shapes(idx, row):
 79 |     """ dump the row stats """
 80 | 
 81 |     imgs = defaultdict(int)
 82 |     for img in row["images"]:
 83 |         if img is None:
 84 |             imgs["0"] += 1
 85 |         else:
 86 |             shape = "x".join(map(str, img.size))
 87 |             imgs[shape] += 1
 88 |     imgs_summary = ", ".join([f"{v} {k}" for k,v in sorted(imgs.items(), key=lambda x: int(x[0].split("x")[0]))])
 89 | 
 90 |     txts = defaultdict(int)
 91 |     for txt in row["texts"]:
 92 |         if txt is None:
 93 |             txts[0] += 1
 94 |         else:
 95 |             shape = len(txt)
 96 |             txts[shape] += 1
 97 |     txts_summary = ", ".join([f"{v} {k}" for k,v in sorted(txts.items(), key=lambda x: int(x[0]))])
 98 | 
 99 |     print(f"\nrec{idx}: {len(row['images'])} pairs with {len(row['images'])-imgs['0']} images, {len(row['texts'])-txts[0]} texts")
100 |     print(f"- img: {imgs_summary}")
101 |     print(f"- txt: {txts_summary}")
102 | 
103 | 
104 | 
105 | 
106 | ids_range = list2range(args.ids)
107 | 
108 | ds = load_from_disk(args.dataset_name_or_path)
109 | #rows = ds[ids_range]
110 | 
111 | 
112 | #pprint(rows[1])
113 | 
114 | 
115 | for idx, id in enumerate(ids_range):
116 |     unpack(args, id, ds[id])
117 |     dump_example_shapes(id, ds[id])
118 |     #sys.exit()
119 | 
120 | ds.info.write_to_directory(args.target_path)
121 | 
122 | 
123 | # replicate one record many times
124 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/mt5-make-tiny-model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | # Copyright 2021 The HuggingFace Team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | # This script creates a smallish random model, with a few layers to test things like MP/PP, where
 18 | # tiny and tiner models are too too small
 19 | #
 20 | # It will be used then as "stas/mt5-tiny-random"
 21 | 
 22 | # To build:
 23 | # 1. clone sentencepiece into this dir
 24 | # git clone https://github.com/google/sentencepiece
 25 | #
 26 | # 2. run this script
 27 | 
 28 | from pathlib import Path
 29 | import json
 30 | import tempfile
 31 | 
 32 | from transformers import MT5Tokenizer, MT5TokenizerFast, MT5Config, MT5ForConditionalGeneration
 33 | from transformers.models.t5.tokenization_t5 import VOCAB_FILES_NAMES
 34 | 
 35 | mname_from = "google/mt5-small"
 36 | mname_very_small = "mt5-tiny-random"
 37 | 
 38 | tokenizer = MT5Tokenizer.from_pretrained(mname_from)
 39 | config = MT5Config.from_pretrained(mname_from)
 40 | #tokenizer_fast = MT5TokenizerFast.from_pretrained(mname_from)
 41 | 
 42 | # Shrink the vocab of mt5-small
 43 | import sys
 44 | # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
 45 | sys.path.append("./sentencepiece/python/src/sentencepiece")
 46 | import sentencepiece_model_pb2 as model
 47 | 
 48 | tmp_dir = "/tmp/mt5-small"
 49 | tokenizer.save_pretrained(tmp_dir)
 50 | file = tmp_dir + "/spiece.model"
 51 | with open(file, 'rb') as f: data = f.read()
 52 | 
 53 | # adapted from https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/
 54 | m = model.ModelProto()
 55 | m.ParseFromString(data)
 56 | 
 57 | keep_items = 5000
 58 | 
 59 | print("Shrinking vocab")
 60 | print(f"original dict {len(m.pieces)}")
 61 | for i in range(len(m.pieces)-keep_items): _ = m.pieces.pop()
 62 | print(f"new dict {len(m.pieces)}")
 63 | 
 64 | with open(tmp_dir + "/spiece-short.model", 'wb') as f:
 65 |     f.write(m.SerializeToString())
 66 | 
 67 | tokenizer = MT5Tokenizer(vocab_file=tmp_dir + "/spiece-short.model")
 68 | 
 69 | config.update(dict(
 70 |     vocab_size=keep_items+12,
 71 |     d_model=64,
 72 |     d_ff=256,
 73 |     d_kv=8,
 74 |     num_layers=8,
 75 |     num_decoder_layers=8,
 76 |     num_heads=4,
 77 |     relative_attention_num_buckets=32,
 78 | ))
 79 | print("new config", config)
 80 | 
 81 | very_small_model = MT5ForConditionalGeneration(config)
 82 | print(f"num of params {very_small_model.num_parameters()}")
 83 | very_small_model.resize_token_embeddings(len(tokenizer))
 84 | 
 85 | # Test
 86 | src_texts = ["A long paragraph for summarization.", "Another paragraph for summarization."]
 87 | tgt_texts = ["Summary of the text.", "Another summary."]
 88 | 
 89 | batch = tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts, return_tensors="pt")
 90 | outputs = very_small_model(**batch)
 91 | 
 92 | print("test output:", len(outputs.logits[0]))
 93 | 
 94 | # Save
 95 | very_small_model.half() # makes it smaller
 96 | very_small_model.save_pretrained(mname_very_small)
 97 | config.save_pretrained(mname_very_small)
 98 | tokenizer.save_pretrained(mname_very_small)
 99 | #tokenizer_fast.save_pretrained(mname_very_small)
100 | 
101 | print(f"Generated {mname_very_small}")
102 | 
103 | # Upload
104 | # transformers-cli repo create mt5-tiny-random
105 | # clone and add files
106 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/openwebtext-10k.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """The Open WebText Corpus"""
16 | 
17 | 
18 | import os
19 | import re
20 | from itertools import chain
21 | 
22 | import datasets
23 | 
24 | 
25 | _CITATION = """\
26 | @misc{Gokaslan2019OpenWeb,
27 |   title={OpenWebText Corpus},
28 |   author={Aaron Gokaslan*, Vanya Cohen*, Ellie Pavlick, Stefanie Tellex},
29 |   howpublished{\\url{http://Skylion007.github.io/OpenWebTextCorpus}},
30 |   year={2019}
31 | }
32 | """
33 | 
34 | _DESCRIPTION = """\
35 | An open-source replication of the WebText dataset from OpenAI.
36 | 
37 | This is a small subset representing the first 10K records from the original dataset - created for testing.
38 | 
39 | The full 8M-record dataset is at https://huggingface.co/datasets/openwebtext
40 | """
41 | 
42 | _URL = "https://cdn-datasets.huggingface.co/nlp/datasets/openwebtext/openwebtext-10k.tar.xz"
43 | 
44 | class Openwebtext10k(datasets.GeneratorBasedBuilder):
45 |     """The Open WebText dataset."""
46 | 
47 |     BUILDER_CONFIGS = [
48 |         datasets.BuilderConfig(
49 |             name="plain_text",
50 |             description="Plain text",
51 |             version=datasets.Version("1.0.0"),
52 |         )
53 |     ]
54 | 
55 |     def _info(self):
56 |         return datasets.DatasetInfo(
57 |             description=_DESCRIPTION,
58 |             features=datasets.Features({"text": datasets.Value("string")}),
59 |             homepage="https://skylion007.github.io/OpenWebTextCorpus/",
60 |             citation=_CITATION,
61 |         )
62 | 
63 |     def _split_generators(self, dl_manager):
64 |         dl_dir = dl_manager.download_and_extract(_URL)
65 |         owt_dir = os.path.join(dl_dir, "openwebtext-10k")
66 |         subset_xzs = [
67 |             os.path.join(owt_dir, file_name)
68 |             for file_name in sorted(os.listdir(owt_dir))
69 |             if file_name.endswith("xz")  # filter out ...xz.lock
70 |         ]
71 |         ex_dirs = dl_manager.extract(subset_xzs, num_proc=round(os.cpu_count() * 0.75))
72 |         nested_txt_files = [
73 |             [
74 |                 os.path.join(ex_dir, txt_file_name)
75 |                 for txt_file_name in sorted(os.listdir(ex_dir))
76 |                 if txt_file_name.endswith("txt")
77 |             ]
78 |             for ex_dir in ex_dirs
79 |         ]
80 |         txt_files = chain(*nested_txt_files)
81 |         return [
82 |             datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"txt_files": txt_files}),
83 |         ]
84 | 
85 |     def _generate_examples(self, txt_files):
86 |         """Yields examples."""
87 |         for idx, filepath in enumerate(txt_files):
88 |             with open(filepath, encoding="utf-8") as f:
89 |                 yield idx, {"text": re.sub("\n\n\n+", "\n\n", f.read()).strip()}
90 | 


--------------------------------------------------------------------------------
/debug/tiny-scripts/oscar-en-10k.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """The Open WebText Corpus"""
16 | 
17 | 
18 | import os
19 | import json
20 | 
21 | import datasets
22 | 
23 | 
24 | _CITATION = """\
25 | @inproceedings{OrtizSuarezSagotRomary2019,
26 |   author    = {Pedro Javier {Ortiz Su{'a}rez} and Benoit Sagot and Laurent Romary},
27 |   title     = {Asynchronous pipelines for processing huge corpora on medium to low resource infrastructures},
28 |   series = {Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-7) 2019. Cardiff, 22nd July 2019},
29 |   editor    = {Piotr Bański and Adrien Barbaresi and Hanno Biber and Evelyn Breiteneder and Simon Clematide and Marc Kupietz and Harald L{"u}ngen and Caroline Iliadi},
30 |   publisher = {Leibniz-Institut f{"u}r Deutsche Sprache},
31 |   address   = {Mannheim},
32 |   doi       = {10.14618/ids-pub-9021},
33 |   url       = {http://nbn-resolving.de/urn:nbn:de:bsz:mh39-90215},
34 |   pages     = {9 -- 16},
35 |   year      = {2019},
36 |   abstract  = {Common Crawl is a considerably large, heterogeneous multilingual corpus comprised of crawled documents from the internet, surpassing 20TB of data and distributed as a set of more than 50 thousand plain text files where each contains many documents written in a wide variety of languages. Even though each document has a metadata block associated to it, this data lacks any information about the language in which each document is written, making it extremely difficult to use Common Crawl for monolingual applications. We propose a general, highly parallel, multithreaded pipeline to clean and classify Common Crawl by language; we specifically design it so that it runs efficiently on medium to low resource infrastructures where I/O speeds are the main constraint. We develop the pipeline so that it can be easily reapplied to any kind of heterogeneous corpus and so that it can be parameterised to a wide range of infrastructures. We also distribute a 6.3TB version of Common Crawl, filtered, classified by language, shuffled at line level in order to avoid copyright issues, and ready to be used for NLP applications.},
37 |   language  = {en}
38 | }
39 | """
40 | 
41 | _DESCRIPTION = """\
42 | This is a small subset representing 10K records from the original OSCAR dataset, "unshuffled_deduplicated_en" subset - created for testing. The records were extracted after having been shuffled.
43 | 
44 | The full 1TB+ dataset is at https://huggingface.co/datasets/oscar.
45 | """
46 | 
47 | _URL = "https://cdn-datasets.huggingface.co/nlp/datasets/oscar/oscar-en-10k.tar.xz"
48 | 
49 | class OscarEn10k(datasets.GeneratorBasedBuilder):
50 |     """The OSCAR dataset."""
51 | 
52 |     BUILDER_CONFIGS = [
53 |         datasets.BuilderConfig(
54 |             name="plain_text",
55 |             description="Plain text",
56 |             version=datasets.Version("1.0.0"),
57 |         )
58 |     ]
59 | 
60 |     def _info(self):
61 |         return datasets.DatasetInfo(
62 |             description=_DESCRIPTION,
63 |             features=datasets.Features({"text": datasets.Value("string")}),
64 |             homepage="https://oscar-corpus.com/",
65 |             citation=_CITATION,
66 |         )
67 | 
68 |     def _split_generators(self, dl_manager):
69 |         dl_dir = dl_manager.download_and_extract(_URL)
70 |         jsonl_file = os.path.join(dl_dir, "oscar-en-10k", "oscar-en-10k.jsonl")
71 |         return [
72 |             datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"jsonl_file": jsonl_file}),
73 |         ]
74 | 
75 |     def _generate_examples(self, jsonl_file):
76 |         """Yields examples."""
77 |         with open(jsonl_file, encoding="utf-8") as f:
78 |             idx = 0
79 |             for line in f:
80 |                 rec = json.loads(line)
81 |                 yield idx,  {"text": rec["text"]}
82 |                 idx += 1
83 | 


--------------------------------------------------------------------------------
/debug/torch-distributed-gpu-test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | 
  5 | This a `torch.distributed` diagnostics script that checks that all GPUs in the cluster (one or
  6 | many nodes) can talk to each other via nccl and allocate gpu memory. It also prints other useful information like NUMA affinities.
  7 | 
  8 | To run it you just need to adjust the number of processes and nodes according to your use case:
  9 | 
 10 | ```
 11 | python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
 12 | ```
 13 | 
 14 | You may need to add `--master_addr $MASTER_ADDR --master_port $MASTER_PORT` if using a custom addr:port
 15 | 
 16 | You can also use the rdzv API: `--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d`
 17 | 
 18 | If you get a hanging in `barrier` calls you have some network issues, you may try to debug this with:
 19 | 
 20 | ```
 21 | NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
 22 | ```
 23 | 
 24 | which should tell you what's going on behind the scenes.
 25 | 
 26 | This script can be run via `srun` in the SLURM environment as well. Here is a SLURM script that
 27 | runs on 2 nodes of 8 gpus per node:
 28 | 
 29 | ```
 30 | #!/bin/bash
 31 | #SBATCH --job-name=test-nodes        # name
 32 | #SBATCH --nodes=2                    # EDIT to the number of nodes
 33 | #SBATCH --ntasks-per-node=1          # crucial - only 1 task per node for this script
 34 | #SBATCH --cpus-per-task=10           # EDIT this to how many cpu cores the node has
 35 | #SBATCH --gres=gpu:8                 # EDIT this if it's not an 8-GPUs node setup
 36 | #SBATCH --partition=dev              # EDIT to the desired partition name
 37 | #SBATCH --time 0:05:00               # 5 min should be enough
 38 | #SBATCH --output=%x-%j.out           # output file name
 39 | 
 40 | export GPUS_PER_NODE=8
 41 | export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 42 | export MASTER_PORT=6000
 43 | 
 44 | srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
 45 | --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
 46 | --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
 47 | torch-distributed-gpu-test.py'
 48 | ```
 49 | 
 50 | You can also add this to the launcher for automatic prefixing of all logs with `[hostname:rank] ` (e.g. after `--master_addr`):
 51 | 
 52 | ```
 53 | --role `hostname -s`: --tee 3
 54 | ```
 55 | 
 56 | """
 57 | 
 58 | import builtins
 59 | import fcntl
 60 | import os
 61 | import socket
 62 | import torch
 63 | import torch.distributed as dist
 64 | 
 65 | def print(*args, **kwargs):
 66 |     """ solves multi-process interleaved print problem """
 67 |     with open(__file__, "r") as fh:
 68 |         fcntl.flock(fh, fcntl.LOCK_EX)
 69 |         try:
 70 |             builtins.print(*args, **kwargs)
 71 |         finally:
 72 |             fcntl.flock(fh, fcntl.LOCK_UN)
 73 | 
 74 | local_rank = int(os.environ["LOCAL_RANK"])
 75 | torch.cuda.set_device(local_rank)
 76 | device = torch.device("cuda", local_rank)
 77 | hostname = socket.gethostname()
 78 | 
 79 | gpu = f"[{hostname}:{local_rank}]"
 80 | 
 81 | try:
 82 |     # XXX: possibly change the dist timeout to something much shorter to get this script to fail
 83 |     # fast if there is a problem and not wait for the default 30min
 84 | 
 85 |     # test distributed
 86 |     dist.init_process_group("nccl")
 87 | 
 88 |     # global rank
 89 |     rank = dist.get_rank()
 90 |     world_size = dist.get_world_size()
 91 | 
 92 |     # reduction test
 93 |     t = torch.ones(1, device=device)
 94 |     dist.all_reduce(t, op=dist.ReduceOp.SUM)
 95 |     dist.barrier()
 96 |     print(f"{gpu} Reduction op=sum result: {t.item()}")
 97 | 
 98 |     # test cuda is available and can allocate memory
 99 |     torch.cuda.is_available()
100 |     torch.ones(1).cuda(local_rank)
101 | 
102 |     print(f"{gpu} is OK (global rank: {rank}/{world_size})")
103 | 
104 |     dist.barrier()
105 |     if rank == 0:
106 |         print(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
107 |         print(f"device compute capabilities={torch.cuda.get_device_capability()}")
108 |         print(f"pytorch compute capabilities={torch.cuda.get_arch_list()}")
109 | 
110 | except Exception:
111 |     print(f"{gpu} is broken (but it could also mean that it failed because another gpu didn't respond)")
112 |     raise
113 | 


--------------------------------------------------------------------------------
/images/Machine-Learning-Engineering-book-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/images/Machine-Learning-Engineering-book-cover.png


--------------------------------------------------------------------------------
/inference/images/github-vllm-stats-2024-08-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/inference/images/github-vllm-stats-2024-08-24.png


--------------------------------------------------------------------------------
/inference/images/infer-kv-cache.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/inference/images/infer-kv-cache.png


--------------------------------------------------------------------------------
/inference/images/mha-gqa-mqa-mla.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/inference/images/mha-gqa-mqa-mla.png


--------------------------------------------------------------------------------
/inference/images/softmax-temperature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/inference/images/softmax-temperature.png


--------------------------------------------------------------------------------
/insights/images/640px-Baureihe52Heizer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/insights/images/640px-Baureihe52Heizer.jpg


--------------------------------------------------------------------------------
/model-parallelism/README.md:
--------------------------------------------------------------------------------
1 | ## Moved
2 | 
3 | **Moved to [here](../training/model-parallelism/).**
4 | 


--------------------------------------------------------------------------------
/network/benchmarks/all_gather_object_vs_all_gather.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | #
 4 | # all_gather to gather counts across process group is 23x faster than the same via all_gather_object
 5 | #
 6 | # python -m torch.distributed.run --nproc_per_node 2 all_gather_object_vs_all_gather.py
 7 | #
 8 | # XXX: in this case the benchmark isn't the most representative since there is almost no data, so
 9 | # the overhead of code is huge, shouldn't be as big for bigger data. But I wanted to compare
10 | # all_gather to all_gather_object and used the same setup as all_gather_object_vs_all_reduce.py as
11 | # the base for the benchmark. Probably need to rework it.
12 | #
13 | # all_gather_object=0.2697904680026113
14 | # all_gather_object=0.26981512399652274
15 | # all_gather       =0.05322460600291379
16 | # all_gather       =0.05485054099699482
17 | 
18 | import torch.distributed as dist
19 | import torch
20 | import os
21 | 
22 | local_rank = int(os.environ["LOCAL_RANK"])
23 | torch.cuda.set_device(local_rank)
24 | dist.init_process_group("nccl")
25 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26 | 
27 | world_size = dist.get_world_size()
28 | rank = dist.get_rank()
29 | 
30 | flag_pt = torch.tensor(1.0, device=device)
31 | flag_py = 1
32 | 
33 | def all_gather_object():
34 |     output_objects = [None for _ in range(world_size)]
35 |     dist.all_gather_object(output_objects, flag_py)
36 |     flag = sum(output_objects)
37 |     return flag
38 | 
39 | def all_gather():
40 |     tensor_list = [torch.zeros(1, dtype=torch.float, device=device) for _ in range(2)]
41 |     dist.all_gather(tensor_list, flag_pt)
42 |     return tensor_list
43 | 
44 | # test
45 | print(f"all_gather_object: {all_gather_object()}\n")
46 | print(f"all_gather: {all_gather()}\n")
47 | 
48 | import timeit
49 | print(f'all_gather_object={timeit.Timer("all_gather_object()", globals=globals()).timeit(number=1000)}')
50 | print(f'all_gather       ={timeit.Timer("all_gather()"       , globals=globals()).timeit(number=1000)}')
51 | 


--------------------------------------------------------------------------------
/network/benchmarks/all_gather_object_vs_all_reduce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | #
 4 | # all_reduce to gather counts across process group is 23x faster than the same via all_gather_object
 5 | #
 6 | # python -m torch.distributed.run --nproc_per_node 2 all_gather_object_vs_all_reduce.py
 7 | #
 8 | # all_gather_object=0.26279118900129106
 9 | # all_gather_object=0.2628160299973388
10 | # all_reduce       =0.011241967000387376
11 | # all_reduce       =0.011610440000367817
12 | 
13 | import torch.distributed as dist
14 | import torch
15 | import os
16 | 
17 | local_rank = int(os.environ["LOCAL_RANK"])
18 | torch.cuda.set_device(local_rank)
19 | dist.init_process_group("nccl")
20 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21 | 
22 | world_size = dist.get_world_size()
23 | rank = dist.get_rank()
24 | 
25 | flag_pt = torch.tensor(1.0, device=device)
26 | flag_py = 1
27 | 
28 | def all_gather_object():
29 |     output_objects = [None for _ in range(world_size)]
30 |     dist.all_gather_object(output_objects, flag_py)
31 |     flag = sum(output_objects)
32 |     return flag
33 | 
34 | def all_reduce():
35 |     dist.all_reduce(flag_pt, op=dist.ReduceOp.SUM)
36 |     return flag_pt
37 | 
38 | # test
39 | print(f"all_gather_object: {all_gather_object()}\n")
40 | print(f"all_reduce: {all_reduce()}\n")
41 | 
42 | import timeit
43 | print(f'all_gather_object={timeit.Timer("all_gather_object()", globals=globals()).timeit(number=1000)}')
44 | print(f'all_reduce       ={timeit.Timer("all_reduce()"       , globals=globals()).timeit(number=1000)}')
45 | 


--------------------------------------------------------------------------------
/network/benchmarks/all_reduce_bench_pyxis.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=all_reduce_bench_pyxis
 3 | #SBATCH --nodes=2
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --time=01:00:00
 7 | 
 8 | # Set up environment variables for torchrun
 9 | GPUS_PER_NODE=8
10 | NNODES=$SLURM_NNODES
11 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
12 | MASTER_PORT=6000
13 | 
14 | srun --container-image=nvcr.io#nvidia/pytorch:25.08-py3 \
15 |      --container-mounts=$PWD:/workspace \
16 |      python -u -m torch.distributed.run \
17 |          --nproc_per_node $GPUS_PER_NODE \
18 |          --nnodes $NNODES \
19 |          --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
20 |          --rdzv_backend c10d \
21 |          --max_restarts 0 \
22 |          --role `hostname -s`':' \
23 |          --tee 3 \
24 |          all_reduce_bench.py
25 | 


--------------------------------------------------------------------------------
/network/benchmarks/all_reduce_latency_comp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # this is derived from the all_reduce_bench.py
 4 | # but adjusted to show how 1x 4GB reduction is much faster than 1000x 4MB reduction
 5 | #
 6 | # to run on 8 gpus:
 7 | # python -u -m torch.distributed.run --nproc_per_node=8 all_reduce_latency_comp.py
 8 | 
 9 | import os
10 | import socket
11 | import torch
12 | import torch.distributed as dist
13 | 
14 | TRIALS = 1
15 | 
16 | # these emulate the payload which will become a M * N * 4-sized tensor below
17 | N = 500000
18 | M = 2000
19 | 
20 | def timed_allreduce(mat, repeat_times, id, start_event, end_event):
21 |     start_event.record()
22 |     for i in range(repeat_times):
23 |         dist.all_reduce(mat)
24 |     end_event.record()
25 | 
26 |     torch.cuda.synchronize()
27 |     duration = start_event.elapsed_time(end_event) / 1000
28 | 
29 |     size = M * N * 4 # 4 is fp32
30 |     algbw = (size / duration) * 8 # 8 is bytes to bits
31 |     n = dist.get_world_size()
32 |     # the 2*(n-1)/n busbw correction factor specific to all-reduce is explained here:
33 |     # https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md#allreduce
34 |     # busbw reflects how optimally the hardware is used
35 |     busbw = algbw * (2*(n - 1) / n)
36 | 
37 |     # gather all data on global-rank-0 and print the results from there to avoid interleaved prints
38 |     data = [id, duration, algbw, busbw]
39 |     output = [None for _ in range(dist.get_world_size())] if dist.get_rank() == 0 else None
40 |     dist.gather_object(data, output, dst=0)
41 |     if dist.get_rank() == 0:
42 |         for data in output:
43 |             id, duration, algbw, busbw = data
44 |             print(f"{id}:\n",
45 |                   f"duration: {duration:.3f} sec\n",
46 |                   f"algbw: {algbw/1e9:.3f} Gbps\n",
47 |                   f"busbw: {busbw / 1e9:.3f} Gbps"
48 |     )
49 | 
50 | 
51 | 
52 | def run(local_rank):
53 |     hostname = socket.gethostname()
54 |     id = f"{hostname}:{local_rank}"
55 |     global_rank = dist.get_rank()
56 | 
57 |     chunks = 1000
58 |     mat1 = torch.rand(N, M, dtype=torch.float32).cuda(local_rank)
59 |     mat2 = torch.rand(int(N/chunks), M, dtype=torch.float32).cuda(local_rank)
60 | 
61 |     start_event = torch.cuda.Event(enable_timing=True)
62 |     end_event = torch.cuda.Event(enable_timing=True)
63 |     for i in range(TRIALS):
64 |         dist.barrier()
65 | 
66 |         if global_rank == 0:
67 |             print(f"\n\n\n----------- 1x {N*M*4/1e9}GB ----------------")
68 |         timed_allreduce(mat1, 1, id, start_event, end_event)
69 | 
70 |         if global_rank == 0:
71 |             print(f"\n\n\n----------- {chunks}x {(N*M*4/chunks)/1e9}GB ----------------")
72 |         timed_allreduce(mat2, chunks, id, start_event, end_event)
73 | 
74 | def init_processes(local_rank, fn, backend='nccl'):
75 |     torch.cuda.set_device(local_rank)
76 |     dist.init_process_group(backend)
77 |     fn(local_rank)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     local_rank = int(os.environ["LOCAL_RANK"])
82 |     print("local_rank: %d" % local_rank)
83 |     init_processes(local_rank=local_rank, fn=run)
84 | 


--------------------------------------------------------------------------------
/network/benchmarks/images/all-reduce-bench-plot-4n.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/benchmarks/images/all-reduce-bench-plot-4n.png


--------------------------------------------------------------------------------
/network/benchmarks/images/all-reduce-multi-node-bandwidth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/benchmarks/images/all-reduce-multi-node-bandwidth.png


--------------------------------------------------------------------------------
/network/benchmarks/results/README.md:
--------------------------------------------------------------------------------
1 | # Network Benchmarks Results
2 | 
3 | - [Disabling NVLink](disable-nvlink.md)
4 | 


--------------------------------------------------------------------------------
/network/benchmarks/results/disable-nvlink.md:
--------------------------------------------------------------------------------
 1 | # Disabling NVLink Benchmark
 2 | 
 3 | Let's compare the training of a gpt2 language model training over a small sample of wikitext.
 4 | 
 5 | The results are:
 6 | 
 7 | | NVlink | Time |
 8 | | -----  | ---: |
 9 | | Y      | 101s |
10 | | N      | 131s |
11 | 
12 | You can see that NVLink completes the training ~23% faster. In the second benchmark we use `NCCL_P2P_DISABLE=1` to tell the GPUs not to use NVLink, which will use PCIe instead.
13 | 
14 | We will use [HF Transformers examples](https://github.com/huggingface/transformers/blob/58e3d23e97078f361a533b9ec4a6a2de674ea52a/examples/pytorch/language-modeling/run_clm.py).
15 | 
16 | Here is the full benchmark code and outputs:
17 | 
18 | ```bash
19 | # DDP w/ NVLink
20 | 
21 | rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
22 | --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
23 | --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
24 | --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
25 | 
26 | {'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
27 | 
28 | # DDP w/o NVLink
29 | 
30 | rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
31 | --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
32 | --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
33 | --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
34 | 
35 | {'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
36 | ```
37 | 
38 | Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
39 | Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
40 | 


--------------------------------------------------------------------------------
/network/images/all-reduce-bw-2025.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/all-reduce-bw-2025.png


--------------------------------------------------------------------------------
/network/images/all-reduce-collective.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/all-reduce-collective.png


--------------------------------------------------------------------------------
/network/images/all-reduce-ring-chunk1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/all-reduce-ring-chunk1.png


--------------------------------------------------------------------------------
/network/images/all-reduce-ring-chunk2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/all-reduce-ring-chunk2.png


--------------------------------------------------------------------------------
/network/images/all-to-all-bw-2025.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/all-to-all-bw-2025.png


--------------------------------------------------------------------------------
/network/images/amd-infinity-arch-MI300X.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/amd-infinity-arch-MI300X.png


--------------------------------------------------------------------------------
/network/images/broadcast-ring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/broadcast-ring.png


--------------------------------------------------------------------------------
/network/images/ccgrid11-low-level-latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/ccgrid11-low-level-latency.png


--------------------------------------------------------------------------------
/network/images/ccgrid11-uni-direction-bandwidth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/ccgrid11-uni-direction-bandwidth.png


--------------------------------------------------------------------------------
/network/images/collective-all-gather-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-all-gather-1.png


--------------------------------------------------------------------------------
/network/images/collective-all-gather-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-all-gather-2.png


--------------------------------------------------------------------------------
/network/images/collective-all-reduce-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-all-reduce-1.png


--------------------------------------------------------------------------------
/network/images/collective-all-reduce-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-all-reduce-2.png


--------------------------------------------------------------------------------
/network/images/collective-all-to-all-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-all-to-all-1.png


--------------------------------------------------------------------------------
/network/images/collective-all-to-all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-all-to-all.png


--------------------------------------------------------------------------------
/network/images/collective-broadcast-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-broadcast-1.png


--------------------------------------------------------------------------------
/network/images/collective-broadcast-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-broadcast-2.png


--------------------------------------------------------------------------------
/network/images/collective-gather-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-gather-1.png


--------------------------------------------------------------------------------
/network/images/collective-gather-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-gather-2.png


--------------------------------------------------------------------------------
/network/images/collective-reduce-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-reduce-1.png


--------------------------------------------------------------------------------
/network/images/collective-reduce-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-reduce-2.png


--------------------------------------------------------------------------------
/network/images/collective-reduce-scatter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-reduce-scatter.png


--------------------------------------------------------------------------------
/network/images/collective-scatter-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-scatter-1.png


--------------------------------------------------------------------------------
/network/images/collective-scatter-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/collective-scatter-2.png


--------------------------------------------------------------------------------
/network/images/nccl-all-reduce-scan-nvlstree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/nccl-all-reduce-scan-nvlstree.png


--------------------------------------------------------------------------------
/network/images/nccl-all-reduce-scan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/network/images/nccl-all-reduce-scan.png


--------------------------------------------------------------------------------
/orchestration/README.md:
--------------------------------------------------------------------------------
 1 | # Orchestration
 2 | 
 3 | There are many container/accelerator orchestration solutions - many of which are open source.
 4 | 
 5 | So far I have been working with SLURM:
 6 | 
 7 | - [SLURM](slurm/) - Simple Linux Utility for Resource Management, which you're guaranteed to find on most HPC environments and typically it's supported by most cloud providers.  It has been around for more than 2 decades
 8 | - SLURM on Kubernetes: [Slinky](https://github.com/stas00/ml-engineering/pull/99) - this is a recently created framework for running SLURM on top of Kubernetes.
 9 | 
10 | The other most popular orchestrator is Kubernetes:
11 | 
12 | - [Kubernetes](https://kubernetes.io/) - also known as K8s, is an open source system for automating deployment, scaling, and management of containerized applications. Here is a good [comparison between SLURM and K8s](https://web.archive.org/web/20250324222116/https://www.fluidstack.io/post/is-kubernetes-or-slurm-the-best-orchestrator-for-512-gpu-jobs).
13 | 
14 | Here are various other less popular, but still very mighty orchestration solutions:
15 | 
16 | - [dstack](https://github.com/dstackai/dstack) is a lightweight, open-source alternative to Kubernetes & Slurm, simplifying AI container orchestration with multi-cloud & on-prem support. It natively supports NVIDIA, AMD, & TPU.
17 | - [SkyPilot](https://github.com/skypilot-org/skypilot) is a framework for running AI and batch workloads on any infra, offering unified execution, high cost savings, and high GPU availability.
18 | - [OpenHPC](https://github.com/openhpc/ohpc) provides a variety of common, pre-built ingredients required to deploy and manage an HPC Linux cluster including provisioning tools, resource management, I/O clients, runtimes, development tools, containers, and a variety of scientific libraries.
19 | - [run.ai](https://www.run.ai/) - got acquired by NVIDIA and is planned to be open sourced soon.
20 | - [Docker Swarm](https://docs.docker.com/engine/swarm/) is a container orchestration tool.
21 | - [IBM Platform Load Sharing Facility (LSF)](https://www.ibm.com/products/hpc-workload-management) Suites is a workload management platform and job scheduler for distributed high performance computing (HPC).
22 | 


--------------------------------------------------------------------------------
/orchestration/slurm/README.md:
--------------------------------------------------------------------------------
 1 | # Working in SLURM Environment
 2 | 
 3 | Unless you're lucky and you have a dedicated cluster that is completely under your control chances are that you will have to use SLURM to timeshare the GPUs with others. But, often, if you train at HPC, and you're given a dedicated partition you still will have to use SLURM.
 4 | 
 5 | The SLURM abbreviation stands for: **Simple Linux Utility for Resource Management** - though now it's called
 6 | The Slurm Workload Manager. It is a free and open-source job scheduler for Linux and Unix-like kernels, used by many of the world's supercomputers and computer clusters.
 7 | 
 8 | These chapters will not try to exhaustively teach you SLURM as there are many manuals out there, but will cover some specific nuances that are useful to help in the training process.
 9 | 
10 | - [SLURM For Users](./users.md) - everything you need to know to do your training in the SLURM environment.
11 | - [SLURM Administration](./admin.md) - if you're unlucky to need to also manage the SLURM cluster besides using it, there is a growing list of recipes in this document to get things done faster for you.
12 | - [Performance](./performance.md) - SLURM performance nuances.
13 | - [Launcher scripts](./launchers) - how to launch with `torchrun`, `accelerate`, pytorch-lightning, etc. in the SLURM environment
14 | 


--------------------------------------------------------------------------------
/orchestration/slurm/admin.md:
--------------------------------------------------------------------------------
  1 | # SLURM Administration
  2 | 
  3 | 
  4 | ## Run a command on multiple nodes
  5 | 
  6 | 1. to avoid being prompted with:
  7 | ```
  8 | Are you sure you want to continue connecting (yes/no/[fingerprint])?
  9 | ```
 10 | for every new node you haven't logged into yet, you can disable this check with:
 11 | ```
 12 | echo "Host *" >> ~/.ssh/config
 13 | echo "  StrictHostKeyChecking no" >> ~/.ssh/config
 14 | ```
 15 | 
 16 | Of course, check if that's secure enough for your needs. I'm making an assumption that you're already on the SLURM cluster and you're not ssh'ing outside of your cluster. You can choose not to set this and then you will have to manually approve each new node.
 17 | 
 18 | 2. Install `pdsh`
 19 | 
 20 | You can now run the wanted command on multiple nodes.
 21 | 
 22 | For example, let's run `date`:
 23 | 
 24 | ```
 25 | $ PDSH_RCMD_TYPE=ssh pdsh -w node-[21,23-26] date
 26 | node-25: Sat Oct 14 02:10:01 UTC 2023
 27 | node-21: Sat Oct 14 02:10:02 UTC 2023
 28 | node-23: Sat Oct 14 02:10:02 UTC 2023
 29 | node-24: Sat Oct 14 02:10:02 UTC 2023
 30 | node-26: Sat Oct 14 02:10:02 UTC 2023
 31 | ```
 32 | 
 33 | Let's do something more useful and complex. Let's kill all GPU-tied processes that didn't exit when the SLURM job was cancelled:
 34 | 
 35 | First, this command will give us all process ids that tie up the GPUs:
 36 | 
 37 | ```
 38 | nvidia-smi --query-compute-apps=pid --format=csv,noheader | sort | uniq
 39 | ```
 40 | 
 41 | So we can now kill all those processes in one swoop:
 42 | 
 43 | ```
 44 |  PDSH_RCMD_TYPE=ssh pdsh -w node-[21,23-26]  "nvidia-smi --query-compute-apps=pid --format=csv,noheader | sort | uniq | xargs -n1 sudo kill -9"
 45 | ```
 46 | 
 47 | 
 48 | ## Slurm settings
 49 | 
 50 | Show the slurm settings:
 51 | 
 52 | ```
 53 | sudo scontrol show config
 54 | ```
 55 | 
 56 | The config file is `/etc/slurm/slurm.conf` on the slurm controller node.
 57 | 
 58 | Once `slurm.conf` was updated to reload the config run:
 59 | ```
 60 | sudo scontrol reconfigure
 61 | ```
 62 | from the controller node.
 63 | 
 64 | 
 65 | 
 66 | ## Auto-reboot
 67 | 
 68 | If the nodes need to be rebooted safely (e.g. if the image has been updated), adapt the list of the node and run:
 69 | 
 70 | ```
 71 | scontrol reboot ASAP node-[1-64]
 72 | ```
 73 | 
 74 | For each of the non-idle nodes this command will wait till the current job ends, then reboot the node and bring it back up to `idle`.
 75 | 
 76 | Note that you need to have:
 77 | ```
 78 | RebootProgram = "/sbin/reboot"
 79 | ```
 80 | set in `/etc/slurm/slurm.conf` on the controller node for this to work (and reconfigure the SLURM daemon if you have just added this entry to the config file).
 81 | 
 82 | 
 83 | ## Changing the state of the node
 84 | 
 85 | The change is performed by `scontrol update`
 86 | 
 87 | Examples:
 88 | 
 89 | To undrain a node that is ready to be used:
 90 | ```
 91 | scontrol update nodename=node-5 state=idle
 92 | ```
 93 | 
 94 | To remove a node from the SLURM's pool:
 95 | ```
 96 | scontrol update nodename=node-5 state=drain
 97 | ```
 98 | 
 99 | 
100 | ## Undrain nodes killed due to slow process exit
101 | 
102 | Sometimes processes are slow to exit when a job has been cancelled. If the SLURM was configured not to wait forever it'll automatically drain such nodes. But there is no reason for those nodes to not be available to the users.
103 | 
104 | So here is how to automate it.
105 | 
106 | The keys is to get the list of nodes that are drained due to `"Kill task failed"`, which is retrieved with:
107 | 
108 | ```
109 | sinfo -R | grep "Kill task failed"
110 | ```
111 | 
112 | now extract and expand the list of nodes, check that the nodes are indeed user-process free (or try to kill them first) and then undrain them.
113 | 
114 | Earlier you learned how to [run a command on multiple nodes](#run-a-command-on-multiple-nodes) which we will use in this script.
115 | 
116 | Here is the script that does all that work for you: [undrain-good-nodes.sh](./undrain-good-nodes.sh)
117 | 
118 | Now you can just run this script and any nodes that are basically ready to serve but are currently drained will be switched to `idle` state and become available for the users to be used.
119 | 
120 | 
121 | ## Modify a job's timelimit
122 | 
123 | To set a new timelimit on a job, e.g., 2 days:
124 | ```
125 | scontrol update JobID=$SLURM_JOB_ID TimeLimit=2-00:00:00
126 | ```
127 | 
128 | To add additional time to the previous setting, e.g. 3 more hours.
129 | ```
130 | scontrol update JobID=$SLURM_JOB_ID TimeLimit=+10:00:00
131 | ```
132 | 
133 | ## When something goes wrong with SLURM
134 | 
135 | Analyze the events log in the SLURM's log file:
136 | ```
137 | sudo cat /var/log/slurm/slurmctld.log
138 | ```
139 | 
140 | This, for example, can help to understand why a certain node got its jobs cancelled before time or the node got removed completely.
141 | 


--------------------------------------------------------------------------------
/orchestration/slurm/cron-daily.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=cron-daily        # job name
 3 | #SBATCH --ntasks=1                   # number of MP tasks
 4 | #SBATCH --nodes=1
 5 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 6 | #SBATCH --time=0:30:00               # maximum execution time (HH:MM:SS)
 7 | #SBATCH --output=%x-%j.out           # output file name
 8 | #SBATCH --partition=PARTITION     # edit me
 9 | #SBATCH --account=GROUP@PARTITION # edit me
10 | 
11 | # do not set -e - we must run all of it
12 | # set -x -e
13 | 
14 | cd $WORK/cron/scheduler
15 | 
16 | # ensure to restart self first
17 | sbatch --begin=now+24hour cron-daily.slurm
18 | 
19 | # now launch any slurm scripts in cron.daily
20 | cd $WORK/cron/cron.daily
21 | for f in *.slurm; do
22 |   sbatch "$f"
23 | done
24 | 


--------------------------------------------------------------------------------
/orchestration/slurm/cron-hourly.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=cron-hourly       # job name
 3 | #SBATCH --ntasks=1                   # number of MP tasks
 4 | #SBATCH --nodes=1
 5 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 6 | #SBATCH --time=0:30:00               # maximum execution time (HH:MM:SS)
 7 | #SBATCH --output=%x-%j.out           # output file name
 8 | #SBATCH --partition=PARTITION     # edit me
 9 | #SBATCH --account=GROUP@PARTITION # edit me
10 | 
11 | # do not set -e - we must run all of it
12 | # set -x -e
13 | 
14 | cd $WORK/cron/scheduler
15 | 
16 | # ensure to restart self first
17 | sbatch --begin=now+1hour cron-hourly.slurm
18 | 
19 | # now launch any slurm scripts in cron.hourly
20 | cd $WORK/cron/cron.hourly
21 | for f in *.slurm; do
22 |   sbatch "$f"
23 | done
24 | 


--------------------------------------------------------------------------------
/orchestration/slurm/example.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this is a 2 node slurm job example, you will most likely need to adapt --cpus-per-task and --partition
 4 | 
 5 | #SBATCH --job-name=example-job
 6 | #SBATCH --nodes=2
 7 | #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 8 | #SBATCH --cpus-per-task=96
 9 | #SBATCH --gres=gpu:8
10 | #SBATCH --time=0:10:00
11 | #SBATCH --exclusive
12 | #SBATCH --partition=xyz-cluster
13 | #SBATCH --output=%x-%j.out
14 | 
15 | 
16 | set -x -e
17 | 
18 | # CHANGE HERE THE CONDA EVN AND ANY STARTUP SCRIPTS
19 | source /path/to/start-xxx-user # if you have something to preload before the job
20 | conda activate stas-xxx        # if you have conda env to activate
21 | 
22 | echo "START TIME: $(date)"
23 | 
24 | # CHANGE TO CUMMULATIVELY LOG OUTPUTS
25 | LOG_PATH="main_log.txt"
26 | 
27 | GPUS_PER_NODE=8
28 | NNODES=$SLURM_NNODES
29 | 
30 | # so processes know who to talk to
31 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
32 | MASTER_PORT=6000
33 | 
34 | # OTHER LAUNCHERS CAN BE USED HERE
35 | export LAUNCHER="python -u -m torch.distributed.run \
36 |     --nproc_per_node $GPUS_PER_NODE \
37 |     --nnodes $NNODES \
38 |     --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
39 |     --rdzv_backend c10d \
40 |     --max_restarts 0 \
41 |     --role `hostname -s`: \
42 |     --tee 3 \
43 |     "
44 | 
45 | # CHANGE HERE THE SCRIPT AND WHATEVER ARGS IT NEEDS
46 | CMD="\
47 | torch-distributed-gpu-test.py \
48 | "
49 | 
50 | echo $CMD
51 | 
52 | # hide duplicated errors using this hack - will be properly fixed in pt-1.12
53 | # export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
54 | 
55 | # force crashing on nccl issues like hanging broadcast
56 | export NCCL_ASYNC_ERROR_HANDLING=1
57 | # export NCCL_DEBUG=INFO
58 | # export NCCL_DEBUG_SUBSYS=COLL
59 | # export NCCL_SOCKET_NTHREADS=1
60 | # export NCCL_NSOCKS_PERTHREAD=1
61 | # export CUDA_LAUNCH_BLOCKING=1
62 | 
63 | # srun error handling:
64 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
65 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
66 | SRUN_ARGS=" \
67 |     --wait=60 \
68 |     --kill-on-bad-exit=1 \
69 |     "
70 | 
71 | # py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
72 | clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee -a $LOG_PATH
73 | 
74 | echo "END TIME: $(date)"
75 | 


--------------------------------------------------------------------------------
/orchestration/slurm/launchers/README.md:
--------------------------------------------------------------------------------
 1 | # Single and Multi-node Launchers with SLURM
 2 | 
 3 | The following are complete SLURM scripts that demonstrate how to integrate various launchers with software that uses `torch.distributed` (but should be easily adaptable to other distributed environments).
 4 | 
 5 | - [torchrun](torchrun-launcher.slurm) - to be used with [PyTorch distributed](https://github.com/pytorch/pytorch).
 6 | - [accelerate](accelerate-launcher.slurm) - to be used with [HF Accelerate](https://github.com/huggingface/accelerate).
 7 | - [lightning](lightning-launcher.slurm) - to be used with [Lightning](https://lightning.ai/) (“PyTorch Lightning” and “Lightning Fabric”).
 8 | - [srun](srun-launcher.slurm) - to be used with the native SLURM launcher - here we have to manually preset env vars that `torch.distributed` expects.
 9 | 
10 | All of these scripts use [torch-distributed-gpu-test.py](../../../debug/torch-distributed-gpu-test.py) as the demo script, which you can copy here with just:
11 | ```
12 | cp ../../../debug/torch-distributed-gpu-test.py .
13 | ```
14 | assuming you cloned this repo. But you can replace it with anything else you need.
15 | 


--------------------------------------------------------------------------------
/orchestration/slurm/launchers/accelerate-launcher.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this is a 2 node SLURM script using `accelerate` launcher
 4 | # Important: you will need to adapt setting where you see EDIT in the comments
 5 | 
 6 | #SBATCH --job-name=accelerate-launcher
 7 | #SBATCH --nodes=2
 8 | #SBATCH --ntasks-per-node=1          # crucial - only 1 task per node
 9 | #SBATCH --cpus-per-task=96           # EDIT this to how many cpu cores the node has
10 | #SBATCH --gres=gpu:8                 # EDIT this if it's not 8-gpus per node
11 | #SBATCH --time=0:10:00               # EDIT the desired runtime
12 | #SBATCH --exclusive
13 | #SBATCH --partition=xyz-cluster      # EDIT to the desired partition name
14 | #SBATCH --output=%x-%j.out
15 | 
16 | echo "START TIME: $(date)"
17 | 
18 | # auto-fail on any errors in this script
19 | set -eo pipefail
20 | 
21 | # logging script's variables/commands for future debug needs
22 | set -x
23 | 
24 | # EDIT the conda evn and any startup scripts
25 | # source /path/to/start-xxx-user # if you have something to preload before the job
26 | # conda activate stas-xxx        # if you have conda env to activate
27 | 
28 | LOG_PATH="main_log.txt"
29 | 
30 | # EDIT the path to accelerate config file and fill it with actual Accelerate config
31 | ACCELERATE_CONFIG_FILE=accelerate.yaml
32 | 
33 | # EDIT if it's not 8-gpus per node
34 | GPUS_PER_NODE=8
35 | NNODES=$SLURM_NNODES
36 | NUM_PROCESSES=$(($NNODES * $GPUS_PER_NODE))
37 | 
38 | # define the node 0 hostname:port
39 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
40 | MASTER_PORT=6000
41 | 
42 | # note `\$SLURM_PROCID` we don't want it interpolated till `srun` since otherwise all nodes will get
43 | # 0 and the launcher will hang
44 | #
45 | # same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
46 | LAUNCHER="python -u -m accelerate.commands.launch \
47 |     --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
48 |     --config_file $ACCELERATE_CONFIG_FILE \
49 |     --num_processes $NUM_PROCESSES \
50 |     --num_machines $NNODES \
51 |     --main_process_ip $MASTER_ADDR \
52 |     --main_process_port $MASTER_PORT \
53 |     --machine_rank \$SLURM_PROCID \
54 |     --role \$(hostname -s|tr -dc '0-9'): --tee 3 \
55 |     "
56 | 
57 | # EDIT the path+name of the python script and whatever args it needs
58 | PROGRAM="torch-distributed-gpu-test.py"
59 | 
60 | export CMD="$LAUNCHER $PROGRAM"
61 | 
62 | echo $CMD
63 | 
64 | # EDIT if you want to redirect /tmp to /scratch (some local SSD path) since /tmp is tiny on compute nodes
65 | # export TMPDIR=/scratch
66 | 
67 | # EDIT: useful for debug if needed
68 | #
69 | # to debug NCCL issues
70 | # export NCCL_DEBUG=INFO
71 | #
72 | # to unravel async errors w/o the correct traceback - potentially makes everything very slower
73 | # export CUDA_LAUNCH_BLOCKING=1
74 | #
75 | # to force crashing on nccl issues like hanging broadcast
76 | # export NCCL_ASYNC_ERROR_HANDLING=1
77 | 
78 | # srun error handling:
79 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
80 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
81 | SRUN_ARGS=" \
82 |     --wait=60 \
83 |     --kill-on-bad-exit=1 \
84 |     --jobid $SLURM_JOB_ID \
85 |     "
86 | 
87 | # bash -c is needed for the delayed interpolation of env vars to work
88 | srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH
89 | 
90 | echo "END TIME: $(date)"
91 | 


--------------------------------------------------------------------------------
/orchestration/slurm/launchers/lightning-launcher.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this is a 2 node SLURM script for launching Lightning-based programs
 4 | # Important: you will need to adapt setting where you see EDIT in the comments
 5 | 
 6 | #SBATCH --job-name=lightning-launcher
 7 | #SBATCH --nodes=2
 8 | #SBATCH --ntasks-per-node=8          # EDIT if it's not 8-gpus per node
 9 | #SBATCH --cpus-per-task=12           # EDIT this to how many cpu cores the node has divided by num of gpus
10 | #SBATCH --gres=gpu:8                 # EDIT this if it's not 8-gpus per node
11 | #SBATCH --time=0:10:00               # EDIT the desired runtime
12 | #SBATCH --exclusive
13 | #SBATCH --partition=xyz-cluster      # EDIT to the desired partition name
14 | #SBATCH --output=%x-%j.out
15 | 
16 | echo "START TIME: $(date)"
17 | 
18 | # auto-fail on any errors in this script
19 | set -eo pipefail
20 | 
21 | # logging script's variables/commands for future debug needs
22 | set -x
23 | 
24 | # EDIT the conda evn and any startup scripts
25 | # source /path/to/start-xxx-user # if you have something to preload before the job
26 | # conda activate stas-xxx        # if you have conda env to activate
27 | 
28 | LOG_PATH="main_log.txt"
29 | 
30 | # PTL doesn't need a special launcher
31 | LAUNCHER="python -u"
32 | 
33 | # EDIT the path+name of the python script and whatever args it needs
34 | PROGRAM="torch-distributed-gpu-test.py"
35 | 
36 | export CMD="$LAUNCHER $PROGRAM"
37 | 
38 | echo $CMD
39 | 
40 | # EDIT if you want to redirect /tmp to /scratch (some local SSD path) since /tmp is tiny on compute nodes
41 | # export TMPDIR=/scratch
42 | 
43 | # EDIT: useful for debug if needed
44 | #
45 | # to debug NCCL issues
46 | # export NCCL_DEBUG=INFO
47 | #
48 | # to unravel async errors w/o the correct traceback - potentially makes everything very slower
49 | # export CUDA_LAUNCH_BLOCKING=1
50 | #
51 | # to force crashing on nccl issues like hanging broadcast
52 | # export NCCL_ASYNC_ERROR_HANDLING=1
53 | 
54 | # srun error handling:
55 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
56 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
57 | SRUN_ARGS=" \
58 |     --wait=60 \
59 |     --kill-on-bad-exit=1 \
60 |     --jobid $SLURM_JOB_ID \
61 |     "
62 | 
63 | # bash -c is needed for the delayed interpolation of env vars to work
64 | srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH
65 | 
66 | echo "END TIME: $(date)"
67 | 


--------------------------------------------------------------------------------
/orchestration/slurm/launchers/srun-launcher.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this is a 2 node SLURM script for launching srun-based programs
 4 | # Important: you will need to adapt setting where you see EDIT in the comments
 5 | 
 6 | #SBATCH --job-name=srun-launcher
 7 | #SBATCH --nodes=2
 8 | #SBATCH --ntasks-per-node=8          # EDIT this has to match the number of GPUs per node
 9 | #SBATCH --cpus-per-task=10           # EDIT how many cpu cores per task (total-cores/tasks-per-node)
10 | #SBATCH --gres=gpu:8                 # EDIT this if it's not 8-gpus per node
11 | #SBATCH --time=0:10:00               # EDIT the desired runtime
12 | #SBATCH --exclusive
13 | #SBATCH --partition=xyz-cluster      # EDIT to the desired partition name
14 | #SBATCH --output=%x-%j.out
15 | 
16 | 
17 | echo "START TIME: $(date)"
18 | 
19 | # auto-fail on any errors in this script
20 | set -eo pipefail
21 | 
22 | # logging script's variables/commands for future debug needs
23 | set -x
24 | 
25 | # EDIT the conda evn and any startup scripts
26 | # source /path/to/start-xxx-user # if you have something to preload before the job
27 | # conda activate stas-xxx        # if you have conda env to activate
28 | 
29 | LOG_PATH="main_log.txt"
30 | 
31 | # we are preparing for torch.distributed programs so it wants:
32 | # - MASTER_ADDR, MASTER_PORT, WORLD_SIZE - already known before `srun`
33 | # - RANK, LOCAL_RANK - will set at `srun` command
34 | export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
35 | export MASTER_PORT=6000
36 | export WORLD_SIZE=$SLURM_NPROCS
37 | 
38 | # srun acts as the launcher in this case, so just `python` is enough.
39 | LAUNCHER="python -u"
40 | 
41 | # EDIT the path+name of the python script and whatever args it needs
42 | PROGRAM="torch-distributed-gpu-test.py"
43 | 
44 | export CMD="$LAUNCHER $PROGRAM"
45 | 
46 | echo $CMD
47 | 
48 | # EDIT if you want to redirect /tmp to /scratch (some local SSD path) since /tmp is tiny on compute nodes
49 | # export TMPDIR=/scratch
50 | 
51 | # EDIT: useful for debug if needed
52 | #
53 | # to debug NCCL issues
54 | # export NCCL_DEBUG=INFO
55 | #
56 | # to unravel async errors w/o the correct traceback - potentially makes everything very slower
57 | # export CUDA_LAUNCH_BLOCKING=1
58 | #
59 | # to force crashing on nccl issues like hanging broadcast
60 | # export NCCL_ASYNC_ERROR_HANDLING=1
61 | 
62 | # srun error handling:
63 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
64 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
65 | SRUN_ARGS=" \
66 |     --wait=60 \
67 |     --kill-on-bad-exit=1 \
68 |     --jobid $SLURM_JOB_ID \
69 |     "
70 | 
71 | # bash -c is needed for the delayed interpolation of env vars to work
72 | # we want $SLURM_PROCID and $SLURM_LOCALID values that get set at the actual process launch time
73 | srun $SRUN_ARGS bash -c "RANK=\$SLURM_PROCID LOCAL_RANK=\$SLURM_LOCALID $CMD" 2>&1 | tee -a $LOG_PATH
74 | 
75 | echo "END TIME: $(date)"
76 | 


--------------------------------------------------------------------------------
/orchestration/slurm/launchers/torchrun-launcher.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this is a 2 node SLURM script using `torchrun` launcher
 4 | # Important: you will need to adapt setting where you see EDIT in the comments
 5 | 
 6 | #SBATCH --job-name=torchrun-launcher
 7 | #SBATCH --nodes=2
 8 | #SBATCH --ntasks-per-node=1          # crucial - only 1 task per node
 9 | #SBATCH --cpus-per-task=96           # EDIT this to how many cpu cores the node has
10 | #SBATCH --gres=gpu:8                 # EDIT this if it's not 8-gpus per node
11 | #SBATCH --time=0:10:00               # EDIT the desired runtime
12 | #SBATCH --exclusive
13 | #SBATCH --partition=xyz-cluster      # EDIT to the desired partition name
14 | #SBATCH --output=%x-%j.out
15 | 
16 | echo "START TIME: $(date)"
17 | 
18 | # auto-fail on any errors in this script
19 | set -eo pipefail
20 | 
21 | # logging script's variables/commands for future debug needs
22 | set -x
23 | 
24 | # EDIT the conda evn and any startup scripts
25 | # source /path/to/start-xxx-user # if you have something to preload before the job
26 | # conda activate stas-xxx        # if you have conda env to activate
27 | 
28 | LOG_PATH="main_log.txt"
29 | 
30 | # EDIT if it's not 8-gpus per node
31 | GPUS_PER_NODE=8
32 | NNODES=$SLURM_NNODES
33 | 
34 | # define the node 0 hostname:port
35 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
36 | MASTER_PORT=6000
37 | 
38 | # note `\$SLURM_PROCID` we don't want it interpolated till `srun` since otherwise all nodes will get
39 | # 0 and the launcher will hang
40 | #
41 | # same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
42 | LAUNCHER="python -u -m torch.distributed.run \
43 |     --nproc_per_node $GPUS_PER_NODE \
44 |     --nnodes $NNODES \
45 |     --node_rank \$SLURM_PROCID \
46 |     --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
47 |     --rdzv_backend c10d \
48 |     --max_restarts 0 \
49 |     --role \$(hostname -s|tr -dc '0-9'): \
50 |     --tee 3 \
51 |     "
52 | 
53 | # EDIT the path+name of the python script and whatever args it needs
54 | PROGRAM="torch-distributed-gpu-test.py"
55 | 
56 | export CMD="$LAUNCHER $PROGRAM"
57 | 
58 | echo $CMD
59 | 
60 | # EDIT if you want to redirect /tmp to /scratch (some local SSD path) since /tmp is tiny on compute nodes
61 | # export TMPDIR=/scratch
62 | 
63 | # EDIT: useful for debug if needed
64 | #
65 | # to debug NCCL issues
66 | # export NCCL_DEBUG=INFO
67 | #
68 | # to unravel async errors w/o the correct traceback - potentially makes everything very slower
69 | # export CUDA_LAUNCH_BLOCKING=1
70 | #
71 | # to force crashing on nccl issues like hanging broadcast
72 | # export NCCL_ASYNC_ERROR_HANDLING=1
73 | 
74 | # srun error handling:
75 | # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
76 | # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
77 | SRUN_ARGS=" \
78 |     --wait=60 \
79 |     --kill-on-bad-exit=1 \
80 |     --jobid $SLURM_JOB_ID \
81 |     "
82 | 
83 | # bash -c is needed for the delayed interpolation of env vars to work
84 | srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH
85 | 
86 | echo "END TIME: $(date)"
87 | 


--------------------------------------------------------------------------------
/orchestration/slurm/performance.md:
--------------------------------------------------------------------------------
 1 | # SLURM Performance
 2 | 
 3 | Here you will find discussions of SLURM-specific settings that impact performance.
 4 | 
 5 | ## srun's `--cpus-per-task` may need to be explicit
 6 | 
 7 | You need to make sure that the launched by `srun` program receives as many cpu-cores as intended. For example, in a typical case of a ML training program, each gpu needs at least one cpu-core for the process driving it plus a few more cores for the `DataLoader`. You need multiple cores so that each task can be performed in parallel. If you have 8 gpus and 2 `DataLoader` workers per gpu, you need at least `3*8=24` cpu-cores per node.
 8 | 
 9 | The number of cpus per task is defined by `--cpus-per-task`, which is passed to `sbatch` or `salloc` and originally `srun` would inherit this setting. However, recently this behavior has changed:
10 | 
11 | A quote from the `sbatch` manpage:
12 | 
13 | > NOTE: Beginning with 22.05, srun will not inherit the --cpus-per-task value requested by salloc or sbatch. It must be requested again with the call to srun or set with the SRUN_CPUS_PER_TASK environment variable if desired for the task(s).
14 | 
15 | Which means that if in the past your SLURM script could have been:
16 | 
17 | ```
18 | #SBATCH --cpus-per-task=48
19 | [...]
20 | 
21 | srun myprogram
22 | ```
23 | 
24 | and the program launched by `srun` would have received 48 cpu-cores because `srun` used to inherit the `--cpus-per-task=48` settings from `sbatch` or `salloc` settings, according to the quoted documentation since SLURM 22.05 this behavior is no longer true.
25 | 
26 | footnote: I tested with SLURM@22.05.09 and the old behavior was still true, but this is definitely the case with 23.x series. So the change might have happened in the later 22.05 series.
27 | 
28 | So if you leave things as is, now the program will receive just 1 cpu-core (unless the `srun` default has been modified).
29 | 
30 | You can easily test if your SLURM setup is affected, using `os.sched_getaffinity(0))`, as it shows which cpu-cores are eligible to be used by the current process. So it should be easy to count those with `len(os.sched_getaffinity(0))`.
31 | 
32 | Here is how you can test if you're affected:
33 | ```
34 | $ cat test.slurm
35 | #!/bin/bash
36 | #SBATCH --job-name=test-cpu-cores-per-task
37 | #SBATCH --nodes=1
38 | #SBATCH --ntasks-per-node=1
39 | #SBATCH --cpus-per-task=48   # adapt to your env if you have less than 48 cpu cores
40 | #SBATCH --time=0:10:00
41 | #SBATCH --partition=x        # adapt to your env to the right partition name
42 | #SBATCH --output=%x-%j.out
43 | 
44 | srun python -c 'import os; print(f"visible cpu cores: {len(os.sched_getaffinity(0))}")'
45 | ```
46 | 
47 | If you get
48 | ```
49 | visible cpu cores: 48
50 | ```
51 | then you don't need to do anything, if however you get:
52 | ```
53 | visible cpu cores: 1
54 | ```
55 | or another value smaller than 48 then you're affected.
56 | 
57 | To fix that you need to change your SLURM script to either:
58 | 
59 | ```
60 | #SBATCH --cpus-per-task=48
61 | [...]
62 | 
63 | srun --cpus-per-task=48 myprogram
64 | ```
65 | or:
66 | ```
67 | #SBATCH --cpus-per-task=48
68 | [...]
69 | 
70 | SRUN_CPUS_PER_TASK=48
71 | srun myprogram
72 | ```
73 | 
74 | or automate it with write-once-and-forget:
75 | ```
76 | #SBATCH --cpus-per-task=48
77 | [...]
78 | 
79 | SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK
80 | srun myprogram
81 | ```
82 | 
83 | 
84 | 
85 | ## To enable Hyper-Threads or not
86 | 
87 | As explained in the [Hyper-Threads](users.md#hyper-threads) section you should be able to double the number of available cpu-cores if your CPUs support hyper-threading and for some workloads this may lead to an overall faster performance.
88 | 
89 | However, you should test the performance w/ and w/o HT, compare the results and choose the setting that gives the best outcome.
90 | 
91 | case study: on AWS p4 nodes I discovered that enabling HT made the network throughput 4x slower. Since then we were careful to have HT disabled on that particular setup.
92 | 


--------------------------------------------------------------------------------
/orchestration/slurm/undrain-good-nodes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # When nodes get auto placed in drain because SLURM fails to wait till all last job's processes are killed and it just takes longer for them to finish, this script automatically checks if all processes tied to the gpu have been killed and if this is so it'll undrain those nodes
 4 | 
 5 | # get the nodes that were put to `drain` because the job was too slow to exit
 6 | nodes=( $(sinfo -R | grep "Kill task failed" | perl -lne '/(node-.*[\d\]]+)/ && print $1' | xargs -n1 scontrol show hostnames) )
 7 | 
 8 | good=()
 9 | bad=()
10 | 
11 | # declare an array called array and define 3 values
12 | for n in "${nodes[@]}"; do
13 | 	echo "*** checking $n"
14 | 
15 |     # check if any processes are still stuck - when none there should be no output
16 |     output=$(PDSH_RCMD_TYPE=ssh pdsh -w $n "nvidia-smi --query-compute-apps=pid --format=csv,noheader")
17 |     if [ -z "$output" ]; then
18 |         clean=1
19 |     else
20 |         clean=0
21 |         # if there are processes running still try to kill them again and recheck if it was successful
22 | 
23 |         # kill any processes tying up the gpus
24 |         PDSH_RCMD_TYPE=ssh pdsh -w $n "nvidia-smi --query-compute-apps=pid --format=csv,noheader | sort | uniq | xargs -n1 sudo kill -9"
25 | 
26 |         echo "sleeping for 3 secs to let the processes exit"
27 |         sleep 3
28 | 
29 |         # check if any processes are still stuck - when none there should be no output
30 |         output=$(PDSH_RCMD_TYPE=ssh pdsh -w $n "nvidia-smi --query-compute-apps=pid --format=csv,noheader")
31 |         if [ -z "$output" ]; then
32 |             clean=1
33 |         fi
34 |     fi
35 | 
36 |     if [ $clean == 1 ]; then
37 |         echo "no gpu processes are tied, undraining $n"
38 |         sudo scontrol update NodeName=$n State=idle Reason="undrained by $USER"
39 |         good+=($n)
40 |     else
41 |         echo "failed to kill all processed tied to gpus on $n"
42 |         echo "ssh into $n and manually check the state of the node"
43 |         bad+=($n)
44 |     fi
45 |     echo ""
46 | done
47 | 


--------------------------------------------------------------------------------
/resources/README.md:
--------------------------------------------------------------------------------
 1 | # Resources
 2 | 
 3 | ## Similar online guides
 4 | 
 5 | - Boris Dayma wrote [A Recipe for Training Large Models](https://wandb.ai/craiyon/report/reports/Recipe-Training-Large-Models--VmlldzozNjc4MzQz)
 6 | 
 7 | - The HuggingFace team published [The Ultra-Scale Playbook: Training LLMs on GPU Clusters](https://huggingface.co/spaces/nanotron/ultrascale-playbook).
 8 | 
 9 | - Jacob Austin maintains [How to Scale Your Model: A Systems View of LLMs on TPUs](https://jax-ml.github.io/scaling-book/)
10 | 
11 | 
12 | ## Useful compilations
13 | 
14 | - [@StellaAthena](https://github.com/StellaAthena) created the [Common LLM Settings spreadsheet](https://docs.google.com/spreadsheets/d/14vbBbuRMEHoqeuMHkTfw3uiZVmyXNuoSp8s-aHvfvZk/edit#gid=0) which can be a super-useful resource when you're about to embark on a new LLM training - as it tells you how many known LLM trainings were created.
15 | 
16 | - A few years back I started compiling information on [which dtype the models were trained in](https://discuss.huggingface.co/t/model-pre-training-precision-database-fp16-fp32-bf16/5671) - it only contains a handful of models but if you're doing a research on dtypes it can still be useful. I was using this information to try and write [a model pretraining dtype auto-detection](https://github.com/stas00/ml-ways/blob/master/numbers/detect-model-pretrained-in-bf16-fp16-fp32.ipynb) and here is a related [float16 vs bfloat16 numerical properties comparison](https://github.com/stas00/ml-ways/blob/master/numbers/bfloat16-vs-float16-study.ipynb).
17 | 
18 | 
19 | ## Publicly available training LLM/VLM logbooks
20 | 
21 | Logbooks and chronicles of training LLM/VLM are one of the best sources to learn from about dealing with training instabilities and choosing good hyper parameters.
22 | 
23 | If you know of a public LLM/VLM training logbook that is not on this list please kindly let me know or add it via a PR. Thank you!
24 | 
25 | The listing is in no particular order other than being grouped by the year.
26 | 
27 | ### 2021
28 | 
29 | - BigScience pre-BLOOM 108B training experiments (2021):
30 | [chronicles](https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md) |
31 | [the full spec and discussions](https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide)
32 | (backup:
33 | [1](https://github.com/stas00/bigscience-backup/blob/master/train/tr8-104B-wide/chronicles.md) |
34 | [2](https://github.com/stas00/bigscience-backup/blob/master/train/tr8-104B-wide))
35 | 
36 | 
37 | ### 2022
38 | 
39 | - BigScience BLOOM-176B (2022):
40 | [chronicles-prequel](https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles-prequel.md) |
41 | [chronicles](https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles.md) |
42 | [the full spec and discussions](https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/)
43 | (backup:
44 | [1](https://github.com/stas00/bigscience-backup/blob/master/train/tr11-176B-ml/chronicles-prequel.md) |
45 | [2](https://github.com/stas00/bigscience-backup/blob/master/train/tr11-176B-ml/chronicles.md) |
46 | [3](https://github.com/stas00/bigscience-backup/blob/master/train/tr11-176B-ml/))
47 | 
48 | - Meta OPT-175B (2022):
49 |  [logbook](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT/chronicles) | [Video](https://www.youtube.com/watch?v=p9IxoSkvZ-M) (backup: [1](https://github.com/stas00/metaseq-backup/tree/main/projects/OPT/chronicles))
50 | 
51 | - THUDM GLM-130B (2022): [en logbook](https://github.com/THUDM/GLM-130B/blob/main/logs/main-log-en.md) | [Mandarin version](https://github.com/THUDM/GLM-130B/blob/main/logs/main-log.md) (backup:  [1](https://github.com/stas00/GLM-130B-backup/blob/main/logs/main-log-en.md) | [2](https://github.com/stas00/GLM-130B-backup/blob/main/logs/main-log.md))
52 | 
53 | 
54 | ### 2023
55 | 
56 | - HuggingFace IDEFICS-80B multimodal (Flamingo repro) (2023): [Learning log](https://github.com/huggingface/m4-logs/blob/master/memos/README.md) | [Training Chronicles](https://github.com/huggingface/m4-logs/blob/master/tr-190-80b/chronicles.md) (backup: [1](https://github.com/stas00/m4-logs-backup/blob/master/memos/README.md) | [2](https://github.com/stas00/m4-logs-backup/blob/master/tr-190-80b/chronicles.md))
57 | 
58 | - BloombergGPT 50B LLM - section C in [BloombergGPT: A Large Language Model for Finance](https://arxiv.org/abs/2303.17564)
59 | 
60 | 
61 | ### 2024
62 | 
63 | - [MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs](https://arxiv.org/abs/2402.15627) - the paper covers various training issues and their resolution - albeit on models that are proprietary yet just as instructional/useful.
64 | 
65 | - Imbue's [From bare metal to a 70B model: infrastructure set-up and scripts](https://imbue.com/research/70b-infrastructure/) very detailed technical post covers many training-related issues that they had to overcome while training a proprietary 70B-param model.
66 | 
67 | 
68 | 
69 | 
70 | ## Hardware setup logbooks
71 | 
72 | - Imbue published a detailed log of how they have set up a 512-node IB-fat-tree cluster and made it to work: [From bare metal to a 70B model: infrastructure set-up and scripts](https://imbue.com/research/70b-infrastructure/), they also open-sourced the [cluster tooling](https://github.com/imbue-ai/cluster-health) they created in the process.
73 | 
74 | - SemiAnalysis published a great detailed writeup about [what it takes to set up a Neocloud cluster](https://semianalysis.com/2024/10/03/ai-neocloud-playbook-and-anatomy/).
75 | 


--------------------------------------------------------------------------------
/stabs/README.md:
--------------------------------------------------------------------------------
1 | # Stabs
2 | 
3 | Some very early notes on various topics, not meant for reading or fixing. Please ignore this sub-dir.
4 | 


--------------------------------------------------------------------------------
/storage/benchmarks/results/hope-2023-12-20-14-37-02-331702-summary.md:
--------------------------------------------------------------------------------
 1 | # fio benchmark results for hope on 2023-12-20-14:37:02
 2 | 
 3 | partition /mnt/nvme0/fio/fio-test
 4 | 
 5 | 
 6 | *  filesize=16k read
 7 | 
 8 | | lat msec | bw MBps |   IOPS   | jobs |
 9 | | -------: | ------: | -------: | ---: |
10 | |     4.0  |  1006.3 |   257614 |   16 |
11 | 
12 | *  filesize=16k write
13 | 
14 | | lat msec | bw MBps |   IOPS   | jobs |
15 | | -------: | ------: | -------: | ---: |
16 | |     3.2  |  1239.1 |   317200 |   16 |
17 | 
18 | 
19 | 
20 | *  filesize=1m read
21 | 
22 | | lat msec | bw MBps |   IOPS   | jobs |
23 | | -------: | ------: | -------: | ---: |
24 | |     1.7  |  2400.1 |   614419 |   16 |
25 | 
26 | *  filesize=1m write
27 | 
28 | | lat msec | bw MBps |   IOPS   | jobs |
29 | | -------: | ------: | -------: | ---: |
30 | |     2.1  |  1940.5 |   496765 |   16 |
31 | 
32 | 
33 | 
34 | *  filesize=1g read
35 | 
36 | | lat msec | bw MBps |   IOPS   | jobs |
37 | | -------: | ------: | -------: | ---: |
38 | |     1.4  |  2762.0 |   707062 |   16 |
39 | 
40 | *  filesize=1g write
41 | 
42 | | lat msec | bw MBps |   IOPS   | jobs |
43 | | -------: | ------: | -------: | ---: |
44 | |     2.1  |  1943.9 |   497638 |   16 |
45 | 


--------------------------------------------------------------------------------
/storage/fio-json-extract.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | #
 4 | # usage:
 5 | #
 6 | # ./fio-json-extract.py fio-json-file.json
 7 | #
 8 | # The script expects an fio-generated json file as the only input. That is `filename.json` that
 9 | # comes from `fio ... --output-format=json --output=filename.json`
10 | #
11 | # The will print out a markdown table of average latency, bandwidth and iops
12 | 
13 | import io, json, sys
14 | 
15 | if len(sys.argv) != 2:
16 |     raise ValueError("usage: ./fio-json-extract.py fio-json-file.json")
17 | 
18 | with open(sys.argv[1], 'r') as f:
19 |     d = json.load(f)
20 | 
21 | # expects a single job output
22 | job = d['jobs'][0]
23 | rw_type = job['jobname'] # read | write
24 | section = job[rw_type]
25 | numjobs = int(d['global options']['numjobs'])
26 | 
27 | headers = ["lat msec", "bw MBps", "  IOPS  ", "jobs"]
28 | width = [len(h) for h in headers]
29 | 
30 | print("| " + " | ".join(headers)  + " |")
31 | 
32 | print(f"| {'-'*(width[0]-1)}: | {'-'*(width[1]-1)}: | {'-'*(width[2]-1)}: | {'-'*(width[3]-1)}: | ")
33 | 
34 | print(f"| {section['lat_ns']['mean']/10**6:{width[0]}.1f} | {section['bw_bytes']/2**20:{width[1]}.1f} | {int(section['iops']):{width[2]}d} | {numjobs:{width[3]}d} |")
35 | 


--------------------------------------------------------------------------------
/storage/fio-scan:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script will run fio on a given partition/path for read/write for 16KB, 1MB and 1GB file sizes
 4 | # using a fixed 4k block size.
 5 | #
 6 | # usage:
 7 | #
 8 | # ./fio-scan /mnt/nvme0/fio
 9 | #
10 | # The required argument is the path to the partition you want to benchmark. It will save a json file
11 | # with each of the runs and produce a single summary of average latency, bandwidth and IOPs.
12 | #
13 | # note: this script calls `python ./fio-json-extract.py` so if you copy this script from the repo make sure to copy fio-json-extract.py as well.
14 | #
15 | # Scroll to the end of the script to optionally adapt any of the fio parameters to reflect your reality.
16 | 
17 | #set -x
18 | set -euo pipefail
19 | 
20 | if [ $# -eq 1 ]; then
21 |    partition_path=$1
22 | else
23 |     echo "Usage: $0 /path/to/a/partition/to/run/benchmark/on"
24 |     exit 1
25 | fi
26 | 
27 | # append an extra section in case someone passes a top-level dir
28 | base_path=$partition_path/fio-test
29 | mkdir -p $base_path
30 | 
31 | echo
32 | echo "*** Benchmarking $base_path"
33 | echo
34 | 
35 | DATETIMEFS=$(date +"%Y-%m-%d-%H-%M-%S")
36 | DATETIME=$(date +"%Y-%m-%d-%T")
37 | HOSTNAME=$(hostname -s)
38 | 
39 | # add pid in case multiple benchmarks get started at the same time
40 | report_file=$HOSTNAME-$DATETIMEFS-$-summary.md
41 | 
42 | echo "# fio benchmark results for $HOSTNAME on $DATETIME" >> $report_file
43 | echo >> $report_file
44 | echo "partition $base_path"  >> $report_file
45 | echo >> $report_file
46 | 
47 | # fio parameters discussion. You might want to read fio's manpage and adapt some of the settings.
48 | #
49 | # I'm using --unlink=1 to prevent fio from doing invalid reporting as it'd otherwise incorrectly
50 | # reuse work files from previous benchmarks and report invalid outcomes. It incidentally also
51 | # removes the need to clean up at the end of the benchmark run.
52 | #
53 | # Use --numjobs=16 if you're planning to have a read/write concurrency of 16 processes. e.g. if
54 | # you write a checkpoint from 8 processes on 8 nodes, you will have a write concurrency of 64 (same
55 | # for loading those 64 checkpoints on resume)
56 | #
57 | # --runtime should be long enough to create a sustainable load - so at least a few minutes
58 | #
59 | 
60 | filesizes=( 16k 1m 1g )
61 | readwrite=( read write )
62 | 
63 | for FS in "${filesizes[@]}"; do
64 |     echo >> $report_file
65 |     for RW in "${readwrite[@]}"; do
66 |         echo "# filesize=$FS $RW" >> $report_file
67 |         output=$HOSTNAME-$RW-$FS-$DATETIMEFS.json
68 |         cmd="fio --ioengine=libaio --filesize=$FS --ramp_time=2s --time_based --runtime=3m --numjobs=16 --direct=1 --verify=0 --randrepeat=0 --group_reporting --unlink=1 --directory=$base_path --name=$RW --blocksize=4k --iodepth=64 --readwrite=$RW --output-format=json --output=$output"
69 |         echo $cmd
70 |         $cmd
71 |         echo >> $report_file
72 |         python ./fio-json-extract.py $output >> $report_file
73 |         echo >> $report_file
74 |     done
75 |     echo >> $report_file
76 | done
77 | 
78 | echo
79 | echo "wrote a summary report into $report_file"
80 | echo
81 | 
82 | cat $report_file
83 | 


--------------------------------------------------------------------------------
/todo.md:
--------------------------------------------------------------------------------
1 | # TODO
2 | 
3 | Also see [stabs](./stabs)
4 | 
5 | - re-run all-reduce bench and update plot+table as the bench switched to KiB/MiB/etc.
6 | https://github.com/stas00/ml-engineering/tree/master/network/benchmarks#all_reduce-benchmark
7 | 


--------------------------------------------------------------------------------
/training/README.md:
--------------------------------------------------------------------------------
 1 | # Training
 2 | 
 3 | **Subsections**:
 4 | 
 5 | - [Model parallelism](model-parallelism)
 6 | 
 7 | - [Performance](performance)
 8 | 
 9 | - [Fault Tolerance](fault-tolerance)
10 | 
11 | - [Reproducibility](reproducibility)
12 | 
13 | - [Instabilities](instabilities)
14 | 
15 | - [Checkpoints](checkpoints)
16 | 
17 | - [Training hyper-parameters and model initializations](hparams.md)
18 | 
19 | - [Tensor precision / Data types](dtype.md)
20 | 
21 | - [Emulate a multi-node setup using just a single node](emulate-multi-node.md) - instructions on how to emulate a multi-node setup using just a single node - we use the `deepspeed` launcher here.
22 | 
23 | - [Re-train HF hub models from scratch using finetuning examples](re-train-hub-models.md)
24 | 
25 | - [Datasets](datasets.md)
26 | 
27 | **Tools**:
28 | 
29 | - [printflock.py](tools/printflock.py) - a tiny library that makes your `print` calls non-interleaved in a multi-gpu environment.
30 | 
31 | - [multi-gpu-non-interleaved-print.py](tools/multi-gpu-non-interleaved-print.py) - a `flock`-based wrapper around `print` that prevents messages from getting interleaved when multiple processes print at the same time - which is the case with `torch.distributed` used with multiple-gpus.
32 | 


--------------------------------------------------------------------------------
/training/checkpoints/README.md:
--------------------------------------------------------------------------------
1 | # Checkpoints
2 | 
3 | - [torch-checkpoint-convert-to-bf16](./torch-checkpoint-convert-to-bf16) - converts an existing fp32 torch checkpoint to bf16. If [safetensors](https://github.com/huggingface/safetensors/) are found those are converted as well. Should be easily adaptable to other similar use cases.
4 | 
5 | - [torch-checkpoint-shrink.py](./torch-checkpoint-shrink.py) - this script fixes checkpoints which for some reason stored tensors with storage larger than their view at the moment of saving. It clones the current view and re-saves them with just the storage of the current view.
6 | 


--------------------------------------------------------------------------------
/training/checkpoints/torch-checkpoint-convert-to-bf16:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this script converts torch's *bin and safetensor *safetensor files to bf16 creating a new checkpoint under a sub-dir bf16
 4 | #
 5 | # usage:
 6 | # cd checkpoint
 7 | # bash torch-checkpoint-convert-to-bf16
 8 | 
 9 | # set destination dir
10 | target_dir=bf16
11 | 
12 | echo "creating a new checkpoint under dir $target_dir"
13 | mkdir -p $target_dir
14 | 
15 | # cp config and other files - adapt if needed - could also do `cp * $target_dir`
16 | cp *json *model $target_dir
17 | 
18 | # convert *bin
19 | echo "converting *bin torch files"
20 | python -c "import torch, sys; [torch.save({k:v.to(torch.bfloat16) for k,v in torch.load(f).items()}, f'{sys.argv[1]}/{f}') for f in sys.argv[2:]]" $target_dir *bin
21 | 
22 | # convert *safetensors (from original *bin files)
23 | if compgen -G "*.safetensors" > /dev/null; then
24 |     echo "converting *safetensors files"
25 |     cd $target_dir
26 |     python -c "import re, sys, torch; from safetensors.torch import save_file; [save_file(torch.load(f), re.sub(r'.*?(model.*?)\.bin',r'\1.safetensors',f), metadata={'format': 'pt'}) for f in sys.argv[1:]]" *bin
27 |     if test -e "pytorch_model.bin.index.json"; then
28 |         cp pytorch_model.bin.index.json model.safetensors.index.json
29 |         perl -pi -e 's|pytorch_||; s|\.bin|.safetensors|' model.safetensors.index.json
30 |     fi
31 |     cd - > /dev/null
32 | fi
33 | 
34 | echo "the dir $target_dir now contains a copy of the original checkpoint with bf16 weights"
35 | 


--------------------------------------------------------------------------------
/training/checkpoints/torch-checkpoint-shrink.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This script fixes checkpoints which for some reason stored tensors with storage larger than their
 4 | # view at the moment of saving. It clones the current view and re-saves them with just the storage
 5 | # of the current view.
 6 | #
 7 | # Examples:
 8 | #
 9 | # 1. All files in the checkpoint
10 | # ./torch-checkpoint-shrink.py --checkpoint_dir ./checkpoints/global_step10
11 | #
12 | # 2. Only select files in the checkpoint that match several patterns
13 | # ./torch-checkpoint-shrink.py --checkpoint_dir ./checkpoints/global_step10 --patterns 'layer*pt' 'zero*pt'
14 | 
15 | import argparse
16 | import torch
17 | import glob
18 | import os
19 | import collections.abc
20 | from fnmatch import fnmatch
21 | 
22 | debug = 0
23 | 
24 | # load to cpu
25 | device = torch.device('cpu')
26 | 
27 | def get_pt_files(checkpoint_dir, patterns):
28 | 
29 |     if not os.path.isdir(checkpoint_dir):
30 |         raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
31 | 
32 |     pt_files = sorted(glob.glob(os.path.join(checkpoint_dir, "*.pt")))
33 | 
34 |     if len(pt_files) == 0:
35 |         raise FileNotFoundError(
36 |             f"can't find '*.pt' files in directory '{checkpoint_dir}'")
37 | 
38 |     # filter out by pattern (just the file part without any parent dir)
39 |     pt_files = [f for f in pt_files for p in patterns if fnmatch(os.path.basename(f), p)];
40 | 
41 |     return pt_files
42 | 
43 | def shrink_dict_values(d, prefix=""):
44 |     for k, v in d.items():
45 |         k_full = f"{prefix}.{k}" if len(prefix) else k
46 |         if isinstance(v, collections.abc.Mapping):
47 |             shrink_dict_values(v, k_full)
48 |         else:
49 |             if debug:
50 |                 print(f"{k_full}")
51 |             if v is not None and torch.is_tensor(v):
52 |                 d[k] = v.clone() # drop any unused storage
53 | 
54 | def shrink_pt_file(f):
55 |     print(f"-> {f}")
56 |     size_before = os.path.getsize(f)
57 |     sd = torch.load(f, map_location=device)
58 |     shrink_dict_values(sd)
59 |     torch.save(sd, f)
60 |     size_after = os.path.getsize(f)
61 |     size_delta = size_before - size_after
62 |     if debug:
63 |         print(f"before {size_before / 2**20:.2f}MB, after {size_after / 2**20:.2f}MB, saved {size_delta / 2**20:.2f}MB")
64 |     return size_before, size_after, size_delta
65 | 
66 | def checkpoint_shrink(checkpoint_dir, patterns):
67 |     """
68 |     Args:
69 |         - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
70 |     """
71 |     print(f"Processing zero checkpoint '{checkpoint_dir}'")
72 |     pt_files = get_pt_files(checkpoint_dir, patterns)
73 |     before, after, delta = 0, 0, 0
74 |     for f in pt_files:
75 |         size_before, size_after, size_delta = shrink_pt_file(f)
76 |         before += size_before
77 |         after  += size_after
78 |         delta  += size_delta
79 |     print(f"Done. Before {before / 2**20:.2f}MB, after {after / 2**20:.2f}MB, saved {delta / 2**20:.2f}MB")
80 | 
81 | if __name__ == "__main__":
82 | 
83 |     parser = argparse.ArgumentParser()
84 |     parser.add_argument("--checkpoint_dir", type=str, help="path to the desired checkpoint folder, e.g., path/checkpoints/global_step10")
85 |     parser.add_argument("--patterns", nargs='+', default="*.pt", required=False, type=str, help="one or more patterns of checkpoint files - make sure to quote those! by default all *.pt files")
86 |     parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
87 |     args = parser.parse_args()
88 | 
89 |     debug = args.debug
90 | 
91 |     checkpoint_shrink(args.checkpoint_dir, args.patterns)
92 | 


--------------------------------------------------------------------------------
/training/datasets.md:
--------------------------------------------------------------------------------
 1 | # Dealing with datasets
 2 | 
 3 | ## Preprocessing and caching datasets on the main process
 4 | 
 5 | HF Accelerate has a very neat container [`main_process_first`](https://huggingface.co/docs/accelerate/v0.4.0/accelerator.html#accelerate.Accelerator.main_process_first) which allows to write code like:
 6 | 
 7 | ```
 8 | with accelerator.main_process_first():
 9 |     # load and pre-process datasets
10 |     dataset = datasets.load_dataset(...)
11 |     # optionally cache it and have the rest of the processes load the cache
12 | ```
13 | instead of the less intuitive and requiring code repetition:
14 | ```
15 | if rank == 0:
16 |     dataset = datasets.load_dataset(...)
17 | dist.barrier()
18 | if not rank == 0:
19 |     dataset = datasets.load_dataset(...)
20 | ```
21 | 
22 | You want to download and process data on the main process and not all processes, because they will be all repeating the same thing in parallel and more over are likely to write to the same location which will result in interleaved broken result. It's also much faster IO-wise to serialize such work.
23 | 
24 | Now there is `main_process_first` and `local_main_process_first` - the first one is for when your data resides on a shared filesystem and all compute nodes can see it. The second one is for when the data is local to each node.
25 | 
26 | If you aren't using HF Accelerate, I have recreated similar containers, except called them:
27 | 
28 | - `global_main_process_first` - for shared fs
29 | - `local_main_process_first` - for local to node fs
30 | 
31 | You can find them [here](tools/main_process_first.py).
32 | 
33 | Now, what if you want to write a generic code that automatically works on shared and local filesystems. I added another helper that automatically discovers what type of filesystem we are dealing with and based on that call the right containers. I called it `main_process_by_path_first`, which is used like:
34 | 
35 | ```
36 | path = "/path/to/data"
37 | with main_process_by_path_first(path):
38 |     # load and pre-process datasets
39 |     dataset = datasets.load_dataset(...)
40 |     # optionally cache it and have the rest of the processes load the cache
41 | ```
42 | 
43 | You can find it [here](tools/main_process_first.py).
44 | 
45 | Of course, besides containers you will also want utils to check the type of main process, and so there are 3 of those corresponding to the containers:
46 | 
47 | - `is_main_process_by_path(path)`
48 | - `is_local_main_process()`
49 | - `is_global_main_process()`
50 | 
51 | They are all found in [here](tools/main_process_first.py).
52 | 
53 | You can see them in action by running:
54 | 
55 | ```
56 | python -u -m torch.distributed.run --nproc_per_node=2 --rdzv_endpoint localhost:6000  --rdzv_backend c10d tools/main_process_first.py
57 | ```
58 | 


--------------------------------------------------------------------------------
/training/dtype.md:
--------------------------------------------------------------------------------
  1 | # Tensor precision / Data types
  2 | 
  3 | These are the common datatypes that are used as of this writing in ML (usually referred to as `dtype`):
  4 | 
  5 | Floating point formats:
  6 | - fp32 - 32 bits
  7 | - tf32 - 19 bits (NVIDIA Ampere+)
  8 | - fp16 - 16 bits
  9 | - bf16 - 16 bits
 10 | - fp8 - 8 bits (E4M3 and E5M2 formats)
 11 | - fp6 - 6 bits
 12 | - fp4 - 4 bits
 13 | 
 14 | For visual comparison refer to this representations:
 15 | 
 16 | ![fp32-tf32-fp16-bf16](images/fp32-tf32-fp16-bf16.png)
 17 | 
 18 | ([source](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/))
 19 | 
 20 | ![fp16-bf16-fp8](images/fp16-bf16-fp8.png)
 21 | 
 22 | ([source](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html))
 23 | 
 24 | 
 25 | The new formats that are being adopted by new hardware are:
 26 | - fp4: `float4_e2m1fn`
 27 | - fp6:`float6_e2m3fn` and `float6_e3m2fn`
 28 | - fp8: `float8_e3m4`, `float8_e4m3`, `float8_e4m3b11fnuz`, `float8_e4m3fn`, `float8_e4m3fnuz`, `float8_e5m2`, `float8_e5m2fnuz`, `float8_e8m0fnu`
 29 | 
 30 | There is an excellent explanation of each of these variations [here](https://github.com/jax-ml/ml_dtypes?tab=readme-ov-file#specifications-of-implemented-floating-point-formats).
 31 | 
 32 | To decipher the letters followed by the numbers:
 33 | - The `e` indicates the length of exponent
 34 | - The `m` indicates the length of mantissa
 35 | - The `b` indicates the bias
 36 | 
 37 | To decipher the letters appearing after the numbers:
 38 | - The `f` indicates it is finite values only (no infinities).
 39 | - The `n` indicates it includes NaNs, but only at the outer range.
 40 | - The `u` stands for unsigned format.
 41 | - The `uz` stands for unsigned zero.
 42 | 
 43 | So for example: `float8_e4m3b11fnuz` stands for fp8 + 4-bit exponent + 3-bit mantissa + bias 11 + finite values only +  includes NaNs, but only at the outer range + unsigned zero.
 44 | 
 45 | 
 46 | Integer formats used in quantization:
 47 | 
 48 | - int8 - 8 bits
 49 | - int4 - 4 bits
 50 | - int1 - 1 bits
 51 | 
 52 | ## ML dtype progression
 53 | 
 54 | Originally ML was using fp32, but it was very slow.
 55 | 
 56 | Next [mixed-precision was invented using a combination of fp16 and fp32](https://developer.nvidia.com/blog/video-mixed-precision-techniques-tensor-cores-deep-learning/) was invented which tremendously sped up the training speed.
 57 | 
 58 | ![fp32/fp16 mixed precision](images/mixed-precision-fp16.png)
 59 | 
 60 | ([source](https://developer.nvidia.com/blog/video-mixed-precision-techniques-tensor-cores-deep-learning/))
 61 | 
 62 | But fp16 proved to be not very stable and training LLM was extremely difficult.
 63 | 
 64 | Luckily bf16 came out and replaced fp16 using the same mixed precision protocol. This made the LLM training much more stable.
 65 | 
 66 | Then fp8 came and mixed precision has switched to [that](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html) and which makes the training even faster. See the paper: [FP8 Formats for Deep Learning](https://arxiv.org/abs/2209.05433).
 67 | 
 68 | To appreciate the speed ups between the different formats have a look at this table for NVIDIA A100 TFLOPS spec (w/o sparsity):
 69 | 
 70 | | Data type              | TFLOPS |
 71 | | :---                   |    --: |
 72 | | FP32                   |   19.5 |
 73 | | Tensor Float 32 (TF32) |    156 |
 74 | | BFLOAT16 Tensor Core   |    312 |
 75 | | FP16 Tensor Core       |    312 |
 76 | | FP8 Tensor Core        |    624 |
 77 | | INT8 Tensor Core       |    624 |
 78 | 
 79 | Each next dtype is about 2x faster than the previous one (except fp32 which is much slower than the rest).
 80 | 
 81 | In parallel with the mixed training regime the ML community starting coming up with various quantization approaches. Probably one of the best examples is Tim Dettmers' [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) which provides many 4 and 8-bit quantization solutions. The Deepspeed team also has some [interesting quantization solutions](https://www.deepspeed.ai/tutorials/model-compression/).
 82 | 
 83 | ## TF32
 84 | 
 85 | TF32 is a magical datatype that is available on NVIDIA GPUs since Ampere, and which allows fp32 `matmul`s performed at a much faster speed than normal fp32 `matmul`s with a small precision loss.
 86 | 
 87 | Here is an example of A100 TFLOPS (w/o sparsity):
 88 | 
 89 | | Data type              | TFLOPS |
 90 | | :---                   |    --: |
 91 | | FP32                   |   19.5 |
 92 | | Tensor Float 32 (TF32) |    156 |
 93 | 
 94 | As you can see TF32 is 8x faster than FP32!
 95 | 
 96 | It's disabled by default. To enable it add at the beginning of your program:
 97 | 
 98 | ```
 99 | torch.backends.cuda.matmul.allow_tf32 = True
100 | torch.backends.cudnn.allow_tf32 = True
101 | ```
102 | 
103 | For more information about the actual precision loss please see [this](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices).
104 | 
105 | 
106 | ## When to use fp32 accumulators
107 | 
108 | Whenever a low-precision dtype is used one has to be careful not to accumulate intermediary results in that dtype.
109 | 
110 | `LayerNorm`-like operations must not do their work in half-precision, or they may lose a lot of data. Therefore when  these operations are implemented correctly they do efficient internal work in the dtype of the inputs, but using the fp32 accumulation registers and then their outputs are downcast to the precision of the inputs.
111 | 
112 | Generally it's just the accumulation that is done in fp32, since adding up many low-precision numbers is very lossy otherwise.
113 | 
114 | Here are some examples:
115 | 
116 | 1. Reduction collectives
117 | 
118 | * fp16: ok to do in fp16 if loss scaling is in place
119 | 
120 | * bf16: only ok in fp32
121 | 
122 | 2. Gradient accumulation
123 | 
124 | * best done in fp32 for fp16 and bf16, but definitely is a must for bf16
125 | 
126 | 3. Optimizer step / Vanishing gradients
127 | 
128 | * when adding a tiny gradient to a large number, that addition is often nullified therefore typically fp32 master weights and fp32 optim states are used.
129 | 
130 | * f16 master weights and optim states can be used when using [Kahan Summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
131 | or [Stochastic rounding](https://en.wikipedia.org/wiki/Rounding) (introduced in [Revisiting BFloat16 Training](https://arxiv.org/abs/2010.06192)).
132 | 
133 | For an example of the latter see: [AnyPrecision optimizer](https://github.com/pytorch/torchdistx/pull/52) with the latest version found [here](https://github.com/facebookresearch/multimodal/blob/6bf3779a064dc72cde48793521a5be151695fc62/torchmultimodal/modules/optimizers/anyprecision.py#L17).
134 | 
135 | 
136 | ## Changing precision post training
137 | 
138 | Sometimes it's OK to change precision after the model was trained.
139 | 
140 | - Using bf16-pretrained model in fp16 regime usually fails - due to overflows (the biggest number that can be represented in fp16 is 64k) for an indepth discussion and possible workaround see this [PR](https://github.com/huggingface/transformers/pull/10956).
141 | 
142 | - Using fp16-pretrained model in bf16 regime usually works - it will lose some performance on conversion, but should work - best to finetune a bit before using it.
143 | 


--------------------------------------------------------------------------------
/training/fault-tolerance/fs-watchdog.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=fs-watchdog       # job name
 3 | #SBATCH --ntasks=1                   # number of MP tasks
 4 | #SBATCH --nodes=1
 5 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 6 | #SBATCH --time=2:00:00               # maximum execution time (HH:MM:SS)
 7 | #SBATCH --output=%x-%j.out           # output file name
 8 | #SBATCH --partition=compil
 9 | 
10 | set -e
11 | 
12 | echo "START TIME: $(date)"
13 | 
14 | source $six_ALL_CCFRWORK/start-prod
15 | 
16 | echo "running partition watchdog"
17 | 
18 | BIG_SCIENCE_REPO_PATH=$six_ALL_CCFRWORK/code/tr11-176B-ml/bigscience
19 | 
20 | $BIG_SCIENCE_REPO_PATH/tools/fs-watchdog.py
21 | 
22 | echo "END TIME: $(date)"
23 | 


--------------------------------------------------------------------------------
/training/fault-tolerance/slurm-status.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #
  4 | # This tool reports on the status of the job - whether it's running or scheduled and various other
  5 | # useful data
  6 | #
  7 | # Example:
  8 | #
  9 | # slurm-status.py --job-name tr1-13B-round3
 10 | #
 11 | 
 12 | import argparse
 13 | import io
 14 | import json
 15 | import os
 16 | import re
 17 | import shlex
 18 | import smtplib
 19 | import socket
 20 | import subprocess
 21 | import sys
 22 | from datetime import datetime, timedelta
 23 | 
 24 | SLURM_GROUP_NAME = "six"
 25 | 
 26 | # this needs to be an actual email subscribed to bigscience@groups.google.com
 27 | FROM_ADDR = "bigscience-bot@huggingface.co"
 28 | TO_ADDRS = ["bigscience@googlegroups.com", "foo@bar.com"] # wants a list
 29 | 
 30 | def send_email(subject, body):
 31 |     message = f"""\
 32 | From: {FROM_ADDR}
 33 | To: {", ".join(TO_ADDRS)}
 34 | Subject: {subject}
 35 | 
 36 | {body}
 37 | """
 38 | 
 39 |     server = smtplib.SMTP("localhost")
 40 |     #server.set_debuglevel(3)  # uncomment if need to debug
 41 |     server.sendmail(FROM_ADDR, TO_ADDRS, message)
 42 |     server.quit()
 43 | 
 44 | def send_email_alert_job_not_scheduled(job_name):
 45 | 
 46 |     subject = f"[ALERT] {job_name} is neither running nor scheduled to run"
 47 |     body = f"""
 48 | ***ALERT: {job_name} is neither RUNNING nor SCHEDULED! Alert someone at Eng WG***
 49 | 
 50 | Please reply to this email once the issue has been taken care of, or if you are in the process of doing that, should new alerts be sent again.
 51 | 
 52 | If unsure what to do, please post in the #bigscience-engineering slack channel.
 53 | 
 54 | *** Useful info ***
 55 | 
 56 | On call info: https://github.com/bigscience-workshop/bigscience/tree/master/train/tr1-13B-base#on-call
 57 | Training logs: https://github.com/bigscience-workshop/bigscience/tree/master/train/tr1-13B-base#watching-the-training-logs
 58 | Launching training: https://github.com/bigscience-workshop/bigscience/tree/master/train/tr1-13B-base#training-scripts
 59 | """
 60 | 
 61 |     send_email(subject, body)
 62 | 
 63 | def check_running_on_jean_zay():
 64 |     fqdn = socket.getfqdn()
 65 |     # sometimes it gives fqdn, other times it doesn't, so try to use both patterns
 66 |     if not ("idris.fr" in fqdn or "idrsrv" in fqdn):
 67 |         raise ValueError("This script relies on JZ's specific environment and won't work elsewhere. "
 68 |         f"You're attempting to run it on '{fqdn}'.")
 69 | 
 70 | def run_cmd(cmd):
 71 |     try:
 72 |         git_status = subprocess.run(
 73 |             cmd,
 74 |             stderr=subprocess.PIPE,
 75 |             stdout=subprocess.PIPE,
 76 |             check=True,
 77 |             encoding="utf-8",
 78 |         ).stdout.strip()
 79 |     except subprocess.CalledProcessError as exc:
 80 |         raise EnvironmentError(exc.stderr)
 81 | 
 82 |     return git_status
 83 | 
 84 | 
 85 | def get_slurm_group_status():
 86 |     # we need to monitor slurm jobs of the whole group six, since the slurm job could be owned by
 87 |     # any user in that group
 88 |     cmd = f"getent group {SLURM_GROUP_NAME}"
 89 |     getent = run_cmd(cmd.split())
 90 |     # sample output: six:*:3015222:foo,bar,tar
 91 |     usernames = getent.split(':')[-1]
 92 | 
 93 |     # get all the scheduled and running jobs
 94 |     # use shlex to split correctly and not on whitespace
 95 |     cmd = f'squeue --user={usernames} -o "%.16i %.9P %.40j %.8T %.10M %.6D %.20S %R"'
 96 |     data = run_cmd(shlex.split(cmd))
 97 |     lines = [line.strip() for line in data.split("\n")]
 98 |     return lines
 99 | 
100 | 
101 | def get_remaining_time(time_str):
102 |     """
103 |     slurm style time_str = "2021-08-06T15:23:46"
104 |     """
105 | 
106 |     delta = datetime.strptime(time_str, "%Y-%m-%dT%H:%M:%S") - datetime.now()
107 |     # round micsecs
108 |     delta -= timedelta(microseconds=delta.microseconds)
109 |     return delta
110 | 
111 | 
112 | def get_preamble():
113 |     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
114 |     # add a string that is easy to grep for:
115 |     return f"[{timestamp}] PULSE:"
116 | 
117 | 
118 | def process_job(jobid, partition, name, state, time, nodes, start_time, notes):
119 | 
120 |     job_on_partition = f"{jobid} on '{partition}' partition"
121 |     preamble = get_preamble()
122 | 
123 |     if state == "RUNNING":
124 |         print(f"{preamble} {name} is running for {time} since {start_time} ({job_on_partition} ({notes})")
125 |     elif state == "PENDING":
126 |         if start_time == "N/A":
127 |             if notes == "(JobArrayTaskLimit)":
128 |                 print(f"{preamble} {name} is waiting for the previous Job Array job to finish before scheduling a new one ({job_on_partition})")
129 |             elif notes == "(Dependency)":
130 |                 print(f"{preamble} {name} is waiting for the previous job to finish before scheduling a new one using the dependency mechanism ({job_on_partition})")
131 |             else:
132 |                 print(f"{preamble} {name} is waiting to be scheduled ({job_on_partition})")
133 |         else:
134 |             remaining_wait_time = get_remaining_time(start_time)
135 |             print(f"{preamble} {name} is scheduled to start in {remaining_wait_time} (at {start_time}) ({job_on_partition})")
136 | 
137 |         return True
138 |     else:
139 |         # Check that we don't get some 3rd state
140 |         print(f"{preamble} {name} is unknown - fix me: (at {start_time}) ({job_on_partition}) ({notes})")
141 | 
142 | 
143 | def get_args():
144 |     parser = argparse.ArgumentParser()
145 |     parser.add_argument("--job-name", type=str, required=True, help="slurm job name")
146 |     parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
147 |     parser.add_argument("--no-email", action='store_true', help="do not email alerts")
148 |     return parser.parse_args()
149 | 
150 | 
151 | def main():
152 | 
153 |     check_running_on_jean_zay()
154 | 
155 |     args = get_args()
156 |     status_lines = get_slurm_group_status()
157 | 
158 |     in_the_system = False
159 |     for l in status_lines:
160 |         #print(f"l=[{l}]")
161 | 
162 |         # XXX: apparently some jobs can be run w/o name and break the split() call, so match our
163 |         # name first and then split
164 |         if args.job_name in l:
165 |             jobid, partition, name, state, time, nodes, start_time, notes = l.split(None, 7)
166 |             #print("-".join([jobid, partition, name, state, time, nodes, start_time, notes]))
167 |             # XXX: add support for regex matching so partial name can be provided
168 |             if name == args.job_name:
169 |                 in_the_system = True
170 |                 process_job(jobid, partition, name, state, time, nodes, start_time, notes)
171 | 
172 |     if not in_the_system:
173 |         preamble = get_preamble()
174 |         print(f"{preamble} ***ALERT: {args.job_name} is not RUNNING or SCHEDULED! Alert someone at Eng WG***")
175 |         if not args.no_email:
176 |             send_email_alert_job_not_scheduled(args.job_name)
177 | 
178 | 
179 | if __name__ == "__main__":
180 | 
181 |     main()
182 | 


--------------------------------------------------------------------------------
/training/fault-tolerance/slurm-status.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=tr11-176B-ml      # job name
 3 | #SBATCH --ntasks=1                   # number of MP tasks
 4 | #SBATCH --nodes=1                    # number of nodes
 5 | #SBATCH --cpus-per-task=1            # number of cores per task
 6 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 7 | #SBATCH --time=0:30:00               # maximum execution time (HH:MM:SS)
 8 | #SBATCH --output=%x-%j.out           # output file name
 9 | #SBATCH --partition=compil
10 | 
11 | echo "START TIME: $(date)"
12 | 
13 | variant=main
14 | DATA_OUTPUT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml
15 | CHECKPOINT_PATH=$DATA_OUTPUT_PATH/checkpoints/$variant
16 | REPO_PATH=$DATA_OUTPUT_PATH/tr11-176B-ml-logs
17 | LOGS_PATH=$REPO_PATH/logs/$variant
18 | 
19 | MAIN_LOG_FILE=$LOGS_PATH/main_log.txt
20 | BIG_SCIENCE_REPO_PATH=$six_ALL_CCFRWORK/code/tr11-176B-ml/bigscience
21 | WATCH_SLURM_NAME=tr11-176B-ml
22 | 
23 | $BIG_SCIENCE_REPO_PATH/tools/slurm-status.py --job-name $WATCH_SLURM_NAME 2>&1 | tee -a $MAIN_LOG_FILE
24 | 
25 | echo "END TIME: $(date)"
26 | 


--------------------------------------------------------------------------------
/training/hparams.md:
--------------------------------------------------------------------------------
 1 | # Selecting Training Hyper-Parameters And Model Initializations
 2 | 
 3 | The easiest way to find a good hparam and model init starter set is to steal it from a similar training that you know has succeeded. Here is a [collection of public training LLM/VLM logbooks](../resources/README.md#publicly-available-training-llmvlm-logbooks) to get you started. The other common source is papers if they disclose that information. You can also try to reach out to the authors and ask them for these details if they didn't publish it.
 4 | 
 5 | ## Glossary
 6 | 
 7 | Training jargon uses a multitude of abbreviations and terms, so here are some important for this chapter.
 8 | 
 9 | - BS: Batch Size - here we mean batch size per gpu, often it is also referred to as MBS (micro-batch-size)
10 | - GBS: Global Batch Size - total batch size per iteration - may include gradient accumulation
11 | - GAS: Gradient Accumulation Steps - how many forward/backward cycles to perform before one full iteration is complete
12 | - TFLOPs: Trillion FLOPs per second - [FLOPS](https://en.wikipedia.org/wiki/FLOPS)
13 | - PP: Pipeline Parallelism
14 | 
15 | ## Global Batch Size Ramp Up
16 | 
17 | If you intend to train with a very large GBS, with say 1024, or 2048 samples and even higher, when you just start training, it's very wasteful to feed such large batch sizes to the model. At this point it's totally random and can't benefit from having too refined data. Therefore to save data and resources, one often ramps up the global batch size over some period of time.
18 | 
19 | It's also important to not start with GBS that is too small, since otherwise the progress won't be efficient. When there is too little data the compute (TFLOPS) is inefficient and will slow everything down. This is especially so when Pipeline Parallelism (PP) is used, since the most important thing about PP tuneup is a small GPU idleness bubble, and the smaller the GBS the larger the bubble is.
20 | 
21 | For example, for BLOOM-176B, where we did use PP, after doing throughput benchmarking we found that starting with GBS=16 was incredibly slow (8 TFLOPs), so we eventually started with GBS=192 (73 TFLOPs) and then we ramped up to GBS=2048 (150 TFLOPs) - we increased GBS by 16 every 9_765_625 samples.
22 | 
23 | 
24 | 
25 | ### STD Init
26 | 
27 | This hyper parameter is super-important and it requires math to get it right. For details see [STD Init](instabilities#std-init).
28 | 


--------------------------------------------------------------------------------
/training/images/fp16-bf16-fp8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/images/fp16-bf16-fp8.png


--------------------------------------------------------------------------------
/training/images/fp32-tf32-fp16-bf16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/images/fp32-tf32-fp16-bf16.png


--------------------------------------------------------------------------------
/training/images/mixed-precision-fp16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/images/mixed-precision-fp16.png


--------------------------------------------------------------------------------
/training/instabilities/README.md:
--------------------------------------------------------------------------------
 1 | # Avoiding, Recovering From and Understanding Instabilities
 2 | 
 3 | Sub-sections:
 4 | 
 5 | * [Understanding Training Loss Patterns](training-loss-patterns.md) - types of spikes, divergences, grokking moments, resumes, etc.
 6 | 
 7 | ## Learning from Training Logbooks
 8 | 
 9 | The best learning is to read [Publicly available training LLM/VLM logbooks](../../resources#publicly-available-training-llmvlm-logbooks) because there you can see exactly what happened and how the problem has been overcome.
10 | 
11 | 
12 | ## STD Init
13 | 
14 | Correctly initializing the initial distribution of the tensors can have a tremendous impact on training's stability. The `std` value isn't fixed and depends on the hidden dimension size.
15 | 
16 | This proved to be a very crucial setting in our pre-BLOOM 104B experiments and we couldn't break past the first few thousands iterations until we figured out that the 0.02 default `--init-method-std` in Megatron-LM was a way too big for our model.
17 | 
18 | We referred to these two sources:
19 | 
20 | 1. "Transformers without Tears" paper https://arxiv.org/abs/1910.05895 prescribes: `sqrt(2/(NHIDDEN*5))`
21 | 
22 | 2. The 530B training paper https://arxiv.org/abs/2201.11990 they used an even smaller init formula: `sqrt(1/(NHIDDEN*3))`
23 | 
24 | and decided to go with the 530B one as it leads to an even smaller init value.
25 | 
26 | To make it easier to compare the two formulas, they can be rewritten as:
27 | 1. `sqrt(0.4000/NHIDDEN)`
28 | 2. `sqrt(0.3333/NHIDDEN)`
29 | 
30 | Thus for `NHIDDEN=14336` the math was `sqrt(1/(14336*3)) = 0.00482` and that's what we used. It surely wasn't the only reason why we had no stability issues during BLOOM-176B training, but I think it was one of the crucial ones.
31 | 
32 | 
33 | ## Numerical instabilities
34 | 
35 | Certain mathematical operations could be unstable when dealing with low precision numbers.
36 | 
37 | For example, please see this very interesting [PyTorch guide on numerical stability](https://pytorch.org/docs/stable/notes/numerical_accuracy.html).
38 | 
39 | Now let's look at a specific example of this concept in action.
40 | 
41 | During 104B training experiments where fp16 mixed precision was used - the following improvement was proposed by [Corby Rosset](https://github.com/corbyrosset) to make [self-attention more stable](https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/118).
42 | 
43 | Specifically this [line](https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/c839a8aa30731f71b3738d56009be9668508e366/megatron/model/transformer.py#L303) shows that the `norm_factor` may be multiplied after the Query * Key matrix multiplication. If the dim of Q and K are very large, the output may blow up and the `norm_factor` won't be able to save it.
44 | 
45 | Proposal: move the `norm_factor` inward, so Q and K are scaled down before matrix multiply:
46 | ```
47 |         matmul_result = torch.baddbmm(
48 |             matmul_result,
49 |             1.0/math.sqrt(self.norm_factor) * query_layer.transpose(0, 1),   # [b * np, sq, hn]
50 |             1.0/math.sqrt(self.norm_factor) * key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
51 |             beta=0.0 if alibi is None else 1.0, alpha=1.0)
52 | 
53 |         # change view to [b, np, sq, sk]
54 |         attention_scores = matmul_result.view(*output_size)
55 | ```
56 | 
57 | To make the operation mathematically equivalent, moving the norm factor inward requires taking sqrt again
58 | if n is a scalar, A and B matrices:
59 | ```
60 | n * (A dot B) === (sqrt(n) * A) dot (sqrt(n) * B)
61 | ```
62 | 
63 | Now A and B dimensions can be significantly larger.
64 | 
65 | For CUDA kernel writers [CuBlas](https://docs.nvidia.com/cuda/cublas/index.html)'s `GemmStridedBatchedEx` at the time of this writing has a similar issue. It is defined as:
66 | 
67 | ```
68 | C+i*strideC=αop(A+i*strideA)op(B+i*strideB)+β(C+i*strideC), for i ∈[0,batchCount−1]
69 | ```
70 | 
71 | The issue is that `alpha` is multiplied after the matrix-matrix multiplication is done so it can cause instability.
72 | 
73 | ## "Bad" combination of data batch and model parameter state
74 | 
75 | PaLM team observed dozens of loss spikes at "highly irregular intervals" when training larger models. While they were not able to track down the root cause, they mitigated the issue by restarting from an earlier checkpoint and skipping potentially problematic data batches. [Section 5.1 Training instability](https://arxiv.org/pdf/2204.02311.pdf)
76 | 
77 | 
78 | ## Time-domain correlation divergence in Adam
79 | 
80 | [A Theory on Adam Instability in Large-Scale Machine Learning](https://arxiv.org/abs/2304.09871) performs a rigorous study of divergence spikes while training LLMs at up to 546B parameters - and suggests that the time-domain correlation leads to divergence of Adam. This is triggered by the epsilon value not being small enough and gradient
81 | estimation components become similar to the epsilon.
82 | 
83 | In section 7.1 they propose practical suggestions, the most interesting one of them is setting epsilon to 0 and possibly dealing with division by zero condition.
84 | 


--------------------------------------------------------------------------------
/training/instabilities/images/bloom-176B-success.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/bloom-176B-success.png


--------------------------------------------------------------------------------
/training/instabilities/images/idefics-80b-tr-190-01-image2text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/idefics-80b-tr-190-01-image2text.png


--------------------------------------------------------------------------------
/training/instabilities/images/idefics-80b-tr-190-01-losses-2023-06-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/idefics-80b-tr-190-01-losses-2023-06-04.png


--------------------------------------------------------------------------------
/training/instabilities/images/idefics-80b-tr-190-01-spike-2023-05-27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/idefics-80b-tr-190-01-spike-2023-05-27.png


--------------------------------------------------------------------------------
/training/instabilities/images/idefics-80b-tr-190-01-spike-recover-2023-05-30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/idefics-80b-tr-190-01-spike-recover-2023-05-30.png


--------------------------------------------------------------------------------
/training/instabilities/images/llama-7b-grokking-no-zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/llama-7b-grokking-no-zoom.png


--------------------------------------------------------------------------------
/training/instabilities/images/llama-7b-grokking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/llama-7b-grokking.png


--------------------------------------------------------------------------------
/training/instabilities/images/pre-bloom-104B-en-fail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/pre-bloom-104B-en-fail.png


--------------------------------------------------------------------------------
/training/instabilities/images/pre-bloom-tr1-13B-glitch-1-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/pre-bloom-tr1-13B-glitch-1-2.png


--------------------------------------------------------------------------------
/training/instabilities/images/pre-bloom-tr8-104B-glitch-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/pre-bloom-tr8-104B-glitch-1.png


--------------------------------------------------------------------------------
/training/instabilities/images/pre-bloom-tr8-104B-glitch-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/pre-bloom-tr8-104B-glitch-5.png


--------------------------------------------------------------------------------
/training/instabilities/images/pre-bloom-tr8-104B-glitch-7-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/pre-bloom-tr8-104B-glitch-7-10.png


--------------------------------------------------------------------------------
/training/instabilities/images/ptl-repeat-data-p1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/ptl-repeat-data-p1.png


--------------------------------------------------------------------------------
/training/instabilities/images/ptl-repeat-data-p2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/ptl-repeat-data-p2.png


--------------------------------------------------------------------------------
/training/instabilities/images/ptl-repeat-data-p3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/instabilities/images/ptl-repeat-data-p3.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/all-reduce-reduce-scatter-all-gather.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/all-reduce-reduce-scatter-all-gather.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/deepspeed-ulysses-math.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/deepspeed-ulysses-math.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/deepspeed-ulysses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/deepspeed-ulysses.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/dist-flash-attn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/dist-flash-attn.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-deepspeed-3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-deepspeed-3d.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-flexflow.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-flexflow.jpeg


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-gpipe-bubble.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-gpipe-bubble.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-pp-dualpipe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-pp-dualpipe.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-sagemaker-interleaved-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-sagemaker-interleaved-pipeline.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-tp-independent-gelu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-tp-independent-gelu.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-tp-parallel_gemm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-tp-parallel_gemm.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-tp-parallel_self_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-tp-parallel_self_attention.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-tp-parallel_shard_processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-tp-parallel_shard_processing.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-zero-dp-pp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-zero-dp-pp.png


--------------------------------------------------------------------------------
/training/model-parallelism/images/parallelism-zero.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/model-parallelism/images/parallelism-zero.png


--------------------------------------------------------------------------------
/training/performance/benchmarks/activation-memory-per-layer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This script derives the coefficient num_of_hidden_states_copies in `num_of_hidden_states_copies * bs * seqlen * hidden_size`, which 
 4 | rougly corresponds to the amount of hidden_states copies a given model architecture makes during a single layer's forward.
 5 | 
 6 | import torch
 7 | from transformers import AutoModelForCausalLM
 8 | 
 9 | #model_name_or_path = "Qwen/Qwen3-4B"
10 | model_name_or_path = "google/gemma-1.1-2b-it"
11 | #model_name_or_path = "meta-llama/Llama-3.1-8B-Instruct"
12 | #model_name_or_path = "nvidia/Llama-3.1-Nemotron-8B-UltraLong-4M-Instruct"
13 | #model_name_or_path = "HuggingFaceTB/SmolLM2-360M"
14 | #model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.3"
15 | #model_name_or_path = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
16 | 
17 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
18 | dtype = torch.bfloat16
19 | dtype_bytes = torch.tensor([], dtype=dtype).element_size() # 2 for bf16
20 | 
21 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=dtype, trust_remote_code=True).to(device)
22 | 
23 | bs = 1
24 | seqlen = 32384
25 | hidden_size = model.config.hidden_size
26 | 
27 | hidden_states = torch.rand((bs, seqlen, hidden_size), requires_grad=True, dtype=dtype, device=device)
28 | position_ids = torch.randint(0, seqlen, [bs, seqlen], device=device)
29 | position_embeddings = model.model.rotary_emb(hidden_states, position_ids)
30 | 
31 | decoder_layer = model.model.layers[0]
32 | 
33 | torch.cuda.empty_cache()
34 | before = torch.cuda.memory_allocated()
35 | hidden_states = decoder_layer(hidden_states=hidden_states,
36 |         attention_mask=None,
37 |         position_ids=position_ids,
38 |         position_embeddings=position_embeddings)
39 | after = torch.cuda.memory_allocated()
40 | delta = after - before
41 | 
42 | print(f'{delta / (bs * seqlen * hidden_size * dtype_bytes):.1f} "{model_name_or_path}"')
43 | 


--------------------------------------------------------------------------------
/training/performance/benchmarks/dataloader/num-workers-bench.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | 
 5 | This benchmark shows that num_workers>0 leads to a better performance
 6 | 
 7 | usage:
 8 | 
 9 | ./num-workers-bench.py
10 | 
11 | """
12 | 
13 | import torch
14 | import time
15 | 
16 | class MyDataset(torch.utils.data.Dataset):
17 | 
18 |     def __init__(self):
19 |         self.tensor = torch.ones(1*2**18) # 1 mb tensor
20 | 
21 |     def __len__(self):
22 |         return 1000
23 | 
24 |     def __getitem__(self, idx):
25 |         # emulate a slow data transform
26 |         time.sleep(0.005)
27 |         return self.tensor
28 | 
29 | num_runs = 10
30 | num_workers = 5
31 | batch_size = 100
32 | compute_emulation_time = 0.2
33 | 
34 | ds = MyDataset()
35 | start_event = torch.cuda.Event(enable_timing=True)
36 | end_event = torch.cuda.Event(enable_timing=True)
37 | device = "cuda:0"
38 | 
39 | for num_workers in range(5):
40 |     dl = torch.utils.data.DataLoader(
41 |         ds,
42 |         batch_size=batch_size,
43 |         pin_memory=True,
44 |         num_workers=num_workers,
45 |     )
46 |     duration = 0
47 |     for i in range(num_runs):
48 |         slept_time = 0
49 |         start_event.record()
50 |         for batch in dl:
51 |             batch = batch.to(device=device, non_blocking=True)
52 |             # emulate a compute delay to give workers a chance to reload, otherwise the benchmark
53 |             # will be measuring waiting for workers
54 |             time.sleep(compute_emulation_time)
55 |             # will then subtract this artificial delay from the total to try to isolate
56 |             # the iterator's overhead
57 |             slept_time += compute_emulation_time
58 |         end_event.record()
59 |         torch.cuda.synchronize()
60 |         duration += start_event.elapsed_time(end_event) / 1000 - slept_time
61 |     duration /= num_runs
62 |     print(f"num_workers={num_workers}: average time: {duration:0.3f}")
63 | 


--------------------------------------------------------------------------------
/training/performance/benchmarks/dataloader/pin-memory-non-block-bench.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | 
 5 | This benchmark shows that a combo of:
 6 | 
 7 | (1) DataLoader(pin_memory=True, ...)
 8 | (2) batch.to(device="cuda", non_blocking=True)
 9 | 
10 | leads to a faster transfer from the workers to the process doing compute and a potential overlap between the compute and the data movement
11 | 
12 | See:
13 | - https://pytorch.org/docs/stable/notes/cuda.html#use-pinned-memory-buffers
14 | - https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/
15 | 
16 | usage:
17 | 
18 | ./pin-memory-non-block-bench.py
19 | 
20 | """
21 | 
22 | import torch
23 | import time
24 | 
25 | class MyDataset(torch.utils.data.Dataset):
26 | 
27 |     def __init__(self):
28 |         self.tensor = torch.ones(1*2**18) # 1 mb tensor
29 | 
30 |     def __len__(self):
31 |         return 1000
32 | 
33 |     def __getitem__(self, idx):
34 |         return self.tensor
35 | 
36 | num_runs = 10
37 | num_workers = 5
38 | batch_size = 100
39 | compute_emulation_time = 0.2
40 | 
41 | ds = MyDataset()
42 | start_event = torch.cuda.Event(enable_timing=True)
43 | end_event = torch.cuda.Event(enable_timing=True)
44 | device = "cuda:0"
45 | 
46 | for pm in [True, False]:
47 |     for nb in [True, False]:
48 | 
49 |         dl = torch.utils.data.DataLoader(
50 |             ds,
51 |             batch_size=batch_size,
52 |             pin_memory=pm,
53 |             num_workers=num_workers,
54 |         )
55 |         duration = 0
56 |         for i in range(num_runs):
57 |             slept_time = 0
58 |             start_event.record()
59 |             for batch in dl:
60 |                 # non_blocking=True would further speeds things up in addition to pinned memory
61 |                 batch = batch.to(device=device, non_blocking=nb)
62 |                 # emulate a compute delay to give workers a chance to reload, otherwise the benchmark
63 |                 # will be measuring waiting for workers
64 |                 time.sleep(compute_emulation_time)
65 |                 # will then subtract this artificial delay from the total to try to isolate
66 |                 # the iterator's overhead
67 |                 slept_time += compute_emulation_time
68 |             end_event.record()
69 |             torch.cuda.synchronize()
70 |             duration += start_event.elapsed_time(end_event) / 1000 - slept_time
71 |         duration /= num_runs
72 |         print(f"pin_memory={pm!s:>5}, non_blocking={nb!s:>5}: average time: {duration:0.3f}")
73 | 


--------------------------------------------------------------------------------
/training/performance/benchmarks/matrix-shape/swiglu-maf-bench.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | 
 5 | This script will help you find the intermediate value of the hidden layer of the MLP when SwiGLU is
 6 | used.
 7 | 
 8 | It performs a brute force search for the best number closest to 8/3*h that would give the highest
 9 | TFLOPS for a matmal of [b*s, h]×[h, 8/3*h]
10 | 
11 | Despite SwiGLU MLP using 3 matrices, this script searches only one matmul, since the performance is
12 | the same for each matmul.
13 | 
14 | In the situation where tensor parallelism is used with tp>1 it'd be even faster to search for m1 =
15 | m/tp - so 1/8th with tp=8
16 | 
17 | To adapt for your situation please modify the search parameters below.
18 | 
19 | This benchmark was written for the paper The Case for Co-Designing Model Architectures with
20 | Hardware: https://arxiv.org/abs/2401.14489
21 | 
22 | """
23 | 
24 | import torch
25 | from tqdm import trange
26 | 
27 | ### Modify the Search Parameters Begin ###
28 | 
29 | # this is the hidden_size of the model
30 | d_hidden = 4096
31 | 
32 | # Now either let the 8/3 ratio give the starting dimension size or choose you own - the 8/3 is
33 | # only a suggestion to compensate for the 3rd additional matrix
34 | d_ff_base = int(8/3*d_hidden)
35 | #d_ff_base = 11008
36 | 
37 | # batch size - make it larger for small matrices
38 | batch_size = 2**2
39 | 
40 | # add more profiler iterations for small matrices
41 | num_iterations = 100
42 | 
43 | # searching range: d_ff_base-distance < d_ff_base < d_ff_base+distance
44 | distance = 100
45 | 
46 | ### Modify the Search Parameters End ###
47 | 
48 | def benchmark_bmm(b, m, n, k, num_iterations=100, num_matmuls=1):
49 |     A = torch.randn((b, m, n)).half().to("cuda:0")
50 |     B = torch.randn((b, n, k)).half().to("cuda:0")
51 |     C = torch.empty((b, m, k)).half().to("cuda:0")
52 |     num_warmup_iterations = 50
53 | 
54 |     start_event = torch.cuda.Event(enable_timing=True)
55 |     end_event = torch.cuda.Event(enable_timing=True)
56 | 
57 |     for i in range(num_warmup_iterations + num_iterations):
58 |         if i == num_warmup_iterations:
59 |             start_event.record()
60 |         with torch.no_grad():
61 |             for i in range(num_matmuls):
62 |                 torch.bmm(A, B, out=C)
63 |     end_event.record()
64 |     torch.cuda.synchronize()
65 |     elapsed_time = start_event.elapsed_time(end_event) / (1000 * num_iterations)
66 |     flops_per_sec = (2 * b * m * n * k * num_matmuls) / (elapsed_time * 10**12)
67 |     #print(f"Elapsed time for {num_matmuls} times {b}x{m}x{n}x{k} : {elapsed_time:.3f}")
68 |     #print(f"Throughput (in TFLOP/s) for {b}x{m}x{n}x{k}: {flops_per_sec:.3f}")
69 |     #print("-" * 80)
70 |     return flops_per_sec
71 | 
72 | 
73 | print(f"Wanted the closest to {d_ff_base} d_ff value that leads to the highest TFLOPS (d_hidden={d_hidden})\n")
74 | print(f"Searching {int(distance/2)} steps in the range of {d_ff_base-distance} .. {d_ff_base+distance}")
75 | results = {}
76 | for d in trange(-distance, distance, 4):
77 |     d_ff = d_ff_base + d
78 |     # find closest div 4 number, pointless to search odd numbers
79 |     d_ff -= d_ff % 4
80 |     #print(d_ff)
81 |     results[d_ff] = benchmark_bmm(batch_size, m=d_hidden, n=d_ff, k=d_hidden, num_iterations=num_iterations, num_matmuls=1)
82 | 
83 | starting_tflops_per_sec = benchmark_bmm(batch_size, m=d_hidden, n=d_ff_base, k=d_hidden, num_iterations=num_iterations, num_matmuls=1)
84 | print("Results: baseline, followed by near-by best performing d_ff results:\n")
85 | print(" d_ff  tflops mlp_params")
86 | print("-" * 25)
87 | print(f"{d_ff_base} {starting_tflops_per_sec:7.2f} {3*d_ff_base*d_hidden}")
88 | print("-" * 25)
89 | cut_off = 5  # how many results do you want to see
90 | for d_ff in list(reversed(sorted(results, key=lambda x: results[x])))[:cut_off]:
91 |     print(f"{d_ff} {results[d_ff]:7.2f} {3*d_ff*d_hidden}")
92 | 


--------------------------------------------------------------------------------
/training/performance/benchmarks/numa/numa-set-pynvml.py:
--------------------------------------------------------------------------------
 1 | # this helper util will assign the cpu-cores belonging to the same NUMA node as the GPU
 2 | 
 3 | # derived from
 4 | # https://github.com/NVIDIA/DeepLearningExamples/blob/9dd9fcb98f56187e49c5ee280cf8dbd530dde57b/TensorFlow2/LanguageModeling/BERT/gpu_affinity.py
 5 | 
 6 | import os
 7 | import math
 8 | import pynvml as nvml
 9 | 
10 | nvml.nvmlInit()
11 | 
12 | def set_numa_affinity(gpu_index, verbose=False):
13 |     """This util will assign to the current process the cpu cores set that resides on the same NUMA
14 |     node as the GPU. Typically if you have 8 GPUs, then the first 4 are on the first NUMA node and
15 |     the remaining 4 are on the second.
16 | 
17 |     `gpu_index` is typically the same as `LOCAL_RANK` in the distributed training, but beware that
18 |     `CUDA_VISIBLE_DEVICES` could impact that. e.g. `CUDA_VISIBLE_DEVICES=0,7` won't do the right
19 |     thing - then you will probably want to remap the ids with something like:
20 | 
21 |     ```
22 |     if "CUDA_VISIBLE_DEVICES" in os.environ:
23 |         ids = list(map(int, os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")))
24 |         gpu_index = ids[gpu_index] # remap
25 |     ```
26 | 
27 |     """
28 | 
29 | 
30 |     num_elements = math.ceil(os.cpu_count() / 64)
31 |     handle = nvml.nvmlDeviceGetHandleByIndex(gpu_index)
32 |     affinity_string = ""
33 |     for j in nvml.nvmlDeviceGetCpuAffinity(handle, num_elements):
34 |         # assume nvml returns list of 64 bit ints
35 |         affinity_string = f"{j:064b}{affinity_string}"
36 |     affinity_list = [int(x) for x in affinity_string]
37 |     affinity_list.reverse()  # so core 0 is the 0th element
38 |     affinity_to_set = [i for i, e in enumerate(affinity_list) if e != 0]
39 | 
40 |     if verbose:
41 |         cores = os.sched_getaffinity(0)
42 |         print(f"before: {len(cores)} visible cpu cores: {cores}")
43 |     os.sched_setaffinity(0, affinity_to_set)
44 |     if verbose:
45 |         cores = os.sched_getaffinity(0)
46 |         print(f"after: {len(cores)} visible cpu cores: {cores}")
47 | 
48 | if __name__ == "__main__":
49 | 
50 |     # pretend we are process that drives gpu 0
51 |     set_numa_affinity(0, verbose=True)
52 | 


--------------------------------------------------------------------------------
/training/performance/benchmarks/numa/numa-set.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | # this helper util performs NUMA node binding which can be used with torchrun, and other launchers
 4 | # contributed by https://github.com/yifuwang
 5 | 
 6 | # 1. first make it executable:
 7 | #
 8 | # chmod a+x ./numa-set.sh
 9 | #
10 | # 2. launch torchrun and test that it assigns the cores correctly
11 | #
12 | # torchrun --nproc_per_node=8 --no-python ./numa-set.sh \
13 | # python -c "import os; cs=os.sched_getaffinity(0); print(f"{len(cs)} visible cpu cores: {cs}")'
14 | #
15 | # so if your original torchrun launcher looked like:
16 | #
17 | # torchrun --nproc_per_node=8 --nnodes 2 ... train.py
18 | #
19 | # now it'll become:
20 | #
21 | # torchrun --nproc_per_node=8 --nnodes 2 ... --no-python ./numa-set.sh python train.py
22 | 
23 | # Query the bus ID for device LOCAL_RANK
24 | BUS_ID=$(nvidia-smi --query-gpu=pci.bus_id -i $LOCAL_RANK --format=csv,noheader)
25 | BUS_ID=${BUS_ID,,}
26 | 
27 | # Find the numa node for device LOCAL_RANK
28 | NODE=$(cat /sys/bus/pci/devices/${BUS_ID:4}/numa_node)
29 | 
30 | echo "Starting local rank $RANK on NUMA node $NODE"
31 | numactl --cpunodebind=$NODE --membind=$NODE "$@"
32 | 


--------------------------------------------------------------------------------
/training/performance/distributed/torch-dist-mem-usage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | 
 5 | This script demonstrates that when using `torch.distributed` a few GBs of GPU memory is taken away per GPU.
 6 | 
 7 | *** To do a quick test on 2 GPUs:
 8 | 
 9 | python -u -m torch.distributed.run --nproc_per_node=2 --rdzv_endpoint localhost:6000  --rdzv_backend c10d \
10 | torch-dist-mem-usage.py
11 | 
12 | Watch the NV column (which is the equivalent of memory usage in `nvidia-smi`).
13 | 
14 | 
15 | """
16 | 
17 | import gc
18 | import os
19 | import psutil
20 | import pynvml
21 | import torch
22 | import torch.distributed as dist
23 | 
24 | def see_memory_usage(message, force=False, ranks=[0]):
25 |     """
26 |     Arguments:
27 |         message: a pre-amble message to print before the counter dumps - useful for annotating where each measurement has been taken - e.g. "before foo" and later "after foo"
28 |         force: allows you to leave see_memory_usage in the code w/o running the code, force=True to activate
29 |         ranks: by default prints only on rank 0 but sometimes we need to debug other ranks, so pass the list. Example: ranks=[1,3]
30 |     """
31 | 
32 |     if not force:
33 |         return
34 |     rank = dist.get_rank() if dist.is_initialized() else 0
35 |     if not rank in ranks:
36 |         return
37 | 
38 |     # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
39 |     gc.collect()
40 | 
41 |     # this would be bad for production, only use during debug
42 |     torch.cuda.empty_cache()
43 | 
44 |     # collect raw memory usage outside pytorch
45 |     pynvml.nvmlInit()
46 |     rank = dist.get_rank() if dist.is_initialized() else 0
47 |     handle = pynvml.nvmlDeviceGetHandleByIndex(rank)
48 |     memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
49 |     pynvml.nvmlShutdown()
50 |     nv_mem = memory_info.used
51 | 
52 |     vm_stats = psutil.virtual_memory()
53 |     used_GB = round(((vm_stats.total - vm_stats.available) / (1024**3)), 2)
54 | 
55 |     accelerator_mem_str = " | ".join([
56 |         f"MA {round(torch.cuda.memory_allocated() / 2**30, 2):0.2f} GB",
57 |         f"Max_MA {round(torch.cuda.max_memory_allocated() / 2**30, 2):0.2f} GB",
58 |         f"CA {round(torch.cuda.memory_reserved() / 2**30, 2):0.2f} GB",
59 |         f"Max_CA {round(torch.cuda.max_memory_reserved() / 2**30, 2):0.2f} GB",
60 |         f"NV {round(nv_mem / 2**30, 2):0.2f} GB",
61 |     ])
62 |     cpu_mem_str = f"CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%"
63 | 
64 |     # add '[rank] mp' prefix to enable easy grep
65 |     print(f"[{rank}] mp: {message}")
66 |     print(f"[{rank}] mp: " + " | ".join([accelerator_mem_str, cpu_mem_str]))
67 | 
68 |     # get the peak memory to report correct data, so reset the counter for the next call
69 |     torch.cuda.reset_peak_memory_stats()
70 | 
71 | 
72 | def init_processes(local_rank, backend='nccl'):
73 |     torch.cuda.set_device(local_rank)
74 | 
75 |     # if we don't pass `device_id` arg, the memory allocation won't happen till the first `barrier` call in this example.
76 |     dist.init_process_group(backend)
77 |     # if passing device_id arg, some memory will get used earlier already in `init_process_group`
78 |     # device = torch.device("cuda", local_rank)
79 |     # dist.init_process_group(backend, device_id=device)
80 |     see_memory_usage("before barrier", force=True)
81 |     dist.barrier()
82 |     see_memory_usage("after barrier", force=True)
83 |     dist.barrier()
84 |     see_memory_usage("after 2nd barrier", force=True)
85 |     dist.destroy_process_group()
86 |     see_memory_usage("after dist destroy", force=True)
87 | 
88 | if __name__ == "__main__":
89 |     local_rank = int(os.environ["LOCAL_RANK"])
90 |     init_processes(local_rank=local_rank)
91 | 


--------------------------------------------------------------------------------
/training/performance/images/a100-server-hwloc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/performance/images/a100-server-hwloc.png


--------------------------------------------------------------------------------
/training/performance/images/attention-less-heads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/performance/images/attention-less-heads.png


--------------------------------------------------------------------------------
/training/performance/images/flash-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/performance/images/flash-attention.png


--------------------------------------------------------------------------------
/training/performance/images/tiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/performance/images/tiling.png


--------------------------------------------------------------------------------
/training/performance/images/wave-quant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stas00/ml-engineering/0099885db36a8f06556efe1faf552518852cb1e0/training/performance/images/wave-quant.png


--------------------------------------------------------------------------------
/training/re-train-hub-models.md:
--------------------------------------------------------------------------------
 1 | # Re-train HF Hub Models From Scratch Using Finetuning Examples
 2 | 
 3 | HF Transformers has awesome finetuning examples  https://github.com/huggingface/transformers/tree/main/examples/pytorch, that cover pretty much any modality and these examples work out of box.
 4 | 
 5 | **But what if you wanted to re-train from scratch rather than finetune.**
 6 | 
 7 | Here is a simple hack to accomplish that.
 8 | 
 9 | We will use `facebook/opt-1.3b` and we will plan to use bf16 training regime as an example here:
10 | 
11 | ```
12 | cat << EOT > prep-bf16.py
13 | from transformers import AutoConfig, AutoModel, AutoTokenizer
14 | import torch
15 | 
16 | mname = "facebook/opt-1.3b"
17 | 
18 | config = AutoConfig.from_pretrained(mname)
19 | model = AutoModel.from_config(config, torch_dtype=torch.bfloat16)
20 | tokenizer = AutoTokenizer.from_pretrained(mname)
21 | 
22 | path = "opt-1.3b-bf16"
23 | 
24 | model.save_pretrained(path)
25 | tokenizer.save_pretrained(path)
26 | EOT
27 | ```
28 | 
29 | now run:
30 | 
31 | ```
32 | python prep-bf16.py
33 | ```
34 | 
35 | This will create a folder: `opt-1.3b-bf16` with everything you need to train the model from scratch. In other words you have a pretrained-like model, except it only had its initializations done and none of the training yet.
36 | 
37 | Adjust to script above to use `torch.float16` or `torch.float32` if that's what you plan to use instead.
38 | 
39 | Now you can proceed with finetuning this saved model as normal:
40 | 
41 | ```
42 | python -m torch.distributed.run \
43 | --nproc_per_node=1 --nnode=1 --node_rank=0 \
44 | --master_addr=127.0.0.1 --master_port=9901 \
45 | examples/pytorch/language-modeling/run_clm.py --bf16 \
46 | --seed 42 --model_name_or_path opt-1.3b-bf16 \
47 | --dataset_name wikitext --dataset_config_name wikitext-103-raw-v1 \
48 | --per_device_train_batch_size 12 --per_device_eval_batch_size 12 \
49 | --gradient_accumulation_steps 1 --do_train --do_eval --logging_steps 10 \
50 | --save_steps 1000 --eval_steps 100 --weight_decay 0.1 --num_train_epochs 1 \
51 | --adam_beta1 0.9 --adam_beta2 0.95 --learning_rate 0.0002 --lr_scheduler_type \
52 | linear --warmup_steps 500 --report_to tensorboard --output_dir save_dir
53 | ```
54 | 
55 | The key entry being:
56 | ```
57 | --model_name_or_path opt-1.3b-bf16
58 | ```
59 | 
60 | where `opt-1.3b-bf16` is your local directory you have just generated in the previous step.
61 | 
62 | Sometimes it's possible to find the same dataset that the original model was trained on, sometimes you have to use an alternative dataset.
63 | 
64 | The rest of the hyper-parameters can often be found in the paper or documentation that came with the model.
65 | 
66 | To summarize, this recipe allows you to use finetuning examples to re-train whatever model you can find on [the HF hub](https://huggingface.co/models).
67 | 


--------------------------------------------------------------------------------
/training/reproducibility/README.md:
--------------------------------------------------------------------------------
  1 | # Reproducibility
  2 | 
  3 | ## Achieve determinism in randomness based software
  4 | 
  5 | When debugging always set a fixed seed for all the used Random Number Generators (RNG) so that you get the same data / code path on each re-run.
  6 | 
  7 | Though with so many different systems it can be tricky to cover them all. Here is an attempt to cover a few:
  8 | 
  9 | ```
 10 | import random, torch, numpy as np
 11 | def enforce_reproducibility(use_seed=None):
 12 |     seed = use_seed if use_seed is not None else random.randint(1, 1000000)
 13 |     print(f"Using seed: {seed}")
 14 | 
 15 |     random.seed(seed)    # python RNG
 16 |     np.random.seed(seed) # numpy RNG
 17 | 
 18 |     # pytorch RNGs
 19 |     torch.manual_seed(seed)          # cpu + cuda
 20 |     torch.cuda.manual_seed_all(seed) # multi-gpu - can be called without gpus
 21 |     if use_seed: # slower speed! https://pytorch.org/docs/stable/notes/randomness.html#cuda-convolution-benchmarking
 22 |         torch.backends.cudnn.deterministic = True
 23 |         torch.backends.cudnn.benchmark     = False
 24 | 
 25 |     return seed
 26 | ```
 27 | a few possible others if you use those subsystems/frameworks instead:
 28 | ```
 29 |     torch.npu.manual_seed_all(seed)
 30 |     torch.xpu.manual_seed_all(seed)
 31 |     tf.random.set_seed(seed)
 32 | ```
 33 | 
 34 | When you rerun the same code again and again to solve some problem set a specific seed at the beginning of your code with:
 35 | ```
 36 | enforce_reproducibility(42)
 37 | ```
 38 | But as it mentions above this is for debug only since it activates various torch flags that help with determinism but can slow things down so you don't want this in production.
 39 | 
 40 | However, you can call this instead to use in production:
 41 | ```
 42 | enforce_reproducibility()
 43 | ```
 44 | i.e. w/o the explicit seed. And then it'll pick a random seed and log it! So if something happens in production you can now reproduce the same RNGs the issue was observed in. And no performance penalty this time, as the `torch.backends.cudnn` flags are only set if you provided the seed explicitly. Say it logged:
 45 | ```
 46 | Using seed: 1234
 47 | ```
 48 | you then just need to change the code to:
 49 | ```
 50 | enforce_reproducibility(1234)
 51 | ```
 52 | and you will get the same RNGs setup.
 53 | 
 54 | As mentioned in the first paragraphs there could be many other RNGs involved in a system, for example, if you want the data to be fed in the same order for a `DataLoader` you need [to have its seed set as well](https://pytorch.org/docs/stable/notes/randomness.html#dataloader).
 55 | 
 56 | Additional resources:
 57 | - [Reproducibility in pytorch](https://pytorch.org/docs/stable/notes/randomness.html)
 58 | 
 59 | 
 60 | 
 61 | ## Reproduce the software and system environment
 62 | 
 63 | This methodology is useful when discovering some discrepancy in outcomes - quality or a throughput for example.
 64 | 
 65 | The idea is to log the key components of the environment used to launch a training (or inference) so that if at a later stage it needs to be reproduced exactly as it was it can be done.
 66 | 
 67 | Since there is a huge variety of systems and components being used it's impossible to prescribe a way that will always work. So let's discuss one possible recipe and you can then adapt it to your particular environment.
 68 | 
 69 | This is added to your slurm launcher script (or whatever other way you use to launch the training) - this is Bash script:
 70 | 
 71 | ```bash
 72 | SAVE_DIR=/tmp # edit to a real persistent path
 73 | export REPRO_DIR=$SAVE_DIR/repro/$SLURM_JOB_ID
 74 | mkdir -p $REPRO_DIR
 75 | # 1. modules (writes to stderr) (remove if you don't use lmod or similar modules implementation)
 76 | module list 2> $REPRO_DIR/modules.txt
 77 | # 2. shell env vars
 78 | /usr/bin/printenv | sort > $REPRO_DIR/env.txt
 79 | # 3. pip (this includes devel installs SHA)
 80 | pip freeze > $REPRO_DIR/requirements.txt
 81 | # 4. uncommitted diff in git clones installed into conda
 82 | perl -nle 'm|"file://(.*?/([^/]+))"| && qx[cd $1; if [ ! -z "\$(git diff)" ]; then git diff > \$REPRO_DIR/$2.diff; fi]' $CONDA_PREFIX/lib/python*/site-packages/*.dist-info/direct_url.json
 83 | ```
 84 | 
 85 | As you can see this recipe is used in a SLURM environment, so every new training will dump the environment specific to the SLURM job. But you can adapt it to any other environment.
 86 | 
 87 | Let's expend on each step in the recipe:
 88 | 
 89 | 1. We save which `modules` were loaded, e.g. in cloud cluster/HPC setups you're like to be loading the CUDA and cuDNN libraries using this.
 90 | 
 91 |    If you don't use `modules` then remove that entry
 92 | 
 93 | 2. We dump the shell environment variables. This can be crucial since a single env var like `LD_PRELOAD` or `LD_LIBRARY_PATH` could make a huge impact on performance in some environments
 94 | 
 95 | 3. We then dump the python environment packages and their versions - this should work with any virtual python environment like `conda`, `venv` or even if you don't use a virtual environment.
 96 | 
 97 |    If you use `uv` instead of `pip`, switch to `uv pip freeze` as it'd be much faster.
 98 | 
 99 | 4. If you use a devel install with `pip install -e .` it doesn't know anything about the git clone repository it was installed from other than its git SHA. But the issue is that it's likely that you have modified the files locally and now `pip freeze` will miss those changes. So this part will go through all packages that are not installed into the conda environment (we find them by looking inside `site-packages/*.dist-info/direct_url.json`)
100 | 
101 | To save the `apt` packages add:
102 | ```
103 | apt list --installed > $REPRO_DIR/apt-packages.txt
104 | ```
105 | 
106 | If using `conda` an additionally useful tool is [conda-env-compare.pl](https://github.com/stas00/conda-tools/blob/master/conda-env-compare.md) which helps you to find out the exact differences 2 conda environments have.
107 | 
108 | Anecdotally, me and my colleague were getting very different training TFLOPs on a cloud cluster running the exact same code - literally launching the same slurm script from the same shared directory. We first compared our conda environments using [conda-env-compare.pl](https://github.com/stas00/conda-tools/blob/master/conda-env-compare.md) and found some differences - I installed the exact packages she had to match her environment and it was still showing a huge performance difference. We then compared the output of `printenv` and discovered that I had `LD_PRELOAD` set up whereas she didn't - and that made a huge difference since this particular cloud provider required multiple env vars to be set to custom paths to get the most of their hardware.
109 | 


--------------------------------------------------------------------------------
/training/tools/main_process_first.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tooling for dealing with efficient dataset loading in a multi-process, potentially multi-node environment with shared and local filesystems.
  3 | 
  4 | For notes please see https://github.com/stas00/ml-engineering/blob/master/training/datasets.md#preprocessing-and-caching-datasets-on-the-main-process
  5 | 
  6 | """
  7 | 
  8 | 
  9 | import os
 10 | from contextlib import contextmanager
 11 | from pathlib import Path
 12 | import torch.distributed as dist
 13 | 
 14 | def get_local_rank() -> int:
 15 |     return int(os.getenv("LOCAL_RANK", 0))
 16 | 
 17 | def get_global_rank() -> int:
 18 |     if dist.is_initialized():
 19 |         return dist.get_rank()
 20 |     else:
 21 |         return 0
 22 | 
 23 | # delay the local filesystems lookup until it's needed
 24 | node_fs_types = None
 25 | 
 26 | local_node_fs_types = ["ext", "ext2", "ext3", "ext4", "reiserfs", "jfs", "xfs", "zfs", "xfs", "btrfs", "ntfs", "overlay"]
 27 | def is_local_fs(path):
 28 |     """ returns True if the `path` resides on the local fs or False otherwise """
 29 |     global node_fs_types
 30 |     if node_fs_types is None:
 31 |         from psutil import disk_partitions
 32 |         node_fs_types = {Path(r.mountpoint):r.fstype for r in disk_partitions(all=True)}
 33 | 
 34 |     return True if path_to_fs_type(path) in local_node_fs_types else False
 35 | 
 36 | def path_to_fs_type(path):
 37 |     """
 38 |     Given a fs `path` returns the fs type (ext, ext2, etc.) it resides on.
 39 |     Note that in this implementation non-existing paths will return the fs type of `/` (which often will be mapped to "overlay")
 40 |     This is useful since as long as partitions are mounted already you can detect the type of the fs ven before the sub-dirs were created
 41 |     """
 42 |     path = Path(path).resolve()
 43 |     if path.is_symlink():
 44 |         path = path.readlink() # py3.9+
 45 | 
 46 |     # assuming at the end we percolate to `/` which is always there so the exit condition is assured
 47 |     if path in node_fs_types:
 48 |         return node_fs_types[path]
 49 | 
 50 |     return path_to_fs_type(path.parent)
 51 | 
 52 | def is_main_process_by_path(path):
 53 |     if is_local_fs(path):
 54 |         return is_local_main_process()
 55 |     else:
 56 |         return is_global_main_process()
 57 | 
 58 | def is_local_main_process():
 59 |     return get_local_rank() == 0
 60 | 
 61 | def is_global_main_process():
 62 |     return dist.get_rank() == 0
 63 | 
 64 | @contextmanager
 65 | def _goes_first(is_main: bool):
 66 |     if not is_main:
 67 |         dist.barrier()
 68 | 
 69 |     yield
 70 | 
 71 |     if is_main:
 72 |         dist.barrier()
 73 | 
 74 | 
 75 | @contextmanager
 76 | def main_process_by_path_first(path):
 77 |     """
 78 |     Lets the global or the local main process go first inside a with block. The decision which to use is based on the `path`. If the `path` is on a local non-shared fs, we use the local main process. If the path is on the shared fs then it's a global main process.
 79 | 
 80 |     The other processes will enter the with block after the defined above main process exits.
 81 | 
 82 |     Important: since this context manager uses a barrier it can't be used around code that requires all ranks to work in sync - e.g. gather, barrier, etc. - it'd lead to a deadlock
 83 | 
 84 |     Example:
 85 | 
 86 |         import time
 87 |         with main_process_by_path_first("/shared_fs/cache"):
 88 |             # This will be printed first by global process 0 then in a seemingly
 89 |             # random order by the other processes.
 90 |             # we presume in this example the path is on a shared fs
 91 |             global_rank = torch.distributed.get_rank()
 92 |             print(f"This will be printed by process {global_rank}")
 93 |             time.sleep(5) # emulate actual work
 94 |     """
 95 |     if is_local_fs(path):
 96 |         with _goes_first(is_local_main_process()):
 97 |             yield
 98 |     else:
 99 |         with _goes_first(is_global_main_process()):
100 |             yield
101 | 
102 | @contextmanager
103 | def global_main_process_first():
104 |     """
105 |     Lets the global main process go first inside a with block.
106 | 
107 |     The other processes will enter the with block after the global main process exits.
108 | 
109 |     Important: since this context manager uses a barrier it can't be used around code that requires all ranks to work in sync - e.g. gather, barrier, etc. - it'd lead to a deadlock
110 | 
111 |     Example:
112 | 
113 |         import time
114 |         global_rank = torch.distributed.get_rank()
115 |         with global_main_process_first():
116 |             # This will be printed first by global process 0 then in a seemingly
117 |             # random order by the other processes.
118 |             print(f"This will be printed by process {global_rank}")
119 |             time.sleep(5) # emulate actual work
120 |     """
121 |     with _goes_first(is_global_main_process()):
122 |         yield
123 | 
124 | @contextmanager
125 | def local_main_process_first():
126 |     """
127 |     Lets the local main process go inside a with block.
128 | 
129 |     The other processes will enter the with block after the local main process exits.
130 | 
131 |     Important: since this context manager uses a barrier it can't be used around code that requires all ranks to work in sync - e.g. gather, barrier, etc. - it'd lead to a deadlock
132 | 
133 |     Example:
134 | 
135 |         import time
136 |         local_rank = get_local_rank()
137 |         with local_main_process_first():
138 |             # This will be printed first by local process 0 then in a seemingly
139 |             # random order by the other processes.
140 |             print(f"This will be printed by process {local_rank}")
141 |             time.sleep(5) # emulate actual work
142 |     """
143 |     with _goes_first(is_local_main_process()):
144 |         yield
145 | 
146 | if __name__ == "__main__":
147 |     # to test run:
148 |     #
149 |     # python -u -m torch.distributed.run --nproc_per_node=2 --rdzv_endpoint localhost:6000  --rdzv_backend c10d main_process_first.py
150 | 
151 |     dist.init_process_group()
152 | 
153 |     import time
154 |     global_rank = get_global_rank()
155 |     local_rank  = get_local_rank()
156 | 
157 |     def ds_load_emulate():
158 |         print("Loading dataset")
159 |         time.sleep(2)
160 | 
161 |     if global_rank == 0:
162 |         print("\n\n*** Demo global_main_process_first")
163 |     with global_main_process_first():
164 |         print(f"Running on global rank {global_rank}")
165 |         ds_load_emulate()
166 |     dist.barrier()
167 | 
168 |     if global_rank == 0:
169 |         print("\n\n*** Demo local_main_process_first")
170 |     with local_main_process_first():
171 |         print(f"Running on local rank {local_rank}")
172 |         ds_load_emulate()
173 |     dist.barrier()
174 | 
175 |     if global_rank == 0:
176 |         print("\n\n*** Demo is_main_process_by_path")
177 |     path = "./"
178 |     rank_type = "main" if is_main_process_by_path(path) else "non-main"
179 |     with main_process_by_path_first(path):
180 |         print(f"Running on {rank_type} rank local={local_rank} global={global_rank}")
181 |         ds_load_emulate()
182 |     dist.barrier()
183 | 
184 |     dist.destroy_process_group()
185 | 


--------------------------------------------------------------------------------
/training/tools/multi-gpu-non-interleaved-print.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # printflock allows one to print in a non-interleaved fashion when printing from multiple procesess.
 4 | # Typically this only the issue within a single node. When processes from different nodes print their
 5 | # output it doesn't get interleaved.
 6 | #
 7 | # This file includes the wrapper and a full example on how to use it.
 8 | #
 9 | # e.g., if you have 2 gpus run it as:
10 | #
11 | # python -m torch.distributed.run --nproc_per_node 2 multi-gpu-non-interleaved-print.py
12 | #
13 | 
14 | import fcntl
15 | def printflock(*args, **kwargs):
16 |     """
17 |     non-interleaved print function for using when printing concurrently from many processes,
18 |     like the case under torch.distributed
19 |     """
20 |     with open(__file__, "r") as fh:
21 |         fcntl.flock(fh, fcntl.LOCK_EX)
22 |         try:
23 |             print(*args, **kwargs)
24 |         finally:
25 |             fcntl.flock(fh, fcntl.LOCK_UN)
26 | 
27 | 
28 | 
29 | if __name__ == "__main__":
30 | 
31 |     import torch.distributed as dist
32 |     import torch
33 |     import os
34 |     local_rank = int(os.environ["LOCAL_RANK"])
35 |     torch.cuda.set_device(local_rank)
36 |     dist.init_process_group("nccl")
37 | 
38 |     world_size = dist.get_world_size()
39 |     rank = dist.get_rank()
40 |     printflock(f"This is a very long message from rank {rank} (world_size={world_size})")
41 | 


--------------------------------------------------------------------------------
/training/tools/printflock.py:
--------------------------------------------------------------------------------
 1 | # If you have ever done multi-gpu work and tried to `print` for debugging you quickly discovered
 2 | # that some messages get interleaved and are impossible to make sense of. Especially so if you're
 3 | # using `print` to debug values.
 4 | #
 5 | # This simple solution that uses the good old `flock` solves the interleaving problem. To use this
 6 | # version of print you can either do:
 7 | #
 8 | # from printflock import printflock
 9 | # import torch.distributed as dist
10 | # printflock(f"{dist.get_rank()}: my long debug message")
11 | #
12 | # or you can override `print` with a better one:
13 | #
14 | # from printflock import printflock as print
15 | # import torch.distributed as dist
16 | # print(f"{dist.get_rank()}: my long debug message")
17 | #
18 | 
19 | import builtins
20 | import fcntl
21 | 
22 | def printflock(*args, **kwargs):
23 |     """
24 |     This is a wrapper around the built-in Python `print` which calls `flock` before calling
25 |     `print` and unlocks it immediately after. This wrapper is useful for when each rank needs to
26 |     print a message without getting it interleaved with prints from other ranks.
27 |     The lock file is the file this wrapper is defined in.
28 |     The output order will be random per rank.
29 | 
30 |     Example:
31 |         >>> # assuming 4 GPUs
32 |         >>> world_size = dist.get_world_size()
33 |         >>> rank = dist.get_rank()
34 |         >>> printflock(f"This is a very long message from rank {rank}/{world_size}")
35 |        This is a very long message from rank 0/4
36 |        This is a very long message from rank 2/4
37 |        This is a very long message from rank 3/4
38 |        This is a very long message from rank 1/4
39 | 
40 |     It can also be used to override normal `print`:
41 | 
42 |     from printflock import printflock as print
43 | 
44 |     and then you don't need to change anything in your code.
45 |     """
46 | 
47 |     with open(__file__, "r") as fh:
48 |         fcntl.flock(fh, fcntl.LOCK_EX)
49 |         try:
50 |             builtins.print(*args, **kwargs)
51 |         finally:
52 |             fcntl.flock(fh, fcntl.LOCK_UN)
53 | 


--------------------------------------------------------------------------------