├── .gitignore ├── README.md ├── config └── ViT.yaml ├── distributed ├── __init__.py ├── helpers.py ├── layers.py └── mappings.py ├── example_logs └── base │ └── 1GPU │ └── 00 │ └── logs │ └── events.out.tfevents.1698839594.nid001332.2071366.0 ├── export_DDP_vars.sh ├── figs └── minibatch_0.jpg ├── networks ├── helpers.py └── vit.py ├── sample_nsys_profiles ├── 4workers.nsys-rep ├── baseline.nsys-rep ├── dali.nsys-rep ├── dali_amp_bf16.nsys-rep └── dali_amp_bf16_fused_jit.nsys-rep ├── start_tensorboard.ipynb ├── submit_pm.sh ├── submit_pm_dp.sh ├── submit_pm_mp.sh ├── test_data_loader.py ├── test_model_dims.py ├── tests ├── run_tests.sh └── test_distributed.py ├── train.py ├── train_mp.py ├── train_mp_graphs.py ├── tutorial_images ├── baseline_tb.png ├── dp_timings.png ├── mp_comp.png ├── mp_dp_comp.png ├── nsys_4workers.png ├── nsys_4workers_zoomed.png ├── nsys_baseline.png ├── nsys_baseline_zoomed.png ├── nsys_dali.png ├── nsys_dali_bf16.png ├── nsys_dali_bf16_fused_jit.png ├── nsys_dali_bf16_fused_jit_zoomed.png ├── nsys_dali_bf16_zoomed.png ├── nsys_dali_zoomed.png ├── vit_schematic.png └── weather_forecasting.gif └── utils ├── YParams.py ├── __init__.py ├── check_rank_generator.ipynb ├── comm.py ├── dali_es_helper.py ├── data_loader.py ├── data_loader_dali.py ├── logging_utils.py ├── loss.py ├── metrics.py ├── plots.py └── rank_generator.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/README.md -------------------------------------------------------------------------------- /config/ViT.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/config/ViT.yaml -------------------------------------------------------------------------------- /distributed/__init__.py: -------------------------------------------------------------------------------- 1 | # model parallelism helpers and routines 2 | -------------------------------------------------------------------------------- /distributed/helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/distributed/helpers.py -------------------------------------------------------------------------------- /distributed/layers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/distributed/layers.py -------------------------------------------------------------------------------- /distributed/mappings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/distributed/mappings.py -------------------------------------------------------------------------------- /example_logs/base/1GPU/00/logs/events.out.tfevents.1698839594.nid001332.2071366.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/example_logs/base/1GPU/00/logs/events.out.tfevents.1698839594.nid001332.2071366.0 -------------------------------------------------------------------------------- /export_DDP_vars.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/export_DDP_vars.sh -------------------------------------------------------------------------------- /figs/minibatch_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/figs/minibatch_0.jpg -------------------------------------------------------------------------------- /networks/helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/networks/helpers.py -------------------------------------------------------------------------------- /networks/vit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/networks/vit.py -------------------------------------------------------------------------------- /sample_nsys_profiles/4workers.nsys-rep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/sample_nsys_profiles/4workers.nsys-rep -------------------------------------------------------------------------------- /sample_nsys_profiles/baseline.nsys-rep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/sample_nsys_profiles/baseline.nsys-rep -------------------------------------------------------------------------------- /sample_nsys_profiles/dali.nsys-rep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/sample_nsys_profiles/dali.nsys-rep -------------------------------------------------------------------------------- /sample_nsys_profiles/dali_amp_bf16.nsys-rep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/sample_nsys_profiles/dali_amp_bf16.nsys-rep -------------------------------------------------------------------------------- /sample_nsys_profiles/dali_amp_bf16_fused_jit.nsys-rep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/sample_nsys_profiles/dali_amp_bf16_fused_jit.nsys-rep -------------------------------------------------------------------------------- /start_tensorboard.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/start_tensorboard.ipynb -------------------------------------------------------------------------------- /submit_pm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/submit_pm.sh -------------------------------------------------------------------------------- /submit_pm_dp.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/submit_pm_dp.sh -------------------------------------------------------------------------------- /submit_pm_mp.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/submit_pm_mp.sh -------------------------------------------------------------------------------- /test_data_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/test_data_loader.py -------------------------------------------------------------------------------- /test_model_dims.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/test_model_dims.py -------------------------------------------------------------------------------- /tests/run_tests.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tests/run_tests.sh -------------------------------------------------------------------------------- /tests/test_distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tests/test_distributed.py -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/train.py -------------------------------------------------------------------------------- /train_mp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/train_mp.py -------------------------------------------------------------------------------- /train_mp_graphs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/train_mp_graphs.py -------------------------------------------------------------------------------- /tutorial_images/baseline_tb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/baseline_tb.png -------------------------------------------------------------------------------- /tutorial_images/dp_timings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/dp_timings.png -------------------------------------------------------------------------------- /tutorial_images/mp_comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/mp_comp.png -------------------------------------------------------------------------------- /tutorial_images/mp_dp_comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/mp_dp_comp.png -------------------------------------------------------------------------------- /tutorial_images/nsys_4workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_4workers.png -------------------------------------------------------------------------------- /tutorial_images/nsys_4workers_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_4workers_zoomed.png -------------------------------------------------------------------------------- /tutorial_images/nsys_baseline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_baseline.png -------------------------------------------------------------------------------- /tutorial_images/nsys_baseline_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_baseline_zoomed.png -------------------------------------------------------------------------------- /tutorial_images/nsys_dali.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_dali.png -------------------------------------------------------------------------------- /tutorial_images/nsys_dali_bf16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_dali_bf16.png -------------------------------------------------------------------------------- /tutorial_images/nsys_dali_bf16_fused_jit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_dali_bf16_fused_jit.png -------------------------------------------------------------------------------- /tutorial_images/nsys_dali_bf16_fused_jit_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_dali_bf16_fused_jit_zoomed.png -------------------------------------------------------------------------------- /tutorial_images/nsys_dali_bf16_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_dali_bf16_zoomed.png -------------------------------------------------------------------------------- /tutorial_images/nsys_dali_zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/nsys_dali_zoomed.png -------------------------------------------------------------------------------- /tutorial_images/vit_schematic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/vit_schematic.png -------------------------------------------------------------------------------- /tutorial_images/weather_forecasting.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/tutorial_images/weather_forecasting.gif -------------------------------------------------------------------------------- /utils/YParams.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/YParams.py -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/__init__.py -------------------------------------------------------------------------------- /utils/check_rank_generator.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/check_rank_generator.ipynb -------------------------------------------------------------------------------- /utils/comm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/comm.py -------------------------------------------------------------------------------- /utils/dali_es_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/dali_es_helper.py -------------------------------------------------------------------------------- /utils/data_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/data_loader.py -------------------------------------------------------------------------------- /utils/data_loader_dali.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/data_loader_dali.py -------------------------------------------------------------------------------- /utils/logging_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/logging_utils.py -------------------------------------------------------------------------------- /utils/loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/loss.py -------------------------------------------------------------------------------- /utils/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/metrics.py -------------------------------------------------------------------------------- /utils/plots.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/plots.py -------------------------------------------------------------------------------- /utils/rank_generator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NERSC/sc24-dl-tutorial/HEAD/utils/rank_generator.py --------------------------------------------------------------------------------