├── .gitignore ├── README.md ├── collect_env.py ├── communication.ipynb ├── data ├── winequality-red.csv ├── winequality-white.csv └── winequality.names ├── imgs ├── accelerate.png ├── adam-mixed-precision.png ├── adam.png ├── amp_32_16.png ├── attn_with_shape.png ├── attn_with_shape_tp.png ├── bloom.png ├── bw_grad_broadcast.png ├── bw_m3_activations.png ├── bw_m3_del.png ├── cuda-tile.jpeg ├── cuda_rpc.png ├── ddp │ ├── ddp_allreduce.png │ ├── ddp_fsdp.png │ ├── fsdp_allgather.png │ ├── fsdp_column.png │ ├── fsdp_overall.png │ ├── fsdp_red_scatter.png │ ├── overlap_comm_comp.png │ └── unit_sharding.png ├── ds_loss.png ├── ds_loss_bp.png ├── ds_update_1.png ├── ds_update_2.png ├── ds_update_3.png ├── ds_update_4.png ├── embedding-tp.png ├── fp32-fp16.png ├── global_rank.png ├── gpipe-pp.png ├── gpus.png ├── local_rank.png ├── loss-scaling.png ├── m0_broadcast.png ├── m0_delete.png ├── m0_forward.png ├── master-weights-scale.png ├── master-weights.png ├── megtron-lm-sp.png ├── mixed-precision-bigo.png ├── mixed-precision-model-size.png ├── mixed-precision.png ├── multi-turn-bi-uni.png ├── multigpus_1.png ├── no_pipe.png ├── offload-cpu.png ├── offload_cpu.png ├── pipe.png ├── qkv │ ├── QK.png │ └── QK^T.jpeg ├── quant-clustering.jpeg ├── ring.png ├── split_two_layer_mlp.png ├── syn-gpus.png ├── tf32.jpeg ├── torchrun_nodes.png ├── verbose.png ├── zero-offload-single-gpu-pipeline.png └── zero-red.png ├── torch_distributed_supp.ipynb └── tutorials ├── 01_multi_gpus_data_parallelism.ipynb ├── 02_ddp_basics.ipynb ├── 03_ddp_toy_example.ipynb ├── 04_model_parallel_resnet50.ipynb ├── 3D-parallel ├── SP-序列并行.ipynb ├── fsdp_fsdp2.ipynb ├── imgs │ ├── ring-allreduce.png │ └── ring-attn.png ├── mesh.py └── ring-allreduce.ipynb ├── CUDA_RPC.ipynb ├── FSDP.ipynb ├── amp_autocast_mixed_precision_training.ipynb ├── backends.ipynb ├── bitsandbytes └── bnd_basics.ipynb ├── cpu_memory └── cpu_memory.ipynb ├── cpu_multi_cores_machine_learning.ipynb ├── cpu_parallel_openmp.ipynb ├── ddp_gpus.py ├── ddp_gpus_torchrun.py ├── deepspeed_accelerate ├── 3D并行.ipynb ├── accelerate_basics_scripts.py ├── accelerate_config.ipynb ├── accelerate_inference.ipynb ├── bert_ds.py ├── bitsandbytes_accelerate.ipynb ├── deepspeed_basics.ipynb ├── deepspeed_实践.ipynb ├── ds_examples.ipynb ├── ds_offload.ipynb ├── megtron_lm.ipynb ├── torchrun_deepspeed_accelerate.ipynb └── utils │ ├── helloworld.py │ └── multi_gpus_inference.py ├── infra ├── PPO_workflow.ipynb ├── mfu_flops.ipynb ├── misc │ └── flash_attn.ipynb ├── openrlhf │ ├── imgs │ │ ├── openrlhf_flow.png │ │ └── openrlhf_ppo.png │ ├── openrlhf_basics.ipynb │ └── openrlhf_分布式训练流程.ipynb ├── ray │ ├── imgs │ │ ├── ray_1.png │ │ ├── ray_2.png │ │ ├── ray_3.png │ │ ├── ray_4.png │ │ └── ray_5.png │ ├── ray_basics.ipynb │ ├── ray_debugger.ipynb │ ├── ray_python.ipynb │ └── ray_资源管理与调度.ipynb └── verl │ ├── imgs │ ├── controlers.png │ ├── dispatch_collect.png │ ├── dp_tp.png │ ├── generalized_hybrid_engine.png │ ├── ray_tp.png │ ├── tp_ffn.png │ ├── verl_components.png │ └── verl_ppo.png │ ├── parquet_data_process.ipynb │ ├── verl.ipynb │ └── verl_in_action.ipynb ├── kv-cache.ipynb ├── mp_vs_rn.png ├── mpi.ipynb ├── nvcc_cuda ├── cuda_mm.ipynb ├── pinned_memory_non_blocking.ipynb ├── tensor-core.ipynb ├── tensorrt-docker.ipynb └── untitled.txt ├── overall.ipynb ├── pipeline_parallel.ipynb ├── quant ├── clustering.ipynb ├── fp32_fp16_bf16_tf32.ipynb └── 基本概念.ipynb ├── snapshot.pt ├── tensor_cores.ipynb ├── tensor_parallel.ipynb ├── torch_dist.py ├── torch_distributed_basics.ipynb ├── torch_nccl_collective_communication.ipynb ├── torch_nccl_test.py ├── 一些补充(ddp、多机多卡).ipynb └── 分布式训练细节.ipynb /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/README.md -------------------------------------------------------------------------------- /collect_env.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/collect_env.py -------------------------------------------------------------------------------- /communication.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/communication.ipynb -------------------------------------------------------------------------------- /data/winequality-red.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/data/winequality-red.csv -------------------------------------------------------------------------------- /data/winequality-white.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/data/winequality-white.csv -------------------------------------------------------------------------------- /data/winequality.names: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/data/winequality.names -------------------------------------------------------------------------------- /imgs/accelerate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/accelerate.png -------------------------------------------------------------------------------- /imgs/adam-mixed-precision.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/adam-mixed-precision.png -------------------------------------------------------------------------------- /imgs/adam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/adam.png -------------------------------------------------------------------------------- /imgs/amp_32_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/amp_32_16.png -------------------------------------------------------------------------------- /imgs/attn_with_shape.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/attn_with_shape.png -------------------------------------------------------------------------------- /imgs/attn_with_shape_tp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/attn_with_shape_tp.png -------------------------------------------------------------------------------- /imgs/bloom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/bloom.png -------------------------------------------------------------------------------- /imgs/bw_grad_broadcast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/bw_grad_broadcast.png -------------------------------------------------------------------------------- /imgs/bw_m3_activations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/bw_m3_activations.png -------------------------------------------------------------------------------- /imgs/bw_m3_del.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/bw_m3_del.png -------------------------------------------------------------------------------- /imgs/cuda-tile.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/cuda-tile.jpeg -------------------------------------------------------------------------------- /imgs/cuda_rpc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/cuda_rpc.png -------------------------------------------------------------------------------- /imgs/ddp/ddp_allreduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ddp/ddp_allreduce.png -------------------------------------------------------------------------------- /imgs/ddp/ddp_fsdp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ddp/ddp_fsdp.png -------------------------------------------------------------------------------- /imgs/ddp/fsdp_allgather.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ddp/fsdp_allgather.png -------------------------------------------------------------------------------- /imgs/ddp/fsdp_column.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ddp/fsdp_column.png -------------------------------------------------------------------------------- /imgs/ddp/fsdp_overall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ddp/fsdp_overall.png -------------------------------------------------------------------------------- /imgs/ddp/fsdp_red_scatter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ddp/fsdp_red_scatter.png -------------------------------------------------------------------------------- /imgs/ddp/overlap_comm_comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ddp/overlap_comm_comp.png -------------------------------------------------------------------------------- /imgs/ddp/unit_sharding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ddp/unit_sharding.png -------------------------------------------------------------------------------- /imgs/ds_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ds_loss.png -------------------------------------------------------------------------------- /imgs/ds_loss_bp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ds_loss_bp.png -------------------------------------------------------------------------------- /imgs/ds_update_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ds_update_1.png -------------------------------------------------------------------------------- /imgs/ds_update_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ds_update_2.png -------------------------------------------------------------------------------- /imgs/ds_update_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ds_update_3.png -------------------------------------------------------------------------------- /imgs/ds_update_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ds_update_4.png -------------------------------------------------------------------------------- /imgs/embedding-tp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/embedding-tp.png -------------------------------------------------------------------------------- /imgs/fp32-fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/fp32-fp16.png -------------------------------------------------------------------------------- /imgs/global_rank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/global_rank.png -------------------------------------------------------------------------------- /imgs/gpipe-pp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/gpipe-pp.png -------------------------------------------------------------------------------- /imgs/gpus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/gpus.png -------------------------------------------------------------------------------- /imgs/local_rank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/local_rank.png -------------------------------------------------------------------------------- /imgs/loss-scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/loss-scaling.png -------------------------------------------------------------------------------- /imgs/m0_broadcast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/m0_broadcast.png -------------------------------------------------------------------------------- /imgs/m0_delete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/m0_delete.png -------------------------------------------------------------------------------- /imgs/m0_forward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/m0_forward.png -------------------------------------------------------------------------------- /imgs/master-weights-scale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/master-weights-scale.png -------------------------------------------------------------------------------- /imgs/master-weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/master-weights.png -------------------------------------------------------------------------------- /imgs/megtron-lm-sp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/megtron-lm-sp.png -------------------------------------------------------------------------------- /imgs/mixed-precision-bigo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/mixed-precision-bigo.png -------------------------------------------------------------------------------- /imgs/mixed-precision-model-size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/mixed-precision-model-size.png -------------------------------------------------------------------------------- /imgs/mixed-precision.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/mixed-precision.png -------------------------------------------------------------------------------- /imgs/multi-turn-bi-uni.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/multi-turn-bi-uni.png -------------------------------------------------------------------------------- /imgs/multigpus_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/multigpus_1.png -------------------------------------------------------------------------------- /imgs/no_pipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/no_pipe.png -------------------------------------------------------------------------------- /imgs/offload-cpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/offload-cpu.png -------------------------------------------------------------------------------- /imgs/offload_cpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/offload_cpu.png -------------------------------------------------------------------------------- /imgs/pipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/pipe.png -------------------------------------------------------------------------------- /imgs/qkv/QK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/qkv/QK.png -------------------------------------------------------------------------------- /imgs/qkv/QK^T.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/qkv/QK^T.jpeg -------------------------------------------------------------------------------- /imgs/quant-clustering.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/quant-clustering.jpeg -------------------------------------------------------------------------------- /imgs/ring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/ring.png -------------------------------------------------------------------------------- /imgs/split_two_layer_mlp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/split_two_layer_mlp.png -------------------------------------------------------------------------------- /imgs/syn-gpus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/syn-gpus.png -------------------------------------------------------------------------------- /imgs/tf32.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/tf32.jpeg -------------------------------------------------------------------------------- /imgs/torchrun_nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/torchrun_nodes.png -------------------------------------------------------------------------------- /imgs/verbose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/verbose.png -------------------------------------------------------------------------------- /imgs/zero-offload-single-gpu-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/zero-offload-single-gpu-pipeline.png -------------------------------------------------------------------------------- /imgs/zero-red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/imgs/zero-red.png -------------------------------------------------------------------------------- /torch_distributed_supp.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/torch_distributed_supp.ipynb -------------------------------------------------------------------------------- /tutorials/01_multi_gpus_data_parallelism.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/01_multi_gpus_data_parallelism.ipynb -------------------------------------------------------------------------------- /tutorials/02_ddp_basics.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/02_ddp_basics.ipynb -------------------------------------------------------------------------------- /tutorials/03_ddp_toy_example.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/03_ddp_toy_example.ipynb -------------------------------------------------------------------------------- /tutorials/04_model_parallel_resnet50.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/04_model_parallel_resnet50.ipynb -------------------------------------------------------------------------------- /tutorials/3D-parallel/SP-序列并行.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/3D-parallel/SP-序列并行.ipynb -------------------------------------------------------------------------------- /tutorials/3D-parallel/fsdp_fsdp2.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/3D-parallel/fsdp_fsdp2.ipynb -------------------------------------------------------------------------------- /tutorials/3D-parallel/imgs/ring-allreduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/3D-parallel/imgs/ring-allreduce.png -------------------------------------------------------------------------------- /tutorials/3D-parallel/imgs/ring-attn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/3D-parallel/imgs/ring-attn.png -------------------------------------------------------------------------------- /tutorials/3D-parallel/mesh.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/3D-parallel/mesh.py -------------------------------------------------------------------------------- /tutorials/3D-parallel/ring-allreduce.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/3D-parallel/ring-allreduce.ipynb -------------------------------------------------------------------------------- /tutorials/CUDA_RPC.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/CUDA_RPC.ipynb -------------------------------------------------------------------------------- /tutorials/FSDP.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/FSDP.ipynb -------------------------------------------------------------------------------- /tutorials/amp_autocast_mixed_precision_training.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/amp_autocast_mixed_precision_training.ipynb -------------------------------------------------------------------------------- /tutorials/backends.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/backends.ipynb -------------------------------------------------------------------------------- /tutorials/bitsandbytes/bnd_basics.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/bitsandbytes/bnd_basics.ipynb -------------------------------------------------------------------------------- /tutorials/cpu_memory/cpu_memory.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/cpu_memory/cpu_memory.ipynb -------------------------------------------------------------------------------- /tutorials/cpu_multi_cores_machine_learning.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/cpu_multi_cores_machine_learning.ipynb -------------------------------------------------------------------------------- /tutorials/cpu_parallel_openmp.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/cpu_parallel_openmp.ipynb -------------------------------------------------------------------------------- /tutorials/ddp_gpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/ddp_gpus.py -------------------------------------------------------------------------------- /tutorials/ddp_gpus_torchrun.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/ddp_gpus_torchrun.py -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/3D并行.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/3D并行.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/accelerate_basics_scripts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/accelerate_basics_scripts.py -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/accelerate_config.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/accelerate_config.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/accelerate_inference.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/accelerate_inference.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/bert_ds.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/bert_ds.py -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/bitsandbytes_accelerate.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/bitsandbytes_accelerate.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/deepspeed_basics.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/deepspeed_basics.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/deepspeed_实践.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/deepspeed_实践.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/ds_examples.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/ds_examples.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/ds_offload.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/ds_offload.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/megtron_lm.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/megtron_lm.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/torchrun_deepspeed_accelerate.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/torchrun_deepspeed_accelerate.ipynb -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/utils/helloworld.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/utils/helloworld.py -------------------------------------------------------------------------------- /tutorials/deepspeed_accelerate/utils/multi_gpus_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/deepspeed_accelerate/utils/multi_gpus_inference.py -------------------------------------------------------------------------------- /tutorials/infra/PPO_workflow.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/PPO_workflow.ipynb -------------------------------------------------------------------------------- /tutorials/infra/mfu_flops.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/mfu_flops.ipynb -------------------------------------------------------------------------------- /tutorials/infra/misc/flash_attn.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/misc/flash_attn.ipynb -------------------------------------------------------------------------------- /tutorials/infra/openrlhf/imgs/openrlhf_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/openrlhf/imgs/openrlhf_flow.png -------------------------------------------------------------------------------- /tutorials/infra/openrlhf/imgs/openrlhf_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/openrlhf/imgs/openrlhf_ppo.png -------------------------------------------------------------------------------- /tutorials/infra/openrlhf/openrlhf_basics.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/openrlhf/openrlhf_basics.ipynb -------------------------------------------------------------------------------- /tutorials/infra/openrlhf/openrlhf_分布式训练流程.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/openrlhf/openrlhf_分布式训练流程.ipynb -------------------------------------------------------------------------------- /tutorials/infra/ray/imgs/ray_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/ray/imgs/ray_1.png -------------------------------------------------------------------------------- /tutorials/infra/ray/imgs/ray_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/ray/imgs/ray_2.png -------------------------------------------------------------------------------- /tutorials/infra/ray/imgs/ray_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/ray/imgs/ray_3.png -------------------------------------------------------------------------------- /tutorials/infra/ray/imgs/ray_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/ray/imgs/ray_4.png -------------------------------------------------------------------------------- /tutorials/infra/ray/imgs/ray_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/ray/imgs/ray_5.png -------------------------------------------------------------------------------- /tutorials/infra/ray/ray_basics.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/ray/ray_basics.ipynb -------------------------------------------------------------------------------- /tutorials/infra/ray/ray_debugger.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/ray/ray_debugger.ipynb -------------------------------------------------------------------------------- /tutorials/infra/ray/ray_python.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/ray/ray_python.ipynb -------------------------------------------------------------------------------- /tutorials/infra/ray/ray_资源管理与调度.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/ray/ray_资源管理与调度.ipynb -------------------------------------------------------------------------------- /tutorials/infra/verl/imgs/controlers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/imgs/controlers.png -------------------------------------------------------------------------------- /tutorials/infra/verl/imgs/dispatch_collect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/imgs/dispatch_collect.png -------------------------------------------------------------------------------- /tutorials/infra/verl/imgs/dp_tp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/imgs/dp_tp.png -------------------------------------------------------------------------------- /tutorials/infra/verl/imgs/generalized_hybrid_engine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/imgs/generalized_hybrid_engine.png -------------------------------------------------------------------------------- /tutorials/infra/verl/imgs/ray_tp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/imgs/ray_tp.png -------------------------------------------------------------------------------- /tutorials/infra/verl/imgs/tp_ffn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/imgs/tp_ffn.png -------------------------------------------------------------------------------- /tutorials/infra/verl/imgs/verl_components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/imgs/verl_components.png -------------------------------------------------------------------------------- /tutorials/infra/verl/imgs/verl_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/imgs/verl_ppo.png -------------------------------------------------------------------------------- /tutorials/infra/verl/parquet_data_process.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/parquet_data_process.ipynb -------------------------------------------------------------------------------- /tutorials/infra/verl/verl.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/verl.ipynb -------------------------------------------------------------------------------- /tutorials/infra/verl/verl_in_action.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/infra/verl/verl_in_action.ipynb -------------------------------------------------------------------------------- /tutorials/kv-cache.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/kv-cache.ipynb -------------------------------------------------------------------------------- /tutorials/mp_vs_rn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/mp_vs_rn.png -------------------------------------------------------------------------------- /tutorials/mpi.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/mpi.ipynb -------------------------------------------------------------------------------- /tutorials/nvcc_cuda/cuda_mm.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/nvcc_cuda/cuda_mm.ipynb -------------------------------------------------------------------------------- /tutorials/nvcc_cuda/pinned_memory_non_blocking.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/nvcc_cuda/pinned_memory_non_blocking.ipynb -------------------------------------------------------------------------------- /tutorials/nvcc_cuda/tensor-core.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/nvcc_cuda/tensor-core.ipynb -------------------------------------------------------------------------------- /tutorials/nvcc_cuda/tensorrt-docker.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/nvcc_cuda/tensorrt-docker.ipynb -------------------------------------------------------------------------------- /tutorials/nvcc_cuda/untitled.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tutorials/overall.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/overall.ipynb -------------------------------------------------------------------------------- /tutorials/pipeline_parallel.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/pipeline_parallel.ipynb -------------------------------------------------------------------------------- /tutorials/quant/clustering.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/quant/clustering.ipynb -------------------------------------------------------------------------------- /tutorials/quant/fp32_fp16_bf16_tf32.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/quant/fp32_fp16_bf16_tf32.ipynb -------------------------------------------------------------------------------- /tutorials/quant/基本概念.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/quant/基本概念.ipynb -------------------------------------------------------------------------------- /tutorials/snapshot.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/snapshot.pt -------------------------------------------------------------------------------- /tutorials/tensor_cores.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/tensor_cores.ipynb -------------------------------------------------------------------------------- /tutorials/tensor_parallel.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/tensor_parallel.ipynb -------------------------------------------------------------------------------- /tutorials/torch_dist.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/torch_dist.py -------------------------------------------------------------------------------- /tutorials/torch_distributed_basics.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/torch_distributed_basics.ipynb -------------------------------------------------------------------------------- /tutorials/torch_nccl_collective_communication.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/torch_nccl_collective_communication.ipynb -------------------------------------------------------------------------------- /tutorials/torch_nccl_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/torch_nccl_test.py -------------------------------------------------------------------------------- /tutorials/一些补充(ddp、多机多卡).ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/一些补充(ddp、多机多卡).ipynb -------------------------------------------------------------------------------- /tutorials/分布式训练细节.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunhuizhang/pytorch_distribute_tutorials/HEAD/tutorials/分布式训练细节.ipynb --------------------------------------------------------------------------------