├── DeepSpeed └── Megatron-LM │ ├── README.md │ └── scripts │ ├── deepspeed_hosts │ ├── ds_zero2_config.json │ └── ds_zero2_pretrain_gpt2_model_parallel.sh ├── HugeCTR ├── README.md ├── dlrm │ ├── README.md │ ├── baseline_log_info.csv │ ├── dlrm.py │ └── dlrm_baseline_auc_lossses.sh └── wdl │ ├── README.md │ ├── scripts │ ├── 300k_iter.sh │ ├── 500_iter.sh │ ├── bsz_x2.sh │ ├── fix_bsz_per_device.sh │ ├── fix_total_bsz.sh │ ├── gpu_memory_usage.py │ ├── vocab_size_x2.sh │ └── wdl.py │ └── tools │ ├── extract_hugectr_logs.py │ └── extract_losses_aucs.sh ├── LICENSE ├── Megatron-LM ├── README.md └── scripts │ └── mutil_perf_pretrain_gpt_dp_mp_pp.sh ├── MindSpore ├── bert │ ├── README.md │ ├── extract_mindspore_logs_time.py │ ├── run_pretrain.py │ └── scripts │ │ ├── run_distributed_pretrain_for_gpu.sh │ │ ├── run_multi_node.sh │ │ ├── run_single_node.sh │ │ └── run_standalone_pretrain_for_gpu.sh └── resnet50v1.5 │ ├── README.md │ ├── extract_mindspore_logs_time.py │ ├── scripts │ ├── run_distribute_train_gpu.sh │ ├── run_multi_node.sh │ ├── run_single_node.sh │ └── run_standalone_train_gpu.sh │ └── train.py ├── MxNet ├── BERT │ ├── README.md │ ├── extract_mxnet_logs.py │ ├── pretrain.sh │ └── run_test.sh ├── Classification │ └── RN50v1b │ │ ├── README.md │ │ ├── extract_mxnet_logs.py │ │ ├── run_test.sh │ │ └── runner.sh └── InsightFace │ ├── ArcFace │ ├── README.md │ ├── extract_mxnet_logs.py │ ├── run_test.sh │ └── runner.sh │ └── PartailFC │ ├── README.md │ ├── extract_mxnet_logs.py │ ├── run_multi_node_test.sh │ ├── run_test.sh │ └── runner.sh ├── NVIDIADeepLearningExamples ├── MxNet │ └── Classification │ │ └── RN50v1.5 │ │ ├── README.md │ │ ├── extract_mxnet_logs.py │ │ ├── extract_mxnet_logs_time.py │ │ ├── run_test.sh │ │ └── runner.sh ├── PyTorch │ ├── BERT │ │ ├── README.md │ │ ├── extract_pytorch_logs_time.py │ │ └── scripts │ │ │ ├── run_multi_nodes.sh │ │ │ ├── run_single_node.sh │ │ │ ├── run_two_nodes.sh │ │ │ └── single_node_train.sh │ └── resnet50v1.5 │ │ ├── README.md │ │ ├── extract_pytorch_logs_time.py │ │ └── scripts │ │ ├── run_multi_nodes.sh │ │ ├── run_single_node.sh │ │ ├── run_two_nodes.sh │ │ └── single_node_train.sh ├── README.md └── TensorFlow │ ├── Classification │ └── ConvNets │ │ └── resnet50v1.5 │ │ ├── README.md │ │ ├── extract_tensorflow_logs.py │ │ ├── extract_tensorflow_logs_time.py │ │ └── scripts │ │ ├── multi_node_train.sh │ │ ├── run_multi_node.sh │ │ ├── run_single_node.sh │ │ ├── run_two_node.sh │ │ └── single_node_train.sh │ └── LanguageModeling │ └── BERT │ ├── README.md │ ├── extract_tensorflow_logs.py │ ├── extract_tensorflow_logs_time.py │ └── scripts │ ├── multi_node_run_pretraining_adam.sh │ ├── run_multi_node.sh │ ├── run_pretraining_adam.sh │ ├── run_single_node.sh │ └── run_two_node.sh ├── OneFlow ├── Classification │ └── ConvNets │ │ └── resnet50v1.5 │ │ ├── README.md │ │ ├── extract_cnn_result.py │ │ ├── extract_util.py │ │ ├── reports │ │ ├── README.md │ │ ├── resnet50_oneflow_v0.2_report_1009.md │ │ ├── resnet50_oneflow_v0.3.1_report_1202.md │ │ └── rn50_fp32_report_0821.md │ │ └── scripts │ │ ├── cp_logs.sh │ │ ├── launch_all.sh │ │ ├── rn50_train.sh │ │ ├── schedule_launch.sh │ │ └── stop_all.sh ├── ClickThroughRate │ └── WideDeepLearning │ │ ├── README.md │ │ ├── docker │ │ ├── build.sh │ │ ├── launch.sh │ │ └── ubuntu.dockerfile │ │ ├── extract_info_from_log.py │ │ ├── extract_info_from_log.sh │ │ ├── gpu_memory_usage.py │ │ ├── imgs │ │ ├── of_1node1Device_latency_memory.png │ │ ├── of_1node1Device_vocabsize_latency_memory.png │ │ ├── of_1node8Device_latency_memory.png │ │ ├── of_1node8Device_vocabsize_latency_memory.png │ │ ├── of_4node32Device_latency_memory.png │ │ ├── of_4node32Device_vocabsize_latency_memory.png │ │ ├── of_fixed_batchsize_per_device.png │ │ └── of_fixed_total_batchsize.png │ │ └── scripts │ │ ├── 300k_iters.sh │ │ ├── 500_iters.sh │ │ ├── bsz_x2.sh │ │ ├── fix_bsz_per_device.sh │ │ ├── fix_total_bsz.sh │ │ └── vocab_x2.sh ├── LanguageModeling │ ├── BERT │ │ ├── README.md │ │ ├── extract_bert_result.py │ │ ├── extract_util.py │ │ ├── reports │ │ │ ├── README.md │ │ │ ├── bert_base_fp32_report_0822.md │ │ │ ├── bert_base_oneflow_v0.2_report_1009.md │ │ │ ├── bert_base_oneflow_v0.3.1_report_1202.md │ │ │ └── imgs │ │ │ │ ├── of_bert_base_latency_throughput.png │ │ │ │ ├── of_bert_base_speedup.png │ │ │ │ └── of_bert_base_throughput.png │ │ └── scripts │ │ │ ├── bert_base_pretrain.sh │ │ │ ├── cp_logs.sh │ │ │ ├── launch_all.sh │ │ │ ├── schedule_launch.sh │ │ │ └── stop_all.sh │ └── GPT │ │ ├── README.md │ │ └── scripts │ │ ├── openweb_to_json.py │ │ ├── pretrain.sh │ │ └── pretrain_with_container.sh ├── Megatron-LM │ ├── README.md │ └── scripts │ │ └── train_gpt2.sh └── Recognition │ └── insightface │ ├── README.md │ ├── extract_oneflow_logs_time.py │ └── scripts │ ├── multi_run.sh │ ├── run_multi_nodes.sh │ ├── run_single_node.sh │ └── train_insightface.sh ├── PaddlePaddle ├── bert │ ├── README.md │ ├── extract_paddle_logs.py │ └── scripts │ │ ├── make_pretrain_data.sh │ │ ├── multi_node_train.sh │ │ ├── run_multi_node.sh │ │ ├── run_single_node.sh │ │ ├── run_two_node.sh │ │ └── single_node_train.sh └── resnet50v1.5 │ ├── README.md │ ├── extract_paddle_logs.py │ ├── extract_paddle_logs_time.py │ └── scripts │ ├── multi_node_train.sh │ ├── run_multi_node.sh │ ├── run_single_node.sh │ ├── run_two_node.sh │ └── single_node_train.sh ├── PyTorch └── resnet50v1.5 │ ├── README.md │ ├── extract_pytorch_logs_time.py │ └── scripts │ ├── run_multi_nodes.sh │ ├── run_single_node.sh │ ├── run_two_nodes.sh │ └── single_node_train.sh ├── README.md ├── TensorFlow ├── bert │ ├── README.md │ ├── extract_tensorflow_logs_time.py │ └── scripts │ │ ├── run_multi_node.sh │ │ ├── run_single_node.sh │ │ ├── run_two_node.sh │ │ └── single_node_train.sh └── resnet50v1.5 │ ├── README.md │ ├── extract_tensorflow_logs.py │ ├── extract_tensorflow_logs_time.py │ └── scripts │ ├── gpu.yaml │ ├── gpu_fp16.yaml │ ├── multi_node_gpu.yaml │ ├── multi_node_gpu_fp16.yaml │ ├── multi_node_train.sh │ ├── run_multi_node.sh │ ├── run_single_node.sh │ ├── run_two_node.sh │ ├── single_node_train.sh │ ├── two_node_gpu.yaml │ ├── two_node_gpu_fp16.yaml │ └── two_node_train.sh └── reports ├── DLPerf_report_v1.0.xlsm ├── GPT └── dlperf_gpt_test_report_210512.md ├── README.md ├── WideDeepLearning ├── dlperf_wide_and_deep_test_report_v1.md ├── dlperf_wide_and_deep_test_report_v1_cn.md └── imgs │ ├── wdl_vecx2_1n1g_mem_latency.png │ ├── wdl_vecx2_1n8g_mem_latency.png │ └── wdl_vecx2_4n8g_mem_latency.png ├── dlperf_benchmark_test_report_v1.md ├── dlperf_benchmark_test_report_v1_cn.md ├── imgs ├── NCCL_debug_0.jpg ├── NCCL_debug_1.jpg ├── NCCL_debug_2.jpg ├── bert_base_amp_bz64_speedup.png ├── bert_base_amp_bz64_throughput.png ├── bert_base_amp_bz_max_speedup.png ├── bert_base_amp_bz_max_throughput.png ├── bert_base_fp32_bz32_speedup.png ├── bert_base_fp32_bz32_throughput.png ├── bert_base_fp32_bz_max_speedup.png ├── bert_base_fp32_bz_max_throughput.png ├── data_parallel_face_emore_r100_bz64.png ├── data_parallel_face_emore_r100_bz_max.png ├── data_parallel_face_emore_y1_bz256.png ├── data_parallel_face_emore_y1_bz_max.png ├── emore_r100_fp32_b64_pf_en.png ├── emore_r100_fp32_bmax_pf_en.png ├── model_parallel_face_emore_r100_bz64.png ├── model_parallel_face_emore_r100_bz_max.png ├── model_parallel_face_emore_y1_bz256.png ├── model_parallel_face_emore_y1_bz_max.png ├── partial_fc_sample_ratio_0_1_face_emore_r100_bz64.png ├── partial_fc_sample_ratio_0_1_face_emore_r100_bz_max.png ├── partial_fc_sample_ratio_0_1_glint_r100_bz64.png ├── partial_fc_sample_ratio_0_1_glint_r100_bz_max.png ├── r50_amp_bz256_speedup.png ├── r50_amp_bz256_throughput.png ├── r50_fp32_bz128_speedup.png └── r50_fp32_bz128_throughput.png └── insightface ├── dlperf_insightface_test_report_v1.md └── dlperf_insightface_test_report_v1_cn.md /DeepSpeed/Megatron-LM/scripts/deepspeed_hosts: -------------------------------------------------------------------------------- 1 | vs002 slots=8 2 | vs003 slots=8 3 | vs004 slots=8 4 | vs005 slots=8 5 | -------------------------------------------------------------------------------- /DeepSpeed/Megatron-LM/scripts/ds_zero2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 0.00015, 9 | "weight_decay": 1e-2 10 | } 11 | }, 12 | "zero_optimization": { 13 | "stage": 1 14 | }, 15 | "zero_allow_untested_optimizer": true, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "activation_checkpointing": { 25 | "partition_activations": false, 26 | "contiguous_memory_optimization": false 27 | }, 28 | "wall_clock_breakdown": false 29 | } 30 | -------------------------------------------------------------------------------- /DeepSpeed/Megatron-LM/scripts/ds_zero2_pretrain_gpt2_model_parallel.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Change for multinode config 4 | BATCH_SIZE=${1:-4} 5 | NUM_GPUS_PER_WORKER=${2:-8} 6 | ZERO_STAGE=${3:-0} 7 | CHECKPOINT_ACTIVATIONS=${4:-"off"} 8 | NUM_WORKERS=${5:-1} 9 | MP_SIZE=${6:-1} 10 | ITER_NUM=${7:-1000} 11 | 12 | script_path=$(realpath $0) 13 | script_dir=$(dirname $script_path) 14 | 15 | echo "BATCH_SIZE: ${BATCH_SIZE}, NUM_GPUS_PER_WORKER:${NUM_GPUS_PER_WORKER}, ZERO_STAGE:${ZERO_STAGE}, CHECKPOINT_ACTIVATIONS:${CHECKPOINT_ACTIVATIONS} " 16 | 17 | a=`expr ${#GPUS} + 1` 18 | gpu_num_per_node=`expr ${a} / 2` 19 | gpu_num=`expr ${NUM_GPUS_PER_WORKER} \* ${NUM_WORKERS}` 20 | total_bz=`expr ${BATCH_SIZE} \* ${gpu_num}` 21 | 22 | sed -i "s/\"train_batch_size\":.*$/\"train_batch_size\": $total_bz,/" $script_dir/ds_zero2_config.json 23 | if [ ${CHECKPOINT_ACTIVATIONS} == "on" ];then 24 | sed -i "s/\"partition_activations\":.*$/\"partition_activations\": true,/" $script_dir/ds_zero2_config.json 25 | else 26 | sed -i "s/\"partition_activations\":.*$/\"partition_activations\": false,/" $script_dir/ds_zero2_config.json 27 | fi 28 | sed -i "s/\"stage\":.*$/\"stage\": $ZERO_STAGE/" $script_dir/ds_zero2_config.json 29 | 30 | # gpt2-small 31 | num_layers=12 32 | num_attention_heads=12 33 | hidden_size=768 34 | 35 | # # gpt2-medium 36 | # num_layers=24 37 | # num_attention_heads=16 38 | # hidden_size=1024 39 | 40 | 41 | PREFIX=20201209-test_zero_gpt2-small 42 | rm -rf checkpoints 43 | LOG_FOLDER=./logs 44 | mkdir -p $LOG_FOLDER 45 | LOG=${LOG_FOLDER}/${PREFIX}_${NUM_WORKERS}n${NUM_GPUS_PER_WORKER}g_bz${BATCH_SIZE}_zero_stage${ZERO_STAGE}_${CHECKPOINT_ACTIVATIONS}_checkpoint_activation.log 46 | 47 | 48 | 49 | config_json="$script_dir/ds_zero2_config.json" 50 | gpt_options=" \ 51 | --save $PREFIX_checkpoint_${NUM_WORKERS}n${NUM_GPUS_PER_WORKER}g_bz${BATCH_SIZE}_zero_stage${ZERO_STAGE}_${CHECKPOINT_ACTIVATIONS}_checkpoint_activation \ 52 | --model-parallel-size ${MP_SIZE} \ 53 | --num-layers ${num_layers} \ 54 | --hidden-size ${hidden_size} \ 55 | --num-attention-heads ${num_attention_heads} \ 56 | --batch-size ${BATCH_SIZE} \ 57 | --seq-length 1024 \ 58 | --max-position-embeddings 1024 \ 59 | --train-iters ${ITER_NUM} \ 60 | --resume-dataloader \ 61 | --train-data wikipedia \ 62 | --lazy-loader \ 63 | --tokenizer-type GPT2BPETokenizer \ 64 | --split 949,50,1 \ 65 | --distributed-backend nccl \ 66 | --lr 0.00015 \ 67 | --no-load-optim \ 68 | --lr-decay-style cosine \ 69 | --weight-decay 1e-2 \ 70 | --clip-grad 1.0 \ 71 | --warmup .01 \ 72 | --fp16 \ 73 | " 74 | 75 | if [ ${CHECKPOINT_ACTIVATIONS} == "on" ];then 76 | gpt_options="${gpt_options} 77 | --checkpoint-activations --deepspeed-activation-checkpointing --deepspeed --deepspeed_config ${config_json} " 78 | else 79 | gpt_options="${gpt_options} 80 | --deepspeed \ 81 | --deepspeed_config ${config_json} \ 82 | " 83 | fi 84 | 85 | run_cmd="deepspeed --hostfile=deepspeed_hosts --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} pretrain_gpt2.py ${gpt_options} " 86 | echo ${run_cmd} 87 | eval ${run_cmd} 2>&1 | tee ${LOG} -------------------------------------------------------------------------------- /HugeCTR/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA HugeCTR Benchmark Test 2 | 3 | This folder holds NVIDIA HugeCTR Benchmark Test scripts, tools and reports. 4 | 5 | You can refer to [HugeCTR User Guide](https://github.com/NVIDIA/HugeCTR/blob/master/docs/hugectr_user_guide.md) for additional information. 6 | 7 | ## folder structure 8 | 9 | ``` 10 | DLPerf/HugeCTR $ tree 11 | . 12 | └── wdl 13 |    ├── docker 14 |    ├── imgs 15 |    ├── scripts 16 |    ├── tools 17 |    └── README.md 18 | 19 | ``` 20 | 21 | -------------------------------------------------------------------------------- /HugeCTR/dlrm/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA HugeCTR DLRM Benchmark Test 2 | This folder holds NVIDIA HugeCTR DLRM Benchmark Test scripts, tools and reports. 3 | 4 | You can refer to [HugeCTR User Guide](https://github.com/NVIDIA/HugeCTR/blob/master/docs/hugectr_user_guide.md) for additional information. 5 | 6 | ## folder structure 7 | ## Benchmark Test Cases 8 | 9 | This report summarized HugeCTR test on 1 nodes with 8 x Tesla V100 on Dec 2021 10 | 11 | ### Test Environment 12 | - 1 nodes with Tesla V100-SXM2-16GB x 8 13 | - InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family 14 | - Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz ($ cat /proc/cpuinfo | grep name | cut -f2 -d: | uniq -c*) 15 | - Memory 384G ($ cat /proc/meminfo) 16 | - Ubuntu 20.04.3 LTS ($ cat /etc/issue/) (GNU/Linux 5.4.0-26-generic x86_64) ($ uname -a) 17 | - CUDA Version: 11.4 ($ nvcc -V), Driver Version: 470.57.02 ($ cat /proc/driver/nvidia/version) 18 | - HugeCTR version: 3.2 19 | - `nvidia-smi topo -m` 20 | 21 | ``` 22 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_1 CPU Affinity NUMA Affinity 23 | GPU0 X NV1 NV2 NV1 SYS SYS SYS NV2 NODE SYS 0-23,48-71 0 24 | GPU1 NV1 X NV1 NV2 SYS SYS NV2 SYS NODE SYS 0-23,48-71 0 25 | GPU2 NV2 NV1 X NV2 SYS NV1 SYS SYS PIX SYS 0-23,48-71 0 26 | GPU3 NV1 NV2 NV2 X NV1 SYS SYS SYS PIX SYS 0-23,48-71 0 27 | GPU4 SYS SYS SYS NV1 X NV2 NV2 NV1 SYS NODE 24-47,72-95 1 28 | GPU5 SYS SYS NV1 SYS NV2 X NV1 NV2 SYS NODE 24-47,72-95 1 29 | GPU6 SYS NV2 SYS SYS NV2 NV1 X NV1 SYS PIX 24-47,72-95 1 30 | GPU7 NV2 SYS SYS SYS NV1 NV2 NV1 X SYS PIX 24-47,72-95 1 31 | mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X SYS 32 | mlx5_1 SYS SYS SYS SYS NODE NODE PIX PIX SYS X 33 | 34 | Legend: 35 | 36 | X = Self 37 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 38 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 39 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 40 | PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) 41 | PIX = Connection traversing at most a single PCIe bridge 42 | NV# = Connection traversing a bonded set of # NVLinks 43 | ``` 44 | 45 | 46 | 47 | ### baseline 48 | 49 | command: bash dlrm.sh 50 | 51 | | parameter | value | 52 | | ---------------------------- | ------------------------------ | 53 | | gpu_num_per_node | 8 | 54 | | num_nodes | 1 | 55 | | eval_batchs | 70 | 56 | | batch_size | 65536 | 57 | | learning_rate | 0.5 | 58 | | warmup_steps | 1000 | 59 | | data_dir | /dataset/f9f659c5/hugectr_dlrm | 60 | | workspace_size_per_gpu_in_mb | 11645 | 61 | | embedding_vec_size | 128 | 62 | | max_iter | 12000 | 63 | | loss_print_every_n_iter | 100 | 64 | | eval_interval | 100 | 65 | | eval_batch_size | 65536 | 66 | | decay_start | 0 | 67 | | decay_steps | 1 | 68 | | decay_power | 2 | 69 | | end_lr', | 0 | 70 | 71 | ### baseline 运行log 72 | 73 | 见baseline_log_info.csv 74 | 75 | -------------------------------------------------------------------------------- /HugeCTR/dlrm/baseline_log_info.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/HugeCTR/dlrm/baseline_log_info.csv -------------------------------------------------------------------------------- /HugeCTR/dlrm/dlrm_baseline_auc_lossses.sh: -------------------------------------------------------------------------------- 1 | max_iter=12000 2 | warmup_steps=1000 3 | lr=0.5 4 | test_case=dlrm_baseline_${max_iter}_${warmup_steps}_${lr} 5 | 6 | python dlrm_kaggle_fp32.py \ 7 | --gpu_num_per_node 8 \ 8 | --eval_batchs 70 \ 9 | --max_iter ${max_iter} \ 10 | --batch_size 65536 \ 11 | --learning_rate ${lr} \ 12 | --warmup_steps ${warmup_steps} \ 13 | --loss_print_every_n_iter 100 \ 14 | --eval_interval 100 | tee log/${test_case}.log 15 | -------------------------------------------------------------------------------- /HugeCTR/wdl/scripts/300k_iter.sh: -------------------------------------------------------------------------------- 1 | bsz=512 2 | 3 | test_case=300kiters-n1g1 4 | mem_usage_file=${test_case}.mem 5 | 6 | python gpu_memory_usage.py 1> log/$mem_usage_file 2>&1 log/$mem_usage_file 2>&1 log/$mem_usage_file 2>&1 log/$mem_usage_file 2>&1 log/$mem_usage_file 2>&1 mem_threshold: 17 | state = 'Detecting' 18 | elif state == 'Detecting': 19 | if info.used < mem_threshold: 20 | running = False 21 | else: 22 | device0_max_used_mem = max(device0_max_used_mem, info.used) 23 | 24 | nvmlShutdown() 25 | print('max device0 memory usage is:', device0_max_used_mem) 26 | 27 | -------------------------------------------------------------------------------- /HugeCTR/wdl/scripts/vocab_size_x2.sh: -------------------------------------------------------------------------------- 1 | bsz=16384 2 | for ngpu in 1 8 3 | do 4 | for i in 1 2 4 8 16 5 | do 6 | test_case=vocab_x2_n1g${ngpu}_vsz${i}_h7 7 | mem_usage_file=${test_case}.mem 8 | wide_workspace_size_per_gpu_in_mb=$(( 12*${i} )) 9 | deep_workspace_size_per_gpu_in_mb=$(( 195*${i} )) 10 | 11 | python gpu_memory_usage.py 1> log/$mem_usage_file 2>&1 losses.tmp 9 | grep AUC: ${logfile} | cut -d " " -f 4 > aucs.tmp 10 | paste losses.tmp aucs.tmp > ${logfile}.losses_aucs 11 | echo "extract loss and AUC to ${logfile}.losses_aucs" 12 | rm losses.tmp aucs.tmp 13 | -------------------------------------------------------------------------------- /Megatron-LM/scripts/mutil_perf_pretrain_gpt_dp_mp_pp.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | export NCCL_SOCKET_IFNAME=ib0 4 | 5 | export NCCL_DEBUG=INFO 6 | export PYTHONUNBUFFERED=1 7 | 8 | PROJECT_ROOT=/data/Megatron-LM 9 | DATA_PATH=/data/gpt/gpt_sample_dataset_text_document 10 | CHECKPOINT_PATH=/data/perf_output 11 | rm -rf /data/perf_output/* 12 | M_P=${1:-1} 13 | P_P=${2:-1} 14 | MICRO_BATCH_SIZE=${3:-8} 15 | GLOABAL_BATCH_SIZE=${4:-16} 16 | NNODES=${5:-1} 17 | GPUS_PER_NODE=${6:-8} 18 | MASTER_ADDR=${7:-127.0.0.1} 19 | MASTER_PORT=21327 20 | NODE_RANK=${8:-0} 21 | echo NODE_RANK=$NODE_RANK 22 | TRAIN_ITERS=${9:-520} 23 | 24 | NUM_LAYERS=${10:-16} 25 | HIDDEN_SIZE=${11:-1536} 26 | NUM_ATTENTION_HEADS=${12:-16} 27 | SEQ_LENGTH=2048 28 | DROPOUT_RATE=0.1 29 | 30 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 31 | 32 | D_P=$(($WORLD_SIZE/$M_P/$P_P)) 33 | 34 | LOGFILE=./megatron_lm_perf_${NNODES}n${GPUS_PER_NODE}g_dp${D_P}_mp${M_P}_pp${P_P}_mbs${MICRO_BATCH_SIZE}_gbs${GLOABAL_BATCH_SIZE}_l${NUM_LAYERS}_hsz${HIDDEN_SIZE}_ahs${NUM_ATTENTION_HEADS}_pretrain_${NODE_RANK}.log 35 | 36 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank ${NODE_RANK} --master_addr ${MASTER_ADDR} --master_port $MASTER_PORT" 37 | 38 | python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \ 39 | --tensor-model-parallel-size $M_P \ 40 | --pipeline-model-parallel-size $P_P \ 41 | --num-layers $NUM_LAYERS \ 42 | --hidden-size $HIDDEN_SIZE \ 43 | --num-attention-heads $NUM_ATTENTION_HEADS \ 44 | --micro-batch-size $MICRO_BATCH_SIZE \ 45 | --global-batch-size $GLOABAL_BATCH_SIZE \ 46 | --seq-length $SEQ_LENGTH \ 47 | --max-position-embeddings $SEQ_LENGTH \ 48 | --train-iters $TRAIN_ITERS \ 49 | --lr-decay-iters 320000 \ 50 | --save $CHECKPOINT_PATH \ 51 | --load $CHECKPOINT_PATH \ 52 | --data-path $DATA_PATH \ 53 | --vocab-file ${PROJECT_ROOT}/gpt2-vocab.json \ 54 | --merge-file ${PROJECT_ROOT}/gpt2-merges.txt \ 55 | --data-impl mmap \ 56 | --split 949,50,1 \ 57 | --distributed-backend nccl \ 58 | --lr 0.00015 \ 59 | --min-lr 1.0e-5 \ 60 | --lr-decay-style cosine \ 61 | --weight-decay 1e-2 \ 62 | --clip-grad 1.0 \ 63 | --lr-warmup-fraction .01 \ 64 | --checkpoint-activations \ 65 | --log-interval 10 \ 66 | --save-interval 100000 \ 67 | --eval-interval 10000 \ 68 | --eval-iters 10 \ 69 | --fp16 2>&1 | tee ${LOGFILE} 70 | 71 | echo "Writting log to ${LOGFILE}" 72 | -------------------------------------------------------------------------------- /MindSpore/bert/scripts/run_distributed_pretrain_for_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEVICE_ID=${1:-0} 4 | BATCH_SIZE=${2:-32} 5 | DTYPE=${3:-'fp32'} 6 | NUM_STEP=${4:-120} 7 | ENABLE_GRAPH_KERNEL=${5:-'false'} 8 | TEST_NUM=${6:-1} 9 | NODE_NUM=${7:-1} 10 | 11 | a=`expr ${#DEVICE_ID} + 1` 12 | GPUS_PER_NODE=`expr ${a} / 2` 13 | total_batch_size=`expr ${BATCH_SIZE} \* $GPUS_PER_NODE` 14 | echo "Use gpus: $DEVICE_ID" 15 | echo "Total batch size : $total_batch_size" 16 | 17 | TOTAL_GPU_NUM=`expr ${NODE_NUM} \* ${GPUS_PER_NODE}` 18 | echo "Total use: ${TOTAL_GPU_NUM} gpu" 19 | 20 | if [ ${NODE_NUM} -eq 1 ] ; then 21 | NODE_IP=localhost:${GPUS_PER_NODE} 22 | elif [ ${NODE_NUM} -eq 2 ] ; then 23 | NODE_IP=${NODE1}:${GPUS_PER_NODE},${NODE2}:${GPUS_PER_NODE} 24 | elif [ ${NODE_NUM} -eq 4 ] ; then 25 | NODE_IP=${NODE1}:${GPUS_PER_NODE},${NODE2}:${GPUS_PER_NODE},${NODE3}:${GPUS_PER_NODE},${NODE4}:${GPUS_PER_NODE} 26 | else 27 | echo "Invalid node num." 28 | fi 29 | 30 | ENABLE_LOSSSCALE="false" 31 | if [ ${DTYPE} == "fp16" ] ; then 32 | ENABLE_LOSSSCALE="true" 33 | fi 34 | 35 | export CUDA_VISIBLE_DEVICES=$DEVICE_ID 36 | export GLOG_logtostderr=1 37 | export GLOG_v=2 38 | LOG_FOLDER=./logs/mindspore/bert/bz${BATCH_SIZE}/${NODE_NUM}n${GPUS_PER_NODE}g 39 | mkdir -p $LOG_FOLDER 40 | LOGFILE=${LOG_FOLDER}/bert_b${BATCH_SIZE}_${DTYPE}_${TEST_NUM}.log 41 | 42 | mpirun --allow-run-as-root \ 43 | --prefix /usr/local/openmpi-4.0.3 \ 44 | --output-filename log_output \ 45 | --merge-stderr-to-stdout \ 46 | -n $TOTAL_GPU_NUM -H $NODE_IP \ 47 | -x NCCL_DEBUG=INFO \ 48 | -mca plm_rsh_args "-p ${PORT}" \ 49 | python run_pretrain.py \ 50 | --device_target="GPU" \ 51 | --distribute="true" \ 52 | --epoch_size=1 \ 53 | --enable_save_ckpt="false" \ 54 | --enable_lossscale=$ENABLE_LOSSSCALE \ 55 | --enable_data_sink="true" \ 56 | --data_sink_steps=10 \ 57 | --train_steps=$NUM_STEP \ 58 | --data_dir="/workspace/bert/data/wiki" \ 59 | --enable_graph_kernel=$ENABLE_GRAPH_KERNEL \ 60 | --batch_size=$BATCH_SIZE \ 61 | --dtype=$DTYPE \ 62 | --schema_dir="" 2>&1 | tee $LOGFILE 63 | 64 | -------------------------------------------------------------------------------- /MindSpore/bert/scripts/run_multi_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BATCH_SIZE=${1:-32} 4 | DTYPE=${2:-'fp32'} 5 | ENABLE_GRAPH_KERNEL=${3:-'false'} 6 | NUM_TESTING=${4:-5} 7 | NODE_NUM=${5:-2} 8 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 9 | 10 | export NODE1=10.11.0.2 11 | export NODE2=10.11.0.3 12 | export NODE3=10.11.0.4 13 | export NODE4=10.11.0.5 14 | export PORT=10000 15 | 16 | i=1 17 | while [ $i -le $NUM_TESTING ] 18 | do 19 | bash $SHELL_FOLDER/scripts/run_distributed_pretrain_for_gpu.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} ${DTYPE} 120 $ENABLE_GRAPH_KERNEL $i ${NODE_NUM} 20 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 21 | let i++ 22 | sleep 20 23 | done 24 | 25 | -------------------------------------------------------------------------------- /MindSpore/bert/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BATCH_SIZE=${1:-32} 4 | DTYPE=${2:-'fp32'} 5 | ENABLE_GRAPH_KERNEL=${3:-'false'} 6 | NUM_TESTING=${4:-5} 7 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 8 | 9 | i=1 10 | while [ $i -le $NUM_TESTING ] 11 | do 12 | bash $SHELL_FOLDER/scripts/run_standalone_pretrain_for_gpu.sh 0 ${BATCH_SIZE} ${DTYPE} 120 $ENABLE_GRAPH_KERNEL $i 13 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 14 | let i++ 15 | sleep 20 16 | done 17 | 18 | 19 | i=1 20 | while [ $i -le $NUM_TESTING ] 21 | do 22 | bash $SHELL_FOLDER/scripts/run_distributed_pretrain_for_gpu.sh 0,1 ${BATCH_SIZE} ${DTYPE} 120 $ENABLE_GRAPH_KERNEL $i 23 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 24 | let i++ 25 | sleep 20 26 | done 27 | 28 | 29 | i=1 30 | while [ $i -le $NUM_TESTING ] 31 | do 32 | bash $SHELL_FOLDER/scripts/run_distributed_pretrain_for_gpu.sh 0,1,2,3 ${BATCH_SIZE} ${DTYPE} 120 $ENABLE_GRAPH_KERNEL $i 33 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 34 | let i++ 35 | sleep 20 36 | done 37 | 38 | 39 | i=1 40 | while [ $i -le $NUM_TESTING ] 41 | do 42 | bash $SHELL_FOLDER/scripts/run_distributed_pretrain_for_gpu.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} ${DTYPE} 120 $ENABLE_GRAPH_KERNEL $i 43 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 44 | let i++ 45 | sleep 20 46 | done 47 | 48 | -------------------------------------------------------------------------------- /MindSpore/bert/scripts/run_standalone_pretrain_for_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEVICE_ID=${1:-0} 4 | BATCH_SIZE=${2:-32} 5 | DTYPE=${3:-'fp32'} 6 | NUM_STEP=${4:-120} 7 | ENABLE_GRAPH_KERNEL=${5:-'false'} 8 | TEST_NUM=${6:-1} 9 | 10 | ENABLE_LOSSSCALE="false" 11 | if [ ${DTYPE} == "fp16" ] ; then 12 | ENABLE_LOSSSCALE="true" 13 | fi 14 | 15 | export CUDA_VISIBLE_DEVICES=$DEVICE_ID 16 | export GLOG_logtostderr=1 17 | export GLOG_v=2 18 | LOG_FOLDER=./logs/mindspore/bert/bz${BATCH_SIZE}/1n1g 19 | mkdir -p $LOG_FOLDER 20 | LOGFILE=${LOG_FOLDER}/bert_b${BATCH_SIZE}_${DTYPE}_${TEST_NUM}.log 21 | 22 | python run_pretrain.py \ 23 | --device_target="GPU" \ 24 | --distribute="false" \ 25 | --epoch_size=1 \ 26 | --enable_save_ckpt="false" \ 27 | --enable_lossscale=$ENABLE_LOSSSCALE \ 28 | --enable_data_sink="true" \ 29 | --data_sink_steps=10 \ 30 | --train_steps=$NUM_STEP \ 31 | --data_dir="/workspace/bert/data/wiki" \ 32 | --enable_graph_kernel=$ENABLE_GRAPH_KERNEL \ 33 | --batch_size=$BATCH_SIZE \ 34 | --dtype=$DTYPE \ 35 | --schema_dir="" 2>&1 | tee $LOGFILE 36 | 37 | -------------------------------------------------------------------------------- /MindSpore/resnet50v1.5/scripts/run_distribute_train_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEVICE_ID=${1:-0} 4 | BATCH_SIZE=${2:-128} 5 | DTYPE=${3:-'fp32'} 6 | NUM_STEP=${4:-120} 7 | TEST_NUM=${5:-1} 8 | NODE_NUM=${6:-1} 9 | 10 | a=`expr ${#DEVICE_ID} + 1` 11 | GPUS_PER_NODE=`expr ${a} / 2` 12 | total_batch_size=`expr ${BATCH_SIZE} \* $GPUS_PER_NODE` 13 | echo "Use gpus: $DEVICE_ID" 14 | echo "Total batch size : $total_batch_size" 15 | 16 | TOTAL_GPU_NUM=`expr ${NODE_NUM} \* ${GPUS_PER_NODE}` 17 | echo "Total use: ${TOTAL_GPU_NUM} gpu" 18 | 19 | if [ ${NODE_NUM} -eq 1 ] ; then 20 | NODE_IP=localhost:${GPUS_PER_NODE} 21 | elif [ ${NODE_NUM} -eq 2 ] ; then 22 | NODE_IP=${NODE1}:${GPUS_PER_NODE},${NODE2}:${GPUS_PER_NODE} 23 | elif [ ${NODE_NUM} -eq 4 ] ; then 24 | NODE_IP=${NODE1}:${GPUS_PER_NODE},${NODE2}:${GPUS_PER_NODE},${NODE3}:${GPUS_PER_NODE},${NODE4}:${GPUS_PER_NODE} 25 | else 26 | echo "Invalid node num." 27 | fi 28 | 29 | export CUDA_VISIBLE_DEVICES=$DEVICE_ID 30 | export GLOG_logtostderr=1 31 | export GLOG_v=2 32 | LOG_FOLDER=./logs/mindspore/resnet50/bz${BATCH_SIZE}/${NODE_NUM}n${GPUS_PER_NODE}g 33 | mkdir -p $LOG_FOLDER 34 | LOGFILE=${LOG_FOLDER}/rn50_b${BATCH_SIZE}_${DTYPE}_${TEST_NUM}.log 35 | 36 | mpirun --allow-run-as-root \ 37 | --prefix /usr/local/openmpi-4.0.3 \ 38 | --output-filename log_output \ 39 | --merge-stderr-to-stdout \ 40 | -n $TOTAL_GPU_NUM -H $NODE_IP \ 41 | -x NCCL_DEBUG=INFO \ 42 | -mca plm_rsh_args "-p ${PORT}" \ 43 | python train.py \ 44 | --net="resnet50" \ 45 | --dataset="imagenet2012" \ 46 | --run_distribute=True \ 47 | --device_target="GPU" \ 48 | --data_sink_steps=10 \ 49 | --train_steps=$NUM_STEP \ 50 | --dataset_path="/workspace/resnet/data/ImageNet/train" \ 51 | --batch_size=$BATCH_SIZE \ 52 | --dtype=$DTYPE \ 53 | --device_num=$GPUS_PER_NODE \ 54 | 2>&1 | tee $LOGFILE 55 | -------------------------------------------------------------------------------- /MindSpore/resnet50v1.5/scripts/run_multi_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BATCH_SIZE=${1:-128} 4 | DTYPE=${2:-'fp32'} 5 | NUM_TESTING=${3:-5} 6 | NODE_NUM=${4:-2} 7 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 8 | 9 | export NODE1=10.11.0.2 10 | export NODE2=10.11.0.3 11 | export NODE3=10.11.0.4 12 | export NODE4=10.11.0.5 13 | export PORT=10000 14 | 15 | i=1 16 | while [ $i -le $NUM_TESTING ] 17 | do 18 | bash $SHELL_FOLDER/scripts/run_distribute_train_gpu.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} ${DTYPE} 200 $i ${NODE_NUM} 19 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 20 | let i++ 21 | sleep 20 22 | done 23 | 24 | -------------------------------------------------------------------------------- /MindSpore/resnet50v1.5/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BATCH_SIZE=${1:-128} 4 | DTYPE=${2:-'fp32'} 5 | NUM_TESTING=${3:-5} 6 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 7 | 8 | i=1 9 | while [ $i -le $NUM_TESTING ] 10 | do 11 | bash $SHELL_FOLDER/scripts/run_standalone_train_gpu.sh 0 ${BATCH_SIZE} ${DTYPE} 200 $i 12 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 13 | let i++ 14 | sleep 20 15 | done 16 | 17 | 18 | i=1 19 | while [ $i -le $NUM_TESTING ] 20 | do 21 | bash $SHELL_FOLDER/scripts/run_distribute_train_gpu.sh 0,1 ${BATCH_SIZE} ${DTYPE} 200 $i 22 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 23 | let i++ 24 | sleep 20 25 | done 26 | 27 | 28 | i=1 29 | while [ $i -le $NUM_TESTING ] 30 | do 31 | bash $SHELL_FOLDER/scripts/run_distribute_train_gpu.sh 0,1,2,3 ${BATCH_SIZE} ${DTYPE} 200 $i 32 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 33 | let i++ 34 | sleep 20 35 | done 36 | 37 | 38 | i=1 39 | while [ $i -le $NUM_TESTING ] 40 | do 41 | bash $SHELL_FOLDER/scripts/run_distribute_train_gpu.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} ${DTYPE} 200 $i 42 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 43 | let i++ 44 | sleep 20 45 | done 46 | 47 | -------------------------------------------------------------------------------- /MindSpore/resnet50v1.5/scripts/run_standalone_train_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEVICE_ID=${1:-0} 4 | BATCH_SIZE=${2:-128} 5 | DTYPE=${3:-'fp32'} 6 | NUM_STEP=${4:-120} 7 | TEST_NUM=${5:-1} 8 | 9 | export CUDA_VISIBLE_DEVICES=$DEVICE_ID 10 | 11 | export GLOG_logtostderr=1 12 | export GLOG_v=2 13 | LOG_FOLDER=./logs/mindspore/resnet50/bz${BATCH_SIZE}/1n1g 14 | mkdir -p $LOG_FOLDER 15 | LOGFILE=${LOG_FOLDER}/rn50_b${BATCH_SIZE}_${DTYPE}_${TEST_NUM}.log 16 | 17 | python train.py \ 18 | --net="resnet50" \ 19 | --dataset="imagenet2012" \ 20 | --device_target="GPU" \ 21 | --data_sink_steps=10 \ 22 | --train_steps=$NUM_STEP \ 23 | --dataset_path="/workspace/resnet/data/ImageNet/train" \ 24 | --batch_size=$BATCH_SIZE \ 25 | --dtype=$DTYPE \ 26 | 2>&1 | tee $LOGFILE 27 | -------------------------------------------------------------------------------- /MxNet/BERT/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | MODEL=${1:-"bert_base"} 3 | BZ_PER_DEVICE=${2:-32} 4 | ITER_NUM=${3:-200} 5 | GPUS=${4:-0} 6 | NODE_NUM=${5:-1} 7 | DTYPE=${6:-"fp32"} 8 | TEST_NUM=${7:-1} 9 | 10 | a=`expr ${#GPUS} + 1` 11 | gpu_num_per_node=`expr ${a} / 2` 12 | gpu_num=`expr ${gpu_num_per_node} \* ${NODE_NUM}` 13 | total_bz=`expr ${BZ_PER_DEVICE} \* ${gpu_num}` 14 | 15 | if [ "$DTYPE" = "fp16" ] ; then 16 | PRECISION="float16" 17 | else 18 | PRECISION="float32" 19 | fi 20 | 21 | log_folder=logs/mxnet/bert/bz${BZ_PER_DEVICE}/${NODE_NUM}n${gpu_num_per_node}g 22 | mkdir -p $log_folder 23 | log_file=$log_folder/bert_b${BZ_PER_DEVICE}_${DTYPE}_$TEST_NUM.log 24 | 25 | if [ ${NODE_NUM} -eq 1 ] ; then 26 | node_ip=localhost:${gpu_num_per_node} 27 | elif [ ${NODE_NUM} -eq 2 ] ; then 28 | node_ip=${NODE1}:${gpu_num_per_node},${NODE2}:${gpu_num_per_node} 29 | elif [ ${NODE_NUM} -eq 4 ] ; then 30 | node_ip=${NODE1}:${gpu_num_per_node},${NODE2}:${gpu_num_per_node},${NODE3}:${gpu_num_per_node},${NODE4}:${gpu_num_per_node} 31 | else 32 | echo "Not a valid node." 33 | fi 34 | 35 | 36 | ONE_PART_NPY=$(eval ls ${DATA_DIR}/* | tr " " "\n" | awk '{printf "%s,",$1}' | sed s'/.$//') 37 | 38 | CMD="" 39 | case $MODEL in 40 | "bert_base") CMD+="--model bert_12_768_12 ";; 41 | "bert_large") CMD+="--model bert_24_1024_16 ";; 42 | esac 43 | 44 | 45 | CMD+="--dtype ${PRECISION} \ 46 | --warmup_ratio 1 \ 47 | --comm_backend horovod \ 48 | --total_batch_size ${total_bz} \ 49 | --total_batch_size_eval ${total_bz} \ 50 | --accumulate 1 \ 51 | --lr 1e-4 \ 52 | --max_seq_length 128 \ 53 | --max_predictions_per_seq 20 \ 54 | --num_steps ${ITER_NUM} \ 55 | --log_interval 1 \ 56 | --ckpt_interval 1000 \ 57 | --no_compute_acc \ 58 | --data ${ONE_PART_NPY} " 59 | 60 | echo "begin time: "; date; 61 | # horovodrun -np ${gpu_num} -H ${node_ip} -p ${PORT} \ 62 | # --start-timeout 600 \ 63 | # python3 ${WORKSPACE}/run_pretraining.py ${CMD} 2>&1 | tee ${log_file} 64 | 65 | mpirun -oversubscribe -np ${gpu_num} -H ${node_ip} \ 66 | -bind-to none -map-by slot \ 67 | -x LD_LIBRARY_PATH -x PATH \ 68 | -mca pml ob1 -mca btl ^openib \ 69 | -mca plm_rsh_args "-p 22 -q -o StrictHostKeyChecking=no" \ 70 | -mca btl_tcp_if_include ib0 \ 71 | python3 ${WORKSPACE}/run_pretraining.py ${CMD} 2>&1 | tee ${log_file} 72 | 73 | 74 | echo "Writting log to $log_file" 75 | echo "end time: "; date; -------------------------------------------------------------------------------- /MxNet/BERT/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | BZ_PER_DEVICE=${1:-32} 4 | DTYPE=${2:-'fp32'} 5 | 6 | 7 | export WORKSPACE=/home/leinao/lyon_test/gluon-nlp/scripts/bert 8 | export DATA_DIR=/datasets/bert/mxnet/wiki_128_npy_part_0 9 | 10 | export NODE1=10.11.0.2 11 | export NODE2=10.11.0.3 12 | export NODE3=10.11.0.4 13 | export NODE4=10.11.0.5 14 | export PORT=22 15 | echo "BZ_PER_DEVICE >> ${BZ_PER_DEVICE}" 16 | 17 | 18 | i=1 19 | while [ $i -le 5 ] 20 | do 21 | rm -rf ckpt_dir 22 | bash $SHELL_FOLDER/pretrain.sh bert_base ${BZ_PER_DEVICE} 200 0 1 $DTYPE ${i} 23 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 24 | let i++ 25 | sleep 20s 26 | done 27 | 28 | 29 | i=1 30 | while [ $i -le 5 ] 31 | do 32 | rm -rf ckpt_dir 33 | bash $SHELL_FOLDER/pretrain.sh bert_base ${BZ_PER_DEVICE} 200 0,1 1 $DTYPE ${i} 34 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 35 | let i++ 36 | sleep 20s 37 | done 38 | 39 | 40 | i=1 41 | while [ $i -le 5 ] 42 | do 43 | rm -rf ckpt_dir 44 | bash $SHELL_FOLDER/pretrain.sh bert_base ${BZ_PER_DEVICE} 200 0,1,2,3 1 $DTYPE ${i} 45 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 46 | let i++ 47 | sleep 20s 48 | done 49 | 50 | 51 | i=1 52 | while [ $i -le 5 ] 53 | do 54 | rm -rf ckpt_dir 55 | bash $SHELL_FOLDER/pretrain.sh bert_base ${BZ_PER_DEVICE} 200 0,1,2,3,4,5,6,7 1 $DTYPE ${i} 56 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 57 | let i++ 58 | sleep 20s 59 | done 60 | 61 | 62 | i=1 63 | while [ $i -le 5 ] 64 | do 65 | rm -rf ckpt_dir 66 | bash $SHELL_FOLDER/pretrain.sh bert_base ${BZ_PER_DEVICE} 200 0,1,2,3,4,5,6,7 2 $DTYPE ${i} 67 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 68 | let i++ 69 | sleep 20s 70 | done 71 | 72 | 73 | i=1 74 | while [ $i -le 5 ] 75 | do 76 | rm -rf ckpt_dir 77 | bash $SHELL_FOLDER/pretrain.sh bert_base ${BZ_PER_DEVICE} 200 0,1,2,3,4,5,6,7 4 $DTYPE ${i} 78 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 79 | let i++ 80 | sleep 20s 81 | done -------------------------------------------------------------------------------- /MxNet/Classification/RN50v1b/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | BZ_PER_DEVICE=${1:-128} 4 | DTYPE=${2:-'fp32'} 5 | 6 | 7 | export NODE1=10.11.0.5 8 | export NODE2=10.11.0.4 9 | export NODE3=10.11.0.3 10 | export NODE4=10.11.0.2 11 | export PORT=22 12 | echo "BZ_PER_DEVICE >> ${BZ_PER_DEVICE}" 13 | 14 | 15 | i=1 16 | while [ $i -le 5 ] 17 | do 18 | bash $SHELL_FOLDER/runner.sh ${BZ_PER_DEVICE} 0 1 $DTYPE ${i} 19 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 20 | let i++ 21 | sleep 20s 22 | done 23 | 24 | 25 | i=1 26 | while [ $i -le 5 ] 27 | do 28 | bash $SHELL_FOLDER/runner.sh ${BZ_PER_DEVICE} 0,1 1 $DTYPE ${i} 29 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 30 | let i++ 31 | sleep 20s 32 | done 33 | 34 | 35 | i=1 36 | while [ $i -le 5 ] 37 | do 38 | bash $SHELL_FOLDER/runner.sh ${BZ_PER_DEVICE} 0,1,2,3 1 $DTYPE ${i} 39 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 40 | let i++ 41 | sleep 20s 42 | done 43 | 44 | 45 | i=1 46 | while [ $i -le 5 ] 47 | do 48 | bash $SHELL_FOLDER/runner.sh ${BZ_PER_DEVICE} 0,1,2,3,4,5,6,7 1 $DTYPE ${i} 49 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 50 | let i++ 51 | sleep 20s 52 | done 53 | 54 | 55 | i=1 56 | while [ $i -le 5 ] 57 | do 58 | bash $SHELL_FOLDER/runner.sh ${BZ_PER_DEVICE} 0,1,2,3,4,5,6,7 2 $DTYPE ${i} 59 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 60 | let i++ 61 | sleep 20s 62 | done 63 | 64 | 65 | i=1 66 | while [ $i -le 5 ] 67 | do 68 | bash $SHELL_FOLDER/runner.sh ${BZ_PER_DEVICE} 0,1,2,3,4,5,6,7 4 $DTYPE ${i} 69 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 70 | let i++ 71 | sleep 20s 72 | done -------------------------------------------------------------------------------- /MxNet/Classification/RN50v1b/runner.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | BATCH_SIZE=${1:-128} 3 | GPUS=${2:-0,1,2,3,4,5,6,7} 4 | NODE_NUM=${3:-1} 5 | DTYPE=${4:-"fp32"} 6 | TEST_NUM=${5:-1} 7 | 8 | a=`expr ${#GPUS} + 1` 9 | gpu_num_per_node=`expr ${a} / 2` 10 | gpu_num=`expr ${gpu_num_per_node} \* ${NODE_NUM}` 11 | total_bz=`expr ${BATCH_SIZE} \* ${gpu_num}` 12 | 13 | if [ "$DTYPE" = "fp16" ] ; then 14 | PRECISION="float16" 15 | else 16 | PRECISION="float32" 17 | fi 18 | 19 | 20 | log_folder=../logs/mxnet/resnet50/bz${BATCH_SIZE}/${NODE_NUM}n${gpu_num_per_node}g 21 | mkdir -p $log_folder 22 | log_file=$log_folder/rn50_b${BATCH_SIZE}_${DTYPE}_$TEST_NUM.log 23 | 24 | if [ ${NODE_NUM} -eq 1 ] ; then 25 | node_ip=localhost:${gpu_num_per_node} 26 | elif [ ${NODE_NUM} -eq 2 ] ; then 27 | node_ip=${NODE1}:${gpu_num_per_node},${NODE2}:${gpu_num_per_node} 28 | elif [ ${NODE_NUM} -eq 4 ] ; then 29 | node_ip=${NODE1}:${gpu_num_per_node},${NODE2}:${gpu_num_per_node},${NODE3}:${gpu_num_per_node},${NODE4}:${gpu_num_per_node} 30 | else 31 | echo "Not a valid node." 32 | fi 33 | 34 | export CUDA_VISIBLE_DEVICES=$GPUS 35 | DATA_DIR=/datasets/ImageNet/MXNet 36 | 37 | mpirun --allow-run-as-root -oversubscribe -np ${gpu_num} -H ${node_ip} \ 38 | -bind-to none -map-by slot \ 39 | -x LD_LIBRARY_PATH -x PATH \ 40 | -mca pml ob1 -mca btl ^openib \ 41 | -mca plm_rsh_args "-p 22 -q -o StrictHostKeyChecking=no" \ 42 | -mca btl_tcp_if_include ib0 python3 train_horovod.py \ 43 | --mode='hybrid' \ 44 | --model='resnet50_v1b' \ 45 | --use-rec \ 46 | --rec-train=$DATA_DIR/train.rec \ 47 | --rec-val=$DATA_DIR/val.rec \ 48 | --batch-size=${BATCH_SIZE} \ 49 | --dtype=${PRECISION} \ 50 | --log-interval=1 \ 51 | --save-frequency=10000 \ 52 | --lr=0.001 \ 53 | --momentum=0.875 \ 54 | --wd=0.000030518 \ 55 | --num-epochs=1 \ 56 | --warmup-epochs=1 2>&1 | tee ${log_file} 57 | -------------------------------------------------------------------------------- /MxNet/InsightFace/ArcFace/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | MODEL=${1:-r100} 4 | BZ_PER_DEVICE=${2:-64} 5 | SAMPLE_RATIO=${3:-1.0} 6 | DTYPE=${4:-'fp32'} 7 | TEST_NUM=${5:-5} 8 | 9 | export NODE1=10.11.0.2 10 | export NODE2=10.11.0.3 11 | export NODE3=10.11.0.4 12 | export NODE4=10.11.0.5 13 | 14 | 15 | i=1 16 | while [ $i -le ${TEST_NUM} ] 17 | do 18 | bash $SHELL_FOLDER/runner.sh ${MODEL} ${BZ_PER_DEVICE} 120 0 1 $SAMPLE_RATIO $DTYPE ${i} 19 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 20 | let i++ 21 | sleep 20s 22 | done 23 | 24 | 25 | i=1 26 | while [ $i -le ${TEST_NUM} ] 27 | do 28 | bash $SHELL_FOLDER/runner.sh ${MODEL} ${BZ_PER_DEVICE} 120 0,1 1 $SAMPLE_RATIO $DTYPE ${i} 29 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 30 | let i++ 31 | sleep 20s 32 | done 33 | 34 | 35 | i=1 36 | while [ $i -le ${TEST_NUM} ] 37 | do 38 | bash $SHELL_FOLDER/runner.sh ${MODEL} ${BZ_PER_DEVICE} 120 0,1,2,3 1 $SAMPLE_RATIO $DTYPE ${i} 39 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 40 | let i++ 41 | sleep 20s 42 | done 43 | 44 | 45 | i=1 46 | while [ $i -le ${TEST_NUM} ] 47 | do 48 | bash $SHELL_FOLDER/runner.sh ${MODEL} ${BZ_PER_DEVICE} 120 0,1,2,3,4,5,6,7 1 $SAMPLE_RATIO $DTYPE ${i} 49 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 50 | let i++ 51 | sleep 20s 52 | done -------------------------------------------------------------------------------- /MxNet/InsightFace/ArcFace/runner.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | MODEL=${1:-"r100"} 3 | BZ_PER_DEVICE=${2:-64} 4 | ITER_NUM=${3:-120} 5 | GPUS=${4:-7} 6 | NODE_NUM=${5:-1} 7 | DTYPE=${6:-"fp32"} 8 | TEST_NUM=${7:-1} 9 | DATASET=${8:-emore} 10 | MODEL_PARALLEL=${9:-"True"} 11 | 12 | a=`expr ${#GPUS} + 1` 13 | gpu_num_per_node=`expr ${a} / 2` 14 | gpu_num=`expr ${gpu_num_per_node} \* ${NODE_NUM}` 15 | total_bz=`expr ${BZ_PER_DEVICE} \* ${gpu_num}` 16 | 17 | if [ "$DTYPE" = "fp16" ] ; then 18 | PRECISION="float16" 19 | else 20 | PRECISION="float32" 21 | fi 22 | 23 | case $MODEL in 24 | "r100") LOSS=arcface ;; 25 | "y1") LOSS=arcface ;; 26 | esac 27 | 28 | 29 | 30 | log_folder=20210204-logs-${MODEL}-${LOSS}/insightface/arcface/bz${BZ_PER_DEVICE}/${NODE_NUM}n${gpu_num_per_node}g 31 | mkdir -p $log_folder 32 | log_file=$log_folder/${MODEL}_b${BZ_PER_DEVICE}_${DTYPE}_$TEST_NUM.log 33 | 34 | if [ ${NODE_NUM} -eq 1 ] ; then 35 | node_ip=localhost:${gpu_num_per_node} 36 | else 37 | echo "Not a valid node." 38 | fi 39 | 40 | export CUDA_VISIBLE_DEVICES=${GPUS} 41 | sed -i "s/\(default.per_batch_size = \)\S*/\default.per_batch_size = ${BZ_PER_DEVICE}/" config.py 42 | 43 | 44 | echo "Begin time: "; date; 45 | 46 | if [ "$MODEL_PARALLEL" = "True" ] ; then 47 | echo "Use model patallel mode" 48 | python train_parall.py \ 49 | --network ${MODEL} \ 50 | --loss ${LOSS} \ 51 | --dataset ${DATASET} 2>&1 | tee ${log_file} 52 | else 53 | echo "Use data patallel mode" 54 | python train.py \ 55 | --network ${MODEL} \ 56 | --loss ${LOSS} \ 57 | --dataset ${DATASET} 2>&1 | tee ${log_file} 58 | fi 59 | 60 | 61 | echo "Writting log to $log_file" 62 | echo "End time: "; date; 63 | 64 | 65 | -------------------------------------------------------------------------------- /MxNet/InsightFace/PartailFC/run_multi_node_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | MODEL=${1:-r100} 4 | BZ_PER_DEVICE=${2:-64} 5 | SAMPLE_RATIO=${3:-0.1} 6 | DTYPE=${4:-'fp32'} 7 | TEST_NUM=${5:-5} 8 | 9 | export NODE1=10.11.0.2 10 | export NODE2=10.11.0.3 11 | export NODE3=10.11.0.4 12 | export NODE4=10.11.0.5 13 | 14 | 15 | i=1 16 | while [ $i -le ${TEST_NUM} ] 17 | do 18 | bash $SHELL_FOLDER/runner.sh ${MODEL} ${BZ_PER_DEVICE} 120 0,1,2,3,4,5,6,7 2 $SAMPLE_RATIO $DTYPE ${i} 19 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 20 | let i++ 21 | sleep 20s 22 | done 23 | 24 | 25 | i=1 26 | while [ $i -le ${TEST_NUM} ] 27 | do 28 | bash $SHELL_FOLDER/runner.sh ${MODEL} ${BZ_PER_DEVICE} 120 0,1,2,3,4,5,6,7 4 $SAMPLE_RATIO $DTYPE ${i} 29 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 30 | let i++ 31 | sleep 20s 32 | done 33 | -------------------------------------------------------------------------------- /MxNet/InsightFace/PartailFC/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | MODEL=${1:-r100} 4 | BZ_PER_DEVICE=${2:-64} 5 | SAMPLE_RATIO=${3:-1.0} 6 | DTYPE=${4:-'fp32'} 7 | TEST_NUM=${5:-5} 8 | 9 | export NODE1=10.11.0.2 10 | export NODE2=10.11.0.3 11 | export NODE3=10.11.0.4 12 | export NODE4=10.11.0.5 13 | 14 | 15 | i=1 16 | while [ $i -le ${TEST_NUM} ] 17 | do 18 | bash $SHELL_FOLDER/runner.sh ${MODEL} ${BZ_PER_DEVICE} 120 0 1 $SAMPLE_RATIO $DTYPE ${i} 19 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 20 | let i++ 21 | sleep 20s 22 | done 23 | 24 | 25 | i=1 26 | while [ $i -le ${TEST_NUM} ] 27 | do 28 | bash $SHELL_FOLDER/runner.sh ${MODEL} ${BZ_PER_DEVICE} 120 0,1,2,3 1 $SAMPLE_RATIO $DTYPE ${i} 29 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 30 | let i++ 31 | sleep 20s 32 | done 33 | 34 | 35 | i=1 36 | while [ $i -le ${TEST_NUM} ] 37 | do 38 | bash $SHELL_FOLDER/runner.sh ${MODEL} ${BZ_PER_DEVICE} 120 0,1,2,3,4,5,6,7 1 $SAMPLE_RATIO $DTYPE ${i} 39 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 40 | let i++ 41 | sleep 20s 42 | done 43 | 44 | -------------------------------------------------------------------------------- /MxNet/InsightFace/PartailFC/runner.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | MODEL=${1:-"r100"} 3 | BZ_PER_DEVICE=${2:-64} 4 | ITER_NUM=${3:-120} 5 | GPUS=${4:-0} 6 | NODE_NUM=${5:-1} 7 | SAMPLE_RATIO=${6:-1.0} 8 | DTYPE=${7:-"fp32"} 9 | TEST_NUM=${8:-1} 10 | 11 | a=`expr ${#GPUS} + 1` 12 | gpu_num_per_node=`expr ${a} / 2` 13 | gpu_num=`expr ${gpu_num_per_node} \* ${NODE_NUM}` 14 | total_bz=`expr ${BZ_PER_DEVICE} \* ${gpu_num}` 15 | 16 | export CUDA_VISIBLE_DEVICES=${GPUS} 17 | export HOROVOD_GPU_ALLREDUCE=NCCL 18 | export HOROVOD_GPU_ALLGATHER=NCCL 19 | export HOROVOD_GPU_BROADCAST=NCLL 20 | export MXNET_CPU_WORKER_NTHREADS=3 21 | 22 | if [ ${NODE_NUM} -eq 1 ] ; then 23 | node_ip=localhost:${gpu_num_per_node} 24 | elif [ ${NODE_NUM} -eq 2 ] ; then 25 | node_ip=${NODE1}:${gpu_num_per_node},${NODE2}:${gpu_num_per_node} 26 | elif [ ${NODE_NUM} -eq 4 ] ; then 27 | node_ip=${NODE1}:${gpu_num_per_node},${NODE2}:${gpu_num_per_node},${NODE3}:${gpu_num_per_node},${NODE4}:${gpu_num_per_node} 28 | else 29 | echo "Not a valid node." 30 | fi 31 | 32 | if [ "$DTYPE" = "fp16" ] ; then 33 | sed -i "s/\(config.fp16 = \)\S*/config.fp16 = True/" default.py 34 | else 35 | sed -i "s/\(config.fp16 = \)\S*/config.fp16 = False/" default.py 36 | fi 37 | sed -i "s/\(config.batch_size = \)\S*/config.batch_size = ${BZ_PER_DEVICE}/" default.py 38 | sed -i "s/\(config.max_update = \)\S*/config.max_update = ${ITER_NUM}/" default.py 39 | sed -i "s/\(config.sample_ratio = \)\S*/config.sample_ratio = ${SAMPLE_RATIO}/" default.py 40 | 41 | 42 | log_folder=./logs-20210222-sample-ratio-${SAMPLE_RATIO}/mxnet/partial_fc/bz${BZ_PER_DEVICE}/${NODE_NUM}n${gpu_num_per_node}g 43 | mkdir -p $log_folder 44 | log_file=$log_folder/${MODEL}_b${BZ_PER_DEVICE}_${DTYPE}_$TEST_NUM.log 45 | 46 | # use `which python` to get the absolute path of your python interpreter 47 | # dataset: emore webface glint360k_8GPU ; loss : arcface; cosface 48 | 49 | dataset=emore 50 | loss=arcface 51 | 52 | 53 | PYTHON_EXEC=/home/leinao/anaconda3/envs/mxnet/bin/python 54 | FOLDER=$(dirname $(readlink -f "$0")) 55 | 56 | if [ ${NODE_NUM} -eq 1 ] ; then 57 | export HOROVOD_CACHE_CAPACITY=0 58 | horovodrun -np ${gpu_num} -H ${node_ip} ${PYTHON_EXEC} \ 59 | ${FOLDER}/train_memory.py \ 60 | --dataset ${dataset} \ 61 | --loss ${loss} \ 62 | --network ${MODEL} 2>&1 | tee ${log_file} 63 | else 64 | export HOROVOD_CACHE_CAPACITY=1024 65 | mpirun --allow-run-as-root -oversubscribe \ 66 | -np ${gpu_num} -H ${node_ip} \ 67 | -x HOROVOD_CACHE_CAPACITY=1024 \ 68 | -bind-to none -map-by slot \ 69 | -x LD_LIBRARY_PATH -x PATH \ 70 | -mca pml ob1 -mca btl ^openib \ 71 | -mca plm_rsh_args "-p 22 -q -o StrictHostKeyChecking=no" \ 72 | -mca btl_tcp_if_include ib0 \ 73 | -x OMP_NUM_THREADS=2 \ 74 | -x MXNET_USE_OPERATOR_TUNING=1 \ 75 | -x MXNET_USE_NUM_CORES_OPERATOR_TUNING=1 \ 76 | -x MXNET_CUDNN_AUTOTUNE_DEFAULT=1 \ 77 | ${PYTHON_EXEC} ${FOLDER}/train_memory.py \ 78 | --dataset ${dataset} \ 79 | --loss ${loss} \ 80 | --network ${MODEL} 2>&1 | tee ${log_file} 81 | fi 82 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/MxNet/Classification/RN50v1.5/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | BZ_PER_DEVICE=${1:-128} 4 | DTYPE=${2:-"fp32"} 5 | TEST_NUM=${3:-5} 6 | 7 | export DATA_DIR=/data/imagenet/train-val-recordio-passthrough 8 | 9 | export NODE1=10.11.0.2 10 | export NODE2=10.11.0.3 11 | export NODE3=10.11.0.4 12 | export NODE4=10.11.0.5 13 | export PORT=10001 14 | 15 | 16 | i=1 17 | while [ $i -le $TEST_NUM ] 18 | do 19 | bash $SHELL_FOLDER/runner.sh resnetv15 ${BZ_PER_DEVICE} 120 0 1 ${DTYPE} ${i} 20 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 21 | let i++ 22 | sleep 20s 23 | done 24 | 25 | 26 | i=1 27 | while [ $i -le $TEST_NUM ] 28 | do 29 | bash $SHELL_FOLDER/runner.sh resnetv15 ${BZ_PER_DEVICE} 120 0,1,2,3 1 ${DTYPE} ${i} 30 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 31 | let i++ 32 | sleep 20s 33 | done 34 | 35 | 36 | i=1 37 | while [ $i -le $TEST_NUM ] 38 | do 39 | bash $SHELL_FOLDER/runner.sh resnetv15 ${BZ_PER_DEVICE} 120 0,1,2,3,4,5,6,7 1 ${DTYPE} ${i} 40 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 41 | let i++ 42 | sleep 20s 43 | done 44 | 45 | 46 | i=1 47 | while [ $i -le $TEST_NUM ] 48 | do 49 | bash $SHELL_FOLDER/runner.sh resnetv15 ${BZ_PER_DEVICE} 120 0,1,2,3,4,5,6,7 2 ${DTYPE} ${i} 50 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 51 | let i++ 52 | sleep 20s 53 | done 54 | 55 | 56 | i=1 57 | while [ $i -le $TEST_NUM ] 58 | do 59 | bash $SHELL_FOLDER/runner.sh resnetv15 ${BZ_PER_DEVICE} 120 0,1,2,3,4,5,6,7 4 ${DTYPE} ${i} 60 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 61 | let i++ 62 | sleep 20s 63 | done 64 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/MxNet/Classification/RN50v1.5/runner.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | MODEL=${1:-"resnetv15"} 3 | BZ_PER_DEVICE=${2:-32} 4 | ITER_NUM=${3:-120} 5 | GPUS=${4:-0} 6 | NODE_NUM=${5:-1} 7 | DTYPE=${6:-"fp32"} 8 | TEST_NUM=${7:-1} 9 | 10 | a=`expr ${#GPUS} + 1` 11 | gpu_num_per_node=`expr ${a} / 2` 12 | gpu_num=`expr ${gpu_num_per_node} \* ${NODE_NUM}` 13 | total_bz=`expr ${BZ_PER_DEVICE} \* ${gpu_num}` 14 | 15 | if [ "$DTYPE" == "fp16" ] ; then 16 | PRECISION="float16" 17 | else 18 | PRECISION="float32" 19 | fi 20 | 21 | 22 | log_folder=logs/ngc/mxnet/resnet50/bz${BZ_PER_DEVICE}/${NODE_NUM}n${gpu_num_per_node}g 23 | mkdir -p $log_folder 24 | log_file=$log_folder/r50_b${BZ_PER_DEVICE}_${DTYPE}_$TEST_NUM.log 25 | 26 | 27 | if [ ${NODE_NUM} -eq 1 ] ; then 28 | node_ip=localhost:${gpu_num_per_node} 29 | elif [ ${NODE_NUM} -eq 2 ] ; then 30 | node_ip=${NODE1}:${gpu_num_per_node},${NODE2}:${gpu_num_per_node} 31 | elif [ ${NODE_NUM} -eq 4 ] ; then 32 | node_ip=${NODE1}:${gpu_num_per_node},${NODE2}:${gpu_num_per_node},${NODE3}:${gpu_num_per_node},${NODE4}:${gpu_num_per_node} 33 | else 34 | echo "Not a valid node." 35 | fi 36 | 37 | echo ${node_ip} 38 | 39 | CMD="" 40 | case $PRECISION in 41 | "float32") CMD+="--dtype float32 --input-layout NHWC --fuse-bn-relu 0 --fuse-bn-add-relu 0 ";; 42 | "float16") CMD+="--dtype float16 --amp --fuse-bn-relu 1 --fuse-bn-add-relu 1 \ 43 | --input-layout NCHW --conv-layout NHWC --batchnorm-layout NHWC \ 44 | --pooling-layout NHWC ";; 45 | esac 46 | 47 | CMD+="--arch resnetv15 \ 48 | --num-layers 50 \ 49 | --num-classes 1000 \ 50 | --mode train \ 51 | --data-train ${DATA_DIR}/train.rec \ 52 | --data-train-idx ${DATA_DIR}/train.idx \ 53 | --gpus ${GPUS} \ 54 | --batch-size ${total_bz} \ 55 | --image-shape 3,224,224 \ 56 | --lr 0.256 \ 57 | --lr-schedule cosine \ 58 | --optimizer sgd \ 59 | --mom 0.875 \ 60 | --wd 3.0517578125e-05 \ 61 | --label-smoothing 0.1 \ 62 | --kv-store horovod \ 63 | --data-backend dali-gpu \ 64 | --benchmark-iters ${ITER_NUM} \ 65 | --no-metrics \ 66 | --disp-batches 1 \ 67 | --save-frequency 0 \ 68 | --num-epochs 1" 69 | 70 | export MXNET_UPDATE_ON_KVSTORE=0 71 | export MXNET_EXEC_ENABLE_ADDTO=1 72 | export MXNET_USE_TENSORRT=0 73 | export MXNET_GPU_WORKER_NTHREADS=2 74 | export MXNET_GPU_COPY_NTHREADS=1 75 | export MXNET_OPTIMIZER_AGGREGATION_SIZE=54 76 | export HOROVOD_CYCLE_TIME=0.1 77 | export HOROVOD_FUSION_THRESHOLD=67108864 78 | export HOROVOD_NUM_NCCL_STREAMS=2 79 | export MXNET_HOROVOD_NUM_GROUPS=16 80 | export MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD=999 81 | export MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD=25 82 | 83 | echo "begin time: "; date; 84 | horovodrun -np ${gpu_num} \ 85 | -H ${node_ip} -p ${PORT} \ 86 | --start-timeout 600 \ 87 | python3 train.py ${CMD} 2>&1 | tee ${log_file} 88 | 89 | 90 | # mpirun --allow-run-as-root -oversubscribe -np ${gpu_num} -H ${node_ip} \ 91 | # -bind-to none -map-by slot \ 92 | # -x LD_LIBRARY_PATH -x PATH \ 93 | # -mca pml ob1 -mca btl ^openib \ 94 | # -mca plm_rsh_args "-p ${PORT} -q -o StrictHostKeyChecking=no" \ 95 | # -mca btl_tcp_if_include ib0 \ 96 | # python3 train.py ${CMD} 2>&1 | tee ${log_file} 97 | 98 | echo "Writting to ${log_file}" 99 | echo "end time: "; date; -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/PyTorch/BERT/scripts/run_multi_nodes.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | MODEL="bert-base" 4 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 5 | BATCH_SIZE=48 6 | NUM_NODES=4 7 | MASTER_NODE=10.11.0.2 8 | MASTER_PORT=22334 9 | ITER_NUM=150 10 | PREC=fp32 11 | 12 | i=1 13 | while [ $i -le 5 ] 14 | do 15 | bash $SHELL_FOLDER/single_node_train.sh ${BATCH_SIZE} ${NUM_NODES} ${MASTER_NODE} ${MASTER_PORT} 8 ${ITER_NUM} ${PREC} $i 16 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished ${MODEL} Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 17 | let i++ 18 | sleep 20 19 | done 20 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/PyTorch/BERT/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | MODEL="bert-base" 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | BATCH_SIZE=96 4 | NUM_NODES=1 5 | MASTER_NODE=127.0.0.1 6 | MASTER_PORT=29500 7 | PREC=fp16 8 | ITER_NUM=150 9 | 10 | i=1 11 | while [ $i -le 5 ] 12 | do 13 | bash $SHELL_FOLDER/single_node_train.sh ${BATCH_SIZE} ${NUM_NODES} ${MASTER_NODE} ${MASTER_PORT} 1 ${ITER_NUM} ${PREC} $i 14 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished ${MODEL} Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 15 | let i++ 16 | sleep 20 17 | done 18 | 19 | i=1 20 | while [ $i -le 5 ] 21 | do 22 | bash $SHELL_FOLDER/single_node_train.sh ${BATCH_SIZE} ${NUM_NODES} ${MASTER_NODE} ${MASTER_PORT} 4 ${ITER_NUM} ${PREC} $i 23 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished ${MODEL} Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 24 | let i++ 25 | sleep 20 26 | done 27 | 28 | 29 | i=1 30 | while [ $i -le 5 ] 31 | do 32 | bash $SHELL_FOLDER/single_node_train.sh ${BATCH_SIZE} ${NUM_NODES} ${MASTER_NODE} ${MASTER_PORT} 8 ${ITER_NUM} ${PREC} $i 33 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished ${MODEL} Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 34 | let i++ 35 | sleep 20 36 | done 37 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/PyTorch/BERT/scripts/run_two_nodes.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | MODEL="bert-base" 4 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 5 | BATCH_SIZE=96 6 | NUM_NODES=2 7 | MASTER_NODE=10.11.0.2 8 | MASTER_PORT=22334 9 | PREC=fp16 10 | ITER_NUM=150 11 | 12 | i=1 13 | while [ $i -le 5 ] 14 | do 15 | NCCL_DEBUG=INFO bash $SHELL_FOLDER/single_node_train.sh ${BATCH_SIZE} ${NUM_NODES} ${MASTER_NODE} ${MASTER_PORT} 8 ${ITER_NUM} ${PREC} $i 16 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished ${MODEL} Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 17 | let i++ 18 | sleep 20 19 | done 20 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/PyTorch/BERT/scripts/single_node_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Container nvidia build = " $NVIDIA_BUILD_ID 4 | train_batch_size=${1:-48} 5 | num_nodes=${2:-1} 6 | master_node=${3:-127.0.0.1} 7 | master_port=${4:-29500} 8 | num_gpus=${5:-1} 9 | train_steps=${6:-120} 10 | precision=${7:-"fp32"} 11 | test_times=${8:-1} 12 | learning_rate=${9:-"6e-3"} 13 | DATASET=workspace/examples/bert/data/hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets 14 | CODEDIR=${10:-"/workspace/examples/bert"} 15 | LOGDIR=./${precision}_ngc_bert_b${train_batch_size}/pytorch/${num_nodes}n${num_gpus}g 16 | CHECKPOINTS_DIR=${CODEDIR}/results/checkpoints 17 | job_name=${11:-"bert-base-adam-training"} 18 | seed=${12:-42} 19 | accumulate_gradients=${13:-"false"} 20 | allreduce_post_accumulation=${14:-"false"} 21 | gradient_accumulation_steps=${15:-1} 22 | allreduce_post_accumulation_fp16=${16:-"false"} 23 | DATA_DIR_PHASE=${17:-$BERT_PREP_WORKING_DIR/${DATASET}/} 24 | resume_training=${18:-"false"} 25 | create_logfile=${19:-"true"} 26 | warmup_proportion=${20:-"1"} 27 | save_checkpoint_steps=${21:-1000} 28 | init_checkpoint=${22:-"None"} 29 | BERT_CONFIG=${CODEDIR}/bert_config.json 30 | mkdir -p $LOGDIR 31 | mkdir -p $CHECKPOINTS_DIR 32 | 33 | if [ ! -d "$DATA_DIR_PHASE" ] ; then 34 | echo "Warning! $DATA_DIR_PHASE directory missing. Training cannot start" 35 | fi 36 | if [ ! -d "$LOGDIR" ] ; then 37 | echo "Error! $LOGDIR directory missing." 38 | exit -1 39 | fi 40 | if [ ! -d "$CHECKPOINTS_DIR" ] ; then 41 | echo "Warning! $CHECKPOINTS_DIR directory missing." 42 | echo "Checkpoints will be written to $LOGDIR instead." 43 | CHECKPOINTS_DIR=$LOGDIR 44 | fi 45 | if [ ! -f "$BERT_CONFIG" ] ; then 46 | echo "Error! BERT large configuration file not found at $BERT_CONFIG" 47 | exit -1 48 | fi 49 | 50 | PREC="" 51 | if [ "$precision" = "fp16" ] ; then 52 | PREC="--fp16" 53 | elif [ "$precision" = "fp32" ] ; then 54 | PREC="" 55 | else 56 | echo "Unknown argument" 57 | exit -2 58 | fi 59 | 60 | ACCUMULATE_GRADIENTS="" 61 | if [ "$accumulate_gradients" == "true" ] ; then 62 | ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps" 63 | fi 64 | 65 | CHECKPOINT="" 66 | if [ "$resume_training" == "true" ] ; then 67 | CHECKPOINT="--resume_from_checkpoint" 68 | fi 69 | 70 | ALL_REDUCE_POST_ACCUMULATION="" 71 | if [ "$allreduce_post_accumulation" == "true" ] ; then 72 | ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation" 73 | fi 74 | ALL_REDUCE_POST_ACCUMULATION_FP16="" 75 | if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then 76 | ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16" 77 | fi 78 | 79 | INIT_CHECKPOINT="" 80 | if [ "$init_checkpoint" != "None" ] ; then 81 | INIT_CHECKPOINT="--init_checkpoint=$init_checkpoint" 82 | fi 83 | 84 | echo $DATA_DIR_PHASE 85 | INPUT_DIR=$DATA_DIR_PHASE 86 | CMD=" $CODEDIR/run_pretraining.py" 87 | CMD+=" --input_dir=$DATA_DIR_PHASE" 88 | CMD+=" --output_dir=$CHECKPOINTS_DIR" 89 | CMD+=" --config_file=$BERT_CONFIG" 90 | CMD+=" --bert_model=bert-base-uncased" 91 | CMD+=" --train_batch_size=$train_batch_size" 92 | CMD+=" --max_seq_length=128" 93 | CMD+=" --max_predictions_per_seq=20" 94 | CMD+=" --max_steps=$train_steps" 95 | CMD+=" --warmup_proportion=$warmup_proportion" 96 | CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps" 97 | CMD+=" --learning_rate=$learning_rate" 98 | CMD+=" --seed=$seed" 99 | CMD+=" $PREC" 100 | CMD+=" $ACCUMULATE_GRADIENTS" 101 | CMD+=" $CHECKPOINT" 102 | CMD+=" $ALL_REDUCE_POST_ACCUMULATION" 103 | CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16" 104 | CMD+=" $INIT_CHECKPOINT" 105 | CMD+=" --do_train" 106 | CMD+=" --json-summary ${CODEDIR}/dllogger.json " 107 | 108 | CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus --nnodes $num_nodes --node_rank=0 --master_addr=$master_node --master_port=$master_port $CMD" 109 | if [ "$create_logfile" = "true" ] ; then 110 | export GBS=$(expr $train_batch_size \* $num_gpus) 111 | printf -v TAG "pytorch_bert_pretraining_phase_%s_gbs%d" "$precision" $GBS 112 | LOGFILE=$LOGDIR/${job_name}_b${train_batch_size}_${precision}_${test_times}.log 113 | printf "Logs written to %s\n" "$LOGFILE" 114 | fi 115 | 116 | set -x 117 | if [ -z "$LOGFILE" ] ; then 118 | $CMD 119 | else 120 | ( 121 | $CMD 122 | ) |& tee $LOGFILE 123 | fi 124 | 125 | set +x 126 | 127 | # in order to test continuously 128 | rm -rf $CHECKPOINTS_DIR 129 | 130 | echo "finished training" 131 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/PyTorch/resnet50v1.5/scripts/run_multi_nodes.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | WORKSPACE=${1:-"/workspace/rn50"} 4 | DATA_DIR=${2:-"/data/image"} 5 | 6 | NUM_NODES=4 7 | MASTER_NODE=10.11.0.2 8 | MASTER_PORT=22333 9 | bz_per_device=${3:-256} 10 | TRAIN_STEPS=150 11 | PREC=amp 12 | 13 | i=1 14 | while [ $i -le 5 ] 15 | do 16 | bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${NUM_NODES} 8 ${MASTER_NODE} ${MASTER_PORT} ${bz_per_device} ${TRAIN_STEPS} ${PREC} $i 17 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 18 | let i++ 19 | sleep 20 20 | done 21 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/PyTorch/resnet50v1.5/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | 2 | # !/bin/bash 3 | 4 | WORKSPACE=${1:-"/workspace/rn50"} 5 | DATA_DIR=${2:-"/data/image"} 6 | bz_per_device=${3:-256} 7 | NUM_NODES=1 8 | MASTER_NODE=127.0.0.1 9 | MASTER_PORT=29500 10 | ITER_TIMES=${4:-150} 11 | PREC=fp16 12 | 13 | i=1 14 | while [ $i -le 5 ] 15 | do 16 | bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${NUM_NODES} 1 ${MASTER_NODE} ${MASTER_PORT} ${bz_per_device} ${ITER_TIMES} ${PREC} $i 17 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 18 | let i++ 19 | sleep 20 20 | done 21 | 22 | 23 | i=1 24 | while [ $i -le 5 ] 25 | do 26 | bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${NUM_NODES} 4 ${MASTER_NODE} ${MASTER_PORT} ${bz_per_device} ${ITER_TIMES} ${PREC} $i 27 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 28 | let i++ 29 | sleep 20 30 | done 31 | 32 | i=1 33 | while [ $i -le 5 ] 34 | do 35 | bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${NUM_NODES} 8 ${MASTER_NODE} ${MASTER_PORT} ${bz_per_device} ${ITER_TIMES} ${PREC} $i 36 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 37 | let i++ 38 | sleep 20 39 | done 40 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/PyTorch/resnet50v1.5/scripts/run_two_nodes.sh: -------------------------------------------------------------------------------- 1 | 2 | # !/bin/bash 3 | 4 | WORKSPACE=${1:-"/workspace/rn50"} 5 | DATA_DIR=${2:-"/data/image"} 6 | 7 | NUM_NODES=2 8 | MASTER_NODE=10.11.0.2 9 | MASTER_PORT=22333 10 | bz_per_device=256 11 | TRAIN_STEPS=150 12 | PREC=fp16 13 | 14 | i=1 15 | while [ $i -le 5 ] 16 | do 17 | NCCL_DEBUG=INFO bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${NUM_NODES} 8 ${MASTER_NODE} ${MASTER_PORT} ${bz_per_device} ${TRAIN_STEPS} ${PREC} $i 18 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 19 | let i++ 20 | sleep 20 21 | done 22 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/PyTorch/resnet50v1.5/scripts/single_node_train.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | WORKSPACE=${1:-"/workspace/rn50"} 3 | DATA_DIR=${2:-"/data"} 4 | num_nodes=${3:-1} 5 | num_gpus=${4:-1} 6 | master_node=${5:-'127.0.0.1'} 7 | master_port=${6:-29500} 8 | bz_per_device=${7:-128} 9 | TRAIN_STEPS=${8:-120} 10 | precision=${9:-amp} 11 | TEST_TIMES=${10:-1} 12 | MODEL="resnet50" 13 | 14 | PREC="" 15 | if [ "$precision" = "amp" ] ; then 16 | PREC="--amp --dynamic-loss-scale" 17 | elif [ "$precision" = "fp16" ] ; then 18 | PREC="--amp --dynamic-loss-scale" 19 | elif [ "$precision" = "fp32" ] ; then 20 | PREC="" 21 | else 22 | echo "Unknown argument" 23 | exit -2 24 | fi 25 | 26 | total_bz=`expr ${bz_per_device} \* ${num_gpus}` 27 | LR=$(awk -v total_bz="$total_bz" 'BEGIN{print total_bz / 1000}') 28 | 29 | LOG_FOLDER=${precision}_ngc/pytorch/${num_nodes}n${num_gpus}g 30 | mkdir -p $LOG_FOLDER 31 | LOGFILE=${LOG_FOLDER}/r50_b${bz_per_device}_${precision}_$TEST_TIMES.log 32 | 33 | CMD="$WORKSPACE/main.py" 34 | CMD+=" --data-backend dali-cpu" 35 | CMD+=" --raport-file $WORKSPACE/raport.json" 36 | CMD+=" -j8 -p 1 --lr $LR" 37 | CMD+=" --optimizer-batch-size -1" 38 | CMD+=" --warmup 8 --arch $MODEL" 39 | CMD+=" -c fanin --label-smoothing 0.1" 40 | CMD+=" --lr-schedule cosine --mom 0.125" 41 | CMD+=" --wd 3.0517578125e-05" 42 | CMD+=" --workspace ${1:-./} -b ${bz_per_device}" 43 | CMD+=" --epochs 1 --prof $TRAIN_STEPS" 44 | CMD+=" --training-only --no-checkpoints" 45 | CMD+=" $PREC" 46 | 47 | CMD=" python $WORKSPACE/multiproc.py --nnodes ${num_nodes} --node_rank 0 --nproc_per_node ${num_gpus} --master_addr ${master_node} --master_port=${master_port} $CMD $DATA_DIR" 48 | 49 | set -x 50 | if [ -z "$LOGFILE" ] ; then 51 | $CMD 52 | else 53 | ( 54 | $CMD 55 | ) |& tee $LOGFILE 56 | fi 57 | set +x 58 | 59 | echo "Writting log to ${LOGFILE}" 60 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA DeepLearningExamples 性能评测复现 2 | 3 | 本目录提供了[NVIDIA DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples) 仓库的性能评测复现,目前支持了TensorFlow、MXNet、PyTorch的ResNet50 v1.5性能评测和TensorFlow、PyTorch的BERT-base的性能评测。 4 | 5 | 我们所有的测试都是在4台配置8卡V100-SXM2-16GB的服务器中进行,主要硬软件配置如下: 6 | 7 | ### 环境 8 | 9 | - Tesla V100-SXM2-16GB x 8 10 | - InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family 11 | - Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz 12 | - Memory 384G 13 | - Ubuntu 16.04.4 LTS (GNU/Linux 4.4.0-116-generic x86_64) 14 | - CUDA Version: 10.2, Driver Version: 440.33.01 15 | - `nvidia-smi topo -m` 16 | 17 | ```shell 18 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 CPU Affinity 19 | GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS NODE 0-11,24-35 20 | GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS NODE 0-11,24-35 21 | GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PIX 0-11,24-35 22 | GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PIX 0-11,24-35 23 | GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS 12-23,36-47 24 | GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS 12-23,36-47 25 | GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS 12-23,36-47 26 | GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS 12-23,36-47 27 | mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X 28 | 29 | Legend: 30 | 31 | X = Self 32 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 33 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 34 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 35 | PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) 36 | PIX = Connection traversing at most a single PCIe bridge 37 | NV# = Connection traversing a bonded set of # NVLinks 38 | 39 | ``` 40 | 41 | 软件环境使用的是[NGC 20.03](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)容器。单机测试结果与NVIDIA的官方公布结果相符,同时还增加了2机16卡、4机32卡的测试,用于比较NVIDIA优化后的各个框架的横向扩展性。 42 | 43 | 44 | 具体各框架的性能评测结果、复现方式等信息详见各框架目录。 45 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/Classification/ConvNets/resnet50v1.5/scripts/multi_node_train.sh: -------------------------------------------------------------------------------- 1 | WORKSPACE=${1:-"/workspace/rn50v15_tf"} 2 | DATA_DIR=${2:-"/data"} 3 | GPUS_PER_NODE=${3:-8} 4 | NUM_STEP=${4:-120} 5 | BATCH_SIZE=${5:-128} 6 | DTYPE=${6:-"fp32"} 7 | NODES=${7:-$NODE1,$NODE2} 8 | NUM_TEST=${8:-1} 9 | OTHER=${@:9} 10 | 11 | node_num=$(echo $NODES | tr ',' '\n' | wc -l) 12 | gpu_num=`expr ${node_num} \* ${GPUS_PER_NODE}` 13 | echo "Nodes : ${NODES}" 14 | echo "Total use: ${gpu_num} gpu" 15 | 16 | LOG_FOLDER=../logs/ngc/tensorflow/resnet50/bz${BATCH_SIZE}/${node_num}n${GPUS_PER_NODE}g 17 | mkdir -p $LOG_FOLDER 18 | LOGFILE=${LOG_FOLDER}/r50_b${BATCH_SIZE}_${DTYPE}_$NUM_TEST.log 19 | 20 | if [[ ! -z "${BIND_TO_SOCKET}" ]]; then 21 | BIND_TO_SOCKET="--bind-to socket" 22 | fi 23 | 24 | if [[ ! -z "${USE_DALI}" ]]; then 25 | USE_DALI="--use_dali --data_idx_dir=${DATA_DIR}/dali_idx" 26 | fi 27 | 28 | if [[ ! -z "${USE_XLA}" ]]; then 29 | USE_XLA="--use_xla" 30 | fi 31 | 32 | CMD="" 33 | case $DTYPE in 34 | "fp32") CMD+="--precision=fp32";; 35 | "fp16") CMD+="--precision=fp16 --use_static_loss_scaling --loss_scale=128";; 36 | "amp") CMD+="--precision=fp32 --use_tf_amp --use_static_loss_scaling --loss_scale=128";; 37 | esac 38 | 39 | CMD="--arch=resnet50 --mode=train --iter_unit=batch --num_iter=${NUM_STEP} \ 40 | --batch_size=${BATCH_SIZE} --warmup_steps=0 --use_cosine_lr --label_smoothing 0.1 \ 41 | --lr_init=0.256 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \ 42 | ${CMD} --data_dir=${DATA_DIR}/tfrecords ${USE_DALI} ${USE_XLA} \ 43 | --results_dir=${LOG_FOLDER}/results --weight_init=fan_in ${OTHER} \ 44 | --display_every=1 --gpu_memory_fraction=0.98" 45 | 46 | if [[ ${gpu_num} -eq 1 ]]; then 47 | python3 main.py ${CMD} 48 | else 49 | mpirun --allow-run-as-root --bind-to socket -np ${gpu_num} -H $NODES \ 50 | -bind-to none -map-by slot -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \ 51 | -mca plm_rsh_args "-p 12345 -q -o StrictHostKeyChecking=no" \ 52 | -mca btl_tcp_if_include ib0 \ 53 | python3 main.py ${CMD} 2>&1 | tee ${LOGFILE} 54 | fi 55 | 56 | # horovodrun -p 12345 -np $gpu_num \ 57 | # -H $NODES \ 58 | # python3 main.py ${CMD} 2>&1 | tee ${LOGFILE} 59 | 60 | echo "Writting log to ${LOGFILE}" -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/Classification/ConvNets/resnet50v1.5/scripts/run_multi_node.sh: -------------------------------------------------------------------------------- 1 | GPUS_PER_NODE=8 2 | NODE1=10.11.0.2:$GPUS_PER_NODE 3 | NODE2=10.11.0.3:$GPUS_PER_NODE 4 | NODE3=10.11.0.4:$GPUS_PER_NODE 5 | NODE4=10.11.0.5:$GPUS_PER_NODE 6 | 7 | WORKSPACE="/workspace/rn50v15_tf" 8 | DATA_DIR="/data" 9 | BATCH_SIZE=${1:-128} 10 | DTYPE=${2:-"fp32"} 11 | NUM_STEP=${3:-120} 12 | NODES=${4:-$NODE1,$NODE2} 13 | NUM_TEST=${5:-5} 14 | 15 | 16 | 17 | i=1 18 | while [ $i -le $NUM_TEST ] 19 | do 20 | USE_DALI=1 bash ${WORKSPACE}/resnet50v1.5/training/multi_node_train.sh ${WORKSPACE} ${DATA_DIR} \ 21 | $GPUS_PER_NODE $NUM_STEP $BATCH_SIZE $DTYPE $NODE1,$NODE2,$NODE3,$NODE4 $i 22 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 23 | let i++ 24 | sleep 30 25 | done -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/Classification/ConvNets/resnet50v1.5/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | WORKSPACE="/workspace/rn50v15_tf" 2 | DATA_DIR="/data" 3 | BATCH_SIZE=${1:-128} 4 | DTYPE=${2:-"fp32"} 5 | NUM_STEP=${3:-120} 6 | NUM_TEST=${4:-5} 7 | 8 | 9 | i=1 10 | while [ $i -le $NUM_TEST ] 11 | do 12 | USE_DALI=1 bash ${WORKSPACE}/resnet50v1.5/training/single_node_train.sh ${WORKSPACE} ${DATA_DIR} 1 $NUM_STEP $BATCH_SIZE $DTYPE $i 13 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 14 | let i++ 15 | sleep 20 16 | done 17 | 18 | i=1 19 | while [ $i -le $NUM_TEST ] 20 | do 21 | USE_DALI=1 bash ${WORKSPACE}/resnet50v1.5/training/single_node_train.sh ${WORKSPACE} ${DATA_DIR} 4 $NUM_STEP $BATCH_SIZE $DTYPE $i 22 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 23 | let i++ 24 | sleep 20 25 | done 26 | 27 | i=1 28 | while [ $i -le $NUM_TEST ] 29 | do 30 | USE_DALI=1 bash ${WORKSPACE}/resnet50v1.5/training/single_node_train.sh ${WORKSPACE} ${DATA_DIR} 8 $NUM_STEP $BATCH_SIZE $DTYPE $i 31 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 32 | let i++ 33 | sleep 20 34 | done -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/Classification/ConvNets/resnet50v1.5/scripts/run_two_node.sh: -------------------------------------------------------------------------------- 1 | GPUS_PER_NODE=8 2 | NODE1=10.11.0.2:$GPUS_PER_NODE 3 | NODE2=10.11.0.3:$GPUS_PER_NODE 4 | 5 | WORKSPACE="/workspace/rn50v15_tf" 6 | DATA_DIR="/data" 7 | BATCH_SIZE=${1:-128} 8 | DTYPE=${2:-"fp32"} 9 | NUM_STEP=${3:-120} 10 | NODES=${4:-$NODE1,$NODE2} 11 | NUM_TEST=${5:-5} 12 | 13 | 14 | 15 | i=1 16 | while [ $i -le $NUM_TEST ] 17 | do 18 | USE_DALI=1 bash ${WORKSPACE}/resnet50v1.5/training/multi_node_train.sh ${WORKSPACE} ${DATA_DIR} \ 19 | $GPUS_PER_NODE $NUM_STEP $BATCH_SIZE $DTYPE $NODE1,$NODE2 $i 20 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 21 | let i++ 22 | sleep 20 23 | done -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/Classification/ConvNets/resnet50v1.5/scripts/single_node_train.sh: -------------------------------------------------------------------------------- 1 | WORKSPACE=${1:-"/workspace/rn50v15_tf"} 2 | DATA_DIR=${2:-"/data"} 3 | NUM_GPU=${3:-8} 4 | NUM_STEP=${4:-120} 5 | BATCH_SIZE=${5:-128} 6 | DTYPE=${6:-"fp32"} 7 | NUM_TEST=${7:-1} 8 | OTHER=${@:8} 9 | 10 | LOG_FOLDER=../logs/ngc/tensorflow/resnet50/bz${BATCH_SIZE}/1n${NUM_GPU}g 11 | mkdir -p $LOG_FOLDER 12 | LOGFILE=${LOG_FOLDER}/r50_b${BATCH_SIZE}_${DTYPE}_$NUM_TEST.log 13 | 14 | if [[ ! -z "${BIND_TO_SOCKET}" ]]; then 15 | BIND_TO_SOCKET="--bind-to socket" 16 | fi 17 | 18 | if [[ ! -z "${USE_DALI}" ]]; then 19 | USE_DALI="--use_dali --data_idx_dir=${DATA_DIR}/dali_idx" 20 | fi 21 | 22 | if [[ ! -z "${USE_XLA}" ]]; then 23 | USE_XLA="--use_xla" 24 | fi 25 | 26 | CMD="" 27 | case $DTYPE in 28 | "fp32") CMD+="--precision=fp32";; 29 | "fp16") CMD+="--precision=fp16 --use_static_loss_scaling --loss_scale=128";; 30 | "amp") CMD+="--precision=fp32 --use_tf_amp --use_static_loss_scaling --loss_scale=128";; 31 | esac 32 | 33 | CMD="--arch=resnet50 --mode=train --iter_unit=batch --num_iter=${NUM_STEP} \ 34 | --batch_size=${BATCH_SIZE} --warmup_steps=10000 --use_cosine_lr --label_smoothing 0.1 \ 35 | --lr_init=0.001 --lr_warmup_epochs=8 --momentum=0.875 --weight_decay=3.0517578125e-05 \ 36 | ${CMD} --data_dir=${DATA_DIR}/tfrecords ${USE_DALI} ${USE_XLA} \ 37 | --results_dir=${LOG_FOLDER}/results --weight_init=fan_in ${OTHER} \ 38 | --display_every=1" 39 | 40 | if [[ ${NUM_GPU} -eq 1 ]]; then 41 | python3 main.py ${CMD} 2>&1 | tee ${LOGFILE} 42 | else 43 | mpiexec --allow-run-as-root ${BIND_TO_SOCKET} -np ${NUM_GPU} python3 main.py ${CMD} 2>&1 | tee ${LOGFILE} 44 | fi 45 | 46 | 47 | echo "Writting log to ${LOGFILE}" 48 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/LanguageModeling/BERT/scripts/multi_node_run_pretraining_adam.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | rm -rf /results/* 16 | echo "Container nvidia build = " $NVIDIA_BUILD_ID 17 | 18 | DATA_DIR=${1:-"data"} 19 | GPUS_PER_NODE=${2:-8} 20 | train_batch_size=${3:-32} 21 | train_steps=${4:-120} 22 | bert_model="base" 23 | max_pred_per_seq=20 24 | seq_len=128 25 | precision=${5:-"fp32"} 26 | use_xla=${6:-"false"} 27 | NODES=${7:-$NODE1,$NODE2} 28 | TEST_NUM=${8:-1} 29 | num_accumulation_steps=1 30 | 31 | node_num=$(echo $NODES | tr ',' '\n' | wc -l) 32 | gpu_num=`expr ${node_num} \* ${GPUS_PER_NODE}` 33 | echo "Nodes : ${NODES}" 34 | echo "Total use: ${gpu_num} gpu" 35 | 36 | # DATA_DIR=data/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus 37 | 38 | if [ "$bert_model" = "large" ] ; then 39 | export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json 40 | else 41 | # export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json 42 | export BERT_CONFIG=data/bert_config.json 43 | fi 44 | 45 | PREC="" 46 | if [ "$precision" = "fp16" ] ; then 47 | PREC="--use_fp16" 48 | elif [ "$precision" = "fp32" ] ; then 49 | PREC="" 50 | elif [ "$precision" = "manual_fp16" ] ; then 51 | PREC="--manual_fp16" 52 | else 53 | echo "Unknown argument" 54 | exit -2 55 | fi 56 | 57 | if [ "$use_xla" = "true" ] ; then 58 | PREC="$PREC --use_xla" 59 | echo "XLA activated" 60 | fi 61 | 62 | export GBS=$(expr $train_batch_size \* $GPUS_PER_NODE \* $num_accumulation_steps) 63 | printf -v TAG "tf_bert_pretraining_adam_%s_%s_gbs%d" "$bert_model" "$precision" $GBS 64 | DATESTAMP=`date +'%y%m%d%H%M%S'` 65 | 66 | #Edit to save logs & checkpoints in a different directory 67 | RESULTS_DIR=${RESULTS_DIR:-/results/${TAG}_${DATESTAMP}} 68 | LOG_FOLDER=../logs//ngc/tensorflow/bert/bz${train_batch_size}/${node_num}n${GPUS_PER_NODE}g 69 | mkdir -p $LOG_FOLDER 70 | LOGFILE=${LOG_FOLDER}/bert_b${train_batch_size}_${precision}_$TEST_NUM.log 71 | mkdir -m 777 -p $RESULTS_DIR 72 | printf "Saving checkpoints to %s\n" "$RESULTS_DIR" 73 | printf "Logs written to %s\n" "$LOGFILE" 74 | 75 | INPUT_FILES="$DATA_DIR/tfrecord" 76 | EVAL_FILES="$DATA_DIR/tfrecord" 77 | 78 | horovod_str="--horovod" 79 | mpi="" 80 | 81 | 82 | CMD="$mpi python3 /workspace/bert/run_pretraining.py" 83 | CMD+=" --input_files_dir=$INPUT_FILES" 84 | CMD+=" --eval_files_dir=$EVAL_FILES" 85 | CMD+=" --output_dir=$RESULTS_DIR" 86 | CMD+=" --bert_config_file=$BERT_CONFIG" 87 | CMD+=" --do_train=True" 88 | CMD+=" --do_eval=False" 89 | CMD+=" --train_batch_size=$train_batch_size" 90 | CMD+=" --eval_batch_size=32" 91 | CMD+=" --max_seq_length=$seq_len" 92 | CMD+=" --max_predictions_per_seq=$max_pred_per_seq" 93 | CMD+=" --num_train_steps=$train_steps" 94 | CMD+=" --num_warmup_steps=10000" 95 | CMD+=" --num_accumulation_steps=$num_accumulation_steps" 96 | CMD+=" --save_checkpoints_steps=10000" 97 | CMD+=" --learning_rate=1e-4" 98 | CMD+=" --optimizer_type=adam" 99 | CMD+=" $horovod_str $PREC" 100 | CMD+=" --allreduce_post_accumulation=False" 101 | 102 | #Check if all necessary files are available before training 103 | for DIR_or_file in $DATA_DIR $BERT_CONFIG $RESULTS_DIR; do 104 | if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then 105 | echo "Error! $DIR_or_file directory missing. Please mount correctly" 106 | exit -1 107 | fi 108 | done 109 | 110 | # echo "-np ${gpu_num}, -H ${NODES}, CMD >>>>>>>>>>>>>>>>>>>> ${CMD}" 111 | horovodrun -p 10000 -np $gpu_num -H $NODES $CMD 2>&1 | tee ${LOGFILE} 112 | echo "Writting log to ${LOGFILE}" -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/LanguageModeling/BERT/scripts/run_multi_node.sh: -------------------------------------------------------------------------------- 1 | WORKSPACE="/workspace/bert" 2 | DATA_DIR="data" 3 | BATCH_SIZE=${1:-32} 4 | DTYPE=${2:-'fp32'} 5 | USE_XLA=${3:-'fasle'} 6 | NUM_TEST=${4:-5} 7 | NODE1=10.11.0.2:8 8 | NODE2=10.11.0.3:8 9 | NODE3=10.11.0.4:8 10 | NODE4=10.11.0.5:8 11 | NODES=${4:-$NODE1,$NODE2,$NODE3,$NODE4} 12 | 13 | 14 | i=1 15 | while [ $i -le $NUM_TEST ] 16 | do 17 | bash ${WORKSPACE}/scripts/multi_node_run_pretraining_adam.sh ${DATA_DIR} 8 ${BATCH_SIZE} 120 $DTYPE $USE_XLA $NODES $i 18 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 19 | let i++ 20 | sleep 20 21 | done 22 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/LanguageModeling/BERT/scripts/run_pretraining_adam.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | rm -rf /results/* 16 | echo "Container nvidia build = " $NVIDIA_BUILD_ID 17 | 18 | DATA_DIR=${1:-"data"} 19 | num_gpus=${2:-8} 20 | train_batch_size=${3:-32} 21 | train_steps=${4:-120} 22 | bert_model="base" 23 | precision=${5:-"fp32"} 24 | use_xla=${6:-"false"} 25 | max_pred_per_seq=20 26 | seq_len=128 27 | TEST_NUM=${7:-1} 28 | num_accumulation_steps=1 29 | 30 | # DATA_DIR=data/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pred_per_seq}_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus 31 | 32 | if [ "$bert_model" = "large" ] ; then 33 | export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json 34 | else 35 | # export BERT_CONFIG=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json 36 | export BERT_CONFIG=data/bert_config.json 37 | fi 38 | 39 | PREC="" 40 | if [ "$precision" = "fp16" ] ; then 41 | PREC="--use_fp16" 42 | elif [ "$precision" = "fp32" ] ; then 43 | PREC="" 44 | elif [ "$precision" = "manual_fp16" ] ; then 45 | PREC="--manual_fp16" 46 | else 47 | echo "Unknown argument" 48 | exit -2 49 | fi 50 | 51 | if [ "$use_xla" = "true" ] ; then 52 | PREC="$PREC --use_xla" 53 | echo "XLA activated" 54 | fi 55 | 56 | export GBS=$(expr $train_batch_size \* $num_gpus \* $num_accumulation_steps) 57 | printf -v TAG "tf_bert_pretraining_adam_%s_%s_gbs%d" "$bert_model" "$precision" $GBS 58 | DATESTAMP=`date +'%y%m%d%H%M%S'` 59 | 60 | #Edit to save logs & checkpoints in a different directory 61 | RESULTS_DIR=${RESULTS_DIR:-/results/${TAG}_${DATESTAMP}} 62 | LOG_FOLDER=../logs/ngc/tensorflow/bert/bz${train_batch_size}/1n${num_gpus}g 63 | mkdir -p $LOG_FOLDER 64 | LOGFILE=${LOG_FOLDER}/bert_b${train_batch_size}_${precision}_$TEST_NUM.log 65 | mkdir -m 777 -p $RESULTS_DIR 66 | printf "Saving checkpoints to %s\n" "$RESULTS_DIR" 67 | printf "Logs written to %s\n" "$LOGFILE" 68 | 69 | INPUT_FILES="$DATA_DIR/tfrecord" 70 | EVAL_FILES="$DATA_DIR/tfrecord" 71 | 72 | horovod_str="" 73 | mpi="" 74 | if [ $num_gpus -gt 1 ] ; then 75 | mpi="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket" 76 | horovod_str="--horovod" 77 | fi 78 | 79 | CMD="$mpi python3 /workspace/bert/run_pretraining.py" 80 | CMD+=" --input_files_dir=$INPUT_FILES" 81 | CMD+=" --eval_files_dir=$EVAL_FILES" 82 | CMD+=" --output_dir=$RESULTS_DIR" 83 | CMD+=" --bert_config_file=$BERT_CONFIG" 84 | CMD+=" --do_train=True" 85 | CMD+=" --do_eval=False" 86 | CMD+=" --train_batch_size=$train_batch_size" 87 | CMD+=" --eval_batch_size=16" 88 | CMD+=" --max_seq_length=$seq_len" 89 | CMD+=" --max_predictions_per_seq=$max_pred_per_seq" 90 | CMD+=" --num_train_steps=$train_steps" 91 | CMD+=" --num_warmup_steps=10000" 92 | CMD+=" --num_accumulation_steps=$num_accumulation_steps" 93 | CMD+=" --save_checkpoints_steps=10000" 94 | CMD+=" --learning_rate=1e-4" 95 | CMD+=" --optimizer_type=adam" 96 | CMD+=" $horovod_str $PREC" 97 | CMD+=" --allreduce_post_accumulation=False" 98 | 99 | #Check if all necessary files are available before training 100 | for DIR_or_file in $DATA_DIR $BERT_CONFIG $RESULTS_DIR; do 101 | if [ ! -d "$DIR_or_file" ] && [ ! -f "$DIR_or_file" ]; then 102 | echo "Error! $DIR_or_file directory missing. Please mount correctly" 103 | exit -1 104 | fi 105 | done 106 | 107 | 108 | $CMD 2>&1 | tee ${LOGFILE} 109 | 110 | echo "Writting log to ${LOGFILE}" 111 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/LanguageModeling/BERT/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | WORKSPACE="/workspace/bert" 2 | DATA_DIR="data" 3 | BATCH_SIZE=${1:-32} 4 | DTYPE=${2:-'fp32'} 5 | USE_XLA=${3:-'false'} 6 | NUM_TEST=${4:-5} 7 | 8 | i=1 9 | while [ $i -le $NUM_TEST ] 10 | do 11 | bash ${WORKSPACE}/scripts/run_pretraining_adam.sh ${DATA_DIR} 1 ${BATCH_SIZE} 120 $DTYPE $USE_XLA $i 12 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 13 | let i++ 14 | sleep 20 15 | done 16 | 17 | 18 | i=1 19 | while [ $i -le $NUM_TEST ] 20 | do 21 | bash ${WORKSPACE}/scripts/run_pretraining_adam.sh ${DATA_DIR} 4 ${BATCH_SIZE} 120 $DTYPE $USE_XLA $i 22 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 23 | let i++ 24 | sleep 20 25 | done 26 | 27 | 28 | i=1 29 | while [ $i -le $NUM_TEST ] 30 | do 31 | bash ${WORKSPACE}/scripts/run_pretraining_adam.sh ${DATA_DIR} 8 ${BATCH_SIZE} 120 $DTYPE $USE_XLA $i 32 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 33 | let i++ 34 | sleep 20 35 | done 36 | -------------------------------------------------------------------------------- /NVIDIADeepLearningExamples/TensorFlow/LanguageModeling/BERT/scripts/run_two_node.sh: -------------------------------------------------------------------------------- 1 | WORKSPACE="/workspace/bert" 2 | DATA_DIR="data" 3 | BATCH_SIZE=${1:-32} 4 | DTYPE=${2:-"fp32"} 5 | USE_XLA=${3:-"false"} 6 | NUM_TEST=${4:-5} 7 | 8 | GPUS_PER_NODE=8 9 | NODE1=10.11.0.3:$GPUS_PER_NODE 10 | NODE2=10.11.0.4:$GPUS_PER_NODE 11 | NODES=${4:-$NODE1,$NODE2} 12 | 13 | i=1 14 | while [ $i -le $NUM_TEST ] 15 | do 16 | bash ${WORKSPACE}/scripts/multi_node_run_pretraining_adam.sh ${DATA_DIR} 8 ${BATCH_SIZE} 120 $DTYPE $USE_XLA $NODES $i 17 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 18 | let i++ 19 | sleep 20 20 | done 21 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/README.md: -------------------------------------------------------------------------------- 1 | # OneFlow Benchmark Test Scripts 2 | 3 | 本文介绍如何使用脚本批量测试ResNet50 V1.5: 4 | 5 | 1. `rn50_train.sh`,可以本地单机训练resnet 50,也可以通过ssh发送到远端节点运行; 6 | 2. `launch_all.sh`,发送脚本到指定的机器节点并运行; 7 | 3. `cp_logs.sh`,拷贝日志到指定目录; 8 | 4. `schedule_launch.sh`,批量顺序执行多组`launch_all.sh`; 9 | 5. `extract_cnn_result.py`,从cnn训练日志中提取结果,并打印成markdown表格。 10 | 6. `reports`,[测试报告](reports)目录 11 | 12 | 通常这几个文件只需要修改很少的配置就能正常运行,下面对各个脚本进行详细介绍。 13 | 14 | ## 本地训练启动脚本:`rn50_train.sh` 15 | 16 | 这个脚本用于本地运行OneFlow的训练,可以独立使用,调用前需要手动修改基本配置,调用时需要传入3个参数。 17 | 18 | ### 手工配置选项 19 | 20 | 有三处需要修改的地方: 21 | 22 | ``` 23 | BENCH_ROOT=cnns 24 | DATA_ROOT=/path/to/imagenet_ofrecord 25 | DATA_PART_NUM=32 26 | ``` 27 | 28 | 1. `BENCH_ROOT`: 模型脚本所在的目录,对应OneFlow-Benchmark项目中的`Classification/cnns`目录 29 | 2. `DATA_ROOT`: 测试所用数据集路径 30 | 3. `DATA_PART_NUM`: 测试所用数据集文件数量 31 | 32 | ### 脚本参数 33 | 34 | 调用时需要传入4个参数: 35 | 36 | ``` 37 | NUM_NODES=$1 38 | GPU_NUM_PER_NODE=$2 39 | BSZ_PER_DEVICE=$3 40 | NODE_IPS=$4 41 | ``` 42 | 43 | 1. `NUM_NODES`: 测试训练用的机器节点数量; 44 | 2. `GPU_NUM_PER_NODE`: 每台机器节点中GPU设备的数量; 45 | 3. `BSZ_PER_DEVICE`: 训练时每个批次每个GPU设备对应的图片/句子数量; 46 | 4. `NODE_IPS`: 各个节点的IP列表,可选,多机训练须配置(如果NUM_NODES=1,则NODE_IPS被忽略)。 47 | 48 | 注:这两个脚本只能够在本地运行OneFlow,如果多机训练,可以在各台机器上分别启动该脚本,OneFlow会自动根据配置的机器节点信息进行通信连接,完成训练。另外一种方式就是使用`launch_all.sh`,自动把脚本发送到各个机器节点进行训练。 49 | 50 | 本地单机8卡训练ResNet50,执行命令: 51 | 52 | ``` 53 | ./rn50_train.sh 1 8 128 54 | ``` 55 | 56 | ## 远程训练启动脚本:`launch_all.sh` 57 | 58 | `launch_all.sh`负责发送本地训练启动脚本(如单机训练脚本rn50_train.sh)和`BENCH_ROOT`路径下的模型脚本(如Classification/cnns/of_cnn_train_val.py等)到各台机器节点,并通过ssh的方式在各个机器节点运行本地训练启动脚本。启动时,需要传入5个参数: 59 | 60 | ``` 61 | LOCAL_RUN=$1 62 | BENCH_ROOT=$2 63 | NUM_NODES=$3 64 | GPU_NUM_PER_NODE=$4 65 | BSZ=$5 66 | ``` 67 | 68 | 1. `LOCAL_RUN`:待发送的本地训练启动脚本; 69 | 2. `BENCH_ROOT`: 待发送的OneFlow模型脚本所在目录; 70 | 3. `NUM_NODES`: 测试训练用的机器节点数量; 71 | 4. `GPU_NUM_PER_NODE`: 每台机器节点中GPU设备的数量; 72 | 5. `BSZ_PER_DEVICE`: 训练时每个批次每个GPU设备对应的图片/句子数量; 73 | 74 | 75 | 发送相关脚本到单机,使用8卡训练ResNet50: 76 | 77 | ``` 78 | ./launch_all.sh rn50_train.sh cnns 1 8 128 79 | ``` 80 | 81 | 发送相关脚本到4机,每机都使用8卡(共4机32卡)训练ResNet50: 82 | 83 | ``` 84 | ./launch_all.sh rn50_train.sh cnns 4 8 128 85 | ``` 86 | 87 | ## `cp_logs.sh` 88 | 89 | 根据下列参数拷贝日志到指定路径并重命名: 90 | 91 | ``` 92 | NUM_NODES=$1 93 | GPU_NUM_PER_NODE=$2 94 | BSZ=$3 95 | REPEAT_ID=$4 96 | ``` 97 | 98 | `cp_logs.sh`负责从本地(主节点)拷贝日志到指定路径下,并按照`logs/oneflow/${NUM_NODES}n${GPU_NUM_PER_NODE}g/${model_name}_b${BSZ}_fp32_${REPEAT_ID}.log`的格式保存。 99 | 100 | ## `schedule_launch.sh` 101 | 102 | 本次测评会测试多组batch_size、单机多机配置,每组实验重复7次。 103 | 104 | 根据测试次数,批量自动运行`launch_all.sh`和`cp_logs.sh`,完成训练和备份日志。 105 | 需要两个参数: 106 | 107 | 1. `LOCAL_RUN`:待发送的本地训练启动脚本; 108 | 2. `BENCH_ROOT`: 待发送的OneFlow模型脚本所在目录。 109 | 110 | `schedule_launch.sh`脚本会根据实验次数,循环测试不同batch size,4组节点和GPU设备数量,每组实验重复7次。实验结束后,`logs/oneflow`路径下会保存实验日志。 111 | 112 | ## `extract_cnn_result.py` 113 | 114 | 运行方式如下: 115 | 116 | ``` 117 | python3 extract_cnn_result.py 118 | ``` 119 | 120 | 结果为markdown格式,方便直接引用,输出如下: 121 | 122 | ``` 123 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | 124 | | -------- | -------- | -------- | -------- | 125 | | 4 | 8 | 96 | 4449.85 | 126 | | 4 | 8 | 96 | 4456.82 | 127 | | 4 | 8 | 96 | 4460.17 | 128 | | 4 | 8 | 96 | 4454.99 | 129 | | 4 | 8 | 96 | 4455.97 | 130 | | 4 | 8 | 96 | 4451.41 | 131 | | 4 | 8 | 96 | 4458.06 | 132 | 133 | 134 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | speedup | 135 | | -------- | -------- | -------- | -------- | -------- | 136 | | 1 | 1 | 96 | 149.84 | 1.00 | 137 | | 1 | 8 | 96 | 1158.51 | 7.73 | 138 | | 2 | 8 | 96 | 2257.71 | 15.07 | 139 | | 4 | 8 | 96 | 4455.97 | 29.74 | 140 | ``` 141 | 142 | ### 输入参数 143 | 144 | - `benchmark_log_dir`: 日志存放的目录,脚本中会自动遍历所有`*.log`文件进行信息提取; 145 | - `start_iter` `end_iter`: 待提取的起始和终止步数,脚本中会利用这两个步数的时间戳计算吞吐率。 146 | - `print_mode`: 打印输出格式设置,缺省`markdown` 147 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/extract_cnn_result.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from extract_util import extract_result 3 | 4 | 5 | parser = argparse.ArgumentParser(description="flags for cnn benchmark") 6 | parser.add_argument( 7 | "--benchmark_log_dir", type=str, default="./logs/oneflow", 8 | required=False) 9 | parser.add_argument("--start_iter", type=int, default=20) 10 | parser.add_argument("--end_iter", type=int, default=120) 11 | parser.add_argument("--print_mode", type=str, default='markdown') 12 | args = parser.parse_args() 13 | 14 | 15 | def extract_info_from_file(log_file): 16 | ''' 17 | model = resnet50 18 | batch_size_per_device = 128 19 | gpu_num_per_node = 8 20 | num_nodes = 2 21 | train: epoch 0, iter 20, loss: 7.087004, top_1: 0.000000, top_k: 0.000000, samples/s: 3988.891 1597933942.9863544 22 | train: epoch 0, iter 120, loss: 1.050499, top_1: 1.000000, top_k: 1.000000, samples/s: 5917.583 1597933977.6064055 23 | ''' 24 | # extract info from file name 25 | result_dict = {} 26 | with open(log_file, 'r') as f: 27 | for line in f.readlines(): 28 | ss = line.split(' ') 29 | if ss[0] in ['model', 'batch_size_per_device', 'gpu_num_per_node', 'num_nodes']: 30 | result_dict[ss[0]] = ss[2].strip() 31 | elif ss[0] == 'train:': 32 | it = int(ss[4][:-1]) 33 | result_dict[it] = ss[-1].strip() 34 | 35 | return result_dict 36 | 37 | 38 | if __name__ == "__main__": 39 | extract_result(args, extract_info_from_file) 40 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/extract_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | from statistics import median 4 | 5 | 6 | def compute_throughput(result_dict, args): 7 | assert args.start_iter in result_dict and args.end_iter in result_dict 8 | duration = float(result_dict[args.end_iter]) - float(result_dict[args.start_iter]) 9 | 10 | num_nodes = int(result_dict['num_nodes']) 11 | gpu_num_per_node = int(result_dict['gpu_num_per_node']) 12 | batch_size_per_device = int(result_dict['batch_size_per_device']) 13 | 14 | total_batch_size = batch_size_per_device * gpu_num_per_node * num_nodes 15 | 16 | num_examples = total_batch_size * (args.end_iter - args.start_iter) 17 | throughput = num_examples / duration 18 | 19 | return num_nodes, gpu_num_per_node, batch_size_per_device, throughput 20 | 21 | 22 | def get_mode_print(mode): 23 | def mode_print(lst): 24 | if mode == 'markdown': 25 | print('|', ' | '.join(('{:.2f}' if type(v) is float else '{}').format(v) for v in lst), '|') 26 | else: 27 | print(','.join(('{:.2f}' if type(v) is float else '{}').format(v) for v in lst)) 28 | return mode_print 29 | 30 | 31 | def extract_result(args, extract_func): 32 | mode_print = get_mode_print(args.print_mode) 33 | logs_list = glob.glob(os.path.join(args.benchmark_log_dir, "*/*.log")) 34 | logs_list = sorted(logs_list) 35 | 36 | final_result_dict = {} 37 | print("## All Results") 38 | mode_print(['num_nodes', 'gpu_num_per_node', 'batch_size_per_device', 'throughput']) 39 | if args.print_mode == 'markdown': 40 | mode_print(['--------' for _ in range(4)]) 41 | for l in logs_list: 42 | result_dict = extract_func(l) 43 | num_nodes, gpu_num_per_node, batch_size_per_device, throughput = compute_throughput(result_dict, args) 44 | mode_print([num_nodes, gpu_num_per_node, batch_size_per_device, throughput]) 45 | key = (num_nodes, gpu_num_per_node, batch_size_per_device) 46 | if key in final_result_dict: 47 | final_result_dict[key].append(throughput) 48 | else: 49 | final_result_dict[key] = [throughput] 50 | print() 51 | 52 | # calculate n1g1 reference 53 | n1g1_throughput = {} 54 | for k, v in final_result_dict.items(): 55 | if k[0] == 1 and k[1] == 1: 56 | n1g1_throughput[k] = median(v) 57 | 58 | # calculate median throughput and speedup 59 | final_result_list = [] 60 | for k, v in final_result_dict.items(): 61 | res = list(k) 62 | res.append(median(v)) 63 | n1g1 = (1, 1, k[2]) 64 | speedup = median(v) / n1g1_throughput[n1g1] if n1g1 in n1g1_throughput else 0.0 65 | res.append(speedup) 66 | final_result_list.append(res) 67 | 68 | # sort final_result_list 69 | final_result_list = sorted(final_result_list, key=lambda x: (-x[2], x[0], x[1])) 70 | 71 | # print results 72 | print("## Filtered Result `median value`") 73 | mode_print(['num_nodes', 'gpu_num_per_node', 'batch_size_per_device', 'throughput', 'speedup']) 74 | if args.print_mode == 'markdown': 75 | mode_print(['--------' for _ in range(5)]) 76 | for res in final_result_list: 77 | mode_print(res) 78 | 79 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/reports/README.md: -------------------------------------------------------------------------------- 1 | # OneFlow ConvNets Benchmark Test Report 2 | This folder contains OneFlow ConvNets Benchmark test reports. 3 | 4 | ## Changelog 5 | Note: latest on the top 6 | 7 | ## Data 8 | Please click [here](https://github.com/Oneflow-Inc/OneFlow-Benchmark/blob/master/Classification/cnns/tools/README.md) 9 | 10 | ### OneFlow v0.3.1 11 | - ResNet50-v1.5 FP16 with dynamic loss scale test [resnet50_oneflow_v0.3.1_report_1202.md](resnet50_oneflow_v0.3.1_report_1202.md) 12 | ### OneFlow v0.2.0 13 | - ResNet50-v1.5 without XLA test [resnet50_oneflow_v0.2_report_1009.md](resnet50_oneflow_v0.2_report_1009.md) 14 | ### Aug 21st 2020 15 | - ResNet50-v1.5 fp32 without XLA test [report](rn50_fp32_report_0821.md) 16 | 17 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/reports/resnet50_oneflow_v0.2_report_1009.md: -------------------------------------------------------------------------------- 1 | # OneFlow ResNet50-V1.5 Benchmark Test Report 2 | 3 | 本报告总结了OneFlow v0.2.0 下的ResNet50-V1.5 评测结果。 4 | 5 | ## Test Environment 6 | 7 | 所有的测试都是在4台配置了8张 V100-SXM2-16GB GPU的服务器中进行,主要硬软件配置如下: 8 | 9 | - Tesla V100-SXM2-16GB x 8 10 | - InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family 11 | - Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz 12 | - Memory 384G 13 | - Ubuntu 16.04.4 LTS (GNU/Linux 4.4.0-116-generic x86_64) 14 | - CUDA Version: 10.2, Driver Version: 440.33.01 15 | - OneFlow: v0.2.0 16 | - OneFlow-Benchmark: master@8a78044 17 | - `nvidia-smi topo -m` 18 | 19 | ``` 20 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 CPU Affinity 21 | GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS NODE 0-11,24-35 22 | GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS NODE 0-11,24-35 23 | GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PIX 0-11,24-35 24 | GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PIX 0-11,24-35 25 | GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS 12-23,36-47 26 | GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS 12-23,36-47 27 | GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS 12-23,36-47 28 | GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS 12-23,36-47 29 | mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X 30 | 31 | Legend: 32 | 33 | X = Self 34 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 35 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 36 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 37 | PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) 38 | PIX = Connection traversing at most a single PCIe bridge 39 | NV# = Connection traversing a bonded set of # NVLinks 40 | 41 | ``` 42 | 43 | ## Test Descriptions 44 | 45 | - OneFlow版本: [v0.2.0](https://github.com/Oneflow-Inc/oneflow/tree/v0.2.0) 46 | - OneFlow Benchmark仓库版本: [v0.2.0](https://github.com/Oneflow-Inc/OneFlow-Benchmark/tree/v0.2.0) 47 | - XLA: 未采用 48 | - 测试共有四组,分别使用单机单卡、单机8卡、2机16卡、4机32卡进行测试,每组测试7次,选取这7次数据中的中位数作为最后结果。 49 | 50 | ## Finial Results 51 | 52 | - ### FP16 53 | 54 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | speedup | 55 | |-----------|------------------|-----------------------|------------|---------| 56 | | 1 | 1 | 256 | 1472.72 | 1.00 | 57 | | 1 | 8 | 256 | 10629.32 | 7.22 | 58 | | 2 | 8 | 256 | 17920.40 | 12.17 | 59 | | 4 | 8 | 256 | 33141.02 | 22.50 | 60 | 61 | 全部日志可以点击[resnet50_fp16_256_logs.tar](http://oneflow-public.oss-cn-beijing.aliyuncs.com/oneflow_test_log/oneflow_0.2/DLPerf/resnet50_fp16_256_logs.tar)获取。 62 | 63 | - ### FP32 64 | 65 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | speedup | 66 | |-----------|------------------|-----------------------|------------|---------| 67 | | 1 | 1 | 144 | 394.87 | 1.00 | 68 | | 2 | 8 | 144 | 6254.94 | 15.84 | 69 | | 4 | 8 | 144 | 12407.59 | 31.42 | 70 | | 1 | 1 | 128 | 397.64 | 1.00 | 71 | | 1 | 8 | 128 | 3130.34 | 7.87 | 72 | | 2 | 8 | 128 | 6260.30 | 15.74 | 73 | | 4 | 8 | 128 | 12411.97 | 31.21 | 74 | | 1 | 1 | 96 | 394.62 | 1.00 | 75 | | 1 | 8 | 96 | 3095.36 | 7.84 | 76 | | 2 | 8 | 96 | 6141.07 | 15.56 | 77 | | 4 | 8 | 96 | 12162.41 | 30.82 | 78 | 79 | 全部日志可以点击[resnet50_fp32_96_128_144_logs.tar](http://oneflow-public.oss-cn-beijing.aliyuncs.com/oneflow_test_log/oneflow_0.2/DLPerf/resnet50_fp32_96_128_144_logs.tar)获取。 80 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/reports/resnet50_oneflow_v0.3.1_report_1202.md: -------------------------------------------------------------------------------- 1 | # OneFlow ResNet50-V1.5 Benchmark Test Report 2 | 3 | 本报告总结了OneFlow v0.3.1 的ResNet50-V1.5 混合精度情况下dynamic loss scale的评测结果。 4 | 5 | ## Test Environment 6 | 7 | 所有的测试都是在4台配置了8张 V100-SXM2-16GB GPU的服务器中进行,主要硬软件配置如下: 8 | 9 | - Tesla V100-SXM2-16GB x 8 10 | - InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family 11 | - Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz 12 | - Memory 384G 13 | - Ubuntu 16.04.4 LTS (GNU/Linux 4.4.0-116-generic x86_64) 14 | - CUDA Version: 10.2, Driver Version: 440.33.01 15 | - OneFlow: v0.3.1@f4bf35f7a 16 | - OneFlow-Benchmark: v0.3.0@854ddd06b 17 | - `nvidia-smi topo -m` 18 | 19 | ``` 20 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 CPU Affinity 21 | GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS NODE 0-11,24-35 22 | GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS NODE 0-11,24-35 23 | GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PIX 0-11,24-35 24 | GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PIX 0-11,24-35 25 | GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS 12-23,36-47 26 | GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS 12-23,36-47 27 | GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS 12-23,36-47 28 | GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS 12-23,36-47 29 | mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X 30 | 31 | Legend: 32 | 33 | X = Self 34 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 35 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 36 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 37 | PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) 38 | PIX = Connection traversing at most a single PCIe bridge 39 | NV# = Connection traversing a bonded set of # NVLinks 40 | 41 | ``` 42 | 43 | ## Test Descriptions 44 | 45 | - OneFlow版本: [v0.3.1@f4bf35f7a](https://github.com/Oneflow-Inc/oneflow/tree/v0.3.1) 46 | - OneFlow Benchmark仓库版本: [v0.3.0@854ddd06b](https://github.com/Oneflow-Inc/OneFlow-Benchmark/tree/v0.3.0) 47 | - Dynamic Loss Scale: 开启 48 | - XLA: 未采用 49 | - 测试共有四组,分别使用单机单卡、单机8卡、2机16卡、4机32卡进行测试,每组测试7次,选取这7次数据中的中位数作为最后结果。 50 | 51 | ## Finial Results 52 | 53 | - ### FP16 54 | 55 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | speedup | 56 | |-----------|------------------|-----------------------|------------|---------| 57 | | 1 | 1 | 256 | 1443.55 | 1.00 | 58 | | 1 | 8 | 256 | 10274.30 | 7.12 | 59 | | 2 | 8 | 256 | 17440.76 | 12.08 | 60 | | 4 | 8 | 256 | 31958.78 | 22.14 | 61 | 62 | 全部日志可以点击[rn50_dls_fp16_256_logs.zip](https://oneflow-public.oss-cn-beijing.aliyuncs.com/oneflow_test_log/oneflow_0.3.1/rn50_dls_fp16_256_logs.zip)获取。 63 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/scripts/cp_logs.sh: -------------------------------------------------------------------------------- 1 | NUM_NODES=$1 2 | GPU_NUM_PER_NODE=$2 3 | BSZ=$3 4 | REPEAT_ID=$4 5 | 6 | log_root=logs/oneflow 7 | log_dir=$log_root/${NUM_NODES}n${GPU_NUM_PER_NODE}g 8 | 9 | log_file=bert_base_b${BSZ}_fp32_${REPEAT_ID}.log 10 | summary_file=bert_base_b${BSZ}_fp32_${REPEAT_ID}.csv 11 | 12 | [ ! -d "${log_dir}" ] && mkdir -p ${log_dir} 13 | 14 | cp ~/oneflow_temp/oneflow.log ${log_dir}/${log_file} 15 | cp ~/oneflow_temp/log/summary.csv ${log_dir}/${summary_file} 16 | 17 | # cp oneflow.INFO to log_dir 18 | #[ ! -d "${log_dir}/oneflow.INFO" ] && cp ~/oneflow_temp/log/VS002/oneflow.INFO ${log_dir}/oneflow.INFO 19 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/scripts/launch_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | 4 | LOCAL_RUN=$1 5 | BENCH_ROOT=$2 6 | NUM_NODES=$3 7 | GPU_NUM_PER_NODE=$4 8 | BSZ=$5 9 | 10 | #0 prepare the host list ips for training 11 | declare -a host_list=("10.11.0.2" "10.11.0.3" "10.11.0.4" "10.11.0.5") 12 | 13 | if [ $NUM_NODES -gt ${#host_list[@]} ] 14 | then 15 | echo num_nodes should be less than or equal to length of host_list. 16 | exit 17 | fi 18 | 19 | hosts=("${host_list[@]:0:${NUM_NODES}}") 20 | echo "Working on hosts:${hosts[@]}" 21 | 22 | ips=${hosts[0]} 23 | for host in "${hosts[@]:1}" 24 | do 25 | ips+=",${host}" 26 | done 27 | 28 | #1 prepare oneflow_temp folder on each host 29 | for host in "${hosts[@]}" 30 | do 31 | ssh $USER@$host "mkdir -p ~/oneflow_temp" 32 | done 33 | 34 | #2 copy files to slave hosts and start work with nohup 35 | for host in "${hosts[@]:1}" 36 | do 37 | echo "start training on ${host}" 38 | ssh $USER@$host 'rm -rf ~/oneflow_temp/*' 39 | scp -r $BENCH_ROOT ./$LOCAL_RUN $USER@$host:~/oneflow_temp 40 | ssh $USER@$host "cd ~/oneflow_temp; nohup ./$LOCAL_RUN $NUM_NODES $GPU_NUM_PER_NODE $BSZ $ips 1>oneflow.log 2>&1 oneflow.log" 49 | 50 | echo "done" 51 | 52 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/scripts/rn50_train.sh: -------------------------------------------------------------------------------- 1 | NUM_NODES=$1 2 | GPU_NUM_PER_NODE=$2 3 | BSZ_PER_DEVICE=$3 4 | 5 | if [ -n "$4" ]; then 6 | NODE_IPS=$4 7 | else 8 | NODE_IPS='10.11.0.2','10.11.0.3','10.11.0.4','10.11.0.5' 9 | fi 10 | 11 | BENCH_ROOT=cnns 12 | DATA_ROOT=/datasets/ImageNet/ofrecord 13 | DATA_PART_NUM=256 14 | 15 | rm -rf ./log 16 | mkdir ./log 17 | 18 | NUM_ITERS=120 19 | NUM_EXAMPLES=$(($NUM_NODES * $GPU_NUM_PER_NODE * $BSZ_PER_DEVICE * $NUM_ITERS)) 20 | 21 | export PYTHONUNBUFFERED=1 22 | echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED 23 | export NCCL_LAUNCH_MODE=PARALLEL 24 | echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE 25 | 26 | python3 ./$BENCH_ROOT/of_cnn_train_val.py \ 27 | --num_examples=$NUM_EXAMPLES \ 28 | --train_data_dir=$DATA_ROOT/train \ 29 | --train_data_part_num=$DATA_PART_NUM \ 30 | --num_nodes=$NUM_NODES \ 31 | --gpu_num_per_node=$GPU_NUM_PER_NODE \ 32 | --optimizer="sgd" \ 33 | --momentum=0.875 \ 34 | --label_smoothing=0.1 \ 35 | --learning_rate=0.001 \ 36 | --loss_print_every_n_iter=20 \ 37 | --batch_size_per_device=$BSZ_PER_DEVICE \ 38 | --val_batch_size_per_device=125 \ 39 | --num_epoch=1 \ 40 | --use_fp16 \ 41 | --channel_last=True \ 42 | --pad_output \ 43 | --fuse_bn_relu=True \ 44 | --fuse_bn_add_relu=True \ 45 | --nccl_fusion_threshold_mb=16 \ 46 | --nccl_fusion_max_ops=24 \ 47 | --gpu_image_decoder=True \ 48 | --log_dir=./log \ 49 | --node_ips=$NODE_IPS \ 50 | --model="resnet50" 51 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/scripts/schedule_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | LOCAL_RUN=$1 4 | BENCH_ROOT=$2 5 | 6 | REPEAT_TIMES=7 7 | 8 | declare -a num_nodes_list=(1 1 2 4) 9 | declare -a num_gpus_list=(1 8 8 8) 10 | len=${#num_nodes_list[@]} 11 | for bsz in 96 64 32 24 12 | do 13 | for (( i=0; i<$len; i++ )) 14 | do 15 | num_nodes=${num_nodes_list[$i]} 16 | num_gpus=${num_gpus_list[$i]} 17 | 18 | for (( j=0; j<$REPEAT_TIMES; j++ )) 19 | do 20 | echo $num_nodes $num_gpus $j $bsz 21 | ./launch_all.sh $LOCAL_RUN $BENCH_ROOT $num_nodes $num_gpus $bsz 22 | ./cp_logs.sh $num_nodes $num_gpus $bsz $j 23 | done 24 | done 25 | done 26 | -------------------------------------------------------------------------------- /OneFlow/Classification/ConvNets/resnet50v1.5/scripts/stop_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | 4 | ############################################## 5 | #0 prepare the host list for training 6 | #comment unused hosts with `#` 7 | #or use first arg to limit the hosts number 8 | #e.g.: `./train.sh 4` will use first 4 hosts. 9 | #declare -a host_list=( 10 | # #"10.11.1.1" 11 | # "10.11.1.2" 12 | # "10.11.1.3" 13 | # "10.11.1.4" 14 | # "10.11.1.5" 15 | # ) 16 | declare -a host_list=(ln1 ln2 ln3 ln4) 17 | 18 | if [ -n "$1" ] 19 | then 20 | host_num=$1 21 | else 22 | host_num=${#host_list[@]} 23 | fi 24 | 25 | 26 | if [ ${host_num} -gt ${#host_list[@]} ] 27 | then 28 | host_num=${#host_list[@]} 29 | fi 30 | 31 | hosts=("${host_list[@]:0:${host_num}}") 32 | echo "plan to pkill python3 on hosts:${hosts[@]}" 33 | 34 | ############################################## 35 | #2 copy files to each host and start work 36 | for host in "${hosts[@]}" 37 | do 38 | echo "pkill python3 on ${host}" 39 | ssh $USER@$host 'pkill python3' 40 | done 41 | 42 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/docker/build.sh: -------------------------------------------------------------------------------- 1 | docker build \ 2 | -f ubuntu.dockerfile \ 3 | -t oneflow:WDL . 4 | 5 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/docker/launch.sh: -------------------------------------------------------------------------------- 1 | ONEFLOW_BENCHMARK_ROOT=/path/to/OneFlow-Benchmark 2 | DLPERF_WDL_SCRIPTS_ROOT=/path/to/DLPerf/OneFlow/ClickThroughRate/WideDeepLearning/scripts 3 | DATASET_ROOT=/path/to/datasets:/data 4 | 5 | docker run --rm -it \ 6 | --privileged \ 7 | --shm-size=16g \ 8 | --ulimit memlock=-1 \ 9 | --net=host \ 10 | --cap-add=IPC_LOCK \ 11 | --device=/dev/infiniband \ 12 | -v ${ONEFLOW_BENCHMARK_ROOT}:/OneFlow-Benchmark \ 13 | -v ${DLPERF_WDL_SCRIPTS_ROOT}:/workspace \ 14 | -v ${DATASET_ROOT}:/data \ 15 | -w /workspace \ 16 | oneflow:WDL bash \ 17 | -c "mkdir -p /run/sshd && /usr/sbin/sshd -p 12395 && bash" 18 | 19 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/docker/ubuntu.dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTHON=3.8 2 | ARG pip_args="-i https://pypi.tuna.tsinghua.edu.cn/simple" 3 | FROM nvidia/cuda:10.2-base-ubuntu18.04 4 | 5 | WORKDIR /etc/apt/sources.list.d 6 | RUN rm cuda.list nvidia-ml.list 7 | WORKDIR / 8 | 9 | RUN apt-get update && \ 10 | apt-get -y install --no-install-recommends openssh-server vim python3 python3-pip wget perl lsb-core google-perftools numactl 11 | 12 | ENV MOFED_DIR MLNX_OFED_LINUX-4.3-1.0.1.0-ubuntu18.04-x86_64 13 | ENV IGNOREEOF 3 14 | 15 | RUN wget https://oneflow-static.oss-cn-beijing.aliyuncs.com/deps/${MOFED_DIR}.tgz && \ 16 | tar -xzvf ${MOFED_DIR}.tgz && \ 17 | ${MOFED_DIR}/mlnxofedinstall --user-space-only --without-fw-update --all -q && \ 18 | cd .. && \ 19 | rm -rf ${MOFED_DIR} && \ 20 | rm -rf *.tgz 21 | 22 | RUN ssh-keygen -t rsa -N "" -f /root/.ssh/id_rsa 23 | RUN cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \ 24 | chmod 600 /root/.ssh/authorized_keys 25 | RUN /etc/init.d/ssh start && \ 26 | ssh-keyscan -H localhost >> /root/.ssh/known_hosts 27 | RUN echo "Host *\n\tStrictHostKeyChecking no" >> /root/.ssh/config && \ 28 | chmod 600 /root/.ssh/config 29 | 30 | RUN echo 'ALL ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers 31 | RUN sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config 32 | 33 | RUN python3 -m pip install --upgrade pip 34 | RUN python3 -m pip install $pip_args scikit-learn pynvml 35 | RUN python3 -m pip install --find-links https://release.oneflow.info oneflow_cu102==0.2.0 --user 36 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/extract_info_from_log.sh: -------------------------------------------------------------------------------- 1 | benchmark_log_dir = /dataset/.../log/ 2 | python3 extract_info_from_log.py \ 3 | --benchmark_log_dir $benchmark_log_dir 4 | --repo models 5 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/gpu_memory_usage.py: -------------------------------------------------------------------------------- 1 | import time 2 | from pynvml import * 3 | 4 | nvmlInit() 5 | handle = nvmlDeviceGetHandleByIndex(0) 6 | running = True 7 | 8 | mem_threshold = 32*1024*1024 9 | state = 'init' #'Detecting' 10 | 11 | device0_max_used_mem = 0 12 | while running == True: 13 | time.sleep(1) 14 | info = nvmlDeviceGetMemoryInfo(handle) 15 | if state == 'init': 16 | if info.used > mem_threshold: 17 | state = 'Detecting' 18 | elif state == 'Detecting': 19 | if info.used < mem_threshold: 20 | running = False 21 | else: 22 | device0_max_used_mem = max(device0_max_used_mem, info.used) 23 | 24 | nvmlShutdown() 25 | print('max device0 memory usage is:', device0_max_used_mem) 26 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_1node1Device_latency_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_1node1Device_latency_memory.png -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_1node1Device_vocabsize_latency_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_1node1Device_vocabsize_latency_memory.png -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_1node8Device_latency_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_1node8Device_latency_memory.png -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_1node8Device_vocabsize_latency_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_1node8Device_vocabsize_latency_memory.png -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_4node32Device_latency_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_4node32Device_latency_memory.png -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_4node32Device_vocabsize_latency_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_4node32Device_vocabsize_latency_memory.png -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_fixed_batchsize_per_device.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_fixed_batchsize_per_device.png -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_fixed_total_batchsize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/ClickThroughRate/WideDeepLearning/imgs/of_fixed_total_batchsize.png -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/scripts/300k_iters.sh: -------------------------------------------------------------------------------- 1 | NUM_NODES=1 2 | DEVICE_NUM_PER_NODE=1 3 | BATHSIZE=512 4 | EMBD_SIZE=2000000 5 | HIDDEN_UNITS_NUM=2 6 | DEEP_VEC_SIZE=16 7 | PREFIX=500iters 8 | MASTER_ADDR=127.0.0.1 9 | NODE_RANK=0 10 | DATA_DIR=/dataset/f9f659c5/wdl_ofrecord 11 | WDL_MODEL_DIR=/dataset/227246e8/wide_and_deep/train.py 12 | 13 | log_root=./log 14 | test_case=${log_root}/300kiters-n1g1 15 | oneflow_log_file=${test_case}.log 16 | mem_file=${test_case}.mem 17 | 18 | python3 -m oneflow.distributed.launch \ 19 | --nproc_per_node $DEVICE_NUM_PER_NODE \ 20 | --nnodes $NUM_NODES \ 21 | --node_rank $NODE_RANK \ 22 | --master_addr $MASTER_ADDR \ 23 | gpu_memory_usage.py 1>$mem_file 2>&1 $oneflow_log_file 41 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/scripts/500_iters.sh: -------------------------------------------------------------------------------- 1 | NUM_NODES=1 2 | DEVICE_NUM_PER_NODE=1 3 | BATHSIZE=512 4 | EMBD_SIZE=2000000 5 | HIDDEN_UNITS_NUM=2 6 | DEEP_VEC_SIZE=16 7 | PREFIX=500iters 8 | MASTER_ADDR=127.0.0.1 9 | NODE_RANK=0 10 | DATA_DIR=/dataset/f9f659c5/wdl_ofrecord 11 | WDL_MODEL_DIR=/dataset/227246e8/wide_and_deep/train.py 12 | 13 | log_root=./log 14 | test_case=${log_root}/500iters-n1g1 15 | oneflow_log_file=${test_case}.log 16 | mem_file=${test_case}.mem 17 | 18 | 19 | python3 -m oneflow.distributed.launch \ 20 | --nproc_per_node $DEVICE_NUM_PER_NODE \ 21 | --nnodes $NUM_NODES \ 22 | --node_rank $NODE_RANK \ 23 | --master_addr $MASTER_ADDR \ 24 | gpu_memory_usage.py 1>$mem_file 2>&1 $oneflow_log_file 42 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/scripts/bsz_x2.sh: -------------------------------------------------------------------------------- 1 | NUM_NODES=1 2 | EMBD_SIZE=2322444 3 | HIDDEN_UNITS_NUM=2 4 | DEEP_VEC_SIZE=16 5 | PREFIX=bsz_x2 6 | MASTER_ADDR=127.0.0.1 7 | NODE_RANK=0 8 | DATA_DIR=/dataset/f9f659c5/wdl_ofrecord 9 | WDL_MODEL_DIR=/dataset/227246e8/wide_and_deep/train.py 10 | 11 | export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1 12 | export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1 13 | export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1 14 | export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1 15 | export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 16 | 17 | for DEVICE_NUM_PER_NODE in 1 8 18 | do 19 | for BATHSIZE in 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 20 | do 21 | log_root=./log 22 | test_case=${log_root}/$PREFIX'_n'$NUM_NODES'g'$DEVICE_NUM_PER_NODE'_b'$BATHSIZE'_h'$HIDDEN_UNITS_NUM 23 | oneflow_log_file=${test_case}.log 24 | mem_file=${test_case}.mem 25 | batch_size_per_proc=$(( ${BATHSIZE}/${DEVICE_NUM_PER_NODE} )) 26 | 27 | python3 gpu_memory_usage.py 1>$mem_file 2>&1 $oneflow_log_file 52 | done 53 | done 54 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/scripts/fix_bsz_per_device.sh: -------------------------------------------------------------------------------- 1 | declare -a num_nodes_list=(1 1 1 1) 2 | declare -a num_gpus_list=(1 2 4 8) 3 | len=${#num_nodes_list[@]} 4 | 5 | NUM_NODES=1 6 | EMBD_SIZE=2322444 7 | HIDDEN_UNITS_NUM=7 8 | DEEP_VEC_SIZE=32 9 | PREFIX=fix_total_bsz 10 | MASTER_ADDR=127.0.0.1 11 | NODE_RANK=0 12 | DATA_DIR=/dataset/f9f659c5/wdl_ofrecord 13 | WDL_MODEL_DIR=/dataset/227246e8/wide_and_deep/train.py 14 | 15 | export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1 16 | export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1 17 | export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1 18 | export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1 19 | export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 20 | 21 | for (( i=0; i<$len; i++ )) 22 | do 23 | num_nodes=${num_nodes_list[$i]} 24 | num_gpus_per_node=${num_gpus_list[$i]} 25 | gpu_num=$(( ${num_nodes} * ${num_gpus_per_node} )) 26 | bsz=$(( 16384 * ${gpu_num} )) 27 | 28 | log_root=./log 29 | test_case=${log_root}/fix_bsz_per_device_n1g${num_gpus_per_node}_b${bsz}_h${HIDDEN_UNITS_NUM} 30 | oneflow_log_file=${test_case}.log 31 | mem_file=${test_case}.mem 32 | batch_size_per_proc=$(( ${bsz}/${num_gpus_per_node} )) 33 | 34 | python3 gpu_memory_usage.py 1>$mem_file 2>&1 $oneflow_log_file 59 | done 60 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/scripts/fix_total_bsz.sh: -------------------------------------------------------------------------------- 1 | declare -a num_nodes_list=(1 1 1 1) 2 | declare -a num_gpus_list=(1 2 4 8) 3 | len=${#num_nodes_list[@]} 4 | 5 | NUM_NODES=1 6 | BATHSIZE=16384 7 | EMBD_SIZE=2322444 8 | HIDDEN_UNITS_NUM=7 9 | DEEP_VEC_SIZE=32 10 | PREFIX=fix_total_bsz 11 | MASTER_ADDR=127.0.0.1 12 | NODE_RANK=0 13 | DATA_DIR=/dataset/f9f659c5/wdl_ofrecord 14 | WDL_MODEL_DIR=/dataset/227246e8/wide_and_deep/train.py 15 | 16 | export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1 17 | export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1 18 | export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1 19 | export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1 20 | export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 21 | 22 | for (( i=0; i<$len; i++ )) 23 | do 24 | DEVICE_NUM_PER_NODE=${num_gpus_list[$i]} 25 | log_root=./log 26 | test_case=${log_root}/fix_total_bsz_n1g${DEVICE_NUM_PER_NODE}_b${BATHSIZE}_h${HIDDEN_UNITS_NUM} 27 | oneflow_log_file=${test_case}.log 28 | mem_file=${test_case}.mem 29 | batch_size_per_proc=$(( ${BATHSIZE}/${DEVICE_NUM_PER_NODE} )) 30 | 31 | python3 gpu_memory_usage.py 1>$mem_file 2>&1 $oneflow_log_file 56 | done 57 | -------------------------------------------------------------------------------- /OneFlow/ClickThroughRate/WideDeepLearning/scripts/vocab_x2.sh: -------------------------------------------------------------------------------- 1 | NUM_NODES=1 2 | BATHSIZE=16384 3 | HIDDEN_UNITS_NUM=7 4 | DEEP_VEC_SIZE=16 5 | PREFIX=vsz_x2 6 | MASTER_ADDR=127.0.0.1 7 | NODE_RANK=0 8 | DATA_DIR=/dataset/f9f659c5/wdl_ofrecord 9 | WDL_MODEL_DIR=/dataset/227246e8/wide_and_deep/train.py 10 | 11 | export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 12 | export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1 13 | export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1 14 | export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1 15 | export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1 16 | 17 | for DEVICE_NUM_PER_NODE in 1 8 18 | do 19 | for i in 1 2 4 8 16 20 | do 21 | EMBD_SIZE=$(( 3200000*${i} )) 22 | batch_size_per_proc=$(( ${BATHSIZE}/${DEVICE_NUM_PER_NODE} )) 23 | log_root=./log 24 | test_case=${log_root}/$PREFIX'_n'$NUM_NODES'g'$DEVICE_NUM_PER_NODE'_vsz'$EMBD_SIZE'_h'$HIDDEN_UNITS_NUM 25 | oneflow_log_file=${test_case}.log 26 | mem_file=${test_case}.mem 27 | 28 | python3 gpu_memory_usage.py 1>$mem_file 2>&1 $oneflow_log_file 53 | done 54 | done 55 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/README.md: -------------------------------------------------------------------------------- 1 | # OneFlow Benchmark Test Scripts 2 | 3 | 本文介绍如何使用脚本批量测试BERT base: 4 | 5 | 1. `bert_base_train.sh`,可以本地单机进行BERT base预训练,也可以通过ssh发送到远端节点运行; 6 | 2. `launch_all.sh`,发送脚本到指定的机器节点并运行; 7 | 3. `cp_logs.sh`,拷贝日志到指定目录; 8 | 4. `schedule_launch.sh`,批量顺序执行多组`launch_all.sh`; 9 | 5. `extract_bert_result.py`,从BERT预训练日志中提取结果,并打印成markdown表格。 10 | 6. `reports`,[测试报告](OneFlow/LanguageModeling/BERT/reports)目录 11 | 12 | 通常这几个文件只需要修改很少的配置就能正常运行,下面对各个脚本进行详细介绍。 13 | 14 | ## 本地训练启动脚本:`bert_base_train.sh` 15 | 16 | 这个脚本用于本地运行OneFlow的训练,可以独立使用,调用前需要手动修改基本配置,调用时需要传入3个参数。 17 | 18 | ### 手工配置选项 19 | 20 | 有三处需要修改的地方: 21 | 22 | ``` 23 | BENCH_ROOT=BERT 24 | DATA_ROOT=/path/to/bert_base_ofrecord 25 | DATA_PART_NUM=32 26 | ``` 27 | 28 | 1. `BENCH_ROOT`: 模型脚本所在的目录,对应OneFlow-Benchmark项目中的`LanguageModeling/BERT`; 29 | 2. `DATA_ROOT`: 测试所用数据集路径 30 | 3. `DATA_PART_NUM`: 测试所用数据集文件数量 31 | 32 | ### 脚本参数 33 | 34 | 调用时需要传入4个参数: 35 | 36 | ``` 37 | NUM_NODES=$1 38 | GPU_NUM_PER_NODE=$2 39 | BSZ_PER_DEVICE=$3 40 | NODE_IPS=$4 41 | ``` 42 | 43 | 1. `NUM_NODES`: 测试训练用的机器节点数量; 44 | 2. `GPU_NUM_PER_NODE`: 每台机器节点中GPU设备的数量; 45 | 3. `BSZ_PER_DEVICE`: 训练时每个批次每个GPU设备对应的图片/句子数量; 46 | 4. `NODE_IPS`: 各个节点的IP列表,可选,多机训练须配置(如果NUM_NODES=1,则NODE_IPS被忽略)。 47 | 48 | 注:这个脚本只能够在本地运行OneFlow,如果多机训练,可以在各台机器上分别启动该脚本,OneFlow会自动根据配置的机器节点信息进行通信连接,完成训练。另外一种方式就是使用`launch_all.sh`,自动把脚本发送到各个机器节点进行训练。 49 | 50 | 本地单机8卡训练BERT base,执行命令: 51 | 52 | ``` 53 | ./bert_base_pretrain.sh 1 8 128 54 | ``` 55 | 56 | ## 远程训练启动脚本:`launch_all.sh` 57 | 58 | `launch_all.sh`负责发送本地训练启动脚本(如单机训练脚本bert_base_pretrain.sh)和`BENCH_ROOT`路径下的模型脚本(如LanguageModeling/BERT/run_pretraining.py等)到各台机器节点,并通过ssh的方式在各个机器节点运行本地训练启动脚本。启动时,需要传入5个参数: 59 | 60 | ``` 61 | LOCAL_RUN=$1 62 | BENCH_ROOT=$2 63 | NUM_NODES=$3 64 | GPU_NUM_PER_NODE=$4 65 | BSZ=$5 66 | ``` 67 | 68 | 1. `LOCAL_RUN`:待发送的本地训练启动脚本; 69 | 2. `BENCH_ROOT`: 待发送的OneFlow模型脚本所在目录; 70 | 3. `NUM_NODES`: 测试训练用的机器节点数量; 71 | 4. `GPU_NUM_PER_NODE`: 每台机器节点中GPU设备的数量; 72 | 5. `BSZ_PER_DEVICE`: 训练时每个批次每个GPU设备对应的图片/句子数量; 73 | 74 | 75 | 发送相关脚本到单机,使用8卡训练BERT base: 76 | 77 | ``` 78 | ./launch_all.sh bert_base_pretrain.sh BERT 1 8 64 79 | ``` 80 | 81 | 发送相关脚本到4机,每机都使用8卡(共4机32卡)训练BERT base: 82 | 83 | ``` 84 | ./launch_all.sh bert_base_pretrain.sh BERT 4 8 96 85 | ``` 86 | 87 | ## `cp_logs.sh` 88 | 89 | 根据下列参数拷贝日志到指定路径并重命名: 90 | 91 | ``` 92 | NUM_NODES=$1 93 | GPU_NUM_PER_NODE=$2 94 | BSZ=$3 95 | REPEAT_ID=$4 96 | ``` 97 | 98 | `cp_logs.sh`负责从本地(主节点)拷贝日志到指定路径下,并按照`logs/oneflow/${NUM_NODES}n${GPU_NUM_PER_NODE}g/${model_name}_b${BSZ}_fp32_${REPEAT_ID}.log`的格式保存。 99 | 100 | ## `schedule_launch.sh` 101 | 102 | 本次测评会测试多组batch_size、单机多机配置,每组实验重复7次。 103 | 104 | 根据测试次数,批量自动运行`launch_all.sh`和`cp_logs.sh`,完成训练和备份日志。 105 | 需要两个参数: 106 | 107 | 1. `LOCAL_RUN`:待发送的本地训练启动脚本; 108 | 2. `BENCH_ROOT`: 待发送的OneFlow模型脚本所在目录。 109 | 110 | `schedule_launch.sh`脚本会根据实验次数,循环测试不同batch size,4组节点和GPU设备数量,每组实验重复7次。实验结束后,`logs/oneflow`路径下会保存实验日志。 111 | 112 | ## `extract_bert_result.py` 113 | 114 | 运行方式如下: 115 | 116 | ``` 117 | python3 extract_bert_result.py 118 | ``` 119 | 120 | 结果为markdown格式,方便直接引用,输出如下: 121 | 122 | ``` 123 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | 124 | | -------- | -------- | -------- | -------- | 125 | | 4 | 8 | 96 | 4449.85 | 126 | | 4 | 8 | 96 | 4456.82 | 127 | | 4 | 8 | 96 | 4460.17 | 128 | | 4 | 8 | 96 | 4454.99 | 129 | | 4 | 8 | 96 | 4455.97 | 130 | | 4 | 8 | 96 | 4451.41 | 131 | | 4 | 8 | 96 | 4458.06 | 132 | 133 | 134 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | speedup | 135 | | -------- | -------- | -------- | -------- | -------- | 136 | | 1 | 1 | 96 | 149.84 | 1.00 | 137 | | 1 | 8 | 96 | 1158.51 | 7.73 | 138 | | 2 | 8 | 96 | 2257.71 | 15.07 | 139 | | 4 | 8 | 96 | 4455.97 | 29.74 | 140 | ``` 141 | 142 | ### 输入参数 143 | 144 | - `benchmark_log_dir`: 日志存放的目录,脚本中会自动遍历所有`*.log`文件进行信息提取; 145 | - `start_iter` `end_iter`: 待提取的起始和终止步数,脚本中会利用这两个步数的时间戳计算吞吐率。 146 | - `print_mode`: 打印输出格式设置,缺省`markdown` 147 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/extract_bert_result.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from extract_util import extract_result 4 | 5 | 6 | parser = argparse.ArgumentParser(description="flags for BERT benchmark") 7 | parser.add_argument( 8 | "--benchmark_log_dir", type=str, default="./logs/oneflow", 9 | required=False) 10 | parser.add_argument("--start_iter", type=int, default=19) 11 | parser.add_argument("--end_iter", type=int, default=119) 12 | parser.add_argument("--print_mode", type=str, default='markdown') 13 | args = parser.parse_args() 14 | 15 | 16 | def extract_info_from_file(log_file): 17 | ''' 18 | batch_size_per_device = 24 19 | gpu_num_per_node = 1 20 | num_nodes = 1 21 | step: 19, total_loss: 9.725, mlm_loss: 8.960, nsp_loss: 0.766, throughput: 54.982 1598056519.8844702 22 | step: 119, total_loss: 8.010, mlm_loss: 7.331, nsp_loss: 0.679, throughput: 139.896 1598056537.0029895 23 | ''' 24 | # extract info from file name 25 | result_dict = {} 26 | with open(log_file, 'r') as f: 27 | for line in f.readlines(): 28 | ss = line.split(' ') 29 | if ss[0] in ['batch_size_per_device', 'gpu_num_per_node', 'num_nodes']: 30 | result_dict[ss[0]] = ss[2].strip() 31 | elif ss[0] == 'step:': 32 | it = int(ss[1][:-1]) 33 | result_dict[it] = ss[-1].strip() 34 | 35 | return result_dict 36 | 37 | 38 | if __name__ == "__main__": 39 | extract_result(args, extract_info_from_file) 40 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/extract_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | from statistics import median 4 | 5 | 6 | def compute_throughput(result_dict, args): 7 | assert args.start_iter in result_dict and args.end_iter in result_dict 8 | duration = float(result_dict[args.end_iter]) - float(result_dict[args.start_iter]) 9 | 10 | num_nodes = int(result_dict['num_nodes']) 11 | gpu_num_per_node = int(result_dict['gpu_num_per_node']) 12 | batch_size_per_device = int(result_dict['batch_size_per_device']) 13 | 14 | total_batch_size = batch_size_per_device * gpu_num_per_node * num_nodes 15 | 16 | num_examples = total_batch_size * (args.end_iter - args.start_iter) 17 | throughput = num_examples / duration 18 | 19 | return num_nodes, gpu_num_per_node, batch_size_per_device, throughput 20 | 21 | 22 | def get_mode_print(mode): 23 | def mode_print(lst): 24 | if mode == 'markdown': 25 | print('|', ' | '.join(('{:.2f}' if type(v) is float else '{}').format(v) for v in lst), '|') 26 | else: 27 | print(','.join(('{:.2f}' if type(v) is float else '{}').format(v) for v in lst)) 28 | return mode_print 29 | 30 | 31 | def extract_result(args, extract_func): 32 | mode_print = get_mode_print(args.print_mode) 33 | logs_list = glob.glob(os.path.join(args.benchmark_log_dir, "*/*.log")) 34 | logs_list = sorted(logs_list) 35 | 36 | final_result_dict = {} 37 | print("## All Results") 38 | mode_print(['num_nodes', 'gpu_num_per_node', 'batch_size_per_device', 'throughput']) 39 | if args.print_mode == 'markdown': 40 | mode_print(['--------' for _ in range(4)]) 41 | for l in logs_list: 42 | result_dict = extract_func(l) 43 | num_nodes, gpu_num_per_node, batch_size_per_device, throughput = compute_throughput(result_dict, args) 44 | mode_print([num_nodes, gpu_num_per_node, batch_size_per_device, throughput]) 45 | key = (num_nodes, gpu_num_per_node, batch_size_per_device) 46 | if key in final_result_dict: 47 | final_result_dict[key].append(throughput) 48 | else: 49 | final_result_dict[key] = [throughput] 50 | print() 51 | 52 | # calculate n1g1 reference 53 | n1g1_throughput = {} 54 | for k, v in final_result_dict.items(): 55 | if k[0] == 1 and k[1] == 1: 56 | n1g1_throughput[k] = median(v) 57 | 58 | # calculate median throughput and speedup 59 | final_result_list = [] 60 | for k, v in final_result_dict.items(): 61 | res = list(k) 62 | res.append(median(v)) 63 | n1g1 = (1, 1, k[2]) 64 | speedup = median(v) / n1g1_throughput[n1g1] if n1g1 in n1g1_throughput else 0.0 65 | res.append(speedup) 66 | final_result_list.append(res) 67 | 68 | # sort final_result_list 69 | final_result_list = sorted(final_result_list, key=lambda x: (-x[2], x[0], x[1])) 70 | 71 | # print results 72 | print("## Filtered Result `median value`") 73 | mode_print(['num_nodes', 'gpu_num_per_node', 'batch_size_per_device', 'throughput', 'speedup']) 74 | if args.print_mode == 'markdown': 75 | mode_print(['--------' for _ in range(5)]) 76 | for res in final_result_list: 77 | mode_print(res) 78 | 79 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/reports/README.md: -------------------------------------------------------------------------------- 1 | # OneFlow BERT Benchmark Test Report 2 | This folder contains OneFlow BERT Benchmark test reports. 3 | 4 | ## Changelog 5 | Note: latest on the top 6 | 7 | ## Data 8 | 9 | - Pretrain datasets 10 | Note that the dataset is about 200GB, so we provide a [sample dataset](https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/wiki_ofrecord_seq_len_128_example.tgz), for testing purposes only. 11 | - SQuAD datasets 12 | Contains the complete dataset and tools. Please click [here](https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/squad_dataset_tools.tgz) for download. 13 | ``` 14 | squad_dataset_tools 15 | ├── ofrecord 16 | ├── dev-v1.1.json 17 | ├── dev-v2.0.json 18 | ├── train-v1.1.json 19 | ├── train-v2.0.json 20 | ├── evaluate-v1.1.py 21 | ├── evaluate-v2.0.py 22 | ``` 23 | - GLUE(CoLA, MRPC) 24 | Contains the complete dataset. Please click [here](https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/glue_ofrecord.tgz) for download. 25 | ``` 26 | glue_ofrecord 27 | ├── CoLA 28 | │ ├── eval 29 | │ │ └── eval.of_record-0 30 | │ ├── test 31 | │ │ └── predict.of_record-0 32 | │ └── train 33 | │ └── train.of_record-0 34 | └── MRPC 35 | ├── eval 36 | │ └── eval.of_record-0 37 | ├── test 38 | │ └── predict.of_record-0 39 | └── train 40 | └── train.of_record-0 41 | ``` 42 | More information can be found [here](https://github.com/Oneflow-Inc/OneFlow-Benchmark/blob/master/LanguageModeling/BERT/README.md).You can also see [here](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/extended_topics/how_to_make_ofdataset.md) how to make the OneFlow dataset. 43 | 44 | ### OneFlow v0.3.1 45 | - BERT base FP16 with dynamic loss scale test [bert_base_oneflow_v0.3.1_report_1202.md](bert_base_oneflow_v0.3.1_report_1202.md) 46 | ### OneFlow v0.2.0 47 | - BERT base without XLA test [bert_base_oneflow_v0.2_report_1009.md](bert_base_oneflow_v0.2_report_1009.md) 48 | ### Aug 22nd 2020 49 | - BERT base fp32 without XLA test [report](bert_base_fp32_report_0822.md) 50 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/reports/bert_base_oneflow_v0.3.1_report_1202.md: -------------------------------------------------------------------------------- 1 | # BERT base Benchmark Test Report 2 | 3 | 本报告总结了OneFlow v0.3.1 下BERT base 混合精度开启dynamic loss scale 的评测结果。 4 | 5 | ## Test Environment 6 | 7 | 所有的测试都是在4台配置8张V100-SXM2-16GB GPU的服务器中进行,主要硬软件配置如下: 8 | 9 | - Tesla V100-SXM2-16GB x 8 10 | - InfiniBand 100 Gb/sec (4X EDR), Mellanox Technologies MT27700 Family 11 | - Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz 12 | - Memory 384G 13 | - Ubuntu 16.04.4 LTS (GNU/Linux 4.4.0-116-generic x86_64) 14 | - CUDA Version: 10.2, Driver Version: 440.33.01 15 | - OneFlow: v0.3.1@f4bf35f7a 16 | - OneFlow-Benchmark: v0.3.0@854ddd06b 17 | - `nvidia-smi topo -m` 18 | 19 | ``` 20 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 CPU Affinity 21 | GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS NODE 0-11,24-35 22 | GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS NODE 0-11,24-35 23 | GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PIX 0-11,24-35 24 | GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PIX 0-11,24-35 25 | GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS 12-23,36-47 26 | GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS 12-23,36-47 27 | GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS 12-23,36-47 28 | GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS 12-23,36-47 29 | mlx5_0 NODE NODE PIX PIX SYS SYS SYS SYS X 30 | 31 | Legend: 32 | 33 | X = Self 34 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 35 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 36 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 37 | PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) 38 | PIX = Connection traversing at most a single PCIe bridge 39 | NV# = Connection traversing a bonded set of # NVLinks 40 | 41 | ``` 42 | 43 | ## Test Descriptions 44 | 45 | - OneFlow版本: [v0.3.1@f4bf35f7a](https://github.com/Oneflow-Inc/oneflow/tree/v0.3.1) 46 | - OneFlow Benchmark仓库版本: [v0.3.0@854ddd06b](https://github.com/Oneflow-Inc/OneFlow-Benchmark/tree/v0.3.0) 47 | - Dynamic Loss Scale: 开启 48 | - XLA: 未采用 49 | - 测试共有四组,分别使用单机单卡、单机8卡、2机16卡、4机32卡进行测试,每组测试7次,选取这7次数据中的中位数作为最后结果。 50 | 51 | 52 | 53 | ## Test Results 54 | 55 | ### FP16 with clip 56 | 57 | - ### batch size = 160 58 | 59 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | speedup | 60 | |-----------|------------------|-----------------------|------------|---------| 61 | | 1 | 1 | 160 | 613.46 | 1.00 | 62 | | 1 | 8 | 160 | 4514.64 | 7.36 | 63 | | 2 | 8 | 160 | 8325.87 | 13.57 | 64 | | 4 | 8 | 160 | 16001.63 | 26.08 | 65 | 66 | - ### batch size = 128 67 | 68 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | speedup | 69 | |-----------|------------------|-----------------------|------------|---------| 70 | | 1 | 1 | 128 | 595.94 | 1.00 | 71 | | 1 | 8 | 128 | 4313.91 | 7.24 | 72 | | 2 | 8 | 128 | 7878.62 | 13.22 | 73 | | 4 | 8 | 128 | 15113.94 | 25.36 | 74 | 75 | - ### batch size = 64 76 | 77 | | num_nodes | gpu_num_per_node | batch_size_per_device | throughput | speedup | 78 | |-----------|------------------|-----------------------|------------|---------| 79 | | 1 | 1 | 64 | 534.17 | 1.00 | 80 | | 1 | 8 | 64 | 3519.61 | 6.59 | 81 | | 2 | 8 | 64 | 5991.10 | 11.22 | 82 | | 4 | 8 | 64 | 10026.29 | 18.77 | 83 | 84 | 全部日志可以点击[bert_dls_fp16_logs.zip](https://oneflow-public.oss-cn-beijing.aliyuncs.com/oneflow_test_log/oneflow_0.3.1/bert_dls_fp16_logs.zip)获取。 85 | 86 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/reports/imgs/of_bert_base_latency_throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/LanguageModeling/BERT/reports/imgs/of_bert_base_latency_throughput.png -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/reports/imgs/of_bert_base_speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/LanguageModeling/BERT/reports/imgs/of_bert_base_speedup.png -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/reports/imgs/of_bert_base_throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/OneFlow/LanguageModeling/BERT/reports/imgs/of_bert_base_throughput.png -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/scripts/bert_base_pretrain.sh: -------------------------------------------------------------------------------- 1 | NUM_NODES=$1 2 | GPU_NUM_PER_NODE=$2 3 | BSZ_PER_DEVICE=$3 4 | 5 | BENCH_ROOT=BERT 6 | DATA_ROOT=/path/to/bert_base_ofrecord 7 | DATA_PART_NUM=32 8 | 9 | if [ -n "$4" ]; then 10 | NODE_IPS=$4 11 | else 12 | NODE_IPS='10.11.0.2','10.11.0.3','10.11.0.4','10.11.0.5' 13 | fi 14 | 15 | rm -rf ./log 16 | mkdir ./log 17 | 18 | export PYTHONUNBUFFERED=1 19 | python3 ./$BENCH_ROOT/run_pretraining.py \ 20 | --gpu_num_per_node=$GPU_NUM_PER_NODE \ 21 | --num_nodes=$NUM_NODES \ 22 | --node_ips=$NODE_IPS \ 23 | --learning_rate=1e-4 \ 24 | --batch_size_per_device=$BSZ_PER_DEVICE \ 25 | --iter_num=140 \ 26 | --loss_print_every_n_iter=20 \ 27 | --seq_length=128 \ 28 | --max_predictions_per_seq=20 \ 29 | --num_hidden_layers=12 \ 30 | --num_attention_heads=12 \ 31 | --max_position_embeddings=512 \ 32 | --type_vocab_size=2 \ 33 | --vocab_size=30522 \ 34 | --attention_probs_dropout_prob=0.1 \ 35 | --hidden_dropout_prob=0.1 \ 36 | --hidden_size_per_head=64 \ 37 | --data_dir=$DATA_ROOT \ 38 | --data_part_num=$DATA_PART_NUM \ 39 | --log_dir=./log \ 40 | --model_save_every_n_iter=10000 \ 41 | --save_last_snapshot=False \ 42 | --model_save_dir=./snapshots 43 | #--node_ips='10.11.0.2','10.11.0.3','10.11.0.4','10.11.0.5' \ 44 | 45 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/scripts/cp_logs.sh: -------------------------------------------------------------------------------- 1 | NUM_NODES=$1 2 | GPU_NUM_PER_NODE=$2 3 | BSZ=$3 4 | REPEAT_ID=$4 5 | 6 | log_root=logs/oneflow 7 | log_dir=$log_root/${NUM_NODES}n${GPU_NUM_PER_NODE}g 8 | 9 | log_file=bert_base_b${BSZ}_fp32_${REPEAT_ID}.log 10 | summary_file=bert_base_b${BSZ}_fp32_${REPEAT_ID}.csv 11 | 12 | [ ! -d "${log_dir}" ] && mkdir -p ${log_dir} 13 | 14 | cp ~/oneflow_temp/oneflow.log ${log_dir}/${log_file} 15 | cp ~/oneflow_temp/log/summary.csv ${log_dir}/${summary_file} 16 | 17 | # cp oneflow.INFO to log_dir 18 | #[ ! -d "${log_dir}/oneflow.INFO" ] && cp ~/oneflow_temp/log/VS002/oneflow.INFO ${log_dir}/oneflow.INFO 19 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/scripts/launch_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | 4 | LOCAL_RUN=$1 5 | BENCH_ROOT=$2 6 | NUM_NODES=$3 7 | GPU_NUM_PER_NODE=$4 8 | BSZ=$5 9 | 10 | #0 prepare the host list ips for training 11 | declare -a host_list=("10.11.0.2" "10.11.0.3" "10.11.0.4" "10.11.0.5") 12 | 13 | if [ $NUM_NODES -gt ${#host_list[@]} ] 14 | then 15 | echo num_nodes should be less than or equal to length of host_list. 16 | exit 17 | fi 18 | 19 | hosts=("${host_list[@]:0:${NUM_NODES}}") 20 | echo "Working on hosts:${hosts[@]}" 21 | 22 | ips=${hosts[0]} 23 | for host in "${hosts[@]:1}" 24 | do 25 | ips+=",${host}" 26 | done 27 | 28 | #1 prepare oneflow_temp folder on each host 29 | for host in "${hosts[@]}" 30 | do 31 | ssh $USER@$host "mkdir -p ~/oneflow_temp" 32 | done 33 | 34 | #2 copy files to slave hosts and start work with nohup 35 | for host in "${hosts[@]:1}" 36 | do 37 | echo "start training on ${host}" 38 | ssh $USER@$host 'rm -rf ~/oneflow_temp/*' 39 | scp -r $BENCH_ROOT ./$LOCAL_RUN $USER@$host:~/oneflow_temp 40 | ssh $USER@$host "cd ~/oneflow_temp; nohup ./$LOCAL_RUN $NUM_NODES $GPU_NUM_PER_NODE $BSZ $ips 1>oneflow.log 2>&1 oneflow.log" 49 | 50 | echo "done" 51 | 52 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/scripts/schedule_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | LOCAL_RUN=$1 4 | BENCH_ROOT=$2 5 | 6 | REPEAT_TIMES=7 7 | 8 | declare -a num_nodes_list=(1 1 2 4) 9 | declare -a num_gpus_list=(1 8 8 8) 10 | len=${#num_nodes_list[@]} 11 | for bsz in 96 64 32 24 12 | do 13 | for (( i=0; i<$len; i++ )) 14 | do 15 | num_nodes=${num_nodes_list[$i]} 16 | num_gpus=${num_gpus_list[$i]} 17 | 18 | for (( j=0; j<$REPEAT_TIMES; j++ )) 19 | do 20 | echo $num_nodes $num_gpus $j $bsz 21 | ./launch_all.sh $LOCAL_RUN $BENCH_ROOT $num_nodes $num_gpus $bsz 22 | ./cp_logs.sh $num_nodes $num_gpus $bsz $j 23 | done 24 | done 25 | done 26 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/BERT/scripts/stop_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | 4 | ############################################## 5 | #0 prepare the host list for training 6 | #comment unused hosts with `#` 7 | #or use first arg to limit the hosts number 8 | #e.g.: `./train.sh 4` will use first 4 hosts. 9 | #declare -a host_list=( 10 | # #"10.11.1.1" 11 | # "10.11.1.2" 12 | # "10.11.1.3" 13 | # "10.11.1.4" 14 | # "10.11.1.5" 15 | # ) 16 | declare -a host_list=(ln1 ln2 ln3 ln4) 17 | 18 | if [ -n "$1" ] 19 | then 20 | host_num=$1 21 | else 22 | host_num=${#host_list[@]} 23 | fi 24 | 25 | 26 | if [ ${host_num} -gt ${#host_list[@]} ] 27 | then 28 | host_num=${#host_list[@]} 29 | fi 30 | 31 | hosts=("${host_list[@]:0:${host_num}}") 32 | echo "plan to pkill python3 on hosts:${hosts[@]}" 33 | 34 | ############################################## 35 | #2 copy files to each host and start work 36 | for host in "${hosts[@]}" 37 | do 38 | echo "pkill python3 on ${host}" 39 | ssh $USER@$host 'pkill python3' 40 | done 41 | 42 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/GPT/scripts/openweb_to_json.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tarfile 3 | import json 4 | 5 | with tarfile.open(sys.argv[1], 'r') as tar: 6 | for member in tar.getmembers(): 7 | print(json.dumps({'url': member.name, 'text': str(tar.extractfile(member).read())})) 8 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/GPT/scripts/pretrain.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export PYTHONUNBUFFERED=1 5 | # export ONEFLOW_DEBUG_MODE=1 6 | # export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 7 | # export NCCL_DEBUG=INFO 8 | 9 | dataset=/data/gpt/gpt_sample_dataset_text_document 10 | seq_length=2048 11 | 12 | num_nodes=1 13 | node_ips="10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5" 14 | num_gpus_per_node=8 15 | 16 | tensor_model_parallel_size=4 17 | pipeline_model_parallel_size=1 18 | world_size=$(($num_gpus_per_node*$num_nodes)) 19 | data_parallel_size=$((${world_size}/$tensor_model_parallel_size/$pipeline_model_parallel_size)) 20 | 21 | num_layers=16 22 | hidden_size=1536 23 | num_heads=16 24 | 25 | micro_batch_size=8 26 | global_batch_size=16 27 | 28 | train_iters=100 29 | log_interval=10 30 | 31 | log_file=pretrain_gpt_${num_nodes}n${num_gpus_per_node}d_dp${data_parallel_size}_mp${tensor_model_parallel_size}_pp${pipeline_model_parallel_size}_mbz${micro_batch_size}_gbz${global_batch_size}_s${seq_length}_l${num_layers}_h${hidden_size}_nh${num_heads}.log 32 | 33 | python3 -m oneflow_gpt.training \ 34 | --num-layers ${num_layers} \ 35 | --hidden-size ${hidden_size} \ 36 | --num-attention-heads ${num_heads} \ 37 | --micro-batch-size ${micro_batch_size} \ 38 | --global-batch-size ${global_batch_size} \ 39 | --tensor-model-parallel-size ${tensor_model_parallel_size} \ 40 | --pipeline-model-parallel-size ${pipeline_model_parallel_size} \ 41 | --num-gpus-per-node ${num_gpus_per_node} \ 42 | --num-nodes ${num_nodes} \ 43 | --node-ips ${node_ips} \ 44 | --train-iters ${train_iters} \ 45 | --dataset ${dataset} \ 46 | --seq-length ${seq_length} \ 47 | --log-interval ${log_interval} \ 48 | --learning-rate 0.00015 \ 49 | --min-lr 1.0e-5 \ 50 | --lr-decay-style cosine \ 51 | --lr-decay-iters 320000 \ 52 | --lr-warmup-fraction 0.01 \ 53 | --initial-loss-scale 1048576 \ 54 | --optimizer adamw \ 55 | --weight-decay 1e-2 \ 56 | --clip-grad 1.0 \ 57 | --vocab-size 50257 \ 58 | --split 949,50,1 \ 59 | --load checkpoint \ 60 | --save checkpoint \ 61 | --save-interval 20000 \ 62 | --metric-print-format table \ 63 | --checkpoint-activations \ 64 | --multihead-attention-fusion \ 65 | --fp16 \ 66 | --use-rdma \ 67 | | tee $log_file 68 | -------------------------------------------------------------------------------- /OneFlow/LanguageModeling/GPT/scripts/pretrain_with_container.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | dataset=/data/gpt/gpt_sample_dataset_text_document 5 | seq_length=2048 6 | 7 | num_nodes=1 8 | node_ips="10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5" 9 | num_gpus_per_node=8 10 | 11 | tensor_model_parallel_size=4 12 | pipeline_model_parallel_size=1 13 | world_size=$(($num_gpus_per_node*$num_nodes)) 14 | data_parallel_size=$((${world_size}/$tensor_model_parallel_size/$pipeline_model_parallel_size)) 15 | 16 | num_layers=16 17 | hidden_size=1536 18 | num_heads=16 19 | 20 | micro_batch_size=8 21 | global_batch_size=16 22 | 23 | train_iters=100 24 | log_interval=10 25 | 26 | log_file=pretrain_gpt_${num_nodes}n${num_gpus_per_node}d_dp${data_parallel_size}_mp${tensor_model_parallel_size}_pp${pipeline_model_parallel_size}_mbz${micro_batch_size}_gbz${global_batch_size}_s${seq_length}_l${num_layers}_h${hidden_size}_nh${num_heads}.log 27 | 28 | oneflow_gpt_src_dir=$HOME/repos/OneFlow-Benchmark/LanguageModeling/GPT 29 | python_version=3.7 30 | image=oneflow-manylinux2014-cuda11.2:0.1 31 | wheel=oneflow-0.3.5+cu112.git.75f11b825-cp37-cp37m-manylinux2014_x86_64.whl 32 | 33 | python3 ${oneflow_gpt_src_dir}/tools/launch_container.py \ 34 | --src ${oneflow_gpt_src_dir} \ 35 | --py ${python_version} \ 36 | --image ${image} \ 37 | --wheel ${wheel} \ 38 | --extra-mount "/data" \ 39 | --cmd "python3 -m oneflow_gpt.training \ 40 | --num-layers ${num_layers} \ 41 | --hidden-size ${hidden_size} \ 42 | --num-attention-heads ${num_heads} \ 43 | --micro-batch-size ${micro_batch_size} \ 44 | --global-batch-size ${global_batch_size} \ 45 | --tensor-model-parallel-size ${tensor_model_parallel_size} \ 46 | --pipeline-model-parallel-size ${pipeline_model_parallel_size} \ 47 | --num-gpus-per-node ${num_gpus_per_node} \ 48 | --num-nodes ${num_nodes} \ 49 | --node-ips ${node_ips} \ 50 | --train-iters ${train_iters} \ 51 | --log-interval ${log_interval} \ 52 | --dataset ${dataset} \ 53 | --seq-length ${seq_length} \ 54 | --learning-rate 0.00015 \ 55 | --min-lr 1.0e-5 \ 56 | --lr-decay-style cosine \ 57 | --lr-decay-iters 320000 \ 58 | --lr-warmup-fraction 0.01 \ 59 | --initial-loss-scale 1048576 \ 60 | --optimizer adamw \ 61 | --weight-decay 1e-2 \ 62 | --clip-grad 1.0 \ 63 | --vocab-size 50257 \ 64 | --split 949,50,1 \ 65 | --load checkpoint \ 66 | --save checkpoint \ 67 | --save-interval 20000 \ 68 | --metric-print-format table \ 69 | --checkpoint-activations \ 70 | --multihead-attention-fusion \ 71 | --fp16 \ 72 | --use-rdma \ 73 | | tee $log_file" 74 | -------------------------------------------------------------------------------- /OneFlow/Megatron-LM/scripts/train_gpt2.sh: -------------------------------------------------------------------------------- 1 | non_distributed_optimizer=${1:-"off"} 2 | gpus=${2:-0,1,2,3,4,5,6,7} 3 | batch_size_per_device=${3:-4} 4 | dtype=${4:-"fp16"} 5 | 6 | a=`expr ${#gpus} + 1` 7 | gpu_num_per_node=`expr ${a} / 2` 8 | 9 | export CUDA_VISIBLE_DEVICES=${gpus} 10 | export PYTHONUNBUFFERED=1 11 | export ONEFLOW_DEBUG_MODE=1 12 | 13 | # gpt2-small 14 | n_head=12 15 | n_embd=768 16 | n_layer=12 17 | # # gpt2-medium 18 | # n_head=16 19 | # n_embd=1024 20 | # n_layer=24 21 | 22 | num_node=4 23 | node_ips="10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5" 24 | 25 | if [ "$dtype" = "fp16" ] ; then 26 | use_fp16=True 27 | else 28 | use_fp16=False 29 | fi 30 | 31 | PREFIX=1209-test-19-oneflow-gpt2-small 32 | test_case=${PREFIX}_${dtype}_${num_node}n${gpu_num_per_node}g_bz${batch_size_per_device}_${non_distributed_optimizer} 33 | mem_file=$test_case.mem 34 | log_file=$test_case.log 35 | output_dir=${PREFIX}_output_${non_distributed_optimizer}_distributed_split_${num_node}n${gpu_num_per_node}g_bz${batch_size_per_device} 36 | mkdir -p $output_dir 37 | 38 | # nsys is a nvidia analysis tool 39 | # /usr/local/cuda-10.2/bin/nsys profile 40 | python3 src/train.py \ 41 | --log_dir=${output_dir} \ 42 | --non_distributed_optimizer=${non_distributed_optimizer} \ 43 | --num_nodes=${num_node} \ 44 | --node_ips=${node_ips} \ 45 | --use_fp16=${use_fp16} \ 46 | --dataset=/datasets/wiki/enwiki/AA \ 47 | --batch_size_per_device=${batch_size_per_device} \ 48 | --gpu_num_per_node=$gpu_num_per_node \ 49 | --seq_len=1024 \ 50 | --iter_num=1000 \ 51 | --optimizer=adamw \ 52 | --embedding_dropout=0.1 \ 53 | --output_dropout=0.1 \ 54 | --attention_dropout=0.1 \ 55 | --n_vocab=50304 \ 56 | --n_ctx=1024 \ 57 | --n_embd=$n_embd \ 58 | --n_head=$n_head \ 59 | --n_layer=${n_layer} 2>&1 | tee ${log_file} 60 | -------------------------------------------------------------------------------- /OneFlow/Recognition/insightface/scripts/multi_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | WORKSPACE=~/oneflow_temp 6 | SCRIPTS_PATH=$WORKSPACE/oneflow_face 7 | host_num=${1:-4} 8 | network=${2:-"r100"} 9 | dataset=${3:-"emore"} 10 | loss=${4:-"arcface"} 11 | num_nodes=${5:-${host_num}} 12 | bz_per_device=${6:-64} 13 | train_unit=${7:-"batch"} 14 | train_iter=${8:-150} 15 | gpu_num_per_node=${9:-8} 16 | precision=${10:-fp32} 17 | model_parallel=${11:-1} 18 | partial_fc=${12:-1} 19 | test_times=${13:-5} 20 | sample_ratio=${14:-0.1} 21 | num_classes=${15:-85744} 22 | use_synthetic_data=${16:-False} 23 | 24 | # 2n8g 25 | sed 26 | sed -i "s/num_nodes = 1/num_nodes = 2/g" $SCRIPTS_PATH/sample_config.py 27 | sed -i "s/node_ips = \['10.11.0.2'\]/node_ips = \['10.11.0.2', '10.11.0.3'\]/g" $SCRIPTS_PATH/sample_config.py 28 | sed -i "s/\"10.11.0.3\"/\#\"10.11.0.3\"/g" $WORKSPACE/run_multi_nodes.sh 29 | sed -i "s/\"10.11.0.4\"/\#\"10.11.0.4\"/g" $WORKSPACE/run_multi_nodes.sh 30 | 31 | 32 | i=1 33 | while [ $i -le ${test_times} ] 34 | do 35 | bash $SCRIPTS_PATH/run_multi_nodes.sh 2 ${network} ${dataset} ${loss} 2 $bz_per_device $train_unit $train_iter ${gpu_num_per_node} $precision $model_parallel $partial_fc $i $sample_ratio $num_classes $use_synthetic_data 36 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 37 | let i++ 38 | sleep 20s 39 | done 40 | 41 | # 4n8g 42 | sed -i "s/num_nodes = 2/num_nodes = 4/g" $SCRIPTS_PATH/sample_config.py 43 | sed -i "s/node_ips = \['10.11.0.2', '10.11.0.3'\]/node_ips = \['10.11.0.2', '10.11.0.3', '10.11.0.4', '10.11.0.5'\]/g" $SCRIPTS_PATH/sample_config.py 44 | sed -i "s/\#\"10.11.0.3\"/\"10.11.0.3\"/g" $WORKSPACE/run_multi_nodes.sh 45 | sed -i "s/\#\"10.11.0.4\"/\"10.11.0.4\"/g" $WORKSPACE/run_multi_nodes.sh 46 | 47 | i=1 48 | while [ $i -le ${test_times} ] 49 | do bash $SCRIPTS_PATH/run_multi_nodes.sh 4 ${network} ${dataset} ${loss} 4 $bz_per_device $train_unit $train_iter ${gpu_num_per_node} $precision $model_parallel $partial_fc $i $sample_ratio $num_classes $use_synthetic_data 50 | echo " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< " 51 | let i++ 52 | sleep 20s 53 | done 54 | -------------------------------------------------------------------------------- /OneFlow/Recognition/insightface/scripts/run_multi_nodes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | 6 | workdir=/workdir 7 | 8 | host_num=${1:-4} 9 | network=${2:-"r100"} 10 | dataset=${3:-"emore"} 11 | loss=${4:-"arcface"} 12 | num_nodes=${5:-${host_num}} 13 | bz_per_device=${6:-64} 14 | train_unit=${7:-"batch"} 15 | train_iter=${8:-150} 16 | gpu_num_per_node=${9:-8} 17 | precision=${10:-fp32} 18 | model_parallel=${11:-1} 19 | partial_fc=${12:-1} 20 | test_times=${13:-5} 21 | sample_ratio=${14:-0.1} 22 | num_classes=${15:-85744} 23 | use_synthetic_data=${16:-False} 24 | 25 | 26 | port=22 27 | 28 | SCRIPTS_PATH=${workdir}/oneflow_face 29 | TEST_SCRIPTS=${SCRIPTS_PATH}/scripts 30 | LOCAL_RUN=${SCRIPTS_PATH}/scripts/train_insightface.sh 31 | 32 | 33 | ############################################## 34 | #0 prepare the host list for training 35 | #comment unused hosts with `#` 36 | #or use first arg to limit the hosts number 37 | declare -a host_list=( 38 | "10.11.0.2" 39 | "10.11.0.3" 40 | "10.11.0.4" 41 | "10.11.0.5" 42 | ) 43 | 44 | if [ -n "$1" ] 45 | then 46 | host_num=$1 47 | else 48 | host_num=${#host_list[@]} 49 | fi 50 | 51 | 52 | if [ ${host_num} -gt ${#host_list[@]} ] 53 | then 54 | host_num=${#host_list[@]} 55 | fi 56 | 57 | hosts=("${host_list[@]:0:${host_num}}") 58 | echo "Working on hosts:${hosts[@]}" 59 | 60 | 61 | if [ ${host_num} == 2 ]; then 62 | sed -i "s/node_ips = \[.*\]/node_ips = \[\"10.11.0.2\", \"10.11.0.3\"\]/g" $SCRIPTS_PATH/sample_config.py 63 | elif [ ${host_num} == 4 ]; then 64 | sed -i "s/node_ips = \[.*\]/node_ips = \[\"10.11.0.2\", \"10.11.0.3\", \"10.11.0.4\", \"10.11.0.5\"\]/g" $SCRIPTS_PATH/sample_config.py 65 | else 66 | echo "Please modify parameters in oneflow_face/sample_config.py, run_multi_nodes.sh manually! " 67 | fi 68 | 69 | 70 | test_case=${host_num}n${gpu_num_per_node}g_b${bz_per_device}_${network}_${dataset}_${loss} 71 | log_file=${test_case}.log 72 | 73 | logs_folder=logs 74 | mkdir -p $logs_folder 75 | 76 | echo log file: ${log_file} 77 | ############################################## 78 | #1 prepare oneflow_temp folder on each host 79 | for host in "${hosts[@]}" 80 | do 81 | ssh -p ${port} $host "mkdir -p ~/oneflow_temp" 82 | done 83 | 84 | ############################################## 85 | #2 copy files to each host and start work 86 | for host in "${hosts[@]:1}" 87 | do 88 | echo "start training on ${host}" 89 | ssh -p ${port} $host "rm -rf ~/oneflow_temp/*" 90 | scp -P ${port} -r $SCRIPTS_PATH $LOCAL_RUN $host:~/oneflow_temp 91 | 92 | ssh -p ${port} $host "cd ~/oneflow_temp; nohup bash train_insightface.sh ~/oneflow_temp/oneflow_face ${network} ${dataset} ${loss} ${num_nodes} $bz_per_device $train_unit $train_iter ${gpu_num_per_node} $precision $model_parallel $partial_fc $test_times $sample_ratio $num_classes 1>${log_file} 2>&1 ${log_file}" 101 | 102 | echo "done" 103 | 104 | cp ~/oneflow_temp/${log_file} $logs_folder/${log_file} 105 | sleep 3 106 | 107 | -------------------------------------------------------------------------------- /OneFlow/Recognition/insightface/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | export PYTHONUNBUFFERED=1 3 | 4 | workspace=${1:-"/oneflow_face"} 5 | network=${2:-"r100"} 6 | dataset=${3:-"emore"} 7 | loss=${4:-"arcface"} 8 | num_nodes=${5:-1} 9 | bz_per_device=${6:-64} 10 | train_unit=${7:-"batch"} 11 | train_iter=${8:-150} 12 | gpu_num_per_node=${9:-8} 13 | precision=${10:-fp32} 14 | model_parallel=${11:-1} 15 | partial_fc=${12:-1} 16 | test_times=${13:-1} 17 | sample_ratio=${14:-0.1} 18 | num_classes=${15:-85744} 19 | use_synthetic_data=${16:-False} 20 | 21 | i=1 22 | while [ $i -le 5 ]; do 23 | bash ${workspace}/scripts/train_insightface.sh ${workspace} ${network} ${dataset} ${loss} 1 ${bz_per_device} ${train_unit} ${train_iter} 1 ${precision} ${model_parallel} ${partial_fc} $i ${sample_ratio} ${num_classes} ${use_synthetic_data} 24 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 25 | let i++ 26 | sleep 20 27 | done 28 | 29 | i=1 30 | while [ $i -le 5 ]; do 31 | bash ${workspace}/scripts/train_insightface.sh ${workspace} ${network} ${dataset} ${loss} 1 ${bz_per_device} ${train_unit} ${train_iter} 4 ${precision} ${model_parallel} ${partial_fc} $i ${sample_ratio} ${num_classes} ${use_synthetic_data} 32 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 33 | let i++ 34 | sleep 20 35 | done 36 | 37 | i=1 38 | while [ $i -le 5 ]; do 39 | bash ${workspace}/scripts/train_insightface.sh ${workspace} ${network} ${dataset} ${loss} 1 ${bz_per_device} ${train_unit} ${train_iter} 8 ${precision} ${model_parallel} ${partial_fc} $i ${sample_ratio} ${num_classes} ${use_synthetic_data} 40 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 41 | let i++ 42 | sleep 20 43 | done 44 | -------------------------------------------------------------------------------- /OneFlow/Recognition/insightface/scripts/train_insightface.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #export ONEFLOW_DEBUG_MODE=True 3 | export PYTHONUNBUFFERED=1 4 | 5 | workspace=${1:-"/oneflow_face"} 6 | network=${2:-"r100"} 7 | dataset=${3:-"emore"} 8 | loss=${4:-"arcface"} 9 | num_nodes=${5:-4} 10 | batch_size_per_device=${6:-64} 11 | train_unit=${7:-"batch"} 12 | train_iter=${8:-150} 13 | gpu_num_per_node=${9:-8} 14 | precision=${10:-fp32} 15 | model_parallel=${11:-1} 16 | partial_fc=${12:-1} 17 | test_times=${13:-1} 18 | sample_ratio=${14:-0.1} 19 | num_classes=${15:-85744} 20 | use_synthetic_data=${16:-False} 21 | 22 | MODEL_SAVE_DIR=${num_classes}_${precision}_b${batch_size_per_device}_oneflow_model_parallel_${model_parallel}_partial_fc_${partial_fc}/${num_nodes}n${gpu_num_per_node}g 23 | LOG_DIR=$MODEL_SAVE_DIR 24 | 25 | if [ $gpu_num_per_node -gt 1 ]; then 26 | if [ $network = "r100" ]; then 27 | data_part_num=32 28 | elif [ $network = "r100_glint360k" ]; then 29 | data_part_num=200 30 | else 31 | echo "Please modify exact data part num in sample_config.py!" 32 | fi 33 | else 34 | data_part_num=1 35 | fi 36 | 37 | sed -i "s/${dataset}.train_data_part_num = [[:digit:]]*/${dataset}.train_data_part_num = $data_part_num/g" $workspace/sample_config.py 38 | sed -i "s/${dataset}.num_classes = [[:digit:]]*/${dataset}.num_classes = $num_classes/g" $workspace/sample_config.py 39 | 40 | PREC="" 41 | if [ "$precision" = "fp16" ]; then 42 | PREC=" --use_fp16=True" 43 | elif [ "$precision" = "fp32" ]; then 44 | PREC=" --use_fp16=False" 45 | else 46 | echo "Unknown argument" 47 | exit -2 48 | fi 49 | 50 | LOG_FILE=${LOG_DIR}/${network}_b${batch_size_per_device}_${precision}_$test_times.log 51 | 52 | mkdir -p $MODEL_SAVE_DIR 53 | 54 | time=$(date "+%Y-%m-%d %H:%M:%S") 55 | echo $time 56 | 57 | CMD="$workspace/insightface_train.py" 58 | CMD+=" --network=${network}" 59 | CMD+=" --dataset=${dataset}" 60 | CMD+=" --loss=${loss}" 61 | CMD+=" --num_nodes=${num_nodes}" 62 | CMD+=" --train_batch_size=$(expr $num_nodes '*' $gpu_num_per_node '*' $batch_size_per_device)" 63 | CMD+=" --train_unit=${train_unit}" 64 | CMD+=" --train_iter=${train_iter}" 65 | CMD+=" --device_num_per_node=${gpu_num_per_node}" 66 | CMD+=" --model_parallel=${model_parallel}" 67 | CMD+=" --partial_fc=${partial_fc}" 68 | CMD+=" --sample_ratio=${sample_ratio}" 69 | CMD+=" --log_dir=${LOG_DIR}" 70 | CMD+=" $PREC" 71 | CMD+=" --sample_ratio=${sample_ratio}" 72 | CMD+=" --use_synthetic_data=${use_synthetic_data}" 73 | CMD+=" --iter_num_in_snapshot=5000" 74 | CMD+=" --validation_interval=5000" 75 | 76 | CMD="/home/leinao/anaconda3/envs/insightface/bin/python3 $CMD " 77 | set -x 78 | if [ -z "$LOG_FILE" ]; then 79 | $CMD 80 | else 81 | ( 82 | $CMD 83 | ) |& tee $LOG_FILE 84 | 85 | fi 86 | set +x 87 | echo "Writing log to ${LOG_FILE}" 88 | -------------------------------------------------------------------------------- /PaddlePaddle/bert/scripts/make_pretrain_data.sh: -------------------------------------------------------------------------------- 1 | echo "gzip train/demo_wiki_train.gz ..." 2 | gzip -d train/demo_wiki_train.gz 3 | echo "Done!" 4 | 5 | for i in {1..50} 6 | do 7 | cat train/demo_wiki_train >> train/demo_wiki_train_50 8 | echo "Copy $i times" 9 | let i+=1 10 | done 11 | 12 | rm train/demo_wiki_train 13 | echo "gzip demo_wiki_train_50 to demo_wiki_train_50.gz ..." 14 | gzip train/demo_wiki_train_50 15 | echo "Success!" 16 | -------------------------------------------------------------------------------- /PaddlePaddle/bert/scripts/multi_node_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUTPUT_DIR=../output 3 | MODEL=${1:-"bert_base"} 4 | BATCH_SIZE=${2:-32} 5 | gpus=${3:-"0,1,2,3,4,5,6,7"} 6 | node_ips=${4:-$NODE1,$NODE2,$NODE3,$NODE4} 7 | CURRENT_NODE=${5:-$NODE1} 8 | TEST_NUM=${6:-1} 9 | DTYPE=${7:-"fp32"} 10 | echo "Node IPs : $node_ips" 11 | 12 | a=`expr ${#gpus} + 1` 13 | NUM_GPU=`expr ${a} / 2` 14 | paddle_batch_size=`expr ${BATCH_SIZE} \* 128` 15 | node_num=$(echo $node_ips | tr ',' '\n' | wc -l) 16 | 17 | echo "Use gpus: $gpus" 18 | echo "Batch size per device : $BATCH_SIZE" 19 | echo "Paddle Batch size : $paddle_batch_size" 20 | 21 | 22 | 23 | LOG_FOLDER=./logs/paddle/bert/bz${BATCH_SIZE}/${node_num}n${NUM_GPU}g 24 | mkdir -p $LOG_FOLDER 25 | LOGFILE=${LOG_FOLDER}/bert_b${BATCH_SIZE}_${DTYPE}_$TEST_NUM.log 26 | 27 | 28 | export CUDA_VISIBLE_DEVICES=${gpus} 29 | export NCCL_IB_DISABLE=1 30 | # Unset proxy 31 | unset https_proxy http_proxy 32 | export worker_endpoints=$node_ips 33 | export current_endpoint=$CURRENT_NODE 34 | 35 | echo "CUDA_VISIBLE_DEVICES:${gpus}" 36 | if [ "$MODEL" = "bert_base" ] ; then 37 | CONFIG_PATH=${BERT_BASE_CONFIG} 38 | VOCAB_PATH='data/demo_config/vocab.txt' 39 | max_seq_len=128 40 | max_predictions_per_seq=20 41 | PADDLE_BERT_DATA_DIR=$PADDLE_BERT_BASE_DATA_DIR 42 | else 43 | CONFIG_PATH=${BERT_LARGE_CONFIG} 44 | VOCAB_PATH=${SCRIPT_ROOT_DIR}/configs/bert_model_config/uncased_L-24_H-1024_A-16/vocab.txt 45 | max_seq_len=512 46 | max_predictions_per_seq=80 47 | PADDLE_BERT_DATA_DIR=$PADDLE_BERT_LARGE_DATA_DIR 48 | fi 49 | 50 | 51 | if [ "$DTYPE" == "fp16" ] ; then 52 | use_fp16=True 53 | use_dynamic_loss_scaling=True 54 | init_loss_scaling=128.0 55 | else 56 | use_fp16=False 57 | use_dynamic_loss_scaling=False 58 | init_loss_scaling=128.0 59 | fi 60 | 61 | 62 | # Change your train arguments: 63 | python -u ./train.py --is_distributed true \ 64 | --use_cuda true \ 65 | --use_fast_executor true \ 66 | --weight_sharing true \ 67 | --batch_size ${paddle_batch_size} \ 68 | --data_dir ${PADDLE_BERT_DATA_DIR:-'data/train'} \ 69 | --validation_set_dir ${PADDLE_BERT_DATA_DIR:-'data/train'} \ 70 | --bert_config_path ${CONFIG_PATH:-'data/demo_config/bert_config.json'} \ 71 | --vocab_path ${VOCAB_PATH} \ 72 | --generate_neg_sample true \ 73 | --save_steps 10000 \ 74 | --learning_rate 1e-4 \ 75 | --weight_decay 0.01 \ 76 | --warmup_steps 120 \ 77 | --num_train_steps 120 \ 78 | --max_seq_len ${max_seq_len} \ 79 | --skip_steps 1 \ 80 | --validation_steps 1000 \ 81 | --use_fp16 ${use_fp16} \ 82 | --use_dynamic_loss_scaling ${use_dynamic_loss_scaling} \ 83 | --init_loss_scaling ${init_loss_scaling} \ 84 | --verbose true \ 85 | --checkpoints $OUTPUT_DIR/paddle/runtime_output/checkpoints 2>&1 | tee $LOGFILE 86 | 87 | echo "Writting log to $LOGFILE" 88 | 89 | -------------------------------------------------------------------------------- /PaddlePaddle/bert/scripts/run_multi_node.sh: -------------------------------------------------------------------------------- 1 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 2 | BATCH_SIZE=${1:-32} 3 | DTYPE=${2:-"fp32"} 4 | NODE1='10.11.0.2:9999' 5 | NODE2='10.11.0.3:9999' 6 | NODE3='10.11.0.4:9999' 7 | NODE4='10.11.0.5:9999' 8 | CURRENT_NODE=$NODE1 9 | NODES=$NODE1,$NODE2,$NODE3,$NODE4 10 | 11 | i=5 12 | while [ $i -le 5 ] 13 | do 14 | bash $SHELL_FOLDER/multi_node_train.sh "bert_base" $BATCH_SIZE 0,1,2,3,4,5,6,7 $NODES $CURRENT_NODE $i $DTYPE 15 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 16 | let i++ 17 | sleep 20 18 | done -------------------------------------------------------------------------------- /PaddlePaddle/bert/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 2 | BATCH_SIZE=${1:-32} 3 | DTYPE=${2:-"fp32"} 4 | 5 | i=1 6 | while [ $i -le 5 ] 7 | do 8 | bash $SHELL_FOLDER/single_node_train.sh "bert_base" 0 $BATCH_SIZE $i $DTYPE 9 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 10 | let i++ 11 | sleep 20 12 | done 13 | 14 | 15 | 16 | i=1 17 | while [ $i -le 5 ] 18 | do 19 | bash $SHELL_FOLDER/single_node_train.sh "bert_base" 0,1,2,3 $BATCH_SIZE $i $DTYPE 20 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 21 | let i++ 22 | sleep 20 23 | done 24 | 25 | 26 | i=1 27 | while [ $i -le 5 ] 28 | do 29 | bash $SHELL_FOLDER/single_node_train.sh "bert_base" 0,1,2,3,4,5,6,7 $BATCH_SIZE $i $DTYPE 30 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 31 | let i++ 32 | sleep 20 33 | done -------------------------------------------------------------------------------- /PaddlePaddle/bert/scripts/run_two_node.sh: -------------------------------------------------------------------------------- 1 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 2 | BATCH_SIZE=${1:-32} 3 | DTYPE=${2:-"fp32"} 4 | NODE1='10.11.0.2:9999' 5 | NODE2='10.11.0.3:9999' 6 | NODE3='10.11.0.4:9999' 7 | NODE4='10.11.0.5:9999' 8 | CURRENT_NODE=$NODE1 9 | NODES=$NODE1,$NODE2 10 | 11 | i=1 12 | while [ $i -le 5 ] 13 | do 14 | bash $SHELL_FOLDER/multi_node_train.sh "bert_base" $BATCH_SIZE 0,1,2,3,4,5,6,7 $NODES $CURRENT_NODE $i $DTYPE 15 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 16 | let i++ 17 | sleep 20 18 | done -------------------------------------------------------------------------------- /PaddlePaddle/bert/scripts/single_node_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUTPUT_DIR=../output 3 | MODEL=${1:-"bert_base"} 4 | gpus=${2:-"0,1,2,3,4,5,6,7"} 5 | BATCH_SIZE=${3:-32} 6 | TEST_NUM=${4:-1} 7 | DTYPE=${5:-"fp32"} 8 | 9 | a=`expr ${#gpus} + 1` 10 | NUM_GPU=`expr ${a} / 2` 11 | paddle_batch_size=`expr ${BATCH_SIZE} \* 128` 12 | echo "Use gpus: $gpus" 13 | echo "Batch size per device : $BATCH_SIZE, Paddle Batch size : $paddle_batch_size" 14 | 15 | 16 | LOG_FOLDER=./logs/paddle/bert/bz${BATCH_SIZE}/1n${NUM_GPU}g 17 | mkdir -p $LOG_FOLDER 18 | LOGFILE=${LOG_FOLDER}/bert_b${BATCH_SIZE}_${DTYPE}_$TEST_NUM.log 19 | 20 | export CUDA_VISIBLE_DEVICES=${gpus} 21 | if [ "$MODEL" = "bert_base" ] ; then 22 | CONFIG_PATH=${BERT_BASE_CONFIG} 23 | VOCAB_PATH='data/demo_config/vocab.txt' 24 | max_seq_len=128 25 | max_predictions_per_seq=20 26 | PADDLE_BERT_DATA_DIR=$PADDLE_BERT_BASE_DATA_DIR 27 | else 28 | CONFIG_PATH=${BERT_LARGE_CONFIG} 29 | VOCAB_PATH=${SCRIPT_ROOT_DIR}/configs/bert_model_config/uncased_L-24_H-1024_A-16/vocab.txt 30 | max_seq_len=512 31 | max_predictions_per_seq=80 32 | PADDLE_BERT_DATA_DIR=$PADDLE_BERT_LARGE_DATA_DIR 33 | fi 34 | 35 | 36 | if [ "$DTYPE" == "fp16" ] ; then 37 | use_fp16=True 38 | use_dynamic_loss_scaling=True 39 | init_loss_scaling=128.0 40 | else 41 | use_fp16=False 42 | use_dynamic_loss_scaling=False 43 | init_loss_scaling=128.0 44 | fi 45 | 46 | 47 | # Change your train arguments: 48 | python -u ./train.py --is_distributed false \ 49 | --use_cuda true \ 50 | --use_fast_executor true \ 51 | --weight_sharing true \ 52 | --batch_size ${paddle_batch_size} \ 53 | --data_dir ${PADDLE_BERT_DATA_DIR:-'data/train'} \ 54 | --validation_set_dir ${PADDLE_BERT_DATA_DIR:-'data/train'} \ 55 | --bert_config_path ${CONFIG_PATH:-'data/demo_config/bert_config.json'} \ 56 | --vocab_path ${VOCAB_PATH} \ 57 | --generate_neg_sample true\ 58 | --save_steps 10000 \ 59 | --learning_rate 1e-4 \ 60 | --weight_decay 0.01 \ 61 | --warmup_steps 120 \ 62 | --num_train_steps 120 \ 63 | --max_seq_len ${max_seq_len} \ 64 | --skip_steps 1 \ 65 | --validation_steps 1000 \ 66 | --use_fp16 ${use_fp16} \ 67 | --use_dynamic_loss_scaling ${use_dynamic_loss_scaling} \ 68 | --init_loss_scaling ${init_loss_scaling} \ 69 | --verbose true \ 70 | --checkpoints $OUTPUT_DIR/paddle/runtime_output/checkpoints 2>&1 | tee $LOGFILE 71 | 72 | echo "Writting log to $LOGFILE" 73 | 74 | -------------------------------------------------------------------------------- /PaddlePaddle/resnet50v1.5/scripts/multi_node_train.sh: -------------------------------------------------------------------------------- 1 | MODEL=${1:-"resnet50"} 2 | gpus=${2:-"0,1,2,3,4,5,6,7"} 3 | BATCH_SIZE=${3:-128} 4 | IMAGE_SIZE=${4:-224} 5 | nodes=${5:-$NODE1,$NODE2,NODE3,$NODE4} 6 | CURRENT_NODE=${6:-NODE1} 7 | TEST_NUM=${7:-1} 8 | DTYPE=${8:-"fp32"} 9 | 10 | a=`expr ${#gpus} + 1` 11 | GPUS_PER_NODE=`expr ${a} / 2` 12 | total_bz=`expr ${BATCH_SIZE} \* ${GPUS_PER_NODE}` 13 | LR=$(awk -v total_bz="$total_bz" 'BEGIN{print total_bz / 1000}') 14 | node_num=$(echo $nodes | tr ',' '\n' | wc -l) 15 | NUM_EPOCH=`expr ${node_num} \* 4` 16 | 17 | 18 | LOG_FOLDER=../logs/paddle/resnet50/bz${BATCH_SIZE}/${node_num}n${GPUS_PER_NODE}g 19 | mkdir -p $LOG_FOLDER 20 | LOGFILE=${LOG_FOLDER}/r50_b${BATCH_SIZE}_${DTYPE}_${TEST_NUM}.log 21 | 22 | 23 | DATA_DIR=/datasets/ImageNet/Paddle 24 | 25 | 26 | # bash run.sh train ResNet50_fp16 27 | if [ "$DTYPE" == "fp16" ] ; then 28 | export FLAGS_conv_workspace_size_limit=4000 #MB 29 | export FLAGS_cudnn_exhaustive_search=1 30 | export FLAGS_cudnn_batchnorm_spatial_persistent=1 31 | DATA_FORMAT="NHWC" 32 | FP16_PARAMS=" --use_fp16=True --use_dynamic_loss_scaling=true --scale_loss=128.0 --fuse_elewise_add_act_ops=true --fuse_bn_act_ops=true " 33 | else 34 | DATA_FORMAT="NCHW" 35 | FP16_PARAMS=" " 36 | fi 37 | 38 | 39 | USE_DALI=false 40 | if ${USE_DALI}; then 41 | export FLAGS_fraction_of_gpu_memory_to_use=0.8 42 | export DALI_EXTRA_PATH=/home/leinao/paddle/DALI_extra 43 | THREAD=10 44 | else 45 | export FLAGS_fraction_of_gpu_memory_to_use=0.98 46 | THREAD=8 47 | fi 48 | echo "FLAGS_fraction_of_gpu_memory_to_use=$FLAGS_fraction_of_gpu_memory_to_use" 49 | 50 | 51 | echo "Nodes : $nodes" 52 | echo "Use gpus: $gpus, Batch size per device : $BATCH_SIZE, Total Batch size : $total_bz" 53 | echo "Learning rate: $LR" 54 | 55 | 56 | export CUDA_VISIBLE_DEVICES=${gpus} 57 | python3 -m paddle.distributed.launch --cluster_node_ips=${nodes} \ 58 | --node_ip=$CURRENT_NODE \ 59 | train.py \ 60 | $FP16_PARAMS \ 61 | --data_format=${DATA_FORMAT} \ 62 | --reader_thread=$THREAD \ 63 | --data_dir=${DATA_DIR} \ 64 | --total_images=1302936 \ 65 | --class_dim=1000 \ 66 | --validate=False \ 67 | --batch_size=$total_bz \ 68 | --image_shape 3 $IMAGE_SIZE $IMAGE_SIZE \ 69 | --print_step=1 \ 70 | --save_step=10000 \ 71 | --lr_strategy=piecewise_decay \ 72 | --lr=0.001 \ 73 | --momentum_rate=0.875 \ 74 | --max_iter=120 \ 75 | --model='ResNet50' \ 76 | --model_save_dir=output/ \ 77 | --l2_decay=0.000030518 \ 78 | --warm_up_epochs=1 \ 79 | --use_mixup=False \ 80 | --use_label_smoothing=True \ 81 | --label_smoothing_epsilon=0.1 2>&1 | tee ${LOGFILE} 82 | 83 | echo "Writting log to ${LOGFILE}" -------------------------------------------------------------------------------- /PaddlePaddle/resnet50v1.5/scripts/run_multi_node.sh: -------------------------------------------------------------------------------- 1 | MODEL="resnet50" 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | BATCH_SIZE=${1:-128} 4 | DTYPE=${2:-"fp32"} 5 | NODE1='10.11.0.2' 6 | NODE2='10.11.0.3' 7 | NODE3='10.11.0.4' 8 | NODE4='10.11.0.5' 9 | CURRENT_NODE=$NODE1 10 | NODES=$NODE1,$NODE2,$NODE3,$NODE4 11 | 12 | i=1 13 | while [ $i -le 5 ] 14 | do 15 | bash $SHELL_FOLDER/multi_node_train.sh $MODEL 0,1,2,3,4,5,6,7 ${BATCH_SIZE} 224 $NODES $CURRENT_NODE $i $DTYPE 16 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 17 | let i++ 18 | sleep 20 19 | done 20 | -------------------------------------------------------------------------------- /PaddlePaddle/resnet50v1.5/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | MODEL="resnet50" 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | BATCH_SIZE=${1:-128} 4 | DTYPE=${2:-"fp32"} 5 | 6 | i=1 7 | while [ $i -le 5 ] 8 | do 9 | bash $SHELL_FOLDER/single_node_train.sh $MODEL 0 ${BATCH_SIZE} 224 $i $DTYPE 10 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 11 | let i++ 12 | sleep 20 13 | done 14 | 15 | 16 | 17 | i=1 18 | while [ $i -le 5 ] 19 | do 20 | bash $SHELL_FOLDER/single_node_train.sh $MODEL 0,1,2,3 ${BATCH_SIZE} 224 $i $DTYPE 21 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 22 | let i++ 23 | sleep 20 24 | done 25 | 26 | 27 | 28 | i=1 29 | while [ $i -le 5 ] 30 | do 31 | bash $SHELL_FOLDER/single_node_train.sh $MODEL 0,1,2,3,4,5,6,7 ${BATCH_SIZE} 224 $i $DTYPE 32 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 33 | let i++ 34 | sleep 20 35 | done 36 | -------------------------------------------------------------------------------- /PaddlePaddle/resnet50v1.5/scripts/run_two_node.sh: -------------------------------------------------------------------------------- 1 | MODEL="resnet50" 2 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 3 | BATCH_SIZE=${1:-128} 4 | DTYPE=${2:-"fp32"} 5 | NODE1='10.11.0.2' 6 | NODE2='10.11.0.3' 7 | CURRENT_NODE=$NODE1 8 | 9 | 10 | i=1 11 | while [ $i -le 5 ] 12 | do 13 | bash $SHELL_FOLDER/multi_node_train.sh $MODEL 0,1,2,3,4,5,6,7 ${BATCH_SIZE} 224 $NODE1,$NODE2 $CURRENT_NODE $i $DTYPE 14 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 15 | let i++ 16 | sleep 20 17 | done -------------------------------------------------------------------------------- /PaddlePaddle/resnet50v1.5/scripts/single_node_train.sh: -------------------------------------------------------------------------------- 1 | MODEL=${1:-"resnet50"} 2 | gpus=${2:-"0,1,2,3,4,5,6,7"} 3 | BATCH_SIZE=${3:-128} 4 | IMAGE_SIZE=${4:-224} 5 | TEST_NUM=${5:-1} 6 | DTYPE=${6:-"fp32"} 7 | 8 | a=`expr ${#gpus} + 1` 9 | GPU_COUNT=`expr ${a} / 2` 10 | total_bz=`expr ${BATCH_SIZE} \* ${GPU_COUNT}` 11 | LR=$(awk -v total_bz="$total_bz" 'BEGIN{print total_bz / 1000}') 12 | 13 | 14 | LOG_FOLDER=../logs/paddle/resnet50/bz${BATCH_SIZE}/1n${GPU_COUNT}g 15 | mkdir -p $LOG_FOLDER 16 | LOGFILE=${LOG_FOLDER}/r50_b${BATCH_SIZE}_${DTYPE}_${TEST_NUM}.log 17 | 18 | 19 | DATA_DIR=/datasets/ImageNet/Paddle 20 | 21 | MULTI_PROCESS="-m paddle.distributed.launch" 22 | if [ $GPU_COUNT -le 2 ] ; then 23 | THREAD=8 24 | elif [ $GPU_COUNT -le 4 ] ; then 25 | THREAD=12 26 | else 27 | THREAD=8 28 | fi 29 | 30 | 31 | # bash run.sh train ResNet50_fp16 32 | if [ "$DTYPE" == "fp16" ] ; then 33 | export FLAGS_conv_workspace_size_limit=4000 #MB 34 | export FLAGS_cudnn_exhaustive_search=1 35 | export FLAGS_cudnn_batchnorm_spatial_persistent=1 36 | DATA_FORMAT="NHWC" 37 | FP16_PARAMS=" --use_fp16=True --use_dynamic_loss_scaling=true --scale_loss=128.0 --fuse_elewise_add_act_ops=true --fuse_bn_act_ops=true " 38 | else 39 | DATA_FORMAT="NCHW" 40 | FP16_PARAMS=" " 41 | fi 42 | 43 | 44 | USE_DALI=false 45 | if ${USE_DALI}; then 46 | export FLAGS_fraction_of_gpu_memory_to_use=0.8 47 | export DALI_EXTRA_PATH=/home/leinao/paddle/DALI_extra 48 | THREAD=10 49 | else 50 | export FLAGS_fraction_of_gpu_memory_to_use=0.98 51 | fi 52 | echo "FLAGS_fraction_of_gpu_memory_to_use=$FLAGS_fraction_of_gpu_memory_to_use" 53 | 54 | 55 | echo "Use gpus: $gpus, Batch size per device : $BATCH_SIZE, Total Batch size : $total_bz" 56 | echo "Learning rate: $LR" 57 | # echo "Use fp16 : $use_fp16" 58 | 59 | export CUDA_VISIBLE_DEVICES=${gpus} 60 | python3 $MULTI_PROCESS \ 61 | train.py ${FP16_PARAMS} \ 62 | --data_format=${DATA_FORMAT} \ 63 | --data_dir=${DATA_DIR} \ 64 | --total_images=1302936 \ 65 | --class_dim=1000 \ 66 | --validate=False \ 67 | --model="ResNet50" \ 68 | --batch_size=${total_bz} \ 69 | --print_step=1 \ 70 | --save_step=10000 \ 71 | --reader_thread=${THREAD} \ 72 | --lr_strategy=cosine_decay \ 73 | --lr=0.001 \ 74 | --momentum_rate=0.875 \ 75 | --image_shape 3 $IMAGE_SIZE $IMAGE_SIZE \ 76 | --max_iter=120 \ 77 | --model_save_dir=output/ \ 78 | --l2_decay=0.000030518 \ 79 | --warm_up_epochs=1 \ 80 | --use_mixup=False \ 81 | --use_label_smoothing=True \ 82 | --use_dali=$USE_DALI \ 83 | --label_smoothing_epsilon=0.1 2>&1 | tee ${LOGFILE} 84 | echo "Writting log to ${LOGFILE}" 85 | -------------------------------------------------------------------------------- /PyTorch/resnet50v1.5/scripts/run_multi_nodes.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | WORKSPACE=${1:-"/examples/imagenet"} 4 | DATA_DIR=${2:-"/data"} 5 | PORT=11222 6 | NODE1='10.11.0.2:'${PORT} 7 | MASTER_NODE=$NODE1 8 | TEST_TIMES=${3:-1} 9 | echo ${MASTER_NODE} 10 | 11 | bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${MASTER_NODE} 0,1,2,3,4,5,6,7 128 4 ${TEST_TIMES} 12 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${TEST_TIMES}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 13 | -------------------------------------------------------------------------------- /PyTorch/resnet50v1.5/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | WORKSPACE=${1:-"/examples/imagenet"} 4 | DATA_DIR=${2:-"/data"} 5 | NODE="127.0.0.1:11222" 6 | 7 | 8 | i=1 9 | while [ $i -le 5 ] 10 | do 11 | bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${NODE} 0 128 1 $i 12 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 13 | let i++ 14 | sleep 20 15 | done 16 | 17 | 18 | i=1 19 | while [ $i -le 5 ] 20 | do 21 | bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${NODE} 0,1,2,3 128 1 $i 22 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 23 | let i++ 24 | sleep 20 25 | done 26 | 27 | i=1 28 | while [ $i -le 5 ] 29 | do 30 | bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${NODE} 0,1,2,3,4,5,6,7 128 1 $i 31 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${i}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 32 | let i++ 33 | sleep 20 34 | done 35 | 36 | 37 | -------------------------------------------------------------------------------- /PyTorch/resnet50v1.5/scripts/run_two_nodes.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | WORKSPACE=${1:-"/examples/imagenet"} 4 | DATA_DIR=${2:-"/data"} 5 | PORT=11222 6 | NODE1='10.11.0.2:'${PORT} 7 | MASTER_NODE=$NODE1 8 | TEST_TIMES=${3:-1} 9 | 10 | bash ${WORKSPACE}/scripts/single_node_train.sh ${WORKSPACE} ${DATA_DIR} ${MASTER_NODE} 0,1,2,3,4,5,6,7 128 2 ${TEST_TIMES} 11 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished Test Case ${TEST_TIMES}! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 12 | -------------------------------------------------------------------------------- /PyTorch/resnet50v1.5/scripts/single_node_train.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | WORKSPACE=${1:-"/examples/imagenet"} 4 | DATA_DIR=${2:-"/data"} 5 | MODEL="resnet50" 6 | NODE1=127.0.0.1:11222 7 | master_node=${3:-$NODE1} 8 | 9 | gpus=${4:-0} 10 | bz_per_device=${5:-128} 11 | NUM_NODES=${6:-1} 12 | TEST_TIMES=${7:-1} 13 | 14 | a=`expr ${#gpus} + 1` 15 | NUM_GPUS=`expr ${a} / 2` 16 | total_bz=`expr ${bz_per_device} \* ${NUM_GPUS}` 17 | LR=$(awk -v total_bz="$total_bz" 'BEGIN{print total_bz / 1000}') 18 | 19 | export CUDA_VISIBLE_DEVICES=${gpus} 20 | LOG_FOLDER=pytorch/${NUM_NODES}n${NUM_GPUS}g 21 | mkdir -p $LOG_FOLDER 22 | LOGFILE=${LOG_FOLDER}/r50_b${bz_per_device}_fp32_$TEST_TIMES.log 23 | 24 | CMD="$WORKSPACE/main.py" 25 | CMD+=" --arch $MODEL" 26 | CMD+=" --epochs 1" 27 | CMD+=" --batch-size $total_bz" 28 | CMD+=" --lr $LR --workers 8" 29 | CMD+=" --momentum 0.125" 30 | CMD+=" --print-freq 1" 31 | CMD+=" --multiprocessing-distributed" 32 | CMD+=" --dist-backend nccl" 33 | CMD+=" --dist-url tcp://${master_node}" 34 | CMD+=" --world-size ${NUM_NODES}" 35 | CMD+=" --rank 0" 36 | 37 | CMD=" python $CMD $DATA_DIR " 38 | 39 | if [ -z "$LOGFILE" ] ; then 40 | $CMD 41 | else 42 | ( 43 | $CMD 44 | ) |& tee $LOGFILE 45 | fi 46 | 47 | echo "Writting log to ${LOGFILE}" 48 | -------------------------------------------------------------------------------- /TensorFlow/bert/scripts/run_multi_node.sh: -------------------------------------------------------------------------------- 1 | BATCH_SIZE=${1:-32} 2 | NUM_TESTING=${2:-5} 3 | DTYPE=${3:-'fp32'} 4 | USE_XLA=${4:-'false'} 5 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 6 | export PYTHONPATH=$PYTHONPATH:/home/leinao/tensorflow/models-2.3.0 7 | NODE1='10.11.0.2:11111' 8 | NODE2='10.11.0.3:11111' 9 | NODE3='10.11.0.4:11111' 10 | NODE4='10.11.0.5:11111' 11 | nodes=$NODE1,$NODE2,$NODE3,$NODE4 12 | task_index=${5:-0} 13 | 14 | i=1 15 | while [ $i -le $NUM_TESTING ] 16 | do 17 | bash $SHELL_FOLDER/single_node_train.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} $DTYPE 120 $USE_XLA $i $nodes $task_index 18 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 19 | let i++ 20 | sleep 20 21 | done 22 | -------------------------------------------------------------------------------- /TensorFlow/bert/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | BATCH_SIZE=${1:-32} 2 | NUM_TESTING=${2:-5} 3 | DTYPE=${3:-'fp32'} 4 | USE_XLA=${4:-'false'} 5 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 6 | export PYTHONPATH=$PYTHONPATH:/home/leinao/tensorflow/models-2.3.0 7 | 8 | i=1 9 | while [ $i -le $NUM_TESTING ] 10 | do 11 | bash $SHELL_FOLDER/single_node_train.sh 0 ${BATCH_SIZE} $DTYPE 120 $USE_XLA $i 12 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 13 | let i++ 14 | sleep 20 15 | done 16 | 17 | 18 | # i=1 19 | # while [ $i -le $NUM_TESTING ] 20 | # do 21 | # bash $SHELL_FOLDER/single_node_train.sh 0,1 ${BATCH_SIZE} $DTYPE 120 $USE_XLA $i 22 | # echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 23 | # let i++ 24 | # sleep 20 25 | # done 26 | 27 | 28 | i=1 29 | while [ $i -le $NUM_TESTING ] 30 | do 31 | bash $SHELL_FOLDER/single_node_train.sh 0,1,2,3 ${BATCH_SIZE} $DTYPE 120 $USE_XLA $i 32 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 33 | let i++ 34 | sleep 20 35 | done 36 | 37 | 38 | i=1 39 | while [ $i -le $NUM_TESTING ] 40 | do 41 | bash $SHELL_FOLDER/single_node_train.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} $DTYPE 120 $USE_XLA $i 42 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 43 | let i++ 44 | sleep 20 45 | done -------------------------------------------------------------------------------- /TensorFlow/bert/scripts/run_two_node.sh: -------------------------------------------------------------------------------- 1 | BATCH_SIZE=${1:-32} 2 | NUM_TESTING=${2:-5} 3 | DTYPE=${3:-'fp32'} 4 | USE_XLA=${4:-'false'} 5 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 6 | export PYTHONPATH=$PYTHONPATH:/home/leinao/tensorflow/models-2.3.0 7 | NODE1='10.11.0.2:11111' 8 | NODE2='10.11.0.3:11111' 9 | NODE3='10.11.0.4:11111' 10 | NODE4='10.11.0.5:11111' 11 | nodes=$NODE1,$NODE2 12 | task_index=${5:-0} 13 | 14 | i=1 15 | while [ $i -le $NUM_TESTING ] 16 | do 17 | bash $SHELL_FOLDER/single_node_train.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} $DTYPE 120 $USE_XLA $i $nodes $task_index 18 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 19 | let i++ 20 | sleep 20 21 | done 22 | -------------------------------------------------------------------------------- /TensorFlow/bert/scripts/single_node_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL_DIR=../output 3 | rm -rf $MODEL_DIR 4 | gpus=${1:-"0"} 5 | BATCH_SIZE=${2:-32} 6 | DTYPE=${3:-'fp32'} 7 | NUM_STEP=${4:-120} 8 | USE_XLA=${5:-'false'} 9 | TEST_NUM=${6:-1} 10 | NODE1='10.11.0.2:11111' 11 | NODE_IPS=${7:-$NODE1} 12 | task_index=${8:-0} 13 | 14 | NODE_NUM=$(echo $NODE_IPS | tr ',' '\n' | wc -l) 15 | a=`expr ${#gpus} + 1` 16 | NUM_GPU_PER_NODE=`expr ${a} / 2` 17 | NUM_GPU=`expr $NODE_NUM \* $NUM_GPU_PER_NODE` 18 | total_batch_size=`expr ${BATCH_SIZE} \* $NUM_GPU` 19 | 20 | if [ $NODE_NUM -gt 1 ] ; then 21 | export NCCL_DEBUG=INFO 22 | echo "Node ip : $NODE_IPS" 23 | fi 24 | echo "Gpu num: $NUM_GPU" 25 | echo "Total batch size : $total_batch_size" 26 | 27 | 28 | if [ "$USE_XLA" == "true" ] ; then 29 | enable_xla='true' 30 | export TF_XLA_FLAGS=--tf_xla_cpu_global_jit 31 | else 32 | enable_xla='false' 33 | fi 34 | 35 | BERT_BASE_CONFIG_FILE='/datasets/bert/uncased_L-12_H-768_A-12/bert_config.json' 36 | LOG_FOLDER=./logs/tensorflow/bert/bz${BATCH_SIZE}/${NODE_NUM}n${NUM_GPU_PER_NODE}g 37 | mkdir -p $LOG_FOLDER 38 | LOGFILE=${LOG_FOLDER}/bert_b${BATCH_SIZE}_${DTYPE}_${TEST_NUM}.log 39 | 40 | 41 | export CUDA_VISIBLE_DEVICES=$gpus 42 | CMD="python run_pretraining.py" 43 | CMD+=" --input_files=/datasets/bert/wiki/*.tfrecord" 44 | CMD+=" --max_seq_length=128" 45 | CMD+=" --max_predictions_per_seq=20" 46 | CMD+=" --train_batch_size=$total_batch_size" 47 | CMD+=" --num_steps_per_epoch=$NUM_STEP" 48 | CMD+=" --num_train_epochs=1" 49 | CMD+=" --warmup_steps=10000" 50 | CMD+=" --use_next_sentence_label=True" 51 | CMD+=" --train_summary_interval=0" 52 | CMD+=" --optimizer_type=adamw" 53 | CMD+=" --num_gpus=$NUM_GPU_PER_NODE" 54 | CMD+=" --datasets_num_private_threads=8" 55 | CMD+=" --dtype=$DTYPE" 56 | CMD+=" --enable_xla=$enable_xla" 57 | CMD+=" --model_dir=$MODEL_DIR" 58 | CMD+=" --bert_config_file=${BERT_BASE_CONFIG_FILE}" 59 | 60 | if [ $NODE_NUM -gt 1 ] ; then 61 | CMD+=" --distribution_strategy=multi_worker_mirrored" 62 | CMD+=" --worker_hosts=$NODE_IPS" 63 | CMD+=" --task_index=$task_index" 64 | CMD+=" --all_reduce_alg=nccl" 65 | fi 66 | 67 | $CMD 2>&1 | tee $LOGFILE 68 | 69 | echo "Writting log to $LOGFILE" 70 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/gpu.yaml: -------------------------------------------------------------------------------- 1 | # Training configuration for ResNet trained on ImageNet on GPUs. 2 | # Reaches > 76.1% within 90 epochs. 3 | # Note: This configuration uses a scaled per-replica batch size based on the number of devices. 4 | # distribution_strategy: 'mirrored' 'multi_worker_mirrored' 5 | 6 | runtime: 7 | distribution_strategy: 'mirrored' 8 | num_gpus: 1 9 | batchnorm_spatial_persistent: True 10 | train_dataset: 11 | name: 'imagenet2012' 12 | data_dir: null 13 | builder: 'records' 14 | split: 'train' 15 | image_size: 224 16 | num_classes: 1000 17 | num_examples: 640512 18 | batch_size: 128 19 | use_per_replica_batch_size: True 20 | dtype: 'float32' 21 | mean_subtract: True 22 | standardize: True 23 | validation_dataset: 24 | name: 'imagenet2012' 25 | data_dir: null 26 | builder: 'records' 27 | split: 'validation' 28 | image_size: 224 29 | num_classes: 1000 30 | num_examples: 50000 31 | batch_size: 128 32 | use_per_replica_batch_size: True 33 | dtype: 'float32' 34 | mean_subtract: True 35 | standardize: True 36 | model: 37 | name: 'resnet' 38 | model_params: 39 | rescale_inputs: False 40 | optimizer: 41 | name: 'momentum' 42 | momentum: 0.875 43 | decay: 0.9 44 | epsilon: 0.001 45 | learning_rate: 46 | initial_lr: 0.256 47 | examples_per_epoch: 640512 48 | warmup_epochs: 0 49 | name: 'piecewise_constant_with_warmup' 50 | loss: 51 | label_smoothing: 0.1 52 | train: 53 | resume_checkpoint: False 54 | epochs: 1 55 | steps: 600 56 | evaluation: 57 | epochs_between_evals: 10 58 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/gpu_fp16.yaml: -------------------------------------------------------------------------------- 1 | # Training configuration for ResNet trained on ImageNet on GPUs. 2 | # Reaches > 76.1% within 90 epochs. 3 | # Note: This configuration uses a scaled per-replica batch size based on the number of devices. 4 | # distribution_strategy: 'mirrored' 'multi_worker_mirrored' 5 | 6 | runtime: 7 | distribution_strategy: 'mirrored' 8 | num_gpus: 1 9 | batchnorm_spatial_persistent: True 10 | train_dataset: 11 | name: 'imagenet2012' 12 | data_dir: null 13 | builder: 'records' 14 | split: 'train' 15 | image_size: 224 16 | num_classes: 1000 17 | num_examples: 640512 18 | batch_size: 256 19 | use_per_replica_batch_size: True 20 | dtype: 'float16' 21 | mean_subtract: True 22 | standardize: True 23 | validation_dataset: 24 | name: 'imagenet2012' 25 | data_dir: null 26 | builder: 'records' 27 | split: 'validation' 28 | image_size: 224 29 | num_classes: 1000 30 | num_examples: 50000 31 | batch_size: 128 32 | use_per_replica_batch_size: True 33 | dtype: 'float16' 34 | mean_subtract: True 35 | standardize: True 36 | model: 37 | name: 'resnet' 38 | model_params: 39 | rescale_inputs: False 40 | optimizer: 41 | name: 'momentum' 42 | momentum: 0.875 43 | decay: 0.9 44 | epsilon: 0.001 45 | learning_rate: 46 | initial_lr: 0.256 47 | examples_per_epoch: 640512 48 | warmup_epochs: 0 49 | name: 'piecewise_constant_with_warmup' 50 | loss: 51 | label_smoothing: 0.1 52 | train: 53 | resume_checkpoint: False 54 | epochs: 1 55 | steps: 600 56 | evaluation: 57 | epochs_between_evals: 10 58 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/multi_node_gpu.yaml: -------------------------------------------------------------------------------- 1 | # Training configuration for ResNet trained on ImageNet on GPUs. 2 | # Reaches > 76.1% within 90 epochs. 3 | # Note: This configuration uses a scaled per-replica batch size based on the number of devices. 4 | 5 | 6 | runtime: 7 | distribution_strategy: 'multi_worker_mirrored' 8 | worker_hosts: '10.11.0.2:11111,10.11.0.3:11111,10.11.0.4:11111,10.11.0.5:11111' 9 | num_gpus: 8 10 | task_index: 0 11 | all_reduce_alg: 'nccl' 12 | batchnorm_spatial_persistent: False 13 | train_dataset: 14 | name: 'imagenet2012' 15 | data_dir: null 16 | builder: 'records' 17 | split: 'train' 18 | image_size: 224 19 | num_classes: 1000 20 | num_examples: 640512 21 | batch_size: 128 22 | use_per_replica_batch_size: True 23 | dtype: 'float32' 24 | mean_subtract: True 25 | standardize: True 26 | validation_dataset: 27 | name: 'imagenet2012' 28 | data_dir: null 29 | builder: 'records' 30 | split: 'validation' 31 | image_size: 224 32 | num_classes: 1000 33 | num_examples: 50000 34 | batch_size: 128 35 | use_per_replica_batch_size: True 36 | dtype: 'float32' 37 | mean_subtract: True 38 | standardize: True 39 | model: 40 | name: 'resnet' 41 | model_params: 42 | rescale_inputs: False 43 | optimizer: 44 | name: 'momentum' 45 | momentum: 0.875 46 | decay: 0.9 47 | epsilon: 0.001 48 | learning_rate: 49 | initial_lr: 1.024 50 | name: 'piecewise_constant_with_warmup' 51 | examples_per_epoch: 640512 52 | warmup_epochs: 1 53 | loss: 54 | label_smoothing: 0.1 55 | train: 56 | resume_checkpoint: False 57 | epochs: 1 58 | steps: 600 59 | evaluation: 60 | epochs_between_evals: 10 61 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/multi_node_gpu_fp16.yaml: -------------------------------------------------------------------------------- 1 | # Training configuration for ResNet trained on ImageNet on GPUs. 2 | # Reaches > 76.1% within 90 epochs. 3 | # Note: This configuration uses a scaled per-replica batch size based on the number of devices. 4 | 5 | 6 | runtime: 7 | distribution_strategy: 'multi_worker_mirrored' 8 | worker_hosts: '10.11.0.2:11111,10.11.0.3:11111,10.11.0.4:11111,10.11.0.5:11111' 9 | num_gpus: 8 10 | task_index: 0 11 | all_reduce_alg: 'nccl' 12 | batchnorm_spatial_persistent: False 13 | train_dataset: 14 | name: 'imagenet2012' 15 | data_dir: null 16 | builder: 'records' 17 | split: 'train' 18 | image_size: 224 19 | num_classes: 1000 20 | num_examples: 640512 21 | batch_size: 256 22 | use_per_replica_batch_size: True 23 | dtype: 'float16' 24 | mean_subtract: True 25 | standardize: True 26 | validation_dataset: 27 | name: 'imagenet2012' 28 | data_dir: null 29 | builder: 'records' 30 | split: 'validation' 31 | image_size: 224 32 | num_classes: 1000 33 | num_examples: 50000 34 | batch_size: 128 35 | use_per_replica_batch_size: True 36 | dtype: 'float16' 37 | mean_subtract: True 38 | standardize: True 39 | model: 40 | name: 'resnet' 41 | model_params: 42 | rescale_inputs: False 43 | optimizer: 44 | name: 'momentum' 45 | momentum: 0.875 46 | decay: 0.9 47 | epsilon: 0.001 48 | learning_rate: 49 | initial_lr: 1.024 50 | name: 'piecewise_constant_with_warmup' 51 | examples_per_epoch: 640512 52 | warmup_epochs: 1 53 | loss: 54 | label_smoothing: 0.1 55 | train: 56 | resume_checkpoint: False 57 | epochs: 1 58 | steps: 600 59 | evaluation: 60 | epochs_between_evals: 10 61 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/multi_node_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL_DIR=../output 3 | rm -rf $MODEL_DIR 4 | GPUS=${1:-"0,1,2,3,4,5,6,7"} 5 | BATCH_SIZE=${2:-128} 6 | NODE_IPS=${3:-$NODE1,$NODE2,$NODE3,$NODE4} 7 | TEST_NUM=${4:-1} 8 | DTYPE=${5:-"fp32"} 9 | 10 | node_num=$(echo $NODE_IPS | tr ',' '\n' | wc -l) 11 | a=`expr ${#GPUS} + 1` 12 | num_gpu=`expr ${a} / 2` 13 | echo "Node ip : $NODE_IPS" 14 | echo "Use gpus: $GPUS" 15 | echo "Batch size : $BATCH_SIZE" 16 | 17 | 18 | LOG_FOLDER=../logs/tensorflow/resnet50/bz${BATCH_SIZE}/${node_num}n${num_gpu}g 19 | mkdir -p $LOG_FOLDER 20 | LOGFILE=${LOG_FOLDER}/rn50_b${BATCH_SIZE}_${DTYPE}_$TEST_NUM.log 21 | 22 | if [ "$DTYPE" == "fp16" ] ; then 23 | config_file=configs/examples/resnet/imagenet/multi_node_gpu_fp16.yaml 24 | else 25 | config_file=configs/examples/resnet/imagenet/multi_node_gpu.yaml 26 | fi 27 | 28 | # export PYTHONPATH=$PYTHONPATH:$BENCH_ROOT_DIR/tensorflow/models-2.3.0 29 | export PYTHONPATH=$PYTHONPATH:/home/leinao/tensorflow/models-2.3.0 30 | export CUDA_VISIBLE_DEVICES=$GPUS 31 | DATA_DIR=/datasets/ImageNet/tfrecord # Set up your tfrecord path 32 | 33 | python3 classifier_trainer.py \ 34 | --mode=train_and_eval \ 35 | --model_type='resnet' \ 36 | --dataset=imagenet \ 37 | --model_dir=$MODEL_DIR \ 38 | --data_dir=$DATA_DIR \ 39 | --config_file=$config_file \ 40 | --params_override='runtime.num_gpus='$num_gpu 2>&1 | tee ${LOGFILE} 41 | 42 | echo "Writting log to ${LOGFILE}" -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/run_multi_node.sh: -------------------------------------------------------------------------------- 1 | BATCH_SIZE=${1:-128} 2 | DTYPE=${2:-"fp32"} 3 | TEST_NUM=${3:-5} 4 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 5 | NODE1='10.11.0.2:11111' 6 | NODE2='10.11.0.3:11111' 7 | NODE3='10.11.0.4:11111' 8 | NODE4='10.11.0.5:11111' 9 | nodes=$NODE1,$NODE2,$NODE3,$NODE4 10 | 11 | i=1 12 | while [ $i -le $TEST_NUM ] 13 | do 14 | bash $SHELL_FOLDER/multi_node_train.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} $nodes $i $DTYPE 15 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 16 | let i++ 17 | sleep 30 18 | done 19 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | BATCH_SIZE=${1:-128} 2 | DTYPE=${2:-"fp32"} 3 | TEST_NUM=${3:-5} 4 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 5 | 6 | 7 | i=1 8 | while [ $i -le $TEST_NUM ] 9 | do 10 | bash $SHELL_FOLDER/single_node_train.sh 0 ${BATCH_SIZE} $i $DTYPE 11 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 12 | let i++ 13 | sleep 20 14 | done 15 | 16 | 17 | i=1 18 | while [ $i -le $TEST_NUM ] 19 | do 20 | bash $SHELL_FOLDER/single_node_train.sh 0,1,2,3 ${BATCH_SIZE} $i $DTYPE 21 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 22 | let i++ 23 | sleep 20 24 | done 25 | 26 | 27 | i=1 28 | while [ $i -le $TEST_NUM ] 29 | do 30 | bash $SHELL_FOLDER/single_node_train.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} $i $DTYPE 31 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 32 | let i++ 33 | sleep 20 34 | done 35 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/run_two_node.sh: -------------------------------------------------------------------------------- 1 | BATCH_SIZE=${1:-128} 2 | DTYPE=${2:-"fp32"} 3 | TEST_NUM=${3:-5} 4 | SHELL_FOLDER=$(dirname $(readlink -f "$0")) 5 | NODE1='10.11.0.2:11111' 6 | NODE2='10.11.0.3:11111' 7 | 8 | i=1 9 | while [ $i -le $TEST_NUM ] 10 | do 11 | bash $SHELL_FOLDER/two_node_train.sh 0,1,2,3,4,5,6,7 ${BATCH_SIZE} $NODE1,$NODE2 $i $DTYPE 12 | echo ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Finished Test Case ${i}!<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 13 | let i++ 14 | sleep 30 15 | done 16 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/single_node_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | MODEL_DIR=../output 3 | rm -rf $MODEL_DIR 4 | GPUS=${1:-"0"} 5 | BATCH_SIZE=${2:-128} 6 | TEST_NUM=${3:-1} 7 | DTYPE=${4:-"fp32"} 8 | 9 | a=`expr ${#GPUS} + 1` 10 | num_gpu=`expr ${a} / 2` 11 | echo "Use gpus: $GPUS" 12 | echo "Batch size per device : $BATCH_SIZE" 13 | 14 | 15 | LOG_FOLDER=../logs/tensorflow/resnet50/bz${BATCH_SIZE}/1n${num_gpu}g 16 | mkdir -p $LOG_FOLDER 17 | LOGFILE=${LOG_FOLDER}/rn50_b${BATCH_SIZE}_${DTYPE}_$TEST_NUM.log 18 | 19 | if [ "$DTYPE" == "fp16" ] ; then 20 | config_file=configs/examples/resnet/imagenet/gpu_fp16.yaml 21 | else 22 | config_file=configs/examples/resnet/imagenet/gpu.yaml 23 | fi 24 | 25 | export PYTHONPATH=$PYTHONPATH:/home/leinao/tensorflow/models-2.3.0 26 | export CUDA_VISIBLE_DEVICES=${GPUS} 27 | DATA_DIR=/datasets/ImageNet/tfrecord 28 | 29 | python3 classifier_trainer.py \ 30 | --mode=train_and_eval \ 31 | --model_type='resnet' \ 32 | --dataset=imagenet \ 33 | --model_dir=$MODEL_DIR \ 34 | --data_dir=$DATA_DIR \ 35 | --config_file=${config_file} \ 36 | --params_override=runtime.num_gpus=$num_gpu 2>&1 | tee ${LOGFILE} 37 | 38 | echo "Writting log to ${LOGFILE}" -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/two_node_gpu.yaml: -------------------------------------------------------------------------------- 1 | # Training configuration for ResNet trained on ImageNet on GPUs. 2 | # Reaches > 76.1% within 90 epochs. 3 | # Note: This configuration uses a scaled per-replica batch size based on the number of devices. 4 | 5 | runtime: 6 | distribution_strategy: 'multi_worker_mirrored' 7 | worker_hosts: '10.11.0.2:11111,10.11.0.3:11111' 8 | num_gpus: 8 9 | task_index: 0 10 | all_reduce_alg: 'nccl' 11 | batchnorm_spatial_persistent: False 12 | train_dataset: 13 | name: 'imagenet2012' 14 | data_dir: null 15 | builder: 'records' 16 | split: 'train' 17 | image_size: 224 18 | num_classes: 1000 19 | num_examples: 640512 20 | batch_size: 128 21 | use_per_replica_batch_size: True 22 | dtype: 'float32' 23 | mean_subtract: True 24 | standardize: True 25 | validation_dataset: 26 | name: 'imagenet2012' 27 | data_dir: null 28 | builder: 'records' 29 | split: 'validation' 30 | image_size: 224 31 | num_classes: 1000 32 | num_examples: 50000 33 | batch_size: 128 34 | use_per_replica_batch_size: True 35 | dtype: 'float32' 36 | mean_subtract: True 37 | standardize: True 38 | model: 39 | name: 'resnet' 40 | model_params: 41 | rescale_inputs: False 42 | optimizer: 43 | name: 'momentum' 44 | momentum: 0.875 45 | decay: 0.9 46 | epsilon: 0.001 47 | learning_rate: 48 | initial_lr: 1.024 49 | name: 'piecewise_constant_with_warmup' 50 | examples_per_epoch: 640512 51 | warmup_epochs: 1 52 | loss: 53 | label_smoothing: 0.1 54 | train: 55 | resume_checkpoint: False 56 | epochs: 1 57 | steps: 600 58 | evaluation: 59 | epochs_between_evals: 10 60 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/two_node_gpu_fp16.yaml: -------------------------------------------------------------------------------- 1 | # Training configuration for ResNet trained on ImageNet on GPUs. 2 | # Reaches > 76.1% within 90 epochs. 3 | # Note: This configuration uses a scaled per-replica batch size based on the number of devices. 4 | 5 | runtime: 6 | distribution_strategy: 'multi_worker_mirrored' 7 | worker_hosts: '10.11.0.2:11111,10.11.0.3:11111' 8 | num_gpus: 8 9 | task_index: 0 10 | all_reduce_alg: 'nccl' 11 | batchnorm_spatial_persistent: False 12 | train_dataset: 13 | name: 'imagenet2012' 14 | data_dir: null 15 | builder: 'records' 16 | split: 'train' 17 | image_size: 224 18 | num_classes: 1000 19 | num_examples: 640512 20 | batch_size: 256 21 | use_per_replica_batch_size: True 22 | dtype: 'float16' 23 | mean_subtract: True 24 | standardize: True 25 | validation_dataset: 26 | name: 'imagenet2012' 27 | data_dir: null 28 | builder: 'records' 29 | split: 'validation' 30 | image_size: 224 31 | num_classes: 1000 32 | num_examples: 50000 33 | batch_size: 128 34 | use_per_replica_batch_size: True 35 | dtype: 'float16' 36 | mean_subtract: True 37 | standardize: True 38 | model: 39 | name: 'resnet' 40 | model_params: 41 | rescale_inputs: False 42 | optimizer: 43 | name: 'momentum' 44 | momentum: 0.875 45 | decay: 0.9 46 | epsilon: 0.001 47 | learning_rate: 48 | initial_lr: 1.024 49 | name: 'piecewise_constant_with_warmup' 50 | examples_per_epoch: 640512 51 | warmup_epochs: 1 52 | loss: 53 | label_smoothing: 0.1 54 | train: 55 | resume_checkpoint: False 56 | epochs: 1 57 | steps: 600 58 | evaluation: 59 | epochs_between_evals: 10 60 | -------------------------------------------------------------------------------- /TensorFlow/resnet50v1.5/scripts/two_node_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL_DIR=../output 3 | rm -rf $MODEL_DIR 4 | GPUS=${1:-"0,1,2,3,4,5,6,7"} 5 | BATCH_SIZE=${2:-128} 6 | NODE_IPS=${3:-$NODE1,$NODE2} 7 | TEST_NUM=${4:-1} 8 | DTYPE=${5:-"fp32"} 9 | 10 | node_num=$(echo $NODE_IPS | tr ',' '\n' | wc -l) 11 | a=`expr ${#GPUS} + 1` 12 | num_gpu=`expr ${a} / 2` 13 | echo "Node ip : $NODE_IPS" 14 | echo "Use gpus: $GPUS" 15 | echo "Batch size per device : $BATCH_SIZE" 16 | 17 | 18 | LOG_FOLDER=../logs/tensorflow/resnet50/bz${BATCH_SIZE}/${node_num}n${num_gpu}g 19 | mkdir -p $LOG_FOLDER 20 | LOGFILE=${LOG_FOLDER}/rn50_b${BATCH_SIZE}_${DTYPE}_$TEST_NUM.log 21 | 22 | 23 | if [ "$DTYPE" == "fp16" ] ; then 24 | config_file=configs/examples/resnet/imagenet/two_node_gpu_fp16.yaml 25 | else 26 | config_file=configs/examples/resnet/imagenet/two_node_gpu.yaml 27 | fi 28 | 29 | 30 | # export PYTHONPATH=$PYTHONPATH:$BENCH_ROOT_DIR/tensorflow/models-2.3.0 31 | export PYTHONPATH=$PYTHONPATH:/home/leinao/tensorflow/models-2.3.0 32 | export CUDA_VISIBLE_DEVICES=$GPUS 33 | DATA_DIR=/datasets/ImageNet/tfrecord # Set up your tfrecord path 34 | 35 | python3 classifier_trainer.py \ 36 | --mode=train_and_eval \ 37 | --model_type='resnet' \ 38 | --dataset=imagenet \ 39 | --model_dir=$MODEL_DIR \ 40 | --data_dir=$DATA_DIR \ 41 | --config_file=${config_file} \ 42 | --params_override='runtime.num_gpus='$num_gpu 2>&1 | tee ${LOGFILE} 43 | 44 | echo "Writting log to ${LOGFILE}" -------------------------------------------------------------------------------- /reports/DLPerf_report_v1.0.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/DLPerf_report_v1.0.xlsm -------------------------------------------------------------------------------- /reports/README.md: -------------------------------------------------------------------------------- 1 | This Folder contains top level reports of DLPerf test. 2 | 3 | ## Changelog 4 | Note: latest on the top 5 | 6 | ### Feb 7th, 2021 7 | - [InsightFace Benchmark Test Report V1.0](insightface/dlperf_insightface_test_report_v1.md) 8 | 9 | - [InsightFace 性能评测报告中文版 V1.0](insightface/dlperf_insightface_test_report_v1_cn.md) 10 | 11 | ### Jan 29th, 2021 12 | 13 | - [WideDeepLearning Benchmark Test Report V1.0](WideDeepLearning/dlperf_wide_and_deep_test_report_v1.md) 14 | 15 | - [WideDeepLearning 性能评测报告中文版 V1.0](WideDeepLearning/dlperf_wide_and_deep_test_report_v1_cn.md) 16 | 17 | ### Oct 9th, 2020 18 | - [DLPerf Benchmark Test Report V1.0](dlperf_benchmark_test_report_v1.md) 19 | - [DLPerf 性能评测报告中文版 v1.0](dlperf_benchmark_test_report_v1_cn.md) 20 | 21 | -------------------------------------------------------------------------------- /reports/WideDeepLearning/imgs/wdl_vecx2_1n1g_mem_latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/WideDeepLearning/imgs/wdl_vecx2_1n1g_mem_latency.png -------------------------------------------------------------------------------- /reports/WideDeepLearning/imgs/wdl_vecx2_1n8g_mem_latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/WideDeepLearning/imgs/wdl_vecx2_1n8g_mem_latency.png -------------------------------------------------------------------------------- /reports/WideDeepLearning/imgs/wdl_vecx2_4n8g_mem_latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/WideDeepLearning/imgs/wdl_vecx2_4n8g_mem_latency.png -------------------------------------------------------------------------------- /reports/imgs/NCCL_debug_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/NCCL_debug_0.jpg -------------------------------------------------------------------------------- /reports/imgs/NCCL_debug_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/NCCL_debug_1.jpg -------------------------------------------------------------------------------- /reports/imgs/NCCL_debug_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/NCCL_debug_2.jpg -------------------------------------------------------------------------------- /reports/imgs/bert_base_amp_bz64_speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/bert_base_amp_bz64_speedup.png -------------------------------------------------------------------------------- /reports/imgs/bert_base_amp_bz64_throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/bert_base_amp_bz64_throughput.png -------------------------------------------------------------------------------- /reports/imgs/bert_base_amp_bz_max_speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/bert_base_amp_bz_max_speedup.png -------------------------------------------------------------------------------- /reports/imgs/bert_base_amp_bz_max_throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/bert_base_amp_bz_max_throughput.png -------------------------------------------------------------------------------- /reports/imgs/bert_base_fp32_bz32_speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/bert_base_fp32_bz32_speedup.png -------------------------------------------------------------------------------- /reports/imgs/bert_base_fp32_bz32_throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/bert_base_fp32_bz32_throughput.png -------------------------------------------------------------------------------- /reports/imgs/bert_base_fp32_bz_max_speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/bert_base_fp32_bz_max_speedup.png -------------------------------------------------------------------------------- /reports/imgs/bert_base_fp32_bz_max_throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/bert_base_fp32_bz_max_throughput.png -------------------------------------------------------------------------------- /reports/imgs/data_parallel_face_emore_r100_bz64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/data_parallel_face_emore_r100_bz64.png -------------------------------------------------------------------------------- /reports/imgs/data_parallel_face_emore_r100_bz_max.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/data_parallel_face_emore_r100_bz_max.png -------------------------------------------------------------------------------- /reports/imgs/data_parallel_face_emore_y1_bz256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/data_parallel_face_emore_y1_bz256.png -------------------------------------------------------------------------------- /reports/imgs/data_parallel_face_emore_y1_bz_max.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/data_parallel_face_emore_y1_bz_max.png -------------------------------------------------------------------------------- /reports/imgs/emore_r100_fp32_b64_pf_en.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/emore_r100_fp32_b64_pf_en.png -------------------------------------------------------------------------------- /reports/imgs/emore_r100_fp32_bmax_pf_en.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/emore_r100_fp32_bmax_pf_en.png -------------------------------------------------------------------------------- /reports/imgs/model_parallel_face_emore_r100_bz64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/model_parallel_face_emore_r100_bz64.png -------------------------------------------------------------------------------- /reports/imgs/model_parallel_face_emore_r100_bz_max.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/model_parallel_face_emore_r100_bz_max.png -------------------------------------------------------------------------------- /reports/imgs/model_parallel_face_emore_y1_bz256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/model_parallel_face_emore_y1_bz256.png -------------------------------------------------------------------------------- /reports/imgs/model_parallel_face_emore_y1_bz_max.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/model_parallel_face_emore_y1_bz_max.png -------------------------------------------------------------------------------- /reports/imgs/partial_fc_sample_ratio_0_1_face_emore_r100_bz64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/partial_fc_sample_ratio_0_1_face_emore_r100_bz64.png -------------------------------------------------------------------------------- /reports/imgs/partial_fc_sample_ratio_0_1_face_emore_r100_bz_max.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/partial_fc_sample_ratio_0_1_face_emore_r100_bz_max.png -------------------------------------------------------------------------------- /reports/imgs/partial_fc_sample_ratio_0_1_glint_r100_bz64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/partial_fc_sample_ratio_0_1_glint_r100_bz64.png -------------------------------------------------------------------------------- /reports/imgs/partial_fc_sample_ratio_0_1_glint_r100_bz_max.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/partial_fc_sample_ratio_0_1_glint_r100_bz_max.png -------------------------------------------------------------------------------- /reports/imgs/r50_amp_bz256_speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/r50_amp_bz256_speedup.png -------------------------------------------------------------------------------- /reports/imgs/r50_amp_bz256_throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/r50_amp_bz256_throughput.png -------------------------------------------------------------------------------- /reports/imgs/r50_fp32_bz128_speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/r50_fp32_bz128_speedup.png -------------------------------------------------------------------------------- /reports/imgs/r50_fp32_bz128_throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/DLPerf/c89430bac64f64caabb0445ff7eb80175641454c/reports/imgs/r50_fp32_bz128_throughput.png --------------------------------------------------------------------------------